blob: 0b068a7c99e6728c10582d8e17710ced7faf12e7 [file] [log] [blame]
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
#define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
// IWYU pragma: private
#include "./InternalHeaderCheck.h"
namespace Eigen {
namespace internal {
// -------------------------------------------------------------------------- //
// Forward declarations for templates defined below.
template <typename Scalar, typename IndexType, int NumDims, int Layout>
class TensorBlockIO;
// -------------------------------------------------------------------------- //
// Helper function to compute strides for densely stored buffer of given
// dimensions.
// TODO(ezhulenev): We compute strides 1000 times in different evaluators, use
// this function instead everywhere.
template <int Layout, typename IndexType, int NumDims>
EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(const DSizes<IndexType, NumDims>& dimensions) {
DSizes<IndexType, NumDims> strides;
if (NumDims == 0) return strides;
// TODO(ezhulenev): Use templates to unroll this loop (similar to
// h_array_reduce in CXX11meta.h)? Benchmark it.
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
strides[0] = 1;
for (int i = 1; i < NumDims; ++i) {
strides[i] = strides[i - 1] * dimensions[i - 1];
}
} else {
strides[NumDims - 1] = 1;
for (int i = NumDims - 2; i >= 0; --i) {
strides[i] = strides[i + 1] * dimensions[i + 1];
}
}
return strides;
}
template <int Layout, typename IndexType, size_t NumDims>
EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(const Eigen::array<IndexType, NumDims>& dimensions) {
return strides<Layout>(DSizes<IndexType, NumDims>(dimensions));
}
template <int Layout, std::ptrdiff_t... Indices>
EIGEN_STRONG_INLINE DSizes<std::ptrdiff_t, sizeof...(Indices)> strides(const Sizes<Indices...>& sizes) {
return strides<Layout>(DSizes<std::ptrdiff_t, sizeof...(Indices)>(sizes));
}
// -------------------------------------------------------------------------- //
// Tensor block shape type defines what are the shape preference for the blocks
// extracted from the larger tensor.
//
// Example: blocks of 100 elements from the large 100x100 tensor:
// - tensor: 100x100
// - target_block_size: 100
//
// TensorBlockShapeType:
// - kUniformAllDims: 100 blocks of size 10x10
// - kSkewedInnerDims: 100 blocks of size 100x1 (or 1x100 depending on a column
// or row major layout)
enum class TensorBlockShapeType { kUniformAllDims, kSkewedInnerDims };
struct TensorBlockResourceRequirements {
TensorBlockShapeType shape_type; // target block shape
size_t size; // target block size
TensorOpCost cost_per_coeff; // cost of computing a single block element
#ifdef EIGEN_HIPCC
// For HIPCC, we need to explicitly declare as a "device fun", the constructor
// which is implicitly invoked in the "merge" / "any" routines. else HIPCC
// errors out complaining about the lack of a matching constructor
EIGEN_DEVICE_FUNC TensorBlockResourceRequirements(TensorBlockShapeType shape_type_, size_t size_, TensorOpCost cost_)
: shape_type(shape_type_), size(size_), cost_per_coeff(cost_) {}
#endif
template <typename Scalar>
EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(TensorBlockShapeType shape_type,
size_t size_in_bytes, TensorOpCost cost) {
const size_t size = numext::maxi(size_t(1), size_in_bytes / sizeof(Scalar));
return {shape_type, size, cost};
}
template <typename Scalar>
EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(TensorBlockShapeType shape_type,
size_t size_in_bytes) {
// This default cost per coefficient is valid for most materialized tensor
// block evaluation implementations, because they typically just read
// coefficients from the underlying tensor storage, and write to the tensor
// block buffer (scratch or destination memory, reads and writes have linear
// access pattern). We ignore the fixed cost of block evaluation, because in
// practice it should negligible.
//
// Lazy block evaluation adds the cost of calling a functor for each
// coefficient.
//
// All non-trivial block evaluation implementations must provide their own
// cost approximation (e.g. shuffling inner dimension has a much higher cost
// because it reads memory randomly, although the total number of moved
// bytes is the same).
return withShapeAndSize<Scalar>(shape_type, size_in_bytes,
{/*bytes_loaded=*/sizeof(Scalar),
/*bytes_stored=*/sizeof(Scalar),
/*compute_cycles=*/0});
}
template <typename Scalar>
EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements skewed(size_t size_in_bytes) {
return withShapeAndSize<Scalar>(TensorBlockShapeType::kSkewedInnerDims, size_in_bytes);
}
template <typename Scalar>
EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements uniform(size_t size_in_bytes) {
return withShapeAndSize<Scalar>(TensorBlockShapeType::kUniformAllDims, size_in_bytes);
}
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE TensorBlockResourceRequirements
merge(const TensorBlockResourceRequirements& lhs, const TensorBlockResourceRequirements& rhs) {
return {merge(lhs.shape_type, rhs.shape_type), // shape_type
merge(lhs.size, rhs.size), // size
merge(lhs.cost_per_coeff, rhs.cost_per_coeff)}; // cost_per_coeff
}
EIGEN_DEVICE_FUNC TensorBlockResourceRequirements& addCostPerCoeff(TensorOpCost cost) {
cost_per_coeff += cost;
return *this;
}
// This is a resource requirement that should be returned from expressions
// that do not have any block evaluation preference (e.g. default tensor
// expression with raw buffer access).
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE TensorBlockResourceRequirements any() {
return {TensorBlockShapeType::kUniformAllDims, 1, {0, 0, 0}};
}
private:
using Requirements = TensorBlockResourceRequirements;
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE size_t merge(size_t lhs_size, size_t rhs_size) {
return numext::maxi(lhs_size, rhs_size);
}
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE TensorBlockShapeType merge(TensorBlockShapeType lhs,
TensorBlockShapeType rhs) {
return (lhs == TensorBlockShapeType::kSkewedInnerDims || rhs == TensorBlockShapeType::kSkewedInnerDims)
? TensorBlockShapeType::kSkewedInnerDims
: TensorBlockShapeType::kUniformAllDims;
}
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE TensorOpCost merge(TensorOpCost lhs_cost, TensorOpCost rhs_cost) {
return lhs_cost + rhs_cost;
}
};
// -------------------------------------------------------------------------- //
// TensorBlockDescriptor specifies a block offset within a tensor and the block
// sizes along each of the tensor dimensions.
template <int NumDims, typename IndexType = Eigen::Index>
class TensorBlockDescriptor {
public:
typedef DSizes<IndexType, NumDims> Dimensions;
// If we evaluate a Tensor assignment, and expression on the left, already has
// a memory buffer, then we might do performance optimization, and evaluate
// the root expression directly into the final output memory. Some time it's
// possible to reuse it for materializing subexpressions inside an expression
// tree, to to avoid dynamic memory allocation.
//
// The pointer type of the underlying storage is erased, because passing
// Scalar type through all the expression evaluation layers is way too many
// templates. In practice destination buffer type should always match the
// evaluated expression scalar type.
class DestinationBuffer {
public:
enum DestinationBufferKind : int {
// The above explicit specification of "int" as the enum basetype is
// needed to get around a HIPCC link error ("the field type is not
// amp-compatible")
// which is issued for class members with the enum type.
// TODO(rocm):
// remove the "int" basetype once HIPCC has been fixed to not error out
// in the above scenario.
// Destination buffer is not defined (`m_data` == nullptr).
kEmpty,
// Tensor block defined by an owning tensor block descriptor can fit
// contiguously into the destination buffer. In this case it's safe to
// materialize tensor block in the destination buffer, wrap it in a
// TensorMap, and use to build Eigen expression on top of it.
kContiguous,
// Destination buffer strides do not match strides of the contiguously
// stored block, and it's impossible to define a TensorMap over this
// buffer. However if we are evaluating a root of an expression tree, we
// still can materialize an output into this destination, because we can
// guarantee that no one will ever access it through block API.
//
// In theory it is possible to build valid TensorStriding<TensorMap>
// expression on top of this destination buffer, however it has
// inefficient coeff/packet access, and defeats the purpose of fast block
// evaluation API.
kStrided
};
template <typename Scalar>
Scalar* data() const {
eigen_assert(m_data_type_size == sizeof(Scalar));
return static_cast<Scalar*>(m_data);
}
const Dimensions& strides() const { return m_strides; }
const DestinationBufferKind& kind() const { return m_kind; }
private:
friend class TensorBlockDescriptor<NumDims, IndexType>;
DestinationBuffer() : m_data(NULL), m_data_type_size(0), m_kind(kEmpty) {}
template <typename Scalar>
DestinationBuffer(Scalar* data, const Dimensions& strides, DestinationBufferKind kind)
: m_data(static_cast<void*>(data)), m_data_type_size(sizeof(Scalar)), m_strides(strides), m_kind(kind) {}
template <int Layout, typename Scalar>
static DestinationBuffer make(const TensorBlockDescriptor& desc, Scalar* data, const Dimensions& strides) {
return DestinationBuffer(data, strides, kind<Layout>(desc, strides));
}
template <int Layout>
static DestinationBufferKind kind(const TensorBlockDescriptor& desc, const Dimensions& strides) {
const Dimensions& desc_dims = desc.dimensions();
const Dimensions& desc_strides = internal::strides<Layout>(desc_dims);
for (int i = 0; i < NumDims; ++i) {
if (desc_dims[i] == 1) continue;
if (desc_strides[i] != strides[i]) return kStrided;
}
return kContiguous;
}
// Storage pointer is type erased, to reduce template bloat, but we still
// keep the size of the underlying element type for error checking.
void* m_data;
size_t m_data_type_size;
// Destination buffer dimensions always match the dimensions of a tensor
// block descriptor it belongs to, however strides might be different.
Dimensions m_strides;
DestinationBufferKind m_kind;
};
TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions, const DestinationBuffer& destination)
: m_offset(offset), m_dimensions(dimensions), m_destination(destination) {}
TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions)
: m_offset(offset), m_dimensions(dimensions), m_destination(DestinationBuffer()) {}
IndexType offset() const { return m_offset; }
const Dimensions& dimensions() const { return m_dimensions; }
IndexType dimension(int index) const { return m_dimensions[index]; }
IndexType size() const { return array_prod<IndexType>(m_dimensions); }
const DestinationBuffer& destination() const { return m_destination; }
template <int Layout, typename Scalar>
void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides) {
eigen_assert(dst_base != NULL);
m_destination = DestinationBuffer::template make<Layout>(*this, dst_base, dst_strides);
}
template <int Layout, typename Scalar, typename DstStridesIndexType>
void AddDestinationBuffer(Scalar* dst_base, const DSizes<DstStridesIndexType, NumDims>& dst_strides) {
// DSizes constructor will do index type promotion if it's safe.
AddDestinationBuffer<Layout>(dst_base, Dimensions(dst_strides));
}
TensorBlockDescriptor& DropDestinationBuffer() {
m_destination.m_data = NULL;
m_destination.m_kind = DestinationBuffer::kEmpty;
return *this;
}
bool HasDestinationBuffer() const { return m_destination.kind() != DestinationBuffer::kEmpty; }
// Returns a copy of `*this` with updated offset.
TensorBlockDescriptor WithOffset(IndexType offset) const {
return TensorBlockDescriptor(offset, m_dimensions, m_destination);
}
private:
// Offset and dimensions are immutable after construction. Block descriptor
// can only be mutated by adding or dropping destination.
const IndexType m_offset;
const Dimensions m_dimensions;
DestinationBuffer m_destination;
};
// -------------------------------------------------------------------------- //
// TensorBlockMapper is responsible for iterating over the blocks of a tensor.
template <int NumDims, int Layout, typename IndexType = Eigen::Index>
class TensorBlockMapper {
typedef TensorBlockDescriptor<NumDims, IndexType> BlockDescriptor;
public:
typedef DSizes<IndexType, NumDims> Dimensions;
TensorBlockMapper() = default;
TensorBlockMapper(const DSizes<IndexType, NumDims>& dimensions, const TensorBlockResourceRequirements& requirements)
: m_tensor_dimensions(dimensions), m_requirements(requirements) {
// Compute block dimensions and the total number of blocks.
InitializeBlockDimensions();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockCount() const { return m_total_block_count; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockTotalSize() const { return m_block_dimensions.TotalSize(); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes<IndexType, NumDims>& blockDimensions() const {
return m_block_dimensions;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockDescriptor blockDescriptor(IndexType block_index) const {
static const bool isColMajor = Layout == static_cast<int>(ColMajor);
IndexType offset = 0;
DSizes<IndexType, NumDims> dimensions;
if (NumDims == 0) return BlockDescriptor(offset, dimensions);
// Iterate outer -> inner dimensions.
for (int i = NumDims - 1; i >= 0; --i) {
const int dim = isColMajor ? i : NumDims - i - 1;
const IndexType idx = block_index / m_block_strides[dim];
block_index -= idx * m_block_strides[dim];
const IndexType coord = idx * m_block_dimensions[dim];
dimensions[dim] = numext::mini(m_tensor_dimensions[dim] - coord, m_block_dimensions[dim]);
offset += coord * m_tensor_strides[dim];
}
return {offset, dimensions};
}
private:
void InitializeBlockDimensions() {
// Requested block shape and size.
const TensorBlockShapeType shape_type = m_requirements.shape_type;
IndexType target_block_size = numext::maxi<IndexType>(1, static_cast<IndexType>(m_requirements.size));
IndexType tensor_size = m_tensor_dimensions.TotalSize();
// Corner case: one of the dimensions is zero. Logic below is too complex
// to handle this case on a general basis, just use unit block size.
// Note: we must not yield blocks with zero dimensions (recipe for
// overflows/underflows, divisions by zero and NaNs later).
if (tensor_size == 0) {
for (int i = 0; i < NumDims; ++i) {
m_block_dimensions[i] = 1;
}
m_total_block_count = 0;
return;
}
// If tensor fits into a target block size, evaluate it as a single block.
if (tensor_size <= target_block_size) {
m_block_dimensions = m_tensor_dimensions;
m_total_block_count = 1;
// The only valid block index is `0`, and in this case we do not need
// to compute real strides for tensor or blocks (see blockDescriptor).
for (int i = 0; i < NumDims; ++i) {
m_tensor_strides[i] = 0;
m_block_strides[i] = 1;
}
return;
}
static const bool isColMajor = Layout == static_cast<int>(ColMajor);
// Block shape skewed towards inner dimension.
if (shape_type == TensorBlockShapeType::kSkewedInnerDims) {
IndexType coeff_to_allocate = target_block_size;
for (int i = 0; i < NumDims; ++i) {
const int dim = isColMajor ? i : NumDims - i - 1;
m_block_dimensions[dim] = numext::mini(coeff_to_allocate, m_tensor_dimensions[dim]);
coeff_to_allocate =
numext::div_ceil(coeff_to_allocate, numext::maxi(static_cast<IndexType>(1), m_block_dimensions[dim]));
}
eigen_assert(coeff_to_allocate == 1);
} else if (shape_type == TensorBlockShapeType::kUniformAllDims) {
// Tensor will not fit within 'target_block_size' budget: calculate tensor
// block dimension sizes based on "square" dimension size target.
const IndexType dim_size_target = convert_index<IndexType>(
std::pow(static_cast<float>(target_block_size), 1.0f / static_cast<float>(m_block_dimensions.rank())));
for (int i = 0; i < NumDims; ++i) {
// TODO(andydavis) Adjust the inner most 'block_dim_size' to make it
// a multiple of the packet size. Note that reducing
// 'block_dim_size' in this manner can increase the number of
// blocks, and so will amplify any per-block overhead.
m_block_dimensions[i] = numext::mini(dim_size_target, m_tensor_dimensions[i]);
}
// Add any un-allocated coefficients to inner dimension(s).
IndexType total_size = m_block_dimensions.TotalSize();
for (int i = 0; i < NumDims; ++i) {
const int dim = isColMajor ? i : NumDims - i - 1;
if (m_block_dimensions[dim] < m_tensor_dimensions[dim]) {
const IndexType total_size_other_dims = total_size / m_block_dimensions[dim];
const IndexType alloc_avail = numext::div_ceil<IndexType>(target_block_size, total_size_other_dims);
if (alloc_avail == m_block_dimensions[dim]) {
// Insufficient excess coefficients to allocate.
break;
}
m_block_dimensions[dim] = numext::mini(m_tensor_dimensions[dim], alloc_avail);
total_size = total_size_other_dims * m_block_dimensions[dim];
}
}
} else {
eigen_assert(false); // unknown block shape
}
eigen_assert(m_block_dimensions.TotalSize() >=
numext::mini<IndexType>(target_block_size, m_tensor_dimensions.TotalSize()));
// Calculate block counts by dimension and total block count.
DSizes<IndexType, NumDims> block_count;
for (int i = 0; i < NumDims; ++i) {
block_count[i] = numext::div_ceil(m_tensor_dimensions[i], m_block_dimensions[i]);
}
m_total_block_count = array_prod(block_count);
// Calculate block strides (used for enumerating blocks).
m_tensor_strides = strides<Layout>(m_tensor_dimensions);
m_block_strides = strides<Layout>(block_count);
}
DSizes<IndexType, NumDims> m_tensor_dimensions;
TensorBlockResourceRequirements m_requirements;
DSizes<IndexType, NumDims> m_block_dimensions;
IndexType m_total_block_count;
DSizes<IndexType, NumDims> m_tensor_strides;
DSizes<IndexType, NumDims> m_block_strides;
};
// -------------------------------------------------------------------------- //
// TensorBlockScratchAllocator is responsible for allocating temporary buffers
// for block evaluation (output or input block materialization). Given that
// Eigen expression traversal order is deterministic, all temporary allocations
// are happening in the same order, and usually have exactly the same size.
// Scratch allocator keeps a trace of all dynamic allocations, and after the
// first block evaluation is completed, we should be able to reuse all the
// temporary buffers for the next block evaluation.
template <typename Device>
class TensorBlockScratchAllocator {
public:
explicit TensorBlockScratchAllocator(const Device& device) : m_device(device), m_allocation_index(0) {}
~TensorBlockScratchAllocator() {
for (size_t i = 0; i < m_allocations.size(); ++i) {
m_device.deallocate(m_allocations[i].ptr);
}
}
void* allocate(size_t size) {
// TODO(ezhulenev): Remove when replaced with inlined vector.
if (m_allocations.capacity() == 0) m_allocations.reserve(8);
// Check if we already have an existing allocation att current index.
const int num_allocations = static_cast<int>(m_allocations.size());
const bool has_allocation = m_allocation_index < num_allocations;
// Allocation index can't be larger than the number of allocations.
eigen_assert(m_allocation_index <= num_allocations);
// If we have existing allocation, and its size is larger or equal to
// requested size, we do nothing.
// If current allocation can't fit requested size, we deallocate it, and
// replace with a larger allocation.
if (has_allocation && m_allocations[m_allocation_index].size < size) {
m_device.deallocate(m_allocations[m_allocation_index].ptr);
m_allocations[m_allocation_index].ptr = m_device.allocate(size);
m_allocations[m_allocation_index].size = size;
}
// Make a new allocation if we don't have and existing one.
if (!has_allocation) {
Allocation allocation;
allocation.ptr = m_device.allocate(size);
allocation.size = size;
m_allocations.push_back(allocation);
}
eigen_assert(m_allocations[m_allocation_index].ptr != NULL);
eigen_assert(m_allocations[m_allocation_index].size >= size);
return m_allocations[m_allocation_index++].ptr;
}
void reset() { m_allocation_index = 0; }
private:
struct Allocation {
void* ptr;
size_t size;
};
const Device& m_device;
int m_allocation_index;
// TODO(ezhulenev): This should be an inlined vector.
std::vector<Allocation> m_allocations;
};
// -------------------------------------------------------------------------- //
// TensorBlockKind represents all possible block kinds, that can be produced by
// TensorEvaluator::evalBlock function.
enum TensorBlockKind {
// Tensor block that is a lazy expression that must be assigned to a
// destination using TensorBlockAssign.
kExpr,
// Tensor block that is a view into a memory buffer owned by an underlying
// Tensor expression (e.g. it can be a view into a Tensor buffer).
kView,
// Tensor block that was materialized in a scratch memory buffer, allocated
// with TensorBlockScratchAllocator. This block must be copied to a
// destination, similar to a block of `kExpr` type.
kMaterializedInScratch,
// Tensor block that was materialized directly into the final output memory
// buffer. For example if the left side of an assignment is a Tensor, we can
// directly materialize the block in the destination memory.
//
// If strides in the output buffer do not match tensor block strides, the
// Tensor expression will be invalid, and should not be used by
// TensorBlockAssign or for constructing another block expression.
kMaterializedInOutput
};
// -------------------------------------------------------------------------- //
// TensorBlockNotImplemented should be used to defined TensorBlock typedef in
// TensorEvaluators that do not support block evaluation.
class TensorBlockNotImplemented {
public:
typedef void XprType;
};
// -------------------------------------------------------------------------- //
// XprScalar extracts Scalar type from the Eigen expressions (if expression type
// is not void). It's required to be able to define lazy block expression for
// argument types, that do not support block evaluation.
template <typename XprType>
struct XprScalar {
typedef typename XprType::Scalar type;
};
template <>
struct XprScalar<void> {
typedef void type;
};
// -------------------------------------------------------------------------- //
// TensorMaterializedBlock is a fully evaluated block of the original tensor,
// and XprType is just a TensorMap over the data. This block type is typically
// used to materialize blocks of tensor expressions, that can't be efficiently
// represented as lazy Tensor expressions with fast coeff/packet operations,
// e.g. we materialize all broadcasts into evaluated blocks.
//
// TensorMaterializedBlock does not own its memory buffer, it's either a memory
// buffer that backs the original expression (e.g. block is just a view into a
// Tensor), or a memory buffer allocated with scratch allocator, and in this
// case the scratch allocator will deallocate it at the end of block based
// expression execution.
//
// If the block was evaluated directly into the output buffer, and strides in
// the output buffer do not match block strides, the TensorMap expression will
// be invalid, and should never be used in block assignment or any other tensor
// expression.
template <typename Scalar, int NumDims, int Layout, typename IndexType = Eigen::Index>
class TensorMaterializedBlock {
public:
typedef DSizes<IndexType, NumDims> Dimensions;
typedef TensorMap<const Tensor<Scalar, NumDims, Layout> > XprType;
TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data, const Dimensions& dimensions,
bool valid_expr = true)
: m_kind(kind), m_data(data), m_dimensions(dimensions), m_expr(m_data, m_dimensions), m_valid_expr(valid_expr) {
eigen_assert(m_kind == internal::TensorBlockKind::kView ||
m_kind == internal::TensorBlockKind::kMaterializedInScratch ||
m_kind == internal::TensorBlockKind::kMaterializedInOutput);
}
TensorBlockKind kind() const { return m_kind; }
// NOTE(ezhulenev): Returning XprType by value like in other block types
// causes asan failures. The theory is that XprType::Nested doesn't work
// properly for TensorMap.
const XprType& expr() const {
eigen_assert(m_valid_expr);
return m_expr;
}
const Scalar* data() const { return m_data; }
void cleanup() {}
typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
// TensorMaterializedBlock can be backed by different types of storage:
//
// (1) Contiguous block of memory allocated with scratch allocator.
// (2) Contiguous block of memory reused from tensor block descriptor
// destination buffer.
// (3) Strided block of memory reused from tensor block descriptor
// destination buffer.
//
class Storage {
public:
Scalar* data() const { return m_data; }
const Dimensions& dimensions() const { return m_dimensions; }
const Dimensions& strides() const { return m_strides; }
TensorMaterializedBlock AsTensorMaterializedBlock() const {
return TensorMaterializedBlock(m_materialized_in_output ? internal::TensorBlockKind::kMaterializedInOutput
: internal::TensorBlockKind::kMaterializedInScratch,
m_data, m_dimensions, !m_strided_storage);
}
private:
friend class TensorMaterializedBlock<Scalar, NumDims, Layout, IndexType>;
Storage(Scalar* data, const Dimensions& dimensions, const Dimensions& strides, bool materialized_in_output,
bool strided_storage)
: m_data(data),
m_dimensions(dimensions),
m_strides(strides),
m_materialized_in_output(materialized_in_output),
m_strided_storage(strided_storage) {}
Scalar* m_data;
Dimensions m_dimensions;
Dimensions m_strides;
bool m_materialized_in_output;
bool m_strided_storage;
};
// Creates a storage for materialized block either from the block descriptor
// destination buffer, or allocates a new buffer with scratch allocator.
template <typename TensorBlockScratch>
EIGEN_STRONG_INLINE static Storage prepareStorage(TensorBlockDesc& desc, TensorBlockScratch& scratch,
bool allow_strided_storage = false) {
// Try to reuse destination as an output block buffer.
typedef typename TensorBlockDesc::DestinationBuffer DestinationBuffer;
if (desc.destination().kind() == DestinationBuffer::kContiguous) {
Scalar* buffer = desc.destination().template data<Scalar>();
desc.DropDestinationBuffer();
return Storage(buffer, desc.dimensions(), internal::strides<Layout>(desc.dimensions()),
/*materialized_in_output=*/true,
/*strided_storage=*/false);
} else if (desc.destination().kind() == DestinationBuffer::kStrided && allow_strided_storage) {
Scalar* buffer = desc.destination().template data<Scalar>();
desc.DropDestinationBuffer();
return Storage(buffer, desc.dimensions(), desc.destination().strides(),
/*materialized_in_output=*/true, /*strided_storage=*/true);
} else {
void* mem = scratch.allocate(desc.size() * sizeof(Scalar));
return Storage(static_cast<Scalar*>(mem), desc.dimensions(), internal::strides<Layout>(desc.dimensions()),
/*materialized_in_output=*/false,
/*strided_storage=*/false);
}
}
// Creates a materialized block for the given descriptor from a memory buffer.
template <typename DataDimensions, typename TensorBlockScratch>
EIGEN_STRONG_INLINE static TensorMaterializedBlock materialize(const Scalar* data, const DataDimensions& data_dims,
TensorBlockDesc& desc, TensorBlockScratch& scratch) {
eigen_assert(array_size<DataDimensions>::value == desc.dimensions().size());
// If a tensor block dimensions covers a contiguous block of the underlying
// memory, we can skip block buffer memory allocation, and construct a block
// from existing `data` memory buffer.
//
// Example: (RowMajor layout)
// data_dims: [11, 12, 13, 14]
// desc.dimensions(): [1, 1, 3, 14]
//
// In this case we can construct a TensorBlock starting at
// `data + desc.offset()`, with a `desc.dimensions()` block sizes.
static const bool is_col_major = Layout == ColMajor;
// Find out how many inner dimensions have a matching size.
int num_matching_inner_dims = 0;
for (int i = 0; i < NumDims; ++i) {
int dim = is_col_major ? i : NumDims - i - 1;
if (data_dims[dim] != desc.dimensions()[dim]) break;
++num_matching_inner_dims;
}
// All the outer dimensions must be of size `1`, except a single dimension
// before the matching inner dimension (`3` in the example above).
bool can_use_direct_access = true;
for (int i = num_matching_inner_dims + 1; i < NumDims; ++i) {
int dim = is_col_major ? i : NumDims - i - 1;
if (desc.dimension(dim) != 1) {
can_use_direct_access = false;
break;
}
}
if (can_use_direct_access) {
const Scalar* block_start = data + desc.offset();
return TensorMaterializedBlock(internal::TensorBlockKind::kView, block_start, desc.dimensions());
} else {
// Reuse destination buffer or allocate new buffer with scratch allocator.
const Storage storage = prepareStorage(desc, scratch);
typedef internal::TensorBlockIO<Scalar, IndexType, NumDims, Layout> TensorBlockIO;
typedef typename TensorBlockIO::Dst TensorBlockIODst;
typedef typename TensorBlockIO::Src TensorBlockIOSrc;
TensorBlockIOSrc src(internal::strides<Layout>(Dimensions(data_dims)), data, desc.offset());
TensorBlockIODst dst(storage.dimensions(), storage.strides(), storage.data());
TensorBlockIO::Copy(dst, src);
return storage.AsTensorMaterializedBlock();
}
}
private:
TensorBlockKind m_kind;
const Scalar* m_data;
Dimensions m_dimensions;
XprType m_expr;
bool m_valid_expr;
};
// -------------------------------------------------------------------------- //
// TensorCwiseUnaryBlock is a lazy tensor expression block that applies UnaryOp
// functor to the blocks produced by the underlying Tensor expression.
template <typename UnaryOp, typename ArgTensorBlock>
class TensorCwiseUnaryBlock {
static constexpr bool NoArgBlockAccess = internal::is_void<typename ArgTensorBlock::XprType>::value;
public:
typedef std::conditional_t<NoArgBlockAccess, void,
TensorCwiseUnaryOp<UnaryOp, const typename ArgTensorBlock::XprType> >
XprType;
typedef typename XprScalar<XprType>::type Scalar;
TensorCwiseUnaryBlock(const ArgTensorBlock& arg_block, const UnaryOp& functor)
: m_arg_block(arg_block), m_functor(functor) {}
TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
XprType expr() const { return XprType(m_arg_block.expr(), m_functor); }
const Scalar* data() const { return NULL; }
void cleanup() { m_arg_block.cleanup(); }
private:
ArgTensorBlock m_arg_block;
UnaryOp m_functor;
};
// -------------------------------------------------------------------------- //
// TensorCwiseUnaryBlock is a lazy tensor expression block that applies BinaryOp
// functor to the blocks produced by the underlying Tensor expression.
template <typename BinaryOp, typename LhsTensorBlock, typename RhsTensorBlock>
class TensorCwiseBinaryBlock {
static constexpr bool NoArgBlockAccess = internal::is_void<typename LhsTensorBlock::XprType>::value ||
internal::is_void<typename RhsTensorBlock::XprType>::value;
public:
typedef std::conditional_t<
NoArgBlockAccess, void,
TensorCwiseBinaryOp<BinaryOp, const typename LhsTensorBlock::XprType, const typename RhsTensorBlock::XprType> >
XprType;
typedef typename XprScalar<XprType>::type Scalar;
TensorCwiseBinaryBlock(const LhsTensorBlock& left_block, const RhsTensorBlock& right_block, const BinaryOp& functor)
: m_left_block(left_block), m_right_block(right_block), m_functor(functor) {}
TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
XprType expr() const { return XprType(m_left_block.expr(), m_right_block.expr(), m_functor); }
const Scalar* data() const { return NULL; }
void cleanup() {
m_left_block.cleanup();
m_right_block.cleanup();
}
private:
LhsTensorBlock m_left_block;
RhsTensorBlock m_right_block;
BinaryOp m_functor;
};
// -------------------------------------------------------------------------- //
// TensorUnaryExprBlock is a lazy tensor expression block that can construct
// an arbitrary tensor expression from a block of the underlying type (this is a
// generalization of the TensorCwiseUnaryBlock for arbitrary expressions).
template <typename BlockFactory, typename ArgTensorBlock>
class TensorUnaryExprBlock {
typedef typename ArgTensorBlock::XprType ArgXprType;
static constexpr bool NoArgBlockAccess = internal::is_void<ArgXprType>::value;
public:
typedef std::conditional_t<NoArgBlockAccess, void, typename BlockFactory::template XprType<ArgXprType>::type> XprType;
typedef typename XprScalar<XprType>::type Scalar;
TensorUnaryExprBlock(const ArgTensorBlock& arg_block, const BlockFactory& factory)
: m_arg_block(arg_block), m_factory(factory) {}
TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
XprType expr() const { return m_factory.expr(m_arg_block.expr()); }
const Scalar* data() const { return NULL; }
void cleanup() { m_arg_block.cleanup(); }
private:
ArgTensorBlock m_arg_block;
BlockFactory m_factory;
};
// -------------------------------------------------------------------------- //
// TensorTernaryExprBlock is a lazy tensor expression block that can construct
// an arbitrary tensor expression from three blocks of the underlying type.
template <typename BlockFactory, typename Arg1TensorBlock, typename Arg2TensorBlock, typename Arg3TensorBlock>
class TensorTernaryExprBlock {
typedef typename Arg1TensorBlock::XprType Arg1XprType;
typedef typename Arg2TensorBlock::XprType Arg2XprType;
typedef typename Arg3TensorBlock::XprType Arg3XprType;
static constexpr bool NoArgBlockAccess = internal::is_void<Arg1XprType>::value ||
internal::is_void<Arg2XprType>::value ||
internal::is_void<Arg3XprType>::value;
public:
typedef std::conditional_t<NoArgBlockAccess, void,
typename BlockFactory::template XprType<Arg1XprType, Arg2XprType, Arg3XprType>::type>
XprType;
typedef typename XprScalar<XprType>::type Scalar;
TensorTernaryExprBlock(const Arg1TensorBlock& arg1_block, const Arg2TensorBlock& arg2_block,
const Arg3TensorBlock& arg3_block, const BlockFactory& factory)
: m_arg1_block(arg1_block), m_arg2_block(arg2_block), m_arg3_block(arg3_block), m_factory(factory) {}
TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
XprType expr() const { return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(), m_arg3_block.expr()); }
const Scalar* data() const { return NULL; }
void cleanup() {
m_arg1_block.cleanup();
m_arg2_block.cleanup();
m_arg3_block.cleanup();
}
private:
Arg1TensorBlock m_arg1_block;
Arg2TensorBlock m_arg2_block;
Arg3TensorBlock m_arg3_block;
BlockFactory m_factory;
};
// -------------------------------------------------------------------------- //
// StridedLinearBufferCopy provides a method to copy data between two linear
// buffers with different strides, with optimized paths for scatter/gather.
template <typename Scalar, typename IndexType>
class StridedLinearBufferCopy {
typedef typename packet_traits<Scalar>::type Packet;
typedef typename unpacket_traits<Packet>::half HalfPacket;
enum {
Vectorizable = packet_traits<Scalar>::Vectorizable,
PacketSize = packet_traits<Scalar>::size,
HalfPacketSize = unpacket_traits<HalfPacket>::size,
HasHalfPacket = static_cast<int>(HalfPacketSize) < static_cast<int>(PacketSize)
};
public:
// Specifying linear copy kind statically gives ~30% speedup for small sizes.
enum class Kind {
Linear = 0, // src_stride == 1 && dst_stride == 1
Scatter = 1, // src_stride == 1 && dst_stride != 1
FillLinear = 2, // src_stride == 0 && dst_stride == 1
FillScatter = 3, // src_stride == 0 && dst_stride != 1
Gather = 4, // dst_stride == 1
Random = 5 // everything else
};
struct Dst {
Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {}
IndexType offset;
IndexType stride;
Scalar* data;
};
struct Src {
Src(IndexType o, IndexType s, const Scalar* d) : offset(o), stride(s), data(d) {}
IndexType offset;
IndexType stride;
const Scalar* data;
};
template <typename StridedLinearBufferCopy::Kind kind>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst, const Src& src, const size_t count) {
Run<kind>(count, dst.offset, dst.stride, dst.data, src.offset, src.stride, src.data);
}
private:
template <typename StridedLinearBufferCopy::Kind kind>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const IndexType count, const IndexType dst_offset,
const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data,
const IndexType src_offset, const IndexType src_stride,
const Scalar* EIGEN_RESTRICT src_data) {
const Scalar* src = &src_data[src_offset];
Scalar* dst = &dst_data[dst_offset];
if (!Vectorizable) {
for (Index i = 0; i < count; ++i) {
dst[i * dst_stride] = src[i * src_stride];
}
return;
}
const IndexType vectorized_size = PacketSize * (count / PacketSize);
IndexType i = 0;
if (kind == StridedLinearBufferCopy::Kind::Linear) {
// ******************************************************************** //
// Linear copy from `src` to `dst`.
const IndexType unrolled_size = (4 * PacketSize) * (count / (4 * PacketSize));
eigen_assert(src_stride == 1 && dst_stride == 1);
for (; i < unrolled_size; i += 4 * PacketSize) {
for (int j = 0; j < 4; ++j) {
Packet p = ploadu<Packet>(src + i + j * PacketSize);
pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
}
}
for (; i < vectorized_size; i += PacketSize) {
Packet p = ploadu<Packet>(src + i);
pstoreu<Scalar, Packet>(dst + i, p);
}
if (HasHalfPacket) {
const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize);
if (i < vectorized_half_size) {
HalfPacket p = ploadu<HalfPacket>(src + i);
pstoreu<Scalar, HalfPacket>(dst + i, p);
i += HalfPacketSize;
}
}
for (; i < count; ++i) {
dst[i] = src[i];
}
// ******************************************************************** //
} else if (kind == StridedLinearBufferCopy::Kind::Scatter) {
// Scatter from `src` to `dst`.
eigen_assert(src_stride == 1 && dst_stride != 1);
for (; i < vectorized_size; i += PacketSize) {
Packet p = ploadu<Packet>(src + i);
pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
}
if (HasHalfPacket) {
const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize);
if (i < vectorized_half_size) {
HalfPacket p = ploadu<HalfPacket>(src + i);
pscatter<Scalar, HalfPacket>(dst + i * dst_stride, p, dst_stride);
i += HalfPacketSize;
}
}
for (; i < count; ++i) {
dst[i * dst_stride] = src[i];
}
// ******************************************************************** //
} else if (kind == StridedLinearBufferCopy::Kind::FillLinear) {
// Fill `dst` with value at `*src`.
eigen_assert(src_stride == 0 && dst_stride == 1);
const IndexType unrolled_size = (4 * PacketSize) * (count / (4 * PacketSize));
Scalar s = *src;
Packet p = pset1<Packet>(s);
for (; i < unrolled_size; i += 4 * PacketSize) {
for (int j = 0; j < 4; ++j) {
pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
}
}
for (; i < vectorized_size; i += PacketSize) {
pstoreu<Scalar, Packet>(dst + i, p);
}
if (HasHalfPacket) {
const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize);
if (i < vectorized_half_size) {
HalfPacket hp = pset1<HalfPacket>(s);
pstoreu<Scalar, HalfPacket>(dst + i, hp);
i += HalfPacketSize;
}
}
for (; i < count; ++i) {
dst[i] = s;
}
// ******************************************************************** //
} else if (kind == StridedLinearBufferCopy::Kind::FillScatter) {
// Scatter `*src` into `dst`.
eigen_assert(src_stride == 0 && dst_stride != 1);
Scalar s = *src;
Packet p = pset1<Packet>(s);
for (; i < vectorized_size; i += PacketSize) {
pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
}
if (HasHalfPacket) {
const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize);
if (i < vectorized_half_size) {
HalfPacket hp = pset1<HalfPacket>(s);
pscatter<Scalar, HalfPacket>(dst + i * dst_stride, hp, dst_stride);
i += HalfPacketSize;
}
}
for (; i < count; ++i) {
dst[i * dst_stride] = s;
}
// ******************************************************************** //
} else if (kind == StridedLinearBufferCopy::Kind::Gather) {
// Gather from `src` into `dst`.
eigen_assert(dst_stride == 1);
for (; i < vectorized_size; i += PacketSize) {
Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride);
pstoreu<Scalar, Packet>(dst + i, p);
}
if (HasHalfPacket) {
const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize);
if (i < vectorized_half_size) {
HalfPacket p = pgather<Scalar, HalfPacket>(src + i * src_stride, src_stride);
pstoreu<Scalar, HalfPacket>(dst + i, p);
i += HalfPacketSize;
}
}
for (; i < count; ++i) {
dst[i] = src[i * src_stride];
}
// ******************************************************************** //
} else if (kind == StridedLinearBufferCopy::Kind::Random) {
// Random.
for (; i < count; ++i) {
dst[i * dst_stride] = src[i * src_stride];
}
} else {
eigen_assert(false);
}
}
};
// -------------------------------------------------------------------------- //
// TensorBlockIO copies data from `src` tensor block, to the `dst` tensor block.
// It's possible to specify src->dst dimension mapping for the copy operation.
// Dimensions of `dst` specify how many elements have to be copied, for the
// `src` we need to know only stride to navigate through source memory buffer.
template <typename Scalar, typename IndexType, int NumDims, int Layout>
class TensorBlockIO {
static constexpr bool IsColMajor = (Layout == ColMajor);
typedef StridedLinearBufferCopy<Scalar, IndexType> LinCopy;
public:
typedef DSizes<IndexType, NumDims> Dimensions;
typedef DSizes<int, NumDims> DimensionsMap;
struct Dst {
Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst, IndexType dst_offset = 0)
: dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {}
Dimensions dims;
Dimensions strides;
Scalar* data;
IndexType offset;
};
struct Src {
Src(const Dimensions& src_strides, const Scalar* src, IndexType src_offset = 0)
: strides(src_strides), data(src), offset(src_offset) {}
Dimensions strides;
const Scalar* data;
IndexType offset;
};
// Copies data to `dst` from `src`, using provided dimensions mapping:
//
// src_dimension_index = dst_to_src_dim_map[dst_dimension_index]
//
// Returns the number of copied elements.
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType Copy(const Dst& dst, const Src& src,
const DimensionsMap& dst_to_src_dim_map) {
// Copy single scalar value from `src` to `dst`.
if (NumDims == 0) {
*(dst.data + dst.offset) = *(src.data + src.offset);
return 1;
}
// Both `dst` and `src` must have contiguous innermost dimension. We also
// accept the special case with stride '0', because it's used as a trick to
// implement broadcasting.
{
int inner_dim = IsColMajor ? 0 : NumDims - 1;
EIGEN_UNUSED_VARIABLE(inner_dim);
eigen_assert(dst.strides[inner_dim] == 1 || dst.strides[inner_dim] == 0);
eigen_assert(src.strides[inner_dim] == 1 || src.strides[inner_dim] == 0);
}
// Give a shorter name to `dst_to_src_dim_map`.
const DimensionsMap& dim_map = dst_to_src_dim_map;
// Do not squeeze reordered inner dimensions.
int num_squeezable_dims = NumSqueezableInnerDims(dim_map);
// NOTE: We find the innermost dimension (contiguous in memory) in the dst
// block, and we write data linearly into that dimension, reading it from
// the src. If dimensions are reordered, we might end up reading data from
// the src with `stride != 1`.
//
// NOTE: Random-Read/Linear-Write can be up to ~2X faster than
// Linear-Read/Random-Write: https://stackoverflow.com/a/54935680
// Find the innermost dimension in the dst whose size is not 1. This is the
// effective inner dim.
int num_size_one_inner_dims = 0;
for (int i = 0; i < num_squeezable_dims; ++i) {
const int dst_dim = IsColMajor ? i : NumDims - i - 1;
if (dst.dims[dst_dim] != 1) break;
num_size_one_inner_dims++;
}
// If all dimensions are of size 1, just copy a scalar from `src` to `dst`.
if (num_size_one_inner_dims == NumDims) {
*(dst.data + dst.offset) = *(src.data + src.offset);
return 1;
}
// Outermost dimension in the dst with `stride == 1` (contiguous in memory).
const int dst_stride1_dim = IsColMajor ? num_size_one_inner_dims : NumDims - num_size_one_inner_dims - 1;
// Dimension in the src that corresponds to the dst innermost dimension.
const int src_dim_for_dst_stride1_dim = NumDims == 0 ? 1 : dim_map[dst_stride1_dim];
// Size of the innermost dimension (length of contiguous blocks of memory).
IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim];
// Squeeze multiple inner dims into one if they are contiguous in `dst` and
// `src` memory, so we can do less linear copy calls.
for (int i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) {
const int dst_dim = IsColMajor ? i : NumDims - i - 1;
const IndexType dst_stride = dst.strides[dst_dim];
const IndexType src_stride = src.strides[dim_map[dst_dim]];
if (dst_inner_dim_size == dst_stride && dst_stride == src_stride) {
dst_inner_dim_size *= dst.dims[dst_dim];
++num_size_one_inner_dims;
} else {
break;
}
}
// Setup strides to read data from `src` and write to `dst`.
IndexType input_offset = src.offset;
IndexType output_offset = dst.offset;
IndexType input_stride = NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim];
IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim];
const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
array<BlockIteratorState, at_least_1_dim> it;
// Initialize block iterator state. Squeeze away any dimension of size 1.
int idx = 0; // currently initialized iterator state index
for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
const int dst_dim = IsColMajor ? i + 1 : NumDims - i - 2;
if (dst.dims[dst_dim] == 1) continue;
it[idx].size = dst.dims[dst_dim];
it[idx].input_stride = src.strides[dim_map[dst_dim]];
it[idx].output_stride = dst.strides[dst_dim];
it[idx].input_span = it[idx].input_stride * (it[idx].size - 1);
it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
idx++;
}
// Iterate copying data from src to dst.
const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize();
#define COPY_INNER_DIM(KIND) \
IndexType num_copied = 0; \
for (num_copied = 0; num_copied < block_total_size; num_copied += dst_inner_dim_size) { \
LinCopy::template Run<KIND>(typename LinCopy::Dst(output_offset, output_stride, dst.data), \
typename LinCopy::Src(input_offset, input_stride, src.data), dst_inner_dim_size); \
\
for (int j = 0; j < idx; ++j) { \
if (++it[j].count < it[j].size) { \
input_offset += it[j].input_stride; \
output_offset += it[j].output_stride; \
break; \
} \
it[j].count = 0; \
input_offset -= it[j].input_span; \
output_offset -= it[j].output_span; \
} \
} \
return num_copied;
if (input_stride == 1 && output_stride == 1) {
COPY_INNER_DIM(LinCopy::Kind::Linear);
} else if (input_stride == 1 && output_stride != 1) {
COPY_INNER_DIM(LinCopy::Kind::Scatter);
} else if (input_stride == 0 && output_stride == 1) {
COPY_INNER_DIM(LinCopy::Kind::FillLinear);
} else if (input_stride == 0 && output_stride != 1) {
COPY_INNER_DIM(LinCopy::Kind::FillScatter);
} else if (output_stride == 1) {
COPY_INNER_DIM(LinCopy::Kind::Gather);
} else {
COPY_INNER_DIM(LinCopy::Kind::Random);
}
#undef COPY_INNER_DIM
}
// Copy from `src` to `dst` with an identity src->dst dimension map. Returns
// the number of copied elements.
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexType Copy(const Dst& dst, const Src& src) {
DimensionsMap dst_to_src_map;
for (int i = 0; i < NumDims; ++i) dst_to_src_map[i] = i;
return Copy(dst, src, dst_to_src_map);
}
private:
struct BlockIteratorState {
BlockIteratorState() : size(0), count(0), input_stride(0), output_stride(0), input_span(0), output_span(0) {}
IndexType size;
IndexType count;
IndexType input_stride;
IndexType output_stride;
IndexType input_span;
IndexType output_span;
};
// Compute how many inner dimensions it's allowed to squeeze when doing IO
// between two tensor blocks. It's safe to squeeze inner dimensions, only
// if they are not reordered.
static int NumSqueezableInnerDims(const DimensionsMap& dim_map) {
int num_squeezable_dims = 0;
for (int i = 0; i < NumDims; ++i) {
const int dim = IsColMajor ? i : NumDims - i - 1;
if (dim_map[dim] != dim) break;
num_squeezable_dims++;
}
return num_squeezable_dims;
}
};
// -------------------------------------------------------------------------- //
// TensorBlockAssignment assigns a block expression of type `TensorBlockExpr` to
// a Tensor block defined by `desc`, backed by a memory buffer at `target`.
//
// Currently there is no way to write from a Tensor expression to a block of
// memory, if dimensions are reordered. If you need to do that, you should
// materialize a Tensor block expression into a memory buffer, and then use
// TensorBlockIO to copy data between two memory buffers with a custom
// `target->src` dimension map (see definition above).
//
// Also currently the innermost dimension of `target` must have a stride '1'
// (contiguous in memory). This restriction could be lifted with a `pscatter`,
// but in practice it's never needed, and there is a similar TensorBlockIO
// workaround for that.
//
// TODO(ezhulenev): TensorBlockAssignment is a special case of TensorBlockIO
// where `src` is a tensor expression. Explore if it is possible to rewrite IO
// to use expressions instead of pointers, and after that TensorBlockAssignment
// will become an alias to IO.
template <typename Scalar, int NumDims, typename TensorBlockExpr, typename IndexType = Eigen::Index>
class TensorBlockAssignment {
// We will use coeff/packet path to evaluate block expressions.
typedef TensorEvaluator<const TensorBlockExpr, DefaultDevice> TensorBlockEvaluator;
typedef DSizes<IndexType, NumDims> Dimensions;
enum { Vectorizable = packet_traits<Scalar>::Vectorizable, PacketSize = packet_traits<Scalar>::size };
template <bool Vectorizable, typename Evaluator>
struct InnerDimAssign {
EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count, const Evaluator& eval, IndexType eval_offset) {
for (IndexType i = 0; i < count; ++i) {
target[i] = eval.coeff(eval_offset + i);
}
}
};
template <typename Evaluator>
struct InnerDimAssign<true, Evaluator> {
EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count, const Evaluator& eval, IndexType eval_offset) {
typedef typename packet_traits<Scalar>::type Packet;
const IndexType unrolled_size = (4 * PacketSize) * (count / (4 * PacketSize));
const IndexType vectorized_size = PacketSize * (count / PacketSize);
IndexType i = 0;
for (; i < unrolled_size; i += 4 * PacketSize) {
for (int j = 0; j < 4; ++j) {
const IndexType idx = eval_offset + i + j * PacketSize;
Packet p = eval.template packet<Unaligned>(idx);
pstoreu<Scalar>(target + i + j * PacketSize, p);
}
}
for (; i < vectorized_size; i += PacketSize) {
Packet p = eval.template packet<Unaligned>(eval_offset + i);
pstoreu<Scalar>(target + i, p);
}
for (; i < count; ++i) {
target[i] = eval.coeff(eval_offset + i);
}
}
};
public:
struct Target {
Target(const Dimensions& target_dims, const Dimensions& target_strides, Scalar* target_data,
IndexType target_offset = 0)
: dims(target_dims), strides(target_strides), data(target_data), offset(target_offset) {}
Dimensions dims;
Dimensions strides;
Scalar* data;
IndexType offset;
};
static Target target(const Dimensions& target_dims, const Dimensions& target_strides, Scalar* target_data,
IndexType target_offset = 0) {
return Target(target_dims, target_strides, target_data, target_offset);
}
template <typename TargetDimsIndexType, typename TargetStridesIndexType>
static Target target(const DSizes<TargetDimsIndexType, NumDims>& target_dims,
const DSizes<TargetStridesIndexType, NumDims>& target_strides, Scalar* target_data,
IndexType target_offset = 0) {
// DSizes constructor will do index type promotion if it's safe.
return Target(Dimensions(target_dims), Dimensions(target_strides), target_data, target_offset);
}
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Target& target, const TensorBlockExpr& expr) {
// Prepare evaluator for block expression.
DefaultDevice default_device;
TensorBlockEvaluator eval(expr, default_device);
// Tensor block expression dimension should match destination dimensions.
eigen_assert(dimensions_match(target.dims, eval.dimensions()));
static const int Layout = TensorBlockEvaluator::Layout;
static const bool is_col_major = Layout == ColMajor;
// Initialize output inner dimension size based on a layout.
const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize();
const int inner_dim_idx = is_col_major ? 0 : NumDims - 1;
IndexType output_inner_dim_size = target.dims[inner_dim_idx];
// Target inner dimension stride must be '1'.
eigen_assert(target.strides[inner_dim_idx] == 1);
// Squeeze multiple inner dims into one if they are contiguous in `target`.
IndexType num_squeezed_dims = 0;
for (Index i = 1; i < NumDims; ++i) {
const Index dim = is_col_major ? i : NumDims - i - 1;
const IndexType target_stride = target.strides[dim];
if (output_inner_dim_size == target_stride) {
output_inner_dim_size *= target.dims[dim];
num_squeezed_dims++;
} else {
break;
}
}
// Initialize output block iterator state. Dimension in this array are
// always in inner_most -> outer_most order (col major layout).
array<BlockIteratorState, NumDims> it;
int idx = 0; // currently initialized iterator state index
for (Index i = num_squeezed_dims; i < NumDims - 1; ++i) {
const Index dim = is_col_major ? i + 1 : NumDims - i - 2;
it[idx].count = 0;
it[idx].size = target.dims[dim];
it[idx].output_stride = target.strides[dim];
it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
idx++;
}
// We read block expression from the beginning, and start writing data to
// `target` at given offset.
IndexType input_offset = 0;
IndexType output_offset = target.offset;
// Iterate copying data from `eval` to `target`.
for (IndexType i = 0; i < output_size; i += output_inner_dim_size) {
// Assign to `target` at current offset.
InnerDimAssign<Vectorizable && TensorBlockEvaluator::PacketAccess, TensorBlockEvaluator>::Run(
target.data + output_offset, output_inner_dim_size, eval, input_offset);
// Move input offset forward by the number of assigned coefficients.
input_offset += output_inner_dim_size;
// Update index.
for (int j = 0; j < idx; ++j) {
if (++it[j].count < it[j].size) {
output_offset += it[j].output_stride;
break;
}
it[j].count = 0;
output_offset -= it[j].output_span;
}
}
}
private:
struct BlockIteratorState {
BlockIteratorState() : count(0), size(0), output_stride(0), output_span(0) {}
IndexType count;
IndexType size;
IndexType output_stride;
IndexType output_span;
};
};
// -------------------------------------------------------------------------- //
} // namespace internal
} // namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H