Blame - unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h - eigen

blob: 3880e7ed309983b6bcd956164d400db26bc2e005 [file] [log] [blame]

Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	1	// This file is part of Eigen, a lightweight C++ template library
				2	// for linear algebra.
				3	//
				4	// This Source Code Form is subject to the terms of the Mozilla
				5	// Public License v. 2.0. If a copy of the MPL was not distributed
				6	// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
				7
				8	#ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_V2_H
				9	#define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_V2_H
				10
				11	namespace Eigen {
				12	namespace internal {
				13
				14	// -------------------------------------------------------------------------- //
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	15	// Forward declarations for templates defined below.
				16	template <typename Scalar, typename IndexType, int NumDims, int Layout>
				17	class TensorBlockIOV2;
				18
				19	// -------------------------------------------------------------------------- //
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	20	// Helper function to compute strides for densely stored buffer of given
				21	// dimensions.
				22
				23	// TODO(ezhulenev): We compute strides 1000 times in different evaluators, use
				24	// this function instead everywhere.
				25	template <int Layout, typename IndexType, int NumDims>
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	26	EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	27	const DSizes<IndexType, NumDims>& dimensions) {
				28	DSizes<IndexType, NumDims> strides;
				29	if (NumDims == 0) return strides;
				30
				31	// TODO(ezhulenev): Use templates to unroll this loop (similar to
				32	// h_array_reduce in CXX11meta.h)? Benchmark it.
				33	if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
				34	strides[0] = 1;
				35	for (int i = 1; i < NumDims; ++i) {
				36	strides[i] = strides[i - 1] * dimensions[i - 1];
				37	}
				38	} else {
				39	strides[NumDims - 1] = 1;
				40	for (int i = NumDims - 2; i >= 0; --i) {
				41	strides[i] = strides[i + 1] * dimensions[i + 1];
				42	}
				43	}
				44
				45	return strides;
				46	}
				47
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	48	#if EIGEN_HAS_CXX11
Eugene Zhulenev	3736d72	2019-10-01 19:41:39 -0700	[diff] [blame]	49	template <int Layout, std::ptrdiff_t... Indices>
				50	EIGEN_STRONG_INLINE DSizes<std::ptrdiff_t, sizeof...(Indices)> strides(
				51	const Sizes<Indices...>& sizes) {
				52	return strides<Layout>(DSizes<std::ptrdiff_t, sizeof...(Indices)>(sizes));
				53	}
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	54	#endif
Eugene Zhulenev	3736d72	2019-10-01 19:41:39 -0700	[diff] [blame]	55
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	56	// -------------------------------------------------------------------------- //
				57	// TensorBlockDescriptor specifies a block offset within a tensor and the block
				58	// sizes along each of the tensor dimensions.
				59
				60	template <int NumDims, typename IndexType = Eigen::Index>
				61	class TensorBlockDescriptor {
				62	public:
				63	typedef DSizes<IndexType, NumDims> Dimensions;
				64
				65	// If we evaluate a Tensor assignment, and expression on the left, already has
				66	// a memory buffer, then we might do performance optimization, and evaluate
				67	// the root expression directly into the memory, or maybe use it as temporary
				68	// storage for some of the subexpressions, to avoid dynamic memory allocation.
				69	//
				70	// This is a type erased storage, because passing Scalar type through all the
				71	// expression evaluation layers it way too many templates. Also it should be
				72	// possible to use this destination as a temp buffer for materializing
				73	// expressions with type, not matching the final output.
				74	class DestinationBuffer {
				75	public:
				76	template <typename Scalar>
				77	Scalar* data() const {
				78	return static_cast<Scalar*>(m_data);
				79	}
				80
				81	private:
				82	friend class TensorBlockDescriptor;
				83
				84	DestinationBuffer() : m_data(NULL), m_total_dst_bytes(0) {}
				85
				86	template <typename Scalar>
				87	DestinationBuffer(Scalar* data, const Dimensions& dimensions,
				88	const Dimensions& strides, size_t total_dst_bytes)
				89	: m_data(static_cast<void*>(data)),
				90	m_dimensions(dimensions),
				91	m_strides(strides),
				92	m_total_dst_bytes(total_dst_bytes) {
				93	// TODO(ezhulenev): Benchmark template meta-unroll for this loop.
				94	for (int i = 0; i < NumDims; ++i) {
				95	m_dimensions[i] *= sizeof(Scalar);
				96	m_strides[i] *= sizeof(Scalar);
				97	}
				98	}
				99
				100	// Returns true if the tensor block corresponding to `desc` fits into the
				101	// contiguous block of memory defined by `*this`.
				102	template <typename Scalar, int Layout>
				103	bool fitsContiguously(const TensorBlockDescriptor& desc) const {
				104	if (m_data == NULL) return false;
				105
				106	const Dimensions& desc_dims = desc.dimensions();
				107	const Dimensions& dst_dims = dimensions<Scalar>();
				108
				109	if (!dimensions_match(desc_dims, dst_dims)) return false;
				110
				111	const Dimensions& desc_strides = internal::strides<Layout>(desc_dims);
				112	const Dimensions& dst_strides = internal::strides<Layout>(dst_dims);
				113
				114	return dimensions_match(desc_strides, dst_strides);
				115	}
				116
				117	template <typename Scalar>
				118	Dimensions dimensions() const {
				119	Dimensions dimensions;
				120	for (int i = 0; i < NumDims; ++i) {
				121	eigen_assert(m_dimensions[i] % sizeof(Scalar) == 0);
				122	dimensions[i] = m_dimensions[i] / sizeof(Scalar);
				123	}
				124	return dimensions;
				125	}
				126
				127	template <typename Scalar>
				128	Dimensions strides() const {
				129	Dimensions strides;
				130	for (int i = 0; i < NumDims; ++i) {
				131	eigen_assert(m_strides[i] % sizeof(Scalar) == 0);
				132	strides[i] = m_strides[i] / sizeof(Scalar);
				133	}
				134	return strides;
				135	}
				136
				137	void* m_data;
				138	Dimensions m_dimensions;
				139	Dimensions m_strides;
				140
				141	// Total size of the memory buffer at the destination (typically the total
				142	// size of the left hand side of an assignment expression). This can be the
				143	// same as `array_prod(m_dimensions)` if the assignment target has just a
				144	// single block, but typically it's a larger number.
				145	size_t m_total_dst_bytes;
				146	};
				147
				148	TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions,
				149	const DestinationBuffer& destination)
				150	: m_offset(offset),
				151	m_dimensions(dimensions),
				152	m_destination(destination) {}
				153
				154	TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions)
				155	: m_offset(offset),
				156	m_dimensions(dimensions),
				157	m_destination(DestinationBuffer()) {}
				158
				159	IndexType offset() const { return m_offset; }
				160	const Dimensions& dimensions() const { return m_dimensions; }
				161	IndexType dimension(int index) const { return m_dimensions[index]; }
				162	IndexType size() const { return array_prod<IndexType>(m_dimensions); }
				163
				164	template <typename Scalar>
				165	void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides,
				166	size_t total_dst_bytes) {
				167	m_destination =
				168	DestinationBuffer(dst_base, m_dimensions, dst_strides, total_dst_bytes);
				169	}
				170
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	171	template <typename Scalar, typename DstStridesIndexType>
				172	void AddDestinationBuffer(
				173	Scalar* dst_base, const DSizes<DstStridesIndexType, NumDims>& dst_strides,
				174	size_t total_dst_bytes) {
				175	// DSizes constructor will do index type promotion if it's safe.
				176	AddDestinationBuffer(dst_base, Dimensions(dst_strides), total_dst_bytes);
				177	}
				178
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	179	TensorBlockDescriptor& DropDestinationBuffer() {
				180	m_destination.m_data = NULL;
				181	return *this;
				182	}
				183
				184	// Returns a non-nullptr pointer to a destination buffer memory if this
				185	// block has a contiguous destination buffer.
				186	template <typename Scalar, int Layout>
				187	Scalar* destination() const {
				188	if (m_destination.template fitsContiguously<Scalar, Layout>(*this)) {
				189	return m_destination.template data<Scalar>();
				190	}
				191	return NULL;
				192	}
				193
				194	private:
				195	// Offset and dimensions are immutable after construction. Block descriptor
				196	// can only be mutated by adding or dropping destination.
				197	const IndexType m_offset;
				198	const Dimensions m_dimensions;
				199	DestinationBuffer m_destination;
				200	};
				201
				202	// -------------------------------------------------------------------------- //
				203	// TensorBlockScratchAllocator is responsible for allocating temporary buffers
				204	// for block evaluation (output or input block materialization). Given that
				205	// Eigen expression traversal order is deterministic, all temporary allocations
				206	// are happening in the same order, and usually have exactly the same size.
				207	// Scratch allocator keeps a trace of all dynamic allocations, and after the
				208	// first block evaluation is completed, we should be able to reuse all the
				209	// temporary buffers for the next block evaluation.
				210
				211	template <typename Device>
				212	class TensorBlockScratchAllocator {
				213	public:
				214	explicit TensorBlockScratchAllocator(const Device& device)
				215	: m_device(device), m_allocation_index(0) {}
				216
				217	~TensorBlockScratchAllocator() {
				218	for (size_t i = 0; i < m_allocations.size(); ++i) {
				219	m_device.deallocate(m_allocations[i].ptr);
				220	}
				221	}
				222
				223	void* allocate(size_t size) {
				224	// TODO(ezhulenev): Remove when replaced with inlined vector.
				225	if (m_allocations.capacity() == 0) m_allocations.reserve(8);
				226
				227	// Check if we already have an existing allocation att current index.
				228	const int num_allocations = static_cast<int>(m_allocations.size());
				229	const bool has_allocation = m_allocation_index < num_allocations;
				230
				231	// Allocation index can't be larger than the number of allocations.
				232	eigen_assert(m_allocation_index <= num_allocations);
				233
				234	// If we have existing allocation, and its size is larger or equal to
				235	// requested size, we do nothing.
				236
				237	// If current allocation can't fit requested size, we deallocate it, and
				238	// replace with a larger allocation.
				239	if (has_allocation && m_allocations[m_allocation_index].size < size) {
				240	m_device.deallocate(m_allocations[m_allocation_index].ptr);
				241	m_allocations[m_allocation_index].ptr = m_device.allocate(size);
				242	m_allocations[m_allocation_index].size = size;
				243	}
				244
				245	// Make a new allocation if we don't have and existing one.
				246	if (!has_allocation) {
				247	Allocation allocation;
				248	allocation.ptr = m_device.allocate(size);
				249	allocation.size = size;
				250	m_allocations.push_back(allocation);
				251	}
				252
				253	eigen_assert(m_allocations[m_allocation_index].ptr != NULL);
				254	eigen_assert(m_allocations[m_allocation_index].size >= size);
				255
				256	return m_allocations[m_allocation_index++].ptr;
				257	}
				258
				259	void reset() { m_allocation_index = 0; }
				260
				261	private:
				262	struct Allocation {
				263	void* ptr;
				264	size_t size;
				265	};
				266
				267	const Device& m_device;
				268	int m_allocation_index;
				269	// TODO(ezhulenev): This should be an inlined vector.
				270	std::vector<Allocation> m_allocations;
				271	};
				272
				273	// -------------------------------------------------------------------------- //
				274	// TensorBlockKind represents all possible block kinds, that can be produced by
				275	// TensorEvaluator::evalBlock function.
				276	#if !EIGEN_HAS_CXX11
				277	// To be able to use `TensorBlockKind::kExpr` in C++03 we need a namespace.
				278	// (Use of enumeration in a nested name specifier is a c++11 extension).
				279	namespace TensorBlockKind {
				280	#endif
				281	enum TensorBlockKind {
				282	// Tensor block that is a lazy expression that must be assigned to a
				283	// destination using TensorBlockAssign.
				284	kExpr,
				285
				286	// Tensor block that is a view into a memory buffer owned by an underlying
				287	// Tensor expression (e.g. it can be a view into a Tensor buffer).
				288	kView,
				289
				290	// Tensor block that was materialized in a scratch memory buffer, allocated
				291	// with TensorBlockScratchAllocator. This block must be copied to a
				292	// destination, similar to a block of `kExpr` type.
				293	kMaterializedInScratch,
				294
				295	// Tensor block that was materialized directly into the final output memory
				296	// buffer. For example if the left side of an assignment is a Tensor, we can
				297	// directly materialize the block in the destination memory. The block
				298	// expression is still a valid Tensor expression, and can be used to build
				299	// lazy expressions.
				300	kMaterializedInOutput
				301
				302	// TODO(ezhulenev): If we know that we are evaluating a block, for the root of
				303	// the expression tree, it might be beneficial to do an assignment to the
				304	// output memory buffer, even if it will be impossible to construct a valid
				305	// block expression after that (e.g. output memory buffer has strides not
				306	// compatible with TensorMap). This might be a performance optimization for
				307	// uniformly shaped blocks, because for blocks skewed towards inner dimension
				308	// `kMaterializedInOutput` should always work.
				309	};
				310	#if !EIGEN_HAS_CXX11
				311	} // namespace TensorBlockKind
				312	#endif
				313
				314	// -------------------------------------------------------------------------- //
				315	// TensorBlockNotImplemented should be used to defined TensorBlock typedef in
				316	// TensorEvaluators that do not support block evaluation.
				317
				318	class TensorBlockNotImplemented {
				319	public:
				320	typedef void XprType;
				321	};
				322
				323	// -------------------------------------------------------------------------- //
				324	// XprScalar extracts Scalar type from the Eigen expressions (if expression type
				325	// is not void). It's required to be able to define lazy block expression for
				326	// argument types, that do not support block evaluation.
				327
				328	template <typename XprType>
				329	struct XprScalar {
				330	typedef typename XprType::Scalar type;
				331	};
				332	template <>
				333	struct XprScalar<void> {
				334	typedef void type;
				335	};
				336
				337	// -------------------------------------------------------------------------- //
				338	// TensorMaterializedBlock is a fully evaluated block of the original tensor,
				339	// and XprType is just a TensorMap over the data. This block type is typically
				340	// used to materialize blocks of tensor expressions, that can't be efficiently
				341	// represented as lazy Tensor expressions with fast coeff/packet operations,
				342	// e.g. we materialize all broadcasts into evaluated blocks.
				343	//
				344	// TensorMaterializedBlock does not own its memory buffer, it's either a memory
				345	// buffer that backs the original expression (e.g. block is just a view into a
				346	// Tensor), or a memory buffer allocated with scratch allocator, and in this
				347	// case the scratch allocator will deallocate it at the end of block based
				348	// expression execution.
				349
				350	template <typename Scalar, int NumDims, int Layout,
				351	typename IndexType = Eigen::Index>
				352	class TensorMaterializedBlock {
				353	#if !EIGEN_HAS_CXX11
				354	typedef internal::TensorBlockKind::TensorBlockKind TensorBlockKind;
				355	#endif
				356	public:
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	357	typedef DSizes<IndexType, NumDims> Dimensions;
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	358	typedef TensorMap<const Tensor<Scalar, NumDims, Layout> > XprType;
				359
				360	TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data,
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	361	const Dimensions& dimensions)
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	362	: m_kind(kind),
				363	m_data(data),
				364	m_dimensions(dimensions),
				365	m_expr(m_data, m_dimensions) {
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	366	eigen_assert(m_kind == internal::TensorBlockKind::kView \|\|
				367	m_kind == internal::TensorBlockKind::kMaterializedInScratch \|\|
				368	m_kind == internal::TensorBlockKind::kMaterializedInOutput);
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	369	}
				370
				371	TensorBlockKind kind() const { return m_kind; }
				372	// NOTE(ezhulenev): Returning XprType by value like in other block types
				373	// causes asan failures. The theory is that XprType::Nested doesn't work
				374	// properly for TensorMap.
				375	const XprType& expr() const { return m_expr; }
				376	const Scalar* data() const { return m_data; }
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	377	void cleanup() {}
				378
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	379	typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
				380
				381	// Creates a materialized block for the given descriptor from a memory buffer.
				382	template <typename DataDimensions, typename TensorBlockScratch>
				383	EIGEN_STRONG_INLINE static TensorMaterializedBlock materialize(
				384	const Scalar* data, const DataDimensions& data_dims,
				385	TensorBlockDesc& desc, TensorBlockScratch& scratch) {
				386	eigen_assert(array_size<DataDimensions>::value == desc.dimensions().size());
				387
				388	// If a tensor block dimensions covers a contiguous block of the underlying
				389	// memory, we can skip block buffer memory allocation, and construct a block
				390	// from existing `data` memory buffer.
				391	//
				392	// Example: (RowMajor layout)
				393	// data_dims: [11, 12, 13, 14]
				394	// desc.dimensions(): [1, 1, 3, 14]
				395	//
				396	// In this case we can construct a TensorBlock starting at
				397	// `data + desc.offset()`, with a `desc.dimensions()` block sizes.
				398	static const bool is_col_major = Layout == ColMajor;
				399
				400	// Find out how many inner dimensions have a matching size.
				401	int num_matching_inner_dims = 0;
				402	for (int i = 0; i < NumDims; ++i) {
				403	int dim = is_col_major ? i : NumDims - i - 1;
				404	if (data_dims[dim] != desc.dimensions()[dim]) break;
				405	++num_matching_inner_dims;
				406	}
				407
				408	// All the outer dimensions must be of size `1`, except a single dimension
				409	// before the matching inner dimension (`3` in the example above).
				410	bool can_use_direct_access = true;
				411	for (int i = num_matching_inner_dims + 1; i < NumDims; ++i) {
				412	int dim = is_col_major ? i : NumDims - i - 1;
				413	if (desc.dimension(dim) != 1) {
				414	can_use_direct_access = false;
				415	break;
				416	}
				417	}
				418
				419	if (can_use_direct_access) {
				420	const Scalar* block_start = data + desc.offset();
				421	return TensorMaterializedBlock(internal::TensorBlockKind::kView, block_start,
				422	desc.dimensions());
				423
				424	} else {
				425	void* mem = scratch.allocate(desc.size() * sizeof(Scalar));
				426	Scalar* block_buffer = static_cast<Scalar*>(mem);
				427
				428	typedef internal::TensorBlockIOV2<Scalar, IndexType, NumDims, Layout>
				429	TensorBlockIO;
				430	typedef typename TensorBlockIO::Dst TensorBlockIODst;
				431	typedef typename TensorBlockIO::Src TensorBlockIOSrc;
				432
				433	TensorBlockIOSrc src(internal::strides<Layout>(Dimensions(data_dims)),
				434	data, desc.offset());
				435	TensorBlockIODst dst(desc.dimensions(),
				436	internal::strides<Layout>(desc.dimensions()),
				437	block_buffer);
				438
				439	TensorBlockIO::Copy(dst, src);
				440
				441	return TensorMaterializedBlock(internal::TensorBlockKind::kMaterializedInScratch,
				442	block_buffer, desc.dimensions());
				443	}
				444	}
				445
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	446	private:
				447	TensorBlockKind m_kind;
				448	const Scalar* m_data;
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	449	Dimensions m_dimensions;
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	450	XprType m_expr;
				451	};
				452
				453	// -------------------------------------------------------------------------- //
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	454	// TensorCwiseUnaryBlock is a lazy tensor expression block that applies UnaryOp
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	455	// functor to the blocks produced by the underlying Tensor expression.
				456
				457	template <typename UnaryOp, typename ArgTensorBlock>
				458	class TensorCwiseUnaryBlock {
				459	#if !EIGEN_HAS_CXX11
				460	typedef internal::TensorBlockKind::TensorBlockKind TensorBlockKind;
				461	#endif
				462
				463	static const bool NoArgBlockAccess =
				464	internal::is_void<typename ArgTensorBlock::XprType>::value;
				465
				466	public:
				467	typedef typename conditional<
				468	NoArgBlockAccess, void,
				469	TensorCwiseUnaryOp<UnaryOp, const typename ArgTensorBlock::XprType> >::type
				470	XprType;
				471
				472	typedef typename XprScalar<XprType>::type Scalar;
				473
				474	TensorCwiseUnaryBlock(const ArgTensorBlock& arg_block, const UnaryOp& functor)
				475	: m_arg_block(arg_block), m_functor(functor) {}
				476
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	477	TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	478
				479	XprType expr() const { return XprType(m_arg_block.expr(), m_functor); }
				480	const Scalar* data() const { return NULL; }
				481	void cleanup() { m_arg_block.cleanup(); }
				482
				483	private:
				484	ArgTensorBlock m_arg_block;
				485	UnaryOp m_functor;
				486	};
				487
				488	// -------------------------------------------------------------------------- //
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	489	// TensorCwiseUnaryBlock is a lazy tensor expression block that applies BinaryOp
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	490	// functor to the blocks produced by the underlying Tensor expression.
				491
				492	template <typename BinaryOp, typename LhsTensorBlock, typename RhsTensorBlock>
				493	class TensorCwiseBinaryBlock {
				494	#if !EIGEN_HAS_CXX11
				495	typedef internal::TensorBlockKind::TensorBlockKind TensorBlockKind;
				496	#endif
				497
				498	static const bool NoArgBlockAccess =
				499	internal::is_void<typename LhsTensorBlock::XprType>::value \|\|
				500	internal::is_void<typename RhsTensorBlock::XprType>::value;
				501
				502	public:
				503	typedef typename conditional<
				504	NoArgBlockAccess, void,
				505	TensorCwiseBinaryOp<BinaryOp, const typename LhsTensorBlock::XprType,
				506	const typename RhsTensorBlock::XprType> >::type
				507	XprType;
				508
				509	typedef typename XprScalar<XprType>::type Scalar;
				510
				511	TensorCwiseBinaryBlock(const LhsTensorBlock& left_block,
				512	const RhsTensorBlock& right_block,
				513	const BinaryOp& functor)
				514	: m_left_block(left_block),
				515	m_right_block(right_block),
				516	m_functor(functor) {}
				517
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	518	TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	519
				520	XprType expr() const {
				521	return XprType(m_left_block.expr(), m_right_block.expr(), m_functor);
				522	}
				523
				524	const Scalar* data() const { return NULL; }
				525
				526	void cleanup() {
				527	m_left_block.cleanup();
				528	m_right_block.cleanup();
				529	}
				530
				531	private:
				532	LhsTensorBlock m_left_block;
				533	RhsTensorBlock m_right_block;
				534	BinaryOp m_functor;
				535	};
				536
				537	// -------------------------------------------------------------------------- //
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	538	// TensorUnaryExprBlock is a lazy tensor expression block that can construct
				539	// an arbitrary tensor expression from a block of the underlying type (this is a
				540	// generalization of the TensorCwiseUnaryBlock for arbitrary expressions).
				541
				542	template <typename BlockFactory, typename ArgTensorBlock>
				543	class TensorUnaryExprBlock {
				544	#if !EIGEN_HAS_CXX11
				545	typedef internal::TensorBlockKind::TensorBlockKind TensorBlockKind;
				546	#endif
				547
				548	typedef typename ArgTensorBlock::XprType ArgXprType;
				549	static const bool NoArgBlockAccess = internal::is_void<ArgXprType>::value;
				550
				551	public:
				552	typedef typename conditional<
				553	NoArgBlockAccess, void,
				554	typename BlockFactory::template XprType<ArgXprType>::type>::type XprType;
				555
				556	typedef typename XprScalar<XprType>::type Scalar;
				557
				558	TensorUnaryExprBlock(const ArgTensorBlock& arg_block,
				559	const BlockFactory& factory)
				560	: m_arg_block(arg_block), m_factory(factory) {}
				561
				562	TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
				563	XprType expr() const { return m_factory.expr(m_arg_block.expr()); }
				564	const Scalar* data() const { return NULL; }
				565	void cleanup() { m_arg_block.cleanup(); }
				566
				567	private:
				568	ArgTensorBlock m_arg_block;
				569	BlockFactory m_factory;
				570	};
				571
				572	// -------------------------------------------------------------------------- //
				573	// TensorTernaryExprBlock is a lazy tensor expression block that can construct
				574	// an arbitrary tensor expression from three blocks of the underlying type.
				575
				576	template <typename BlockFactory, typename Arg1TensorBlock,
				577	typename Arg2TensorBlock, typename Arg3TensorBlock>
				578	class TensorTernaryExprBlock {
				579	#if !EIGEN_HAS_CXX11
				580	typedef internal::TensorBlockKind::TensorBlockKind TensorBlockKind;
				581	#endif
				582
				583	typedef typename Arg1TensorBlock::XprType Arg1XprType;
				584	typedef typename Arg2TensorBlock::XprType Arg2XprType;
				585	typedef typename Arg3TensorBlock::XprType Arg3XprType;
				586
				587	static const bool NoArgBlockAccess = internal::is_void<Arg1XprType>::value \|\|
				588	internal::is_void<Arg2XprType>::value \|\|
				589	internal::is_void<Arg3XprType>::value;
				590
				591	public:
				592	typedef typename conditional<
				593	NoArgBlockAccess, void,
				594	typename BlockFactory::template XprType<Arg1XprType, Arg2XprType,
				595	Arg3XprType>::type>::type XprType;
				596
				597	typedef typename XprScalar<XprType>::type Scalar;
				598
				599	TensorTernaryExprBlock(const Arg1TensorBlock& arg1_block,
				600	const Arg2TensorBlock& arg2_block,
				601	const Arg3TensorBlock& arg3_block,
				602	const BlockFactory& factory)
				603	: m_arg1_block(arg1_block),
				604	m_arg2_block(arg2_block),
				605	m_arg3_block(arg3_block),
				606	m_factory(factory) {}
				607
				608	TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
				609	XprType expr() const {
				610	return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(),
				611	m_arg3_block.expr());
				612	}
				613	const Scalar* data() const { return NULL; }
				614	void cleanup() {
				615	m_arg1_block.cleanup();
				616	m_arg2_block.cleanup();
				617	m_arg3_block.cleanup();
				618	}
				619
				620	private:
				621	Arg1TensorBlock m_arg1_block;
				622	Arg2TensorBlock m_arg2_block;
				623	Arg3TensorBlock m_arg3_block;
				624	BlockFactory m_factory;
				625	};
				626
				627	// -------------------------------------------------------------------------- //
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	628	// StridedLinearBufferCopy provides a method to copy data between two linear
				629	// buffers with different strides, with optimized paths for scatter/gather.
				630
				631	template <typename Scalar, typename IndexType>
				632	class StridedLinearBufferCopy {
				633	typedef typename packet_traits<Scalar>::type Packet;
				634	enum {
				635	Vectorizable = packet_traits<Scalar>::Vectorizable,
				636	PacketSize = packet_traits<Scalar>::size
				637	};
				638
				639	public:
				640	// Specifying linear copy kind statically gives ~30% speedup for small sizes.
				641	enum Kind {
				642	Linear = 0, // src_stride == 1 && dst_stride == 1
				643	Scatter = 1, // src_stride == 1 && dst_stride != 1
				644	FillLinear = 2, // src_stride == 0 && dst_stride == 1
				645	FillScatter = 3, // src_stride == 0 && dst_stride != 1
				646	Gather = 4, // dst_stride == 1
				647	Random = 5 // everything else
				648	};
				649
				650	struct Dst {
				651	Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {}
				652
				653	IndexType offset;
				654	IndexType stride;
				655	Scalar* data;
				656	};
				657
				658	struct Src {
				659	Src(IndexType o, IndexType s, const Scalar* d)
				660	: offset(o), stride(s), data(d) {}
				661
				662	IndexType offset;
				663	IndexType stride;
				664	const Scalar* data;
				665	};
				666
				667	template <StridedLinearBufferCopy::Kind kind>
				668	static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst,
				669	const Src& src,
				670	const size_t count) {
				671	Run<kind>(count, dst.offset, dst.stride, dst.data, src.offset, src.stride,
				672	src.data);
				673	}
				674
				675	private:
				676	template <StridedLinearBufferCopy::Kind kind>
				677	static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
				678	const IndexType count, const IndexType dst_offset,
				679	const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data,
				680	const IndexType src_offset, const IndexType src_stride,
				681	const Scalar* EIGEN_RESTRICT src_data) {
				682	const Scalar* src = &src_data[src_offset];
				683	Scalar* dst = &dst_data[dst_offset];
				684
				685	if (!Vectorizable) {
				686	for (Index i = 0; i < count; ++i) {
				687	dst[i * dst_stride] = src[i * src_stride];
				688	}
				689	return;
				690	}
				691
				692	const IndexType vectorized_size = count - PacketSize;
				693	IndexType i = 0;
				694
				695	if (kind == Linear) {
				696	// ******************************************************************** //
				697	// Linear copy from `src` to `dst`.
				698	const IndexType unrolled_size = count - 4 * PacketSize;
				699	eigen_assert(src_stride == 1 && dst_stride == 1);
				700	for (; i <= unrolled_size; i += 4 * PacketSize) {
				701	for (int j = 0; j < 4; ++j) {
				702	Packet p = ploadu<Packet>(src + i + j * PacketSize);
				703	pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
				704	}
				705	}
				706	for (; i <= vectorized_size; i += PacketSize) {
				707	Packet p = ploadu<Packet>(src + i);
				708	pstoreu<Scalar, Packet>(dst + i, p);
				709	}
				710	for (; i < count; ++i) {
				711	dst[i] = src[i];
				712	}
				713	// ******************************************************************** //
				714	} else if (kind == Scatter) {
				715	// Scatter from `src` to `dst`.
				716	eigen_assert(src_stride == 1 && dst_stride != 1);
				717	for (; i <= vectorized_size; i += PacketSize) {
				718	Packet p = ploadu<Packet>(src + i);
				719	pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
				720	}
				721	for (; i < count; ++i) {
				722	dst[i * dst_stride] = src[i];
				723	}
				724	// ******************************************************************** //
				725	} else if (kind == FillLinear) {
				726	// Fill `dst` with value at `*src`.
				727	eigen_assert(src_stride == 0 && dst_stride == 1);
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	728	const IndexType unrolled_size = count - 4 * PacketSize;
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	729	Packet p = pload1<Packet>(src);
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	730	for (; i <= unrolled_size; i += 4 * PacketSize) {
				731	for (int j = 0; j < 4; ++j) {
				732	pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
				733	}
				734	}
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	735	for (; i <= vectorized_size; i += PacketSize) {
				736	pstoreu<Scalar, Packet>(dst + i, p);
				737	}
				738	for (; i < count; ++i) {
				739	dst[i] = *src;
				740	}
				741	// ******************************************************************** //
				742	} else if (kind == FillScatter) {
				743	// Scatter `*src` into `dst`.
				744	eigen_assert(src_stride == 0 && dst_stride != 1);
				745	Packet p = pload1<Packet>(src);
				746	for (; i <= vectorized_size; i += PacketSize) {
				747	pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
				748	}
				749	for (; i < count; ++i) {
				750	dst[i * dst_stride] = *src;
				751	}
				752	// ******************************************************************** //
				753	} else if (kind == Gather) {
				754	// Gather from `src` into `dst`.
				755	eigen_assert(dst_stride == 1);
				756	for (; i <= vectorized_size; i += PacketSize) {
				757	Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride);
				758	pstoreu<Scalar, Packet>(dst + i, p);
				759	}
				760	for (; i < count; ++i) {
				761	dst[i] = src[i * src_stride];
				762	}
				763	// ******************************************************************** //
				764	} else if (kind == Random) {
				765	// Random.
				766	for (; i < count; ++i) {
				767	dst[i * dst_stride] = src[i * src_stride];
				768	}
				769	} else {
				770	eigen_assert(false);
				771	}
				772	}
				773	};
				774
				775	// -------------------------------------------------------------------------- //
				776	// TensorBlockIO copies data from `src` tensor block, to the `dst` tensor block.
				777	// It's possible to specify src->dst dimension mapping for the copy operation.
				778	// Dimensions of `dst` specify how many elements have to be copied, for the
				779	// `src` we need to know only stride to navigate through source memory buffer.
				780
				781	template <typename Scalar, typename IndexType, int NumDims, int Layout>
				782	class TensorBlockIOV2 {
				783	static const bool IsColMajor = (Layout == ColMajor);
				784
				785	typedef StridedLinearBufferCopy<Scalar, IndexType> LinCopy;
				786
				787	public:
				788	typedef DSizes<IndexType, NumDims> Dimensions;
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	789	typedef DSizes<int, NumDims> DimensionsMap;
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	790
				791	struct Dst {
				792	Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst,
				793	IndexType dst_offset = 0)
				794	: dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {}
				795
				796	Dimensions dims;
				797	Dimensions strides;
				798	Scalar* data;
				799	IndexType offset;
				800	};
				801
				802	struct Src {
				803	Src(const Dimensions& src_strides, const Scalar* src,
				804	IndexType src_offset = 0)
				805	: strides(src_strides), data(src), offset(src_offset) {}
				806
				807	Dimensions strides;
				808	const Scalar* data;
				809	IndexType offset;
				810	};
				811
				812	// Copies data to `dst` from `src`, using provided dimensions mapping:
				813	//
				814	// src_dimension_index = dst_to_src_dim_map[dst_dimension_index]
				815	//
				816	static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Copy(
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	817	const Dst& dst, const Src& src, const DimensionsMap& dst_to_src_dim_map) {
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	818	// Copy single scalar value from `src` to `dst`.
				819	if (NumDims == 0) {
				820	(dst.data + dst.offset) = (src.data + src.offset);
				821	return;
				822	}
				823
				824	// Both `dst` and `src` must have contiguous innermost dimension. We also
				825	// accept the special case with stride '0', because it's used as a trick to
				826	// implement broadcasting.
				827	{
				828	int inner_dim = IsColMajor ? 0 : NumDims - 1;
				829	EIGEN_UNUSED_VARIABLE(inner_dim);
				830	eigen_assert(dst.strides[inner_dim] == 1 \|\| dst.strides[inner_dim] == 0);
				831	eigen_assert(src.strides[inner_dim] == 1 \|\| src.strides[inner_dim] == 0);
				832	}
				833
				834	// Give a shorter name to `dst_to_src_dim_map`.
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	835	const DimensionsMap& dim_map = dst_to_src_dim_map;
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	836
				837	// Do not squeeze reordered inner dimensions.
				838	int num_squeezable_dims = NumSqueezableInnerDims(dim_map);
				839
				840	// NOTE: We find the innermost dimension (contiguous in memory) in the dst
				841	// block, and we write data linearly into that dimension, reading it from
				842	// the src. If dimensions are reordered, we might end up reading data from
				843	// the src with `stride != 1`.
				844	//
				845	// NOTE: Random-Read/Linear-Write can be up to ~2X faster than
				846	// Linear-Read/Random-Write: https://stackoverflow.com/a/54935680
				847
				848	// Find the innermost dimension in the dst whose size is not 1. This is the
				849	// effective inner dim.
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	850	int num_size_one_inner_dims = 0;
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	851	for (int i = 0; i < num_squeezable_dims; ++i) {
				852	const int dst_dim = IsColMajor ? i : NumDims - i - 1;
				853	if (dst.dims[dst_dim] != 1) break;
				854	num_size_one_inner_dims++;
				855	}
				856
				857	// If all dimensions are of size 1, just copy a scalar from `src` to `dst`.
				858	if (num_size_one_inner_dims == NumDims) {
				859	(dst.data + dst.offset) = (src.data + src.offset);
				860	return;
				861	}
				862
				863	// Outermost dimension in the dst with `stride == 1` (contiguous in memory).
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	864	const int dst_stride1_dim =
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	865	IsColMajor ? num_size_one_inner_dims
				866	: NumDims - num_size_one_inner_dims - 1;
				867
				868	// Dimension in the src that corresponds to the dst innermost dimension.
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	869	const int src_dim_for_dst_stride1_dim =
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	870	NumDims == 0 ? 1 : dim_map[dst_stride1_dim];
				871
				872	// Size of the innermost dimension (length of contiguous blocks of memory).
				873	IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim];
				874
				875	// Squeeze multiple inner dims into one if they are contiguous in `dst` and
				876	// `src` memory, so we can do less linear copy calls.
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	877	for (int i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) {
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	878	const int dst_dim = IsColMajor ? i : NumDims - i - 1;
				879	const IndexType dst_stride = dst.strides[dst_dim];
				880	const IndexType src_stride = src.strides[dim_map[dst_dim]];
				881	if (dst_inner_dim_size == dst_stride && dst_stride == src_stride) {
				882	dst_inner_dim_size *= dst.dims[dst_dim];
				883	++num_size_one_inner_dims;
				884	} else {
				885	break;
				886	}
				887	}
				888
				889	// Setup strides to read data from `src` and write to `dst`.
				890	IndexType input_offset = src.offset;
				891	IndexType output_offset = dst.offset;
				892	IndexType input_stride =
				893	NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim];
				894	IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim];
				895
				896	const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
				897	array<BlockIteratorState, at_least_1_dim> it;
				898
				899	// Initialize block iterator state. Squeeze away any dimension of size 1.
				900	int idx = 0; // currently initialized iterator state index
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	901	for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	902	const int dst_dim = IsColMajor ? i + 1 : NumDims - i - 2;
				903	if (dst.dims[dst_dim] == 1) continue;
				904
				905	it[idx].size = dst.dims[dst_dim];
				906	it[idx].input_stride = src.strides[dim_map[dst_dim]];
				907	it[idx].output_stride = dst.strides[dst_dim];
				908
				909	it[idx].input_span = it[idx].input_stride * (it[idx].size - 1);
				910	it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
				911
				912	idx++;
				913	}
				914
				915	// Iterate copying data from src to dst.
				916	const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize();
				917
				918	#define COPY_INNER_DIM(KIND) \
				919	for (IndexType i = 0; i < block_total_size; i += dst_inner_dim_size) { \
				920	LinCopy::template Run<KIND>( \
				921	typename LinCopy::Dst(output_offset, output_stride, dst.data), \
				922	typename LinCopy::Src(input_offset, input_stride, src.data), \
				923	dst_inner_dim_size); \
				924	\
				925	for (int j = 0; j < idx; ++j) { \
				926	if (++it[j].count < it[j].size) { \
				927	input_offset += it[j].input_stride; \
				928	output_offset += it[j].output_stride; \
				929	break; \
				930	} \
				931	it[j].count = 0; \
				932	input_offset -= it[j].input_span; \
				933	output_offset -= it[j].output_span; \
				934	} \
				935	}
				936
				937	if (input_stride == 1 && output_stride == 1) {
				938	COPY_INNER_DIM(LinCopy::Linear);
				939	} else if (input_stride == 1 && output_stride != 1) {
				940	COPY_INNER_DIM(LinCopy::Scatter);
				941	} else if (input_stride == 0 && output_stride == 1) {
				942	COPY_INNER_DIM(LinCopy::FillLinear);
				943	} else if (input_stride == 0 && output_stride != 1) {
				944	COPY_INNER_DIM(LinCopy::FillScatter);
				945	} else if (output_stride == 1) {
				946	COPY_INNER_DIM(LinCopy::Gather);
				947	} else {
				948	COPY_INNER_DIM(LinCopy::Random);
				949	}
				950
				951	#undef COPY_INNER_DIM
				952	}
				953
				954	// Copy from `src` to `dst` with an identity src->dst dimension map.
				955	static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Copy(const Dst& dst,
				956	const Src& src) {
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	957	DimensionsMap dst_to_src_map;
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	958	for (int i = 0; i < NumDims; ++i) dst_to_src_map[i] = i;
				959	Copy(dst, src, dst_to_src_map);
				960	}
				961
				962	private:
				963	struct BlockIteratorState {
				964	BlockIteratorState()
				965	: size(0),
				966	count(0),
				967	input_stride(0),
				968	output_stride(0),
				969	input_span(0),
				970	output_span(0) {}
				971
				972	IndexType size;
				973	IndexType count;
				974	IndexType input_stride;
				975	IndexType output_stride;
				976	IndexType input_span;
				977	IndexType output_span;
				978	};
				979
				980	// Compute how many inner dimensions it's allowed to squeeze when doing IO
				981	// between two tensor blocks. It's safe to squeeze inner dimensions, only
				982	// if they are not reordered.
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	983	static int NumSqueezableInnerDims(const DimensionsMap& dim_map) {
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	984	int num_squeezable_dims = 0;
				985	for (int i = 0; i < NumDims; ++i) {
				986	const int dim = IsColMajor ? i : NumDims - i - 1;
				987	if (dim_map[dim] != dim) break;
				988	num_squeezable_dims++;
				989	}
				990	return num_squeezable_dims;
				991	}
				992	};
				993
				994	// -------------------------------------------------------------------------- //
				995	// TensorBlockAssignment assigns a block expression of type `TensorBlockExpr` to
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	996	// a Tensor block defined by `desc`, backed by a memory buffer at `target`.
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	997	//
				998	// Currently there is no way to write from a Tensor expression to a block of
				999	// memory, if dimensions are reordered. If you need to do that, you should
				1000	// materialize a Tensor block expression into a memory buffer, and then use
				1001	// TensorBlockIO to copy data between two memory buffers with a custom
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	1002	// `target->src` dimension map (see definition above).
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	1003	//
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	1004	// Also currently the innermost dimension of `target` must have a stride '1'
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	1005	// (contiguous in memory). This restriction could be lifted with a `pscatter`,
				1006	// but in practice it's never needed, and there is a similar TensorBlockIO
				1007	// workaround for that.
				1008	//
				1009	// TODO(ezhulenev): TensorBlockAssignment is a special case of TensorBlockIO
				1010	// where `src` is a tensor expression. Explore if it is possible to rewrite IO
				1011	// to use expressions instead of pointers, and after that TensorBlockAssignment
				1012	// will become an alias to IO.
				1013	template <typename Scalar, int NumDims, typename TensorBlockExpr,
				1014	typename IndexType = Eigen::Index>
				1015	class TensorBlockAssignment {
				1016	// We will use coeff/packet path to evaluate block expressions.
				1017	typedef TensorEvaluator<const TensorBlockExpr, DefaultDevice>
				1018	TensorBlockEvaluator;
				1019
				1020	typedef DSizes<IndexType, NumDims> Dimensions;
				1021
				1022	enum {
				1023	Vectorizable = packet_traits<Scalar>::Vectorizable,
				1024	PacketSize = packet_traits<Scalar>::size
				1025	};
				1026
				1027	template <bool Vectorizable, typename Evaluator>
				1028	struct InnerDimAssign {
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	1029	EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	1030	const Evaluator& eval,
				1031	IndexType eval_offset) {
				1032	for (IndexType i = 0; i < count; ++i) {
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	1033	target[i] = eval.coeff(eval_offset + i);
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	1034	}
				1035	}
				1036	};
				1037
				1038	template <typename Evaluator>
				1039	struct InnerDimAssign<true, Evaluator> {
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	1040	EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	1041	const Evaluator& eval,
				1042	IndexType eval_offset) {
				1043	typedef typename packet_traits<Scalar>::type Packet;
				1044
				1045	const IndexType unrolled_size = count - 4 * PacketSize;
				1046	const IndexType vectorized_size = count - PacketSize;
				1047	IndexType i = 0;
				1048
				1049	for (; i <= unrolled_size; i += 4 * PacketSize) {
				1050	for (int j = 0; j < 4; ++j) {
				1051	const IndexType idx = eval_offset + i + j * PacketSize;
				1052	Packet p = eval.template packet<Unaligned>(idx);
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	1053	pstoreu<Scalar>(target + i + j * PacketSize, p);
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	1054	}
				1055	}
				1056
				1057	for (; i <= vectorized_size; i += PacketSize) {
				1058	Packet p = eval.template packet<Unaligned>(eval_offset + i);
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	1059	pstoreu<Scalar>(target + i, p);
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	1060	}
				1061
				1062	for (; i < count; ++i) {
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	1063	target[i] = eval.coeff(eval_offset + i);
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	1064	}
				1065	}
				1066	};
				1067
				1068	public:
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	1069	struct Target {
				1070	Target(const Dimensions& target_dims, const Dimensions& target_strides,
				1071	Scalar* target_data, IndexType target_offset = 0)
				1072	: dims(target_dims),
				1073	strides(target_strides),
				1074	data(target_data),
				1075	offset(target_offset) {}
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	1076
				1077	Dimensions dims;
				1078	Dimensions strides;
				1079	Scalar* data;
				1080	IndexType offset;
				1081	};
				1082
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	1083	static Target target(const Dimensions& target_dims,
				1084	const Dimensions& target_strides, Scalar* target_data,
				1085	IndexType target_offset = 0) {
				1086	return Target(target_dims, target_strides, target_data, target_offset);
				1087	}
				1088
				1089	template <typename TargetDimsIndexType, typename TargetStridesIndexType>
				1090	static Target target(
				1091	const DSizes<TargetDimsIndexType, NumDims>& target_dims,
				1092	const DSizes<TargetStridesIndexType, NumDims>& target_strides,
				1093	Scalar* target_data, IndexType target_offset = 0) {
				1094	// DSizes constructor will do index type promotion if it's safe.
				1095	return Target(Dimensions(target_dims), Dimensions(target_strides),
				1096	target_data, target_offset);
				1097	}
				1098
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	1099	static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	1100	const Target& target, const TensorBlockExpr& expr) {
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	1101	// Prepare evaluator for block expression.
				1102	DefaultDevice default_device;
				1103	TensorBlockEvaluator eval(expr, default_device);
				1104
				1105	// Tensor block expression dimension should match destination dimensions.
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	1106	eigen_assert(dimensions_match(target.dims, eval.dimensions()));
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	1107
				1108	static const int Layout = TensorBlockEvaluator::Layout;
				1109	static const bool is_col_major = Layout == ColMajor;
				1110
				1111	// Initialize output inner dimension size based on a layout.
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	1112	const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize();
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	1113	const int inner_dim_idx = is_col_major ? 0 : NumDims - 1;
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	1114	IndexType output_inner_dim_size = target.dims[inner_dim_idx];
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	1115
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	1116	// Target inner dimension stride must be '1'.
				1117	eigen_assert(target.strides[inner_dim_idx] == 1);
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	1118
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	1119	// Squeeze multiple inner dims into one if they are contiguous in `target`.
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	1120	IndexType num_squeezed_dims = 0;
				1121	for (Index i = 1; i < NumDims; ++i) {
				1122	const Index dim = is_col_major ? i : NumDims - i - 1;
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	1123	const IndexType target_stride = target.strides[dim];
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	1124
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	1125	if (output_inner_dim_size == target_stride) {
				1126	output_inner_dim_size *= target.dims[dim];
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	1127	num_squeezed_dims++;
				1128	} else {
				1129	break;
				1130	}
				1131	}
				1132
				1133	// Initialize output block iterator state. Dimension in this array are
				1134	// always in inner_most -> outer_most order (col major layout).
				1135	array<BlockIteratorState, NumDims> it;
				1136
				1137	int idx = 0; // currently initialized iterator state index
				1138	for (Index i = num_squeezed_dims; i < NumDims - 1; ++i) {
				1139	const Index dim = is_col_major ? i + 1 : NumDims - i - 2;
				1140
				1141	it[idx].count = 0;
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	1142	it[idx].size = target.dims[dim];
				1143	it[idx].output_stride = target.strides[dim];
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	1144	it[idx].output_span = it[i].output_stride * (it[i].size - 1);
				1145	idx++;
				1146	}
				1147
				1148	// We read block expression from the beginning, and start writing data to
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	1149	// `target` at given offset.
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	1150	IndexType input_offset = 0;
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	1151	IndexType output_offset = target.offset;
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	1152
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	1153	// Iterate copying data from `eval` to `target`.
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	1154	for (IndexType i = 0; i < output_size; i += output_inner_dim_size) {
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	1155	// Assign to `target` at current offset.
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	1156	InnerDimAssign<Vectorizable && TensorBlockEvaluator::PacketAccess,
Eugene Zhulenev	3c2990f	2019-10-08 14:45:43 -0700	[diff] [blame^]	1157	TensorBlockEvaluator>::Run(target.data + output_offset,
Rasmus Munk Larsen	9f90524	2019-10-01 16:34:00 -0700	[diff] [blame]	1158	output_inner_dim_size, eval,
				1159	input_offset);
				1160
				1161	// Move input offset forward by the number of assigned coefficients.
				1162	input_offset += output_inner_dim_size;
				1163
				1164	// Update index.
				1165	for (int j = 0; j < idx; ++j) {
				1166	if (++it[j].count < it[j].size) {
				1167	output_offset += it[j].output_stride;
				1168	break;
				1169	}
				1170	it[j].count = 0;
				1171	output_offset -= it[j].output_span;
				1172	}
				1173	}
				1174	}
				1175
				1176	private:
				1177	struct BlockIteratorState {
				1178	BlockIteratorState()
				1179	: count(0), size(0), output_stride(0), output_span(0) {}
				1180
				1181	IndexType count;
				1182	IndexType size;
				1183	IndexType output_stride;
				1184	IndexType output_span;
				1185	};
				1186	};
				1187
				1188	// -------------------------------------------------------------------------- //
				1189
				1190	} // namespace internal
				1191	} // namespace Eigen
				1192
				1193	#endif // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_V2_H