blob: 3880e7ed309983b6bcd956164d400db26bc2e005 [file] [log] [blame]
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -07001// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// This Source Code Form is subject to the terms of the Mozilla
5// Public License v. 2.0. If a copy of the MPL was not distributed
6// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
7
8#ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_V2_H
9#define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_V2_H
10
11namespace Eigen {
12namespace internal {
13
14// -------------------------------------------------------------------------- //
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -070015// Forward declarations for templates defined below.
16template <typename Scalar, typename IndexType, int NumDims, int Layout>
17class TensorBlockIOV2;
18
19// -------------------------------------------------------------------------- //
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -070020// Helper function to compute strides for densely stored buffer of given
21// dimensions.
22
23// TODO(ezhulenev): We compute strides 1000 times in different evaluators, use
24// this function instead everywhere.
25template <int Layout, typename IndexType, int NumDims>
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -070026EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -070027 const DSizes<IndexType, NumDims>& dimensions) {
28 DSizes<IndexType, NumDims> strides;
29 if (NumDims == 0) return strides;
30
31 // TODO(ezhulenev): Use templates to unroll this loop (similar to
32 // h_array_reduce in CXX11meta.h)? Benchmark it.
33 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
34 strides[0] = 1;
35 for (int i = 1; i < NumDims; ++i) {
36 strides[i] = strides[i - 1] * dimensions[i - 1];
37 }
38 } else {
39 strides[NumDims - 1] = 1;
40 for (int i = NumDims - 2; i >= 0; --i) {
41 strides[i] = strides[i + 1] * dimensions[i + 1];
42 }
43 }
44
45 return strides;
46}
47
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -070048#if EIGEN_HAS_CXX11
Eugene Zhulenev3736d722019-10-01 19:41:39 -070049template <int Layout, std::ptrdiff_t... Indices>
50EIGEN_STRONG_INLINE DSizes<std::ptrdiff_t, sizeof...(Indices)> strides(
51 const Sizes<Indices...>& sizes) {
52 return strides<Layout>(DSizes<std::ptrdiff_t, sizeof...(Indices)>(sizes));
53}
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -070054#endif
Eugene Zhulenev3736d722019-10-01 19:41:39 -070055
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -070056// -------------------------------------------------------------------------- //
57// TensorBlockDescriptor specifies a block offset within a tensor and the block
58// sizes along each of the tensor dimensions.
59
60template <int NumDims, typename IndexType = Eigen::Index>
61class TensorBlockDescriptor {
62 public:
63 typedef DSizes<IndexType, NumDims> Dimensions;
64
65 // If we evaluate a Tensor assignment, and expression on the left, already has
66 // a memory buffer, then we might do performance optimization, and evaluate
67 // the root expression directly into the memory, or maybe use it as temporary
68 // storage for some of the subexpressions, to avoid dynamic memory allocation.
69 //
70 // This is a type erased storage, because passing Scalar type through all the
71 // expression evaluation layers it way too many templates. Also it should be
72 // possible to use this destination as a temp buffer for materializing
73 // expressions with type, not matching the final output.
74 class DestinationBuffer {
75 public:
76 template <typename Scalar>
77 Scalar* data() const {
78 return static_cast<Scalar*>(m_data);
79 }
80
81 private:
82 friend class TensorBlockDescriptor;
83
84 DestinationBuffer() : m_data(NULL), m_total_dst_bytes(0) {}
85
86 template <typename Scalar>
87 DestinationBuffer(Scalar* data, const Dimensions& dimensions,
88 const Dimensions& strides, size_t total_dst_bytes)
89 : m_data(static_cast<void*>(data)),
90 m_dimensions(dimensions),
91 m_strides(strides),
92 m_total_dst_bytes(total_dst_bytes) {
93 // TODO(ezhulenev): Benchmark template meta-unroll for this loop.
94 for (int i = 0; i < NumDims; ++i) {
95 m_dimensions[i] *= sizeof(Scalar);
96 m_strides[i] *= sizeof(Scalar);
97 }
98 }
99
100 // Returns true if the tensor block corresponding to `desc` fits into the
101 // contiguous block of memory defined by `*this`.
102 template <typename Scalar, int Layout>
103 bool fitsContiguously(const TensorBlockDescriptor& desc) const {
104 if (m_data == NULL) return false;
105
106 const Dimensions& desc_dims = desc.dimensions();
107 const Dimensions& dst_dims = dimensions<Scalar>();
108
109 if (!dimensions_match(desc_dims, dst_dims)) return false;
110
111 const Dimensions& desc_strides = internal::strides<Layout>(desc_dims);
112 const Dimensions& dst_strides = internal::strides<Layout>(dst_dims);
113
114 return dimensions_match(desc_strides, dst_strides);
115 }
116
117 template <typename Scalar>
118 Dimensions dimensions() const {
119 Dimensions dimensions;
120 for (int i = 0; i < NumDims; ++i) {
121 eigen_assert(m_dimensions[i] % sizeof(Scalar) == 0);
122 dimensions[i] = m_dimensions[i] / sizeof(Scalar);
123 }
124 return dimensions;
125 }
126
127 template <typename Scalar>
128 Dimensions strides() const {
129 Dimensions strides;
130 for (int i = 0; i < NumDims; ++i) {
131 eigen_assert(m_strides[i] % sizeof(Scalar) == 0);
132 strides[i] = m_strides[i] / sizeof(Scalar);
133 }
134 return strides;
135 }
136
137 void* m_data;
138 Dimensions m_dimensions;
139 Dimensions m_strides;
140
141 // Total size of the memory buffer at the destination (typically the total
142 // size of the left hand side of an assignment expression). This can be the
143 // same as `array_prod(m_dimensions)` if the assignment target has just a
144 // single block, but typically it's a larger number.
145 size_t m_total_dst_bytes;
146 };
147
148 TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions,
149 const DestinationBuffer& destination)
150 : m_offset(offset),
151 m_dimensions(dimensions),
152 m_destination(destination) {}
153
154 TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions)
155 : m_offset(offset),
156 m_dimensions(dimensions),
157 m_destination(DestinationBuffer()) {}
158
159 IndexType offset() const { return m_offset; }
160 const Dimensions& dimensions() const { return m_dimensions; }
161 IndexType dimension(int index) const { return m_dimensions[index]; }
162 IndexType size() const { return array_prod<IndexType>(m_dimensions); }
163
164 template <typename Scalar>
165 void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides,
166 size_t total_dst_bytes) {
167 m_destination =
168 DestinationBuffer(dst_base, m_dimensions, dst_strides, total_dst_bytes);
169 }
170
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -0700171 template <typename Scalar, typename DstStridesIndexType>
172 void AddDestinationBuffer(
173 Scalar* dst_base, const DSizes<DstStridesIndexType, NumDims>& dst_strides,
174 size_t total_dst_bytes) {
175 // DSizes constructor will do index type promotion if it's safe.
176 AddDestinationBuffer(dst_base, Dimensions(dst_strides), total_dst_bytes);
177 }
178
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -0700179 TensorBlockDescriptor& DropDestinationBuffer() {
180 m_destination.m_data = NULL;
181 return *this;
182 }
183
184 // Returns a non-nullptr pointer to a destination buffer memory if this
185 // block has a contiguous destination buffer.
186 template <typename Scalar, int Layout>
187 Scalar* destination() const {
188 if (m_destination.template fitsContiguously<Scalar, Layout>(*this)) {
189 return m_destination.template data<Scalar>();
190 }
191 return NULL;
192 }
193
194 private:
195 // Offset and dimensions are immutable after construction. Block descriptor
196 // can only be mutated by adding or dropping destination.
197 const IndexType m_offset;
198 const Dimensions m_dimensions;
199 DestinationBuffer m_destination;
200};
201
202// -------------------------------------------------------------------------- //
203// TensorBlockScratchAllocator is responsible for allocating temporary buffers
204// for block evaluation (output or input block materialization). Given that
205// Eigen expression traversal order is deterministic, all temporary allocations
206// are happening in the same order, and usually have exactly the same size.
207// Scratch allocator keeps a trace of all dynamic allocations, and after the
208// first block evaluation is completed, we should be able to reuse all the
209// temporary buffers for the next block evaluation.
210
211template <typename Device>
212class TensorBlockScratchAllocator {
213 public:
214 explicit TensorBlockScratchAllocator(const Device& device)
215 : m_device(device), m_allocation_index(0) {}
216
217 ~TensorBlockScratchAllocator() {
218 for (size_t i = 0; i < m_allocations.size(); ++i) {
219 m_device.deallocate(m_allocations[i].ptr);
220 }
221 }
222
223 void* allocate(size_t size) {
224 // TODO(ezhulenev): Remove when replaced with inlined vector.
225 if (m_allocations.capacity() == 0) m_allocations.reserve(8);
226
227 // Check if we already have an existing allocation att current index.
228 const int num_allocations = static_cast<int>(m_allocations.size());
229 const bool has_allocation = m_allocation_index < num_allocations;
230
231 // Allocation index can't be larger than the number of allocations.
232 eigen_assert(m_allocation_index <= num_allocations);
233
234 // If we have existing allocation, and its size is larger or equal to
235 // requested size, we do nothing.
236
237 // If current allocation can't fit requested size, we deallocate it, and
238 // replace with a larger allocation.
239 if (has_allocation && m_allocations[m_allocation_index].size < size) {
240 m_device.deallocate(m_allocations[m_allocation_index].ptr);
241 m_allocations[m_allocation_index].ptr = m_device.allocate(size);
242 m_allocations[m_allocation_index].size = size;
243 }
244
245 // Make a new allocation if we don't have and existing one.
246 if (!has_allocation) {
247 Allocation allocation;
248 allocation.ptr = m_device.allocate(size);
249 allocation.size = size;
250 m_allocations.push_back(allocation);
251 }
252
253 eigen_assert(m_allocations[m_allocation_index].ptr != NULL);
254 eigen_assert(m_allocations[m_allocation_index].size >= size);
255
256 return m_allocations[m_allocation_index++].ptr;
257 }
258
259 void reset() { m_allocation_index = 0; }
260
261 private:
262 struct Allocation {
263 void* ptr;
264 size_t size;
265 };
266
267 const Device& m_device;
268 int m_allocation_index;
269 // TODO(ezhulenev): This should be an inlined vector.
270 std::vector<Allocation> m_allocations;
271};
272
273// -------------------------------------------------------------------------- //
274// TensorBlockKind represents all possible block kinds, that can be produced by
275// TensorEvaluator::evalBlock function.
276#if !EIGEN_HAS_CXX11
277// To be able to use `TensorBlockKind::kExpr` in C++03 we need a namespace.
278// (Use of enumeration in a nested name specifier is a c++11 extension).
279namespace TensorBlockKind {
280#endif
281enum TensorBlockKind {
282 // Tensor block that is a lazy expression that must be assigned to a
283 // destination using TensorBlockAssign.
284 kExpr,
285
286 // Tensor block that is a view into a memory buffer owned by an underlying
287 // Tensor expression (e.g. it can be a view into a Tensor buffer).
288 kView,
289
290 // Tensor block that was materialized in a scratch memory buffer, allocated
291 // with TensorBlockScratchAllocator. This block must be copied to a
292 // destination, similar to a block of `kExpr` type.
293 kMaterializedInScratch,
294
295 // Tensor block that was materialized directly into the final output memory
296 // buffer. For example if the left side of an assignment is a Tensor, we can
297 // directly materialize the block in the destination memory. The block
298 // expression is still a valid Tensor expression, and can be used to build
299 // lazy expressions.
300 kMaterializedInOutput
301
302 // TODO(ezhulenev): If we know that we are evaluating a block, for the root of
303 // the expression tree, it might be beneficial to do an assignment to the
304 // output memory buffer, even if it will be impossible to construct a valid
305 // block expression after that (e.g. output memory buffer has strides not
306 // compatible with TensorMap). This might be a performance optimization for
307 // uniformly shaped blocks, because for blocks skewed towards inner dimension
308 // `kMaterializedInOutput` should always work.
309};
310#if !EIGEN_HAS_CXX11
311} // namespace TensorBlockKind
312#endif
313
314// -------------------------------------------------------------------------- //
315// TensorBlockNotImplemented should be used to defined TensorBlock typedef in
316// TensorEvaluators that do not support block evaluation.
317
318class TensorBlockNotImplemented {
319 public:
320 typedef void XprType;
321};
322
323// -------------------------------------------------------------------------- //
324// XprScalar extracts Scalar type from the Eigen expressions (if expression type
325// is not void). It's required to be able to define lazy block expression for
326// argument types, that do not support block evaluation.
327
328template <typename XprType>
329struct XprScalar {
330 typedef typename XprType::Scalar type;
331};
332template <>
333struct XprScalar<void> {
334 typedef void type;
335};
336
337// -------------------------------------------------------------------------- //
338// TensorMaterializedBlock is a fully evaluated block of the original tensor,
339// and XprType is just a TensorMap over the data. This block type is typically
340// used to materialize blocks of tensor expressions, that can't be efficiently
341// represented as lazy Tensor expressions with fast coeff/packet operations,
342// e.g. we materialize all broadcasts into evaluated blocks.
343//
344// TensorMaterializedBlock does not own its memory buffer, it's either a memory
345// buffer that backs the original expression (e.g. block is just a view into a
346// Tensor), or a memory buffer allocated with scratch allocator, and in this
347// case the scratch allocator will deallocate it at the end of block based
348// expression execution.
349
350template <typename Scalar, int NumDims, int Layout,
351 typename IndexType = Eigen::Index>
352class TensorMaterializedBlock {
353#if !EIGEN_HAS_CXX11
354 typedef internal::TensorBlockKind::TensorBlockKind TensorBlockKind;
355#endif
356 public:
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -0700357 typedef DSizes<IndexType, NumDims> Dimensions;
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -0700358 typedef TensorMap<const Tensor<Scalar, NumDims, Layout> > XprType;
359
360 TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data,
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -0700361 const Dimensions& dimensions)
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -0700362 : m_kind(kind),
363 m_data(data),
364 m_dimensions(dimensions),
365 m_expr(m_data, m_dimensions) {
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -0700366 eigen_assert(m_kind == internal::TensorBlockKind::kView ||
367 m_kind == internal::TensorBlockKind::kMaterializedInScratch ||
368 m_kind == internal::TensorBlockKind::kMaterializedInOutput);
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -0700369 }
370
371 TensorBlockKind kind() const { return m_kind; }
372 // NOTE(ezhulenev): Returning XprType by value like in other block types
373 // causes asan failures. The theory is that XprType::Nested doesn't work
374 // properly for TensorMap.
375 const XprType& expr() const { return m_expr; }
376 const Scalar* data() const { return m_data; }
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -0700377 void cleanup() {}
378
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -0700379 typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
380
381 // Creates a materialized block for the given descriptor from a memory buffer.
382 template <typename DataDimensions, typename TensorBlockScratch>
383 EIGEN_STRONG_INLINE static TensorMaterializedBlock materialize(
384 const Scalar* data, const DataDimensions& data_dims,
385 TensorBlockDesc& desc, TensorBlockScratch& scratch) {
386 eigen_assert(array_size<DataDimensions>::value == desc.dimensions().size());
387
388 // If a tensor block dimensions covers a contiguous block of the underlying
389 // memory, we can skip block buffer memory allocation, and construct a block
390 // from existing `data` memory buffer.
391 //
392 // Example: (RowMajor layout)
393 // data_dims: [11, 12, 13, 14]
394 // desc.dimensions(): [1, 1, 3, 14]
395 //
396 // In this case we can construct a TensorBlock starting at
397 // `data + desc.offset()`, with a `desc.dimensions()` block sizes.
398 static const bool is_col_major = Layout == ColMajor;
399
400 // Find out how many inner dimensions have a matching size.
401 int num_matching_inner_dims = 0;
402 for (int i = 0; i < NumDims; ++i) {
403 int dim = is_col_major ? i : NumDims - i - 1;
404 if (data_dims[dim] != desc.dimensions()[dim]) break;
405 ++num_matching_inner_dims;
406 }
407
408 // All the outer dimensions must be of size `1`, except a single dimension
409 // before the matching inner dimension (`3` in the example above).
410 bool can_use_direct_access = true;
411 for (int i = num_matching_inner_dims + 1; i < NumDims; ++i) {
412 int dim = is_col_major ? i : NumDims - i - 1;
413 if (desc.dimension(dim) != 1) {
414 can_use_direct_access = false;
415 break;
416 }
417 }
418
419 if (can_use_direct_access) {
420 const Scalar* block_start = data + desc.offset();
421 return TensorMaterializedBlock(internal::TensorBlockKind::kView, block_start,
422 desc.dimensions());
423
424 } else {
425 void* mem = scratch.allocate(desc.size() * sizeof(Scalar));
426 Scalar* block_buffer = static_cast<Scalar*>(mem);
427
428 typedef internal::TensorBlockIOV2<Scalar, IndexType, NumDims, Layout>
429 TensorBlockIO;
430 typedef typename TensorBlockIO::Dst TensorBlockIODst;
431 typedef typename TensorBlockIO::Src TensorBlockIOSrc;
432
433 TensorBlockIOSrc src(internal::strides<Layout>(Dimensions(data_dims)),
434 data, desc.offset());
435 TensorBlockIODst dst(desc.dimensions(),
436 internal::strides<Layout>(desc.dimensions()),
437 block_buffer);
438
439 TensorBlockIO::Copy(dst, src);
440
441 return TensorMaterializedBlock(internal::TensorBlockKind::kMaterializedInScratch,
442 block_buffer, desc.dimensions());
443 }
444 }
445
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -0700446 private:
447 TensorBlockKind m_kind;
448 const Scalar* m_data;
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -0700449 Dimensions m_dimensions;
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -0700450 XprType m_expr;
451};
452
453// -------------------------------------------------------------------------- //
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -0700454// TensorCwiseUnaryBlock is a lazy tensor expression block that applies UnaryOp
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -0700455// functor to the blocks produced by the underlying Tensor expression.
456
457template <typename UnaryOp, typename ArgTensorBlock>
458class TensorCwiseUnaryBlock {
459#if !EIGEN_HAS_CXX11
460 typedef internal::TensorBlockKind::TensorBlockKind TensorBlockKind;
461#endif
462
463 static const bool NoArgBlockAccess =
464 internal::is_void<typename ArgTensorBlock::XprType>::value;
465
466 public:
467 typedef typename conditional<
468 NoArgBlockAccess, void,
469 TensorCwiseUnaryOp<UnaryOp, const typename ArgTensorBlock::XprType> >::type
470 XprType;
471
472 typedef typename XprScalar<XprType>::type Scalar;
473
474 TensorCwiseUnaryBlock(const ArgTensorBlock& arg_block, const UnaryOp& functor)
475 : m_arg_block(arg_block), m_functor(functor) {}
476
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -0700477 TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -0700478
479 XprType expr() const { return XprType(m_arg_block.expr(), m_functor); }
480 const Scalar* data() const { return NULL; }
481 void cleanup() { m_arg_block.cleanup(); }
482
483 private:
484 ArgTensorBlock m_arg_block;
485 UnaryOp m_functor;
486};
487
488// -------------------------------------------------------------------------- //
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -0700489// TensorCwiseUnaryBlock is a lazy tensor expression block that applies BinaryOp
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -0700490// functor to the blocks produced by the underlying Tensor expression.
491
492template <typename BinaryOp, typename LhsTensorBlock, typename RhsTensorBlock>
493class TensorCwiseBinaryBlock {
494#if !EIGEN_HAS_CXX11
495 typedef internal::TensorBlockKind::TensorBlockKind TensorBlockKind;
496#endif
497
498 static const bool NoArgBlockAccess =
499 internal::is_void<typename LhsTensorBlock::XprType>::value ||
500 internal::is_void<typename RhsTensorBlock::XprType>::value;
501
502 public:
503 typedef typename conditional<
504 NoArgBlockAccess, void,
505 TensorCwiseBinaryOp<BinaryOp, const typename LhsTensorBlock::XprType,
506 const typename RhsTensorBlock::XprType> >::type
507 XprType;
508
509 typedef typename XprScalar<XprType>::type Scalar;
510
511 TensorCwiseBinaryBlock(const LhsTensorBlock& left_block,
512 const RhsTensorBlock& right_block,
513 const BinaryOp& functor)
514 : m_left_block(left_block),
515 m_right_block(right_block),
516 m_functor(functor) {}
517
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -0700518 TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -0700519
520 XprType expr() const {
521 return XprType(m_left_block.expr(), m_right_block.expr(), m_functor);
522 }
523
524 const Scalar* data() const { return NULL; }
525
526 void cleanup() {
527 m_left_block.cleanup();
528 m_right_block.cleanup();
529 }
530
531 private:
532 LhsTensorBlock m_left_block;
533 RhsTensorBlock m_right_block;
534 BinaryOp m_functor;
535};
536
537// -------------------------------------------------------------------------- //
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -0700538// TensorUnaryExprBlock is a lazy tensor expression block that can construct
539// an arbitrary tensor expression from a block of the underlying type (this is a
540// generalization of the TensorCwiseUnaryBlock for arbitrary expressions).
541
542template <typename BlockFactory, typename ArgTensorBlock>
543class TensorUnaryExprBlock {
544#if !EIGEN_HAS_CXX11
545 typedef internal::TensorBlockKind::TensorBlockKind TensorBlockKind;
546#endif
547
548 typedef typename ArgTensorBlock::XprType ArgXprType;
549 static const bool NoArgBlockAccess = internal::is_void<ArgXprType>::value;
550
551 public:
552 typedef typename conditional<
553 NoArgBlockAccess, void,
554 typename BlockFactory::template XprType<ArgXprType>::type>::type XprType;
555
556 typedef typename XprScalar<XprType>::type Scalar;
557
558 TensorUnaryExprBlock(const ArgTensorBlock& arg_block,
559 const BlockFactory& factory)
560 : m_arg_block(arg_block), m_factory(factory) {}
561
562 TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
563 XprType expr() const { return m_factory.expr(m_arg_block.expr()); }
564 const Scalar* data() const { return NULL; }
565 void cleanup() { m_arg_block.cleanup(); }
566
567 private:
568 ArgTensorBlock m_arg_block;
569 BlockFactory m_factory;
570};
571
572// -------------------------------------------------------------------------- //
573// TensorTernaryExprBlock is a lazy tensor expression block that can construct
574// an arbitrary tensor expression from three blocks of the underlying type.
575
576template <typename BlockFactory, typename Arg1TensorBlock,
577 typename Arg2TensorBlock, typename Arg3TensorBlock>
578class TensorTernaryExprBlock {
579#if !EIGEN_HAS_CXX11
580 typedef internal::TensorBlockKind::TensorBlockKind TensorBlockKind;
581#endif
582
583 typedef typename Arg1TensorBlock::XprType Arg1XprType;
584 typedef typename Arg2TensorBlock::XprType Arg2XprType;
585 typedef typename Arg3TensorBlock::XprType Arg3XprType;
586
587 static const bool NoArgBlockAccess = internal::is_void<Arg1XprType>::value ||
588 internal::is_void<Arg2XprType>::value ||
589 internal::is_void<Arg3XprType>::value;
590
591 public:
592 typedef typename conditional<
593 NoArgBlockAccess, void,
594 typename BlockFactory::template XprType<Arg1XprType, Arg2XprType,
595 Arg3XprType>::type>::type XprType;
596
597 typedef typename XprScalar<XprType>::type Scalar;
598
599 TensorTernaryExprBlock(const Arg1TensorBlock& arg1_block,
600 const Arg2TensorBlock& arg2_block,
601 const Arg3TensorBlock& arg3_block,
602 const BlockFactory& factory)
603 : m_arg1_block(arg1_block),
604 m_arg2_block(arg2_block),
605 m_arg3_block(arg3_block),
606 m_factory(factory) {}
607
608 TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
609 XprType expr() const {
610 return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(),
611 m_arg3_block.expr());
612 }
613 const Scalar* data() const { return NULL; }
614 void cleanup() {
615 m_arg1_block.cleanup();
616 m_arg2_block.cleanup();
617 m_arg3_block.cleanup();
618 }
619
620 private:
621 Arg1TensorBlock m_arg1_block;
622 Arg2TensorBlock m_arg2_block;
623 Arg3TensorBlock m_arg3_block;
624 BlockFactory m_factory;
625};
626
627// -------------------------------------------------------------------------- //
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -0700628// StridedLinearBufferCopy provides a method to copy data between two linear
629// buffers with different strides, with optimized paths for scatter/gather.
630
631template <typename Scalar, typename IndexType>
632class StridedLinearBufferCopy {
633 typedef typename packet_traits<Scalar>::type Packet;
634 enum {
635 Vectorizable = packet_traits<Scalar>::Vectorizable,
636 PacketSize = packet_traits<Scalar>::size
637 };
638
639 public:
640 // Specifying linear copy kind statically gives ~30% speedup for small sizes.
641 enum Kind {
642 Linear = 0, // src_stride == 1 && dst_stride == 1
643 Scatter = 1, // src_stride == 1 && dst_stride != 1
644 FillLinear = 2, // src_stride == 0 && dst_stride == 1
645 FillScatter = 3, // src_stride == 0 && dst_stride != 1
646 Gather = 4, // dst_stride == 1
647 Random = 5 // everything else
648 };
649
650 struct Dst {
651 Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {}
652
653 IndexType offset;
654 IndexType stride;
655 Scalar* data;
656 };
657
658 struct Src {
659 Src(IndexType o, IndexType s, const Scalar* d)
660 : offset(o), stride(s), data(d) {}
661
662 IndexType offset;
663 IndexType stride;
664 const Scalar* data;
665 };
666
667 template <StridedLinearBufferCopy::Kind kind>
668 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst,
669 const Src& src,
670 const size_t count) {
671 Run<kind>(count, dst.offset, dst.stride, dst.data, src.offset, src.stride,
672 src.data);
673 }
674
675 private:
676 template <StridedLinearBufferCopy::Kind kind>
677 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
678 const IndexType count, const IndexType dst_offset,
679 const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data,
680 const IndexType src_offset, const IndexType src_stride,
681 const Scalar* EIGEN_RESTRICT src_data) {
682 const Scalar* src = &src_data[src_offset];
683 Scalar* dst = &dst_data[dst_offset];
684
685 if (!Vectorizable) {
686 for (Index i = 0; i < count; ++i) {
687 dst[i * dst_stride] = src[i * src_stride];
688 }
689 return;
690 }
691
692 const IndexType vectorized_size = count - PacketSize;
693 IndexType i = 0;
694
695 if (kind == Linear) {
696 // ******************************************************************** //
697 // Linear copy from `src` to `dst`.
698 const IndexType unrolled_size = count - 4 * PacketSize;
699 eigen_assert(src_stride == 1 && dst_stride == 1);
700 for (; i <= unrolled_size; i += 4 * PacketSize) {
701 for (int j = 0; j < 4; ++j) {
702 Packet p = ploadu<Packet>(src + i + j * PacketSize);
703 pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
704 }
705 }
706 for (; i <= vectorized_size; i += PacketSize) {
707 Packet p = ploadu<Packet>(src + i);
708 pstoreu<Scalar, Packet>(dst + i, p);
709 }
710 for (; i < count; ++i) {
711 dst[i] = src[i];
712 }
713 // ******************************************************************** //
714 } else if (kind == Scatter) {
715 // Scatter from `src` to `dst`.
716 eigen_assert(src_stride == 1 && dst_stride != 1);
717 for (; i <= vectorized_size; i += PacketSize) {
718 Packet p = ploadu<Packet>(src + i);
719 pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
720 }
721 for (; i < count; ++i) {
722 dst[i * dst_stride] = src[i];
723 }
724 // ******************************************************************** //
725 } else if (kind == FillLinear) {
726 // Fill `dst` with value at `*src`.
727 eigen_assert(src_stride == 0 && dst_stride == 1);
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -0700728 const IndexType unrolled_size = count - 4 * PacketSize;
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -0700729 Packet p = pload1<Packet>(src);
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -0700730 for (; i <= unrolled_size; i += 4 * PacketSize) {
731 for (int j = 0; j < 4; ++j) {
732 pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
733 }
734 }
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -0700735 for (; i <= vectorized_size; i += PacketSize) {
736 pstoreu<Scalar, Packet>(dst + i, p);
737 }
738 for (; i < count; ++i) {
739 dst[i] = *src;
740 }
741 // ******************************************************************** //
742 } else if (kind == FillScatter) {
743 // Scatter `*src` into `dst`.
744 eigen_assert(src_stride == 0 && dst_stride != 1);
745 Packet p = pload1<Packet>(src);
746 for (; i <= vectorized_size; i += PacketSize) {
747 pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
748 }
749 for (; i < count; ++i) {
750 dst[i * dst_stride] = *src;
751 }
752 // ******************************************************************** //
753 } else if (kind == Gather) {
754 // Gather from `src` into `dst`.
755 eigen_assert(dst_stride == 1);
756 for (; i <= vectorized_size; i += PacketSize) {
757 Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride);
758 pstoreu<Scalar, Packet>(dst + i, p);
759 }
760 for (; i < count; ++i) {
761 dst[i] = src[i * src_stride];
762 }
763 // ******************************************************************** //
764 } else if (kind == Random) {
765 // Random.
766 for (; i < count; ++i) {
767 dst[i * dst_stride] = src[i * src_stride];
768 }
769 } else {
770 eigen_assert(false);
771 }
772 }
773};
774
775// -------------------------------------------------------------------------- //
776// TensorBlockIO copies data from `src` tensor block, to the `dst` tensor block.
777// It's possible to specify src->dst dimension mapping for the copy operation.
778// Dimensions of `dst` specify how many elements have to be copied, for the
779// `src` we need to know only stride to navigate through source memory buffer.
780
781template <typename Scalar, typename IndexType, int NumDims, int Layout>
782class TensorBlockIOV2 {
783 static const bool IsColMajor = (Layout == ColMajor);
784
785 typedef StridedLinearBufferCopy<Scalar, IndexType> LinCopy;
786
787 public:
788 typedef DSizes<IndexType, NumDims> Dimensions;
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -0700789 typedef DSizes<int, NumDims> DimensionsMap;
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -0700790
791 struct Dst {
792 Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst,
793 IndexType dst_offset = 0)
794 : dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {}
795
796 Dimensions dims;
797 Dimensions strides;
798 Scalar* data;
799 IndexType offset;
800 };
801
802 struct Src {
803 Src(const Dimensions& src_strides, const Scalar* src,
804 IndexType src_offset = 0)
805 : strides(src_strides), data(src), offset(src_offset) {}
806
807 Dimensions strides;
808 const Scalar* data;
809 IndexType offset;
810 };
811
812 // Copies data to `dst` from `src`, using provided dimensions mapping:
813 //
814 // src_dimension_index = dst_to_src_dim_map[dst_dimension_index]
815 //
816 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Copy(
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -0700817 const Dst& dst, const Src& src, const DimensionsMap& dst_to_src_dim_map) {
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -0700818 // Copy single scalar value from `src` to `dst`.
819 if (NumDims == 0) {
820 *(dst.data + dst.offset) = *(src.data + src.offset);
821 return;
822 }
823
824 // Both `dst` and `src` must have contiguous innermost dimension. We also
825 // accept the special case with stride '0', because it's used as a trick to
826 // implement broadcasting.
827 {
828 int inner_dim = IsColMajor ? 0 : NumDims - 1;
829 EIGEN_UNUSED_VARIABLE(inner_dim);
830 eigen_assert(dst.strides[inner_dim] == 1 || dst.strides[inner_dim] == 0);
831 eigen_assert(src.strides[inner_dim] == 1 || src.strides[inner_dim] == 0);
832 }
833
834 // Give a shorter name to `dst_to_src_dim_map`.
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -0700835 const DimensionsMap& dim_map = dst_to_src_dim_map;
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -0700836
837 // Do not squeeze reordered inner dimensions.
838 int num_squeezable_dims = NumSqueezableInnerDims(dim_map);
839
840 // NOTE: We find the innermost dimension (contiguous in memory) in the dst
841 // block, and we write data linearly into that dimension, reading it from
842 // the src. If dimensions are reordered, we might end up reading data from
843 // the src with `stride != 1`.
844 //
845 // NOTE: Random-Read/Linear-Write can be up to ~2X faster than
846 // Linear-Read/Random-Write: https://stackoverflow.com/a/54935680
847
848 // Find the innermost dimension in the dst whose size is not 1. This is the
849 // effective inner dim.
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -0700850 int num_size_one_inner_dims = 0;
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -0700851 for (int i = 0; i < num_squeezable_dims; ++i) {
852 const int dst_dim = IsColMajor ? i : NumDims - i - 1;
853 if (dst.dims[dst_dim] != 1) break;
854 num_size_one_inner_dims++;
855 }
856
857 // If all dimensions are of size 1, just copy a scalar from `src` to `dst`.
858 if (num_size_one_inner_dims == NumDims) {
859 *(dst.data + dst.offset) = *(src.data + src.offset);
860 return;
861 }
862
863 // Outermost dimension in the dst with `stride == 1` (contiguous in memory).
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -0700864 const int dst_stride1_dim =
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -0700865 IsColMajor ? num_size_one_inner_dims
866 : NumDims - num_size_one_inner_dims - 1;
867
868 // Dimension in the src that corresponds to the dst innermost dimension.
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -0700869 const int src_dim_for_dst_stride1_dim =
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -0700870 NumDims == 0 ? 1 : dim_map[dst_stride1_dim];
871
872 // Size of the innermost dimension (length of contiguous blocks of memory).
873 IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim];
874
875 // Squeeze multiple inner dims into one if they are contiguous in `dst` and
876 // `src` memory, so we can do less linear copy calls.
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -0700877 for (int i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) {
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -0700878 const int dst_dim = IsColMajor ? i : NumDims - i - 1;
879 const IndexType dst_stride = dst.strides[dst_dim];
880 const IndexType src_stride = src.strides[dim_map[dst_dim]];
881 if (dst_inner_dim_size == dst_stride && dst_stride == src_stride) {
882 dst_inner_dim_size *= dst.dims[dst_dim];
883 ++num_size_one_inner_dims;
884 } else {
885 break;
886 }
887 }
888
889 // Setup strides to read data from `src` and write to `dst`.
890 IndexType input_offset = src.offset;
891 IndexType output_offset = dst.offset;
892 IndexType input_stride =
893 NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim];
894 IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim];
895
896 const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
897 array<BlockIteratorState, at_least_1_dim> it;
898
899 // Initialize block iterator state. Squeeze away any dimension of size 1.
900 int idx = 0; // currently initialized iterator state index
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -0700901 for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -0700902 const int dst_dim = IsColMajor ? i + 1 : NumDims - i - 2;
903 if (dst.dims[dst_dim] == 1) continue;
904
905 it[idx].size = dst.dims[dst_dim];
906 it[idx].input_stride = src.strides[dim_map[dst_dim]];
907 it[idx].output_stride = dst.strides[dst_dim];
908
909 it[idx].input_span = it[idx].input_stride * (it[idx].size - 1);
910 it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
911
912 idx++;
913 }
914
915 // Iterate copying data from src to dst.
916 const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize();
917
918#define COPY_INNER_DIM(KIND) \
919 for (IndexType i = 0; i < block_total_size; i += dst_inner_dim_size) { \
920 LinCopy::template Run<KIND>( \
921 typename LinCopy::Dst(output_offset, output_stride, dst.data), \
922 typename LinCopy::Src(input_offset, input_stride, src.data), \
923 dst_inner_dim_size); \
924 \
925 for (int j = 0; j < idx; ++j) { \
926 if (++it[j].count < it[j].size) { \
927 input_offset += it[j].input_stride; \
928 output_offset += it[j].output_stride; \
929 break; \
930 } \
931 it[j].count = 0; \
932 input_offset -= it[j].input_span; \
933 output_offset -= it[j].output_span; \
934 } \
935 }
936
937 if (input_stride == 1 && output_stride == 1) {
938 COPY_INNER_DIM(LinCopy::Linear);
939 } else if (input_stride == 1 && output_stride != 1) {
940 COPY_INNER_DIM(LinCopy::Scatter);
941 } else if (input_stride == 0 && output_stride == 1) {
942 COPY_INNER_DIM(LinCopy::FillLinear);
943 } else if (input_stride == 0 && output_stride != 1) {
944 COPY_INNER_DIM(LinCopy::FillScatter);
945 } else if (output_stride == 1) {
946 COPY_INNER_DIM(LinCopy::Gather);
947 } else {
948 COPY_INNER_DIM(LinCopy::Random);
949 }
950
951#undef COPY_INNER_DIM
952 }
953
954 // Copy from `src` to `dst` with an identity src->dst dimension map.
955 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Copy(const Dst& dst,
956 const Src& src) {
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -0700957 DimensionsMap dst_to_src_map;
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -0700958 for (int i = 0; i < NumDims; ++i) dst_to_src_map[i] = i;
959 Copy(dst, src, dst_to_src_map);
960 }
961
962 private:
963 struct BlockIteratorState {
964 BlockIteratorState()
965 : size(0),
966 count(0),
967 input_stride(0),
968 output_stride(0),
969 input_span(0),
970 output_span(0) {}
971
972 IndexType size;
973 IndexType count;
974 IndexType input_stride;
975 IndexType output_stride;
976 IndexType input_span;
977 IndexType output_span;
978 };
979
980 // Compute how many inner dimensions it's allowed to squeeze when doing IO
981 // between two tensor blocks. It's safe to squeeze inner dimensions, only
982 // if they are not reordered.
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -0700983 static int NumSqueezableInnerDims(const DimensionsMap& dim_map) {
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -0700984 int num_squeezable_dims = 0;
985 for (int i = 0; i < NumDims; ++i) {
986 const int dim = IsColMajor ? i : NumDims - i - 1;
987 if (dim_map[dim] != dim) break;
988 num_squeezable_dims++;
989 }
990 return num_squeezable_dims;
991 }
992};
993
994// -------------------------------------------------------------------------- //
995// TensorBlockAssignment assigns a block expression of type `TensorBlockExpr` to
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -0700996// a Tensor block defined by `desc`, backed by a memory buffer at `target`.
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -0700997//
998// Currently there is no way to write from a Tensor expression to a block of
999// memory, if dimensions are reordered. If you need to do that, you should
1000// materialize a Tensor block expression into a memory buffer, and then use
1001// TensorBlockIO to copy data between two memory buffers with a custom
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -07001002// `target->src` dimension map (see definition above).
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -07001003//
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -07001004// Also currently the innermost dimension of `target` must have a stride '1'
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -07001005// (contiguous in memory). This restriction could be lifted with a `pscatter`,
1006// but in practice it's never needed, and there is a similar TensorBlockIO
1007// workaround for that.
1008//
1009// TODO(ezhulenev): TensorBlockAssignment is a special case of TensorBlockIO
1010// where `src` is a tensor expression. Explore if it is possible to rewrite IO
1011// to use expressions instead of pointers, and after that TensorBlockAssignment
1012// will become an alias to IO.
1013template <typename Scalar, int NumDims, typename TensorBlockExpr,
1014 typename IndexType = Eigen::Index>
1015class TensorBlockAssignment {
1016 // We will use coeff/packet path to evaluate block expressions.
1017 typedef TensorEvaluator<const TensorBlockExpr, DefaultDevice>
1018 TensorBlockEvaluator;
1019
1020 typedef DSizes<IndexType, NumDims> Dimensions;
1021
1022 enum {
1023 Vectorizable = packet_traits<Scalar>::Vectorizable,
1024 PacketSize = packet_traits<Scalar>::size
1025 };
1026
1027 template <bool Vectorizable, typename Evaluator>
1028 struct InnerDimAssign {
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -07001029 EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -07001030 const Evaluator& eval,
1031 IndexType eval_offset) {
1032 for (IndexType i = 0; i < count; ++i) {
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -07001033 target[i] = eval.coeff(eval_offset + i);
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -07001034 }
1035 }
1036 };
1037
1038 template <typename Evaluator>
1039 struct InnerDimAssign<true, Evaluator> {
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -07001040 EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -07001041 const Evaluator& eval,
1042 IndexType eval_offset) {
1043 typedef typename packet_traits<Scalar>::type Packet;
1044
1045 const IndexType unrolled_size = count - 4 * PacketSize;
1046 const IndexType vectorized_size = count - PacketSize;
1047 IndexType i = 0;
1048
1049 for (; i <= unrolled_size; i += 4 * PacketSize) {
1050 for (int j = 0; j < 4; ++j) {
1051 const IndexType idx = eval_offset + i + j * PacketSize;
1052 Packet p = eval.template packet<Unaligned>(idx);
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -07001053 pstoreu<Scalar>(target + i + j * PacketSize, p);
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -07001054 }
1055 }
1056
1057 for (; i <= vectorized_size; i += PacketSize) {
1058 Packet p = eval.template packet<Unaligned>(eval_offset + i);
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -07001059 pstoreu<Scalar>(target + i, p);
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -07001060 }
1061
1062 for (; i < count; ++i) {
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -07001063 target[i] = eval.coeff(eval_offset + i);
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -07001064 }
1065 }
1066 };
1067
1068 public:
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -07001069 struct Target {
1070 Target(const Dimensions& target_dims, const Dimensions& target_strides,
1071 Scalar* target_data, IndexType target_offset = 0)
1072 : dims(target_dims),
1073 strides(target_strides),
1074 data(target_data),
1075 offset(target_offset) {}
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -07001076
1077 Dimensions dims;
1078 Dimensions strides;
1079 Scalar* data;
1080 IndexType offset;
1081 };
1082
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -07001083 static Target target(const Dimensions& target_dims,
1084 const Dimensions& target_strides, Scalar* target_data,
1085 IndexType target_offset = 0) {
1086 return Target(target_dims, target_strides, target_data, target_offset);
1087 }
1088
1089 template <typename TargetDimsIndexType, typename TargetStridesIndexType>
1090 static Target target(
1091 const DSizes<TargetDimsIndexType, NumDims>& target_dims,
1092 const DSizes<TargetStridesIndexType, NumDims>& target_strides,
1093 Scalar* target_data, IndexType target_offset = 0) {
1094 // DSizes constructor will do index type promotion if it's safe.
1095 return Target(Dimensions(target_dims), Dimensions(target_strides),
1096 target_data, target_offset);
1097 }
1098
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -07001099 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -07001100 const Target& target, const TensorBlockExpr& expr) {
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -07001101 // Prepare evaluator for block expression.
1102 DefaultDevice default_device;
1103 TensorBlockEvaluator eval(expr, default_device);
1104
1105 // Tensor block expression dimension should match destination dimensions.
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -07001106 eigen_assert(dimensions_match(target.dims, eval.dimensions()));
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -07001107
1108 static const int Layout = TensorBlockEvaluator::Layout;
1109 static const bool is_col_major = Layout == ColMajor;
1110
1111 // Initialize output inner dimension size based on a layout.
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -07001112 const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize();
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -07001113 const int inner_dim_idx = is_col_major ? 0 : NumDims - 1;
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -07001114 IndexType output_inner_dim_size = target.dims[inner_dim_idx];
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -07001115
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -07001116 // Target inner dimension stride must be '1'.
1117 eigen_assert(target.strides[inner_dim_idx] == 1);
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -07001118
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -07001119 // Squeeze multiple inner dims into one if they are contiguous in `target`.
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -07001120 IndexType num_squeezed_dims = 0;
1121 for (Index i = 1; i < NumDims; ++i) {
1122 const Index dim = is_col_major ? i : NumDims - i - 1;
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -07001123 const IndexType target_stride = target.strides[dim];
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -07001124
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -07001125 if (output_inner_dim_size == target_stride) {
1126 output_inner_dim_size *= target.dims[dim];
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -07001127 num_squeezed_dims++;
1128 } else {
1129 break;
1130 }
1131 }
1132
1133 // Initialize output block iterator state. Dimension in this array are
1134 // always in inner_most -> outer_most order (col major layout).
1135 array<BlockIteratorState, NumDims> it;
1136
1137 int idx = 0; // currently initialized iterator state index
1138 for (Index i = num_squeezed_dims; i < NumDims - 1; ++i) {
1139 const Index dim = is_col_major ? i + 1 : NumDims - i - 2;
1140
1141 it[idx].count = 0;
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -07001142 it[idx].size = target.dims[dim];
1143 it[idx].output_stride = target.strides[dim];
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -07001144 it[idx].output_span = it[i].output_stride * (it[i].size - 1);
1145 idx++;
1146 }
1147
1148 // We read block expression from the beginning, and start writing data to
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -07001149 // `target` at given offset.
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -07001150 IndexType input_offset = 0;
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -07001151 IndexType output_offset = target.offset;
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -07001152
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -07001153 // Iterate copying data from `eval` to `target`.
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -07001154 for (IndexType i = 0; i < output_size; i += output_inner_dim_size) {
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -07001155 // Assign to `target` at current offset.
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -07001156 InnerDimAssign<Vectorizable && TensorBlockEvaluator::PacketAccess,
Eugene Zhulenev3c2990f2019-10-08 14:45:43 -07001157 TensorBlockEvaluator>::Run(target.data + output_offset,
Rasmus Munk Larsen9f905242019-10-01 16:34:00 -07001158 output_inner_dim_size, eval,
1159 input_offset);
1160
1161 // Move input offset forward by the number of assigned coefficients.
1162 input_offset += output_inner_dim_size;
1163
1164 // Update index.
1165 for (int j = 0; j < idx; ++j) {
1166 if (++it[j].count < it[j].size) {
1167 output_offset += it[j].output_stride;
1168 break;
1169 }
1170 it[j].count = 0;
1171 output_offset -= it[j].output_span;
1172 }
1173 }
1174 }
1175
1176 private:
1177 struct BlockIteratorState {
1178 BlockIteratorState()
1179 : count(0), size(0), output_stride(0), output_span(0) {}
1180
1181 IndexType count;
1182 IndexType size;
1183 IndexType output_stride;
1184 IndexType output_span;
1185 };
1186};
1187
1188// -------------------------------------------------------------------------- //
1189
1190} // namespace internal
1191} // namespace Eigen
1192
1193#endif // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_V2_H