Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 1 | // This file is part of Eigen, a lightweight C++ template library |
| 2 | // for linear algebra. |
| 3 | // |
| 4 | // This Source Code Form is subject to the terms of the Mozilla |
| 5 | // Public License v. 2.0. If a copy of the MPL was not distributed |
| 6 | // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. |
| 7 | |
| 8 | #ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_V2_H |
| 9 | #define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_V2_H |
| 10 | |
| 11 | namespace Eigen { |
| 12 | namespace internal { |
| 13 | |
| 14 | // -------------------------------------------------------------------------- // |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 15 | // Forward declarations for templates defined below. |
| 16 | template <typename Scalar, typename IndexType, int NumDims, int Layout> |
| 17 | class TensorBlockIOV2; |
| 18 | |
| 19 | // -------------------------------------------------------------------------- // |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 20 | // Helper function to compute strides for densely stored buffer of given |
| 21 | // dimensions. |
| 22 | |
| 23 | // TODO(ezhulenev): We compute strides 1000 times in different evaluators, use |
| 24 | // this function instead everywhere. |
| 25 | template <int Layout, typename IndexType, int NumDims> |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 26 | EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides( |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 27 | const DSizes<IndexType, NumDims>& dimensions) { |
| 28 | DSizes<IndexType, NumDims> strides; |
| 29 | if (NumDims == 0) return strides; |
| 30 | |
| 31 | // TODO(ezhulenev): Use templates to unroll this loop (similar to |
| 32 | // h_array_reduce in CXX11meta.h)? Benchmark it. |
| 33 | if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { |
| 34 | strides[0] = 1; |
| 35 | for (int i = 1; i < NumDims; ++i) { |
| 36 | strides[i] = strides[i - 1] * dimensions[i - 1]; |
| 37 | } |
| 38 | } else { |
| 39 | strides[NumDims - 1] = 1; |
| 40 | for (int i = NumDims - 2; i >= 0; --i) { |
| 41 | strides[i] = strides[i + 1] * dimensions[i + 1]; |
| 42 | } |
| 43 | } |
| 44 | |
| 45 | return strides; |
| 46 | } |
| 47 | |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 48 | #if EIGEN_HAS_CXX11 |
Eugene Zhulenev | 3736d72 | 2019-10-01 19:41:39 -0700 | [diff] [blame] | 49 | template <int Layout, std::ptrdiff_t... Indices> |
| 50 | EIGEN_STRONG_INLINE DSizes<std::ptrdiff_t, sizeof...(Indices)> strides( |
| 51 | const Sizes<Indices...>& sizes) { |
| 52 | return strides<Layout>(DSizes<std::ptrdiff_t, sizeof...(Indices)>(sizes)); |
| 53 | } |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 54 | #endif |
Eugene Zhulenev | 3736d72 | 2019-10-01 19:41:39 -0700 | [diff] [blame] | 55 | |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 56 | // -------------------------------------------------------------------------- // |
| 57 | // TensorBlockDescriptor specifies a block offset within a tensor and the block |
| 58 | // sizes along each of the tensor dimensions. |
| 59 | |
| 60 | template <int NumDims, typename IndexType = Eigen::Index> |
| 61 | class TensorBlockDescriptor { |
| 62 | public: |
| 63 | typedef DSizes<IndexType, NumDims> Dimensions; |
| 64 | |
| 65 | // If we evaluate a Tensor assignment, and expression on the left, already has |
| 66 | // a memory buffer, then we might do performance optimization, and evaluate |
| 67 | // the root expression directly into the memory, or maybe use it as temporary |
| 68 | // storage for some of the subexpressions, to avoid dynamic memory allocation. |
| 69 | // |
| 70 | // This is a type erased storage, because passing Scalar type through all the |
| 71 | // expression evaluation layers it way too many templates. Also it should be |
| 72 | // possible to use this destination as a temp buffer for materializing |
| 73 | // expressions with type, not matching the final output. |
| 74 | class DestinationBuffer { |
| 75 | public: |
| 76 | template <typename Scalar> |
| 77 | Scalar* data() const { |
| 78 | return static_cast<Scalar*>(m_data); |
| 79 | } |
| 80 | |
| 81 | private: |
| 82 | friend class TensorBlockDescriptor; |
| 83 | |
| 84 | DestinationBuffer() : m_data(NULL), m_total_dst_bytes(0) {} |
| 85 | |
| 86 | template <typename Scalar> |
| 87 | DestinationBuffer(Scalar* data, const Dimensions& dimensions, |
| 88 | const Dimensions& strides, size_t total_dst_bytes) |
| 89 | : m_data(static_cast<void*>(data)), |
| 90 | m_dimensions(dimensions), |
| 91 | m_strides(strides), |
| 92 | m_total_dst_bytes(total_dst_bytes) { |
| 93 | // TODO(ezhulenev): Benchmark template meta-unroll for this loop. |
| 94 | for (int i = 0; i < NumDims; ++i) { |
| 95 | m_dimensions[i] *= sizeof(Scalar); |
| 96 | m_strides[i] *= sizeof(Scalar); |
| 97 | } |
| 98 | } |
| 99 | |
| 100 | // Returns true if the tensor block corresponding to `desc` fits into the |
| 101 | // contiguous block of memory defined by `*this`. |
| 102 | template <typename Scalar, int Layout> |
| 103 | bool fitsContiguously(const TensorBlockDescriptor& desc) const { |
| 104 | if (m_data == NULL) return false; |
| 105 | |
| 106 | const Dimensions& desc_dims = desc.dimensions(); |
| 107 | const Dimensions& dst_dims = dimensions<Scalar>(); |
| 108 | |
| 109 | if (!dimensions_match(desc_dims, dst_dims)) return false; |
| 110 | |
| 111 | const Dimensions& desc_strides = internal::strides<Layout>(desc_dims); |
| 112 | const Dimensions& dst_strides = internal::strides<Layout>(dst_dims); |
| 113 | |
| 114 | return dimensions_match(desc_strides, dst_strides); |
| 115 | } |
| 116 | |
| 117 | template <typename Scalar> |
| 118 | Dimensions dimensions() const { |
| 119 | Dimensions dimensions; |
| 120 | for (int i = 0; i < NumDims; ++i) { |
| 121 | eigen_assert(m_dimensions[i] % sizeof(Scalar) == 0); |
| 122 | dimensions[i] = m_dimensions[i] / sizeof(Scalar); |
| 123 | } |
| 124 | return dimensions; |
| 125 | } |
| 126 | |
| 127 | template <typename Scalar> |
| 128 | Dimensions strides() const { |
| 129 | Dimensions strides; |
| 130 | for (int i = 0; i < NumDims; ++i) { |
| 131 | eigen_assert(m_strides[i] % sizeof(Scalar) == 0); |
| 132 | strides[i] = m_strides[i] / sizeof(Scalar); |
| 133 | } |
| 134 | return strides; |
| 135 | } |
| 136 | |
| 137 | void* m_data; |
| 138 | Dimensions m_dimensions; |
| 139 | Dimensions m_strides; |
| 140 | |
| 141 | // Total size of the memory buffer at the destination (typically the total |
| 142 | // size of the left hand side of an assignment expression). This can be the |
| 143 | // same as `array_prod(m_dimensions)` if the assignment target has just a |
| 144 | // single block, but typically it's a larger number. |
| 145 | size_t m_total_dst_bytes; |
| 146 | }; |
| 147 | |
| 148 | TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions, |
| 149 | const DestinationBuffer& destination) |
| 150 | : m_offset(offset), |
| 151 | m_dimensions(dimensions), |
| 152 | m_destination(destination) {} |
| 153 | |
| 154 | TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions) |
| 155 | : m_offset(offset), |
| 156 | m_dimensions(dimensions), |
| 157 | m_destination(DestinationBuffer()) {} |
| 158 | |
| 159 | IndexType offset() const { return m_offset; } |
| 160 | const Dimensions& dimensions() const { return m_dimensions; } |
| 161 | IndexType dimension(int index) const { return m_dimensions[index]; } |
| 162 | IndexType size() const { return array_prod<IndexType>(m_dimensions); } |
| 163 | |
| 164 | template <typename Scalar> |
| 165 | void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides, |
| 166 | size_t total_dst_bytes) { |
| 167 | m_destination = |
| 168 | DestinationBuffer(dst_base, m_dimensions, dst_strides, total_dst_bytes); |
| 169 | } |
| 170 | |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 171 | template <typename Scalar, typename DstStridesIndexType> |
| 172 | void AddDestinationBuffer( |
| 173 | Scalar* dst_base, const DSizes<DstStridesIndexType, NumDims>& dst_strides, |
| 174 | size_t total_dst_bytes) { |
| 175 | // DSizes constructor will do index type promotion if it's safe. |
| 176 | AddDestinationBuffer(dst_base, Dimensions(dst_strides), total_dst_bytes); |
| 177 | } |
| 178 | |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 179 | TensorBlockDescriptor& DropDestinationBuffer() { |
| 180 | m_destination.m_data = NULL; |
| 181 | return *this; |
| 182 | } |
| 183 | |
| 184 | // Returns a non-nullptr pointer to a destination buffer memory if this |
| 185 | // block has a contiguous destination buffer. |
| 186 | template <typename Scalar, int Layout> |
| 187 | Scalar* destination() const { |
| 188 | if (m_destination.template fitsContiguously<Scalar, Layout>(*this)) { |
| 189 | return m_destination.template data<Scalar>(); |
| 190 | } |
| 191 | return NULL; |
| 192 | } |
| 193 | |
| 194 | private: |
| 195 | // Offset and dimensions are immutable after construction. Block descriptor |
| 196 | // can only be mutated by adding or dropping destination. |
| 197 | const IndexType m_offset; |
| 198 | const Dimensions m_dimensions; |
| 199 | DestinationBuffer m_destination; |
| 200 | }; |
| 201 | |
| 202 | // -------------------------------------------------------------------------- // |
| 203 | // TensorBlockScratchAllocator is responsible for allocating temporary buffers |
| 204 | // for block evaluation (output or input block materialization). Given that |
| 205 | // Eigen expression traversal order is deterministic, all temporary allocations |
| 206 | // are happening in the same order, and usually have exactly the same size. |
| 207 | // Scratch allocator keeps a trace of all dynamic allocations, and after the |
| 208 | // first block evaluation is completed, we should be able to reuse all the |
| 209 | // temporary buffers for the next block evaluation. |
| 210 | |
| 211 | template <typename Device> |
| 212 | class TensorBlockScratchAllocator { |
| 213 | public: |
| 214 | explicit TensorBlockScratchAllocator(const Device& device) |
| 215 | : m_device(device), m_allocation_index(0) {} |
| 216 | |
| 217 | ~TensorBlockScratchAllocator() { |
| 218 | for (size_t i = 0; i < m_allocations.size(); ++i) { |
| 219 | m_device.deallocate(m_allocations[i].ptr); |
| 220 | } |
| 221 | } |
| 222 | |
| 223 | void* allocate(size_t size) { |
| 224 | // TODO(ezhulenev): Remove when replaced with inlined vector. |
| 225 | if (m_allocations.capacity() == 0) m_allocations.reserve(8); |
| 226 | |
| 227 | // Check if we already have an existing allocation att current index. |
| 228 | const int num_allocations = static_cast<int>(m_allocations.size()); |
| 229 | const bool has_allocation = m_allocation_index < num_allocations; |
| 230 | |
| 231 | // Allocation index can't be larger than the number of allocations. |
| 232 | eigen_assert(m_allocation_index <= num_allocations); |
| 233 | |
| 234 | // If we have existing allocation, and its size is larger or equal to |
| 235 | // requested size, we do nothing. |
| 236 | |
| 237 | // If current allocation can't fit requested size, we deallocate it, and |
| 238 | // replace with a larger allocation. |
| 239 | if (has_allocation && m_allocations[m_allocation_index].size < size) { |
| 240 | m_device.deallocate(m_allocations[m_allocation_index].ptr); |
| 241 | m_allocations[m_allocation_index].ptr = m_device.allocate(size); |
| 242 | m_allocations[m_allocation_index].size = size; |
| 243 | } |
| 244 | |
| 245 | // Make a new allocation if we don't have and existing one. |
| 246 | if (!has_allocation) { |
| 247 | Allocation allocation; |
| 248 | allocation.ptr = m_device.allocate(size); |
| 249 | allocation.size = size; |
| 250 | m_allocations.push_back(allocation); |
| 251 | } |
| 252 | |
| 253 | eigen_assert(m_allocations[m_allocation_index].ptr != NULL); |
| 254 | eigen_assert(m_allocations[m_allocation_index].size >= size); |
| 255 | |
| 256 | return m_allocations[m_allocation_index++].ptr; |
| 257 | } |
| 258 | |
| 259 | void reset() { m_allocation_index = 0; } |
| 260 | |
| 261 | private: |
| 262 | struct Allocation { |
| 263 | void* ptr; |
| 264 | size_t size; |
| 265 | }; |
| 266 | |
| 267 | const Device& m_device; |
| 268 | int m_allocation_index; |
| 269 | // TODO(ezhulenev): This should be an inlined vector. |
| 270 | std::vector<Allocation> m_allocations; |
| 271 | }; |
| 272 | |
| 273 | // -------------------------------------------------------------------------- // |
| 274 | // TensorBlockKind represents all possible block kinds, that can be produced by |
| 275 | // TensorEvaluator::evalBlock function. |
| 276 | #if !EIGEN_HAS_CXX11 |
| 277 | // To be able to use `TensorBlockKind::kExpr` in C++03 we need a namespace. |
| 278 | // (Use of enumeration in a nested name specifier is a c++11 extension). |
| 279 | namespace TensorBlockKind { |
| 280 | #endif |
| 281 | enum TensorBlockKind { |
| 282 | // Tensor block that is a lazy expression that must be assigned to a |
| 283 | // destination using TensorBlockAssign. |
| 284 | kExpr, |
| 285 | |
| 286 | // Tensor block that is a view into a memory buffer owned by an underlying |
| 287 | // Tensor expression (e.g. it can be a view into a Tensor buffer). |
| 288 | kView, |
| 289 | |
| 290 | // Tensor block that was materialized in a scratch memory buffer, allocated |
| 291 | // with TensorBlockScratchAllocator. This block must be copied to a |
| 292 | // destination, similar to a block of `kExpr` type. |
| 293 | kMaterializedInScratch, |
| 294 | |
| 295 | // Tensor block that was materialized directly into the final output memory |
| 296 | // buffer. For example if the left side of an assignment is a Tensor, we can |
| 297 | // directly materialize the block in the destination memory. The block |
| 298 | // expression is still a valid Tensor expression, and can be used to build |
| 299 | // lazy expressions. |
| 300 | kMaterializedInOutput |
| 301 | |
| 302 | // TODO(ezhulenev): If we know that we are evaluating a block, for the root of |
| 303 | // the expression tree, it might be beneficial to do an assignment to the |
| 304 | // output memory buffer, even if it will be impossible to construct a valid |
| 305 | // block expression after that (e.g. output memory buffer has strides not |
| 306 | // compatible with TensorMap). This might be a performance optimization for |
| 307 | // uniformly shaped blocks, because for blocks skewed towards inner dimension |
| 308 | // `kMaterializedInOutput` should always work. |
| 309 | }; |
| 310 | #if !EIGEN_HAS_CXX11 |
| 311 | } // namespace TensorBlockKind |
| 312 | #endif |
| 313 | |
| 314 | // -------------------------------------------------------------------------- // |
| 315 | // TensorBlockNotImplemented should be used to defined TensorBlock typedef in |
| 316 | // TensorEvaluators that do not support block evaluation. |
| 317 | |
| 318 | class TensorBlockNotImplemented { |
| 319 | public: |
| 320 | typedef void XprType; |
| 321 | }; |
| 322 | |
| 323 | // -------------------------------------------------------------------------- // |
| 324 | // XprScalar extracts Scalar type from the Eigen expressions (if expression type |
| 325 | // is not void). It's required to be able to define lazy block expression for |
| 326 | // argument types, that do not support block evaluation. |
| 327 | |
| 328 | template <typename XprType> |
| 329 | struct XprScalar { |
| 330 | typedef typename XprType::Scalar type; |
| 331 | }; |
| 332 | template <> |
| 333 | struct XprScalar<void> { |
| 334 | typedef void type; |
| 335 | }; |
| 336 | |
| 337 | // -------------------------------------------------------------------------- // |
| 338 | // TensorMaterializedBlock is a fully evaluated block of the original tensor, |
| 339 | // and XprType is just a TensorMap over the data. This block type is typically |
| 340 | // used to materialize blocks of tensor expressions, that can't be efficiently |
| 341 | // represented as lazy Tensor expressions with fast coeff/packet operations, |
| 342 | // e.g. we materialize all broadcasts into evaluated blocks. |
| 343 | // |
| 344 | // TensorMaterializedBlock does not own its memory buffer, it's either a memory |
| 345 | // buffer that backs the original expression (e.g. block is just a view into a |
| 346 | // Tensor), or a memory buffer allocated with scratch allocator, and in this |
| 347 | // case the scratch allocator will deallocate it at the end of block based |
| 348 | // expression execution. |
| 349 | |
| 350 | template <typename Scalar, int NumDims, int Layout, |
| 351 | typename IndexType = Eigen::Index> |
| 352 | class TensorMaterializedBlock { |
| 353 | #if !EIGEN_HAS_CXX11 |
| 354 | typedef internal::TensorBlockKind::TensorBlockKind TensorBlockKind; |
| 355 | #endif |
| 356 | public: |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 357 | typedef DSizes<IndexType, NumDims> Dimensions; |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 358 | typedef TensorMap<const Tensor<Scalar, NumDims, Layout> > XprType; |
| 359 | |
| 360 | TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data, |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 361 | const Dimensions& dimensions) |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 362 | : m_kind(kind), |
| 363 | m_data(data), |
| 364 | m_dimensions(dimensions), |
| 365 | m_expr(m_data, m_dimensions) { |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 366 | eigen_assert(m_kind == internal::TensorBlockKind::kView || |
| 367 | m_kind == internal::TensorBlockKind::kMaterializedInScratch || |
| 368 | m_kind == internal::TensorBlockKind::kMaterializedInOutput); |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 369 | } |
| 370 | |
| 371 | TensorBlockKind kind() const { return m_kind; } |
| 372 | // NOTE(ezhulenev): Returning XprType by value like in other block types |
| 373 | // causes asan failures. The theory is that XprType::Nested doesn't work |
| 374 | // properly for TensorMap. |
| 375 | const XprType& expr() const { return m_expr; } |
| 376 | const Scalar* data() const { return m_data; } |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 377 | void cleanup() {} |
| 378 | |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 379 | typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc; |
| 380 | |
| 381 | // Creates a materialized block for the given descriptor from a memory buffer. |
| 382 | template <typename DataDimensions, typename TensorBlockScratch> |
| 383 | EIGEN_STRONG_INLINE static TensorMaterializedBlock materialize( |
| 384 | const Scalar* data, const DataDimensions& data_dims, |
| 385 | TensorBlockDesc& desc, TensorBlockScratch& scratch) { |
| 386 | eigen_assert(array_size<DataDimensions>::value == desc.dimensions().size()); |
| 387 | |
| 388 | // If a tensor block dimensions covers a contiguous block of the underlying |
| 389 | // memory, we can skip block buffer memory allocation, and construct a block |
| 390 | // from existing `data` memory buffer. |
| 391 | // |
| 392 | // Example: (RowMajor layout) |
| 393 | // data_dims: [11, 12, 13, 14] |
| 394 | // desc.dimensions(): [1, 1, 3, 14] |
| 395 | // |
| 396 | // In this case we can construct a TensorBlock starting at |
| 397 | // `data + desc.offset()`, with a `desc.dimensions()` block sizes. |
| 398 | static const bool is_col_major = Layout == ColMajor; |
| 399 | |
| 400 | // Find out how many inner dimensions have a matching size. |
| 401 | int num_matching_inner_dims = 0; |
| 402 | for (int i = 0; i < NumDims; ++i) { |
| 403 | int dim = is_col_major ? i : NumDims - i - 1; |
| 404 | if (data_dims[dim] != desc.dimensions()[dim]) break; |
| 405 | ++num_matching_inner_dims; |
| 406 | } |
| 407 | |
| 408 | // All the outer dimensions must be of size `1`, except a single dimension |
| 409 | // before the matching inner dimension (`3` in the example above). |
| 410 | bool can_use_direct_access = true; |
| 411 | for (int i = num_matching_inner_dims + 1; i < NumDims; ++i) { |
| 412 | int dim = is_col_major ? i : NumDims - i - 1; |
| 413 | if (desc.dimension(dim) != 1) { |
| 414 | can_use_direct_access = false; |
| 415 | break; |
| 416 | } |
| 417 | } |
| 418 | |
| 419 | if (can_use_direct_access) { |
| 420 | const Scalar* block_start = data + desc.offset(); |
| 421 | return TensorMaterializedBlock(internal::TensorBlockKind::kView, block_start, |
| 422 | desc.dimensions()); |
| 423 | |
| 424 | } else { |
| 425 | void* mem = scratch.allocate(desc.size() * sizeof(Scalar)); |
| 426 | Scalar* block_buffer = static_cast<Scalar*>(mem); |
| 427 | |
| 428 | typedef internal::TensorBlockIOV2<Scalar, IndexType, NumDims, Layout> |
| 429 | TensorBlockIO; |
| 430 | typedef typename TensorBlockIO::Dst TensorBlockIODst; |
| 431 | typedef typename TensorBlockIO::Src TensorBlockIOSrc; |
| 432 | |
| 433 | TensorBlockIOSrc src(internal::strides<Layout>(Dimensions(data_dims)), |
| 434 | data, desc.offset()); |
| 435 | TensorBlockIODst dst(desc.dimensions(), |
| 436 | internal::strides<Layout>(desc.dimensions()), |
| 437 | block_buffer); |
| 438 | |
| 439 | TensorBlockIO::Copy(dst, src); |
| 440 | |
| 441 | return TensorMaterializedBlock(internal::TensorBlockKind::kMaterializedInScratch, |
| 442 | block_buffer, desc.dimensions()); |
| 443 | } |
| 444 | } |
| 445 | |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 446 | private: |
| 447 | TensorBlockKind m_kind; |
| 448 | const Scalar* m_data; |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 449 | Dimensions m_dimensions; |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 450 | XprType m_expr; |
| 451 | }; |
| 452 | |
| 453 | // -------------------------------------------------------------------------- // |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 454 | // TensorCwiseUnaryBlock is a lazy tensor expression block that applies UnaryOp |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 455 | // functor to the blocks produced by the underlying Tensor expression. |
| 456 | |
| 457 | template <typename UnaryOp, typename ArgTensorBlock> |
| 458 | class TensorCwiseUnaryBlock { |
| 459 | #if !EIGEN_HAS_CXX11 |
| 460 | typedef internal::TensorBlockKind::TensorBlockKind TensorBlockKind; |
| 461 | #endif |
| 462 | |
| 463 | static const bool NoArgBlockAccess = |
| 464 | internal::is_void<typename ArgTensorBlock::XprType>::value; |
| 465 | |
| 466 | public: |
| 467 | typedef typename conditional< |
| 468 | NoArgBlockAccess, void, |
| 469 | TensorCwiseUnaryOp<UnaryOp, const typename ArgTensorBlock::XprType> >::type |
| 470 | XprType; |
| 471 | |
| 472 | typedef typename XprScalar<XprType>::type Scalar; |
| 473 | |
| 474 | TensorCwiseUnaryBlock(const ArgTensorBlock& arg_block, const UnaryOp& functor) |
| 475 | : m_arg_block(arg_block), m_functor(functor) {} |
| 476 | |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 477 | TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 478 | |
| 479 | XprType expr() const { return XprType(m_arg_block.expr(), m_functor); } |
| 480 | const Scalar* data() const { return NULL; } |
| 481 | void cleanup() { m_arg_block.cleanup(); } |
| 482 | |
| 483 | private: |
| 484 | ArgTensorBlock m_arg_block; |
| 485 | UnaryOp m_functor; |
| 486 | }; |
| 487 | |
| 488 | // -------------------------------------------------------------------------- // |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 489 | // TensorCwiseUnaryBlock is a lazy tensor expression block that applies BinaryOp |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 490 | // functor to the blocks produced by the underlying Tensor expression. |
| 491 | |
| 492 | template <typename BinaryOp, typename LhsTensorBlock, typename RhsTensorBlock> |
| 493 | class TensorCwiseBinaryBlock { |
| 494 | #if !EIGEN_HAS_CXX11 |
| 495 | typedef internal::TensorBlockKind::TensorBlockKind TensorBlockKind; |
| 496 | #endif |
| 497 | |
| 498 | static const bool NoArgBlockAccess = |
| 499 | internal::is_void<typename LhsTensorBlock::XprType>::value || |
| 500 | internal::is_void<typename RhsTensorBlock::XprType>::value; |
| 501 | |
| 502 | public: |
| 503 | typedef typename conditional< |
| 504 | NoArgBlockAccess, void, |
| 505 | TensorCwiseBinaryOp<BinaryOp, const typename LhsTensorBlock::XprType, |
| 506 | const typename RhsTensorBlock::XprType> >::type |
| 507 | XprType; |
| 508 | |
| 509 | typedef typename XprScalar<XprType>::type Scalar; |
| 510 | |
| 511 | TensorCwiseBinaryBlock(const LhsTensorBlock& left_block, |
| 512 | const RhsTensorBlock& right_block, |
| 513 | const BinaryOp& functor) |
| 514 | : m_left_block(left_block), |
| 515 | m_right_block(right_block), |
| 516 | m_functor(functor) {} |
| 517 | |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 518 | TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 519 | |
| 520 | XprType expr() const { |
| 521 | return XprType(m_left_block.expr(), m_right_block.expr(), m_functor); |
| 522 | } |
| 523 | |
| 524 | const Scalar* data() const { return NULL; } |
| 525 | |
| 526 | void cleanup() { |
| 527 | m_left_block.cleanup(); |
| 528 | m_right_block.cleanup(); |
| 529 | } |
| 530 | |
| 531 | private: |
| 532 | LhsTensorBlock m_left_block; |
| 533 | RhsTensorBlock m_right_block; |
| 534 | BinaryOp m_functor; |
| 535 | }; |
| 536 | |
| 537 | // -------------------------------------------------------------------------- // |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 538 | // TensorUnaryExprBlock is a lazy tensor expression block that can construct |
| 539 | // an arbitrary tensor expression from a block of the underlying type (this is a |
| 540 | // generalization of the TensorCwiseUnaryBlock for arbitrary expressions). |
| 541 | |
| 542 | template <typename BlockFactory, typename ArgTensorBlock> |
| 543 | class TensorUnaryExprBlock { |
| 544 | #if !EIGEN_HAS_CXX11 |
| 545 | typedef internal::TensorBlockKind::TensorBlockKind TensorBlockKind; |
| 546 | #endif |
| 547 | |
| 548 | typedef typename ArgTensorBlock::XprType ArgXprType; |
| 549 | static const bool NoArgBlockAccess = internal::is_void<ArgXprType>::value; |
| 550 | |
| 551 | public: |
| 552 | typedef typename conditional< |
| 553 | NoArgBlockAccess, void, |
| 554 | typename BlockFactory::template XprType<ArgXprType>::type>::type XprType; |
| 555 | |
| 556 | typedef typename XprScalar<XprType>::type Scalar; |
| 557 | |
| 558 | TensorUnaryExprBlock(const ArgTensorBlock& arg_block, |
| 559 | const BlockFactory& factory) |
| 560 | : m_arg_block(arg_block), m_factory(factory) {} |
| 561 | |
| 562 | TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } |
| 563 | XprType expr() const { return m_factory.expr(m_arg_block.expr()); } |
| 564 | const Scalar* data() const { return NULL; } |
| 565 | void cleanup() { m_arg_block.cleanup(); } |
| 566 | |
| 567 | private: |
| 568 | ArgTensorBlock m_arg_block; |
| 569 | BlockFactory m_factory; |
| 570 | }; |
| 571 | |
| 572 | // -------------------------------------------------------------------------- // |
| 573 | // TensorTernaryExprBlock is a lazy tensor expression block that can construct |
| 574 | // an arbitrary tensor expression from three blocks of the underlying type. |
| 575 | |
| 576 | template <typename BlockFactory, typename Arg1TensorBlock, |
| 577 | typename Arg2TensorBlock, typename Arg3TensorBlock> |
| 578 | class TensorTernaryExprBlock { |
| 579 | #if !EIGEN_HAS_CXX11 |
| 580 | typedef internal::TensorBlockKind::TensorBlockKind TensorBlockKind; |
| 581 | #endif |
| 582 | |
| 583 | typedef typename Arg1TensorBlock::XprType Arg1XprType; |
| 584 | typedef typename Arg2TensorBlock::XprType Arg2XprType; |
| 585 | typedef typename Arg3TensorBlock::XprType Arg3XprType; |
| 586 | |
| 587 | static const bool NoArgBlockAccess = internal::is_void<Arg1XprType>::value || |
| 588 | internal::is_void<Arg2XprType>::value || |
| 589 | internal::is_void<Arg3XprType>::value; |
| 590 | |
| 591 | public: |
| 592 | typedef typename conditional< |
| 593 | NoArgBlockAccess, void, |
| 594 | typename BlockFactory::template XprType<Arg1XprType, Arg2XprType, |
| 595 | Arg3XprType>::type>::type XprType; |
| 596 | |
| 597 | typedef typename XprScalar<XprType>::type Scalar; |
| 598 | |
| 599 | TensorTernaryExprBlock(const Arg1TensorBlock& arg1_block, |
| 600 | const Arg2TensorBlock& arg2_block, |
| 601 | const Arg3TensorBlock& arg3_block, |
| 602 | const BlockFactory& factory) |
| 603 | : m_arg1_block(arg1_block), |
| 604 | m_arg2_block(arg2_block), |
| 605 | m_arg3_block(arg3_block), |
| 606 | m_factory(factory) {} |
| 607 | |
| 608 | TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } |
| 609 | XprType expr() const { |
| 610 | return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(), |
| 611 | m_arg3_block.expr()); |
| 612 | } |
| 613 | const Scalar* data() const { return NULL; } |
| 614 | void cleanup() { |
| 615 | m_arg1_block.cleanup(); |
| 616 | m_arg2_block.cleanup(); |
| 617 | m_arg3_block.cleanup(); |
| 618 | } |
| 619 | |
| 620 | private: |
| 621 | Arg1TensorBlock m_arg1_block; |
| 622 | Arg2TensorBlock m_arg2_block; |
| 623 | Arg3TensorBlock m_arg3_block; |
| 624 | BlockFactory m_factory; |
| 625 | }; |
| 626 | |
| 627 | // -------------------------------------------------------------------------- // |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 628 | // StridedLinearBufferCopy provides a method to copy data between two linear |
| 629 | // buffers with different strides, with optimized paths for scatter/gather. |
| 630 | |
| 631 | template <typename Scalar, typename IndexType> |
| 632 | class StridedLinearBufferCopy { |
| 633 | typedef typename packet_traits<Scalar>::type Packet; |
| 634 | enum { |
| 635 | Vectorizable = packet_traits<Scalar>::Vectorizable, |
| 636 | PacketSize = packet_traits<Scalar>::size |
| 637 | }; |
| 638 | |
| 639 | public: |
| 640 | // Specifying linear copy kind statically gives ~30% speedup for small sizes. |
| 641 | enum Kind { |
| 642 | Linear = 0, // src_stride == 1 && dst_stride == 1 |
| 643 | Scatter = 1, // src_stride == 1 && dst_stride != 1 |
| 644 | FillLinear = 2, // src_stride == 0 && dst_stride == 1 |
| 645 | FillScatter = 3, // src_stride == 0 && dst_stride != 1 |
| 646 | Gather = 4, // dst_stride == 1 |
| 647 | Random = 5 // everything else |
| 648 | }; |
| 649 | |
| 650 | struct Dst { |
| 651 | Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {} |
| 652 | |
| 653 | IndexType offset; |
| 654 | IndexType stride; |
| 655 | Scalar* data; |
| 656 | }; |
| 657 | |
| 658 | struct Src { |
| 659 | Src(IndexType o, IndexType s, const Scalar* d) |
| 660 | : offset(o), stride(s), data(d) {} |
| 661 | |
| 662 | IndexType offset; |
| 663 | IndexType stride; |
| 664 | const Scalar* data; |
| 665 | }; |
| 666 | |
| 667 | template <StridedLinearBufferCopy::Kind kind> |
| 668 | static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst, |
| 669 | const Src& src, |
| 670 | const size_t count) { |
| 671 | Run<kind>(count, dst.offset, dst.stride, dst.data, src.offset, src.stride, |
| 672 | src.data); |
| 673 | } |
| 674 | |
| 675 | private: |
| 676 | template <StridedLinearBufferCopy::Kind kind> |
| 677 | static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( |
| 678 | const IndexType count, const IndexType dst_offset, |
| 679 | const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data, |
| 680 | const IndexType src_offset, const IndexType src_stride, |
| 681 | const Scalar* EIGEN_RESTRICT src_data) { |
| 682 | const Scalar* src = &src_data[src_offset]; |
| 683 | Scalar* dst = &dst_data[dst_offset]; |
| 684 | |
| 685 | if (!Vectorizable) { |
| 686 | for (Index i = 0; i < count; ++i) { |
| 687 | dst[i * dst_stride] = src[i * src_stride]; |
| 688 | } |
| 689 | return; |
| 690 | } |
| 691 | |
| 692 | const IndexType vectorized_size = count - PacketSize; |
| 693 | IndexType i = 0; |
| 694 | |
| 695 | if (kind == Linear) { |
| 696 | // ******************************************************************** // |
| 697 | // Linear copy from `src` to `dst`. |
| 698 | const IndexType unrolled_size = count - 4 * PacketSize; |
| 699 | eigen_assert(src_stride == 1 && dst_stride == 1); |
| 700 | for (; i <= unrolled_size; i += 4 * PacketSize) { |
| 701 | for (int j = 0; j < 4; ++j) { |
| 702 | Packet p = ploadu<Packet>(src + i + j * PacketSize); |
| 703 | pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p); |
| 704 | } |
| 705 | } |
| 706 | for (; i <= vectorized_size; i += PacketSize) { |
| 707 | Packet p = ploadu<Packet>(src + i); |
| 708 | pstoreu<Scalar, Packet>(dst + i, p); |
| 709 | } |
| 710 | for (; i < count; ++i) { |
| 711 | dst[i] = src[i]; |
| 712 | } |
| 713 | // ******************************************************************** // |
| 714 | } else if (kind == Scatter) { |
| 715 | // Scatter from `src` to `dst`. |
| 716 | eigen_assert(src_stride == 1 && dst_stride != 1); |
| 717 | for (; i <= vectorized_size; i += PacketSize) { |
| 718 | Packet p = ploadu<Packet>(src + i); |
| 719 | pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride); |
| 720 | } |
| 721 | for (; i < count; ++i) { |
| 722 | dst[i * dst_stride] = src[i]; |
| 723 | } |
| 724 | // ******************************************************************** // |
| 725 | } else if (kind == FillLinear) { |
| 726 | // Fill `dst` with value at `*src`. |
| 727 | eigen_assert(src_stride == 0 && dst_stride == 1); |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 728 | const IndexType unrolled_size = count - 4 * PacketSize; |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 729 | Packet p = pload1<Packet>(src); |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 730 | for (; i <= unrolled_size; i += 4 * PacketSize) { |
| 731 | for (int j = 0; j < 4; ++j) { |
| 732 | pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p); |
| 733 | } |
| 734 | } |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 735 | for (; i <= vectorized_size; i += PacketSize) { |
| 736 | pstoreu<Scalar, Packet>(dst + i, p); |
| 737 | } |
| 738 | for (; i < count; ++i) { |
| 739 | dst[i] = *src; |
| 740 | } |
| 741 | // ******************************************************************** // |
| 742 | } else if (kind == FillScatter) { |
| 743 | // Scatter `*src` into `dst`. |
| 744 | eigen_assert(src_stride == 0 && dst_stride != 1); |
| 745 | Packet p = pload1<Packet>(src); |
| 746 | for (; i <= vectorized_size; i += PacketSize) { |
| 747 | pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride); |
| 748 | } |
| 749 | for (; i < count; ++i) { |
| 750 | dst[i * dst_stride] = *src; |
| 751 | } |
| 752 | // ******************************************************************** // |
| 753 | } else if (kind == Gather) { |
| 754 | // Gather from `src` into `dst`. |
| 755 | eigen_assert(dst_stride == 1); |
| 756 | for (; i <= vectorized_size; i += PacketSize) { |
| 757 | Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride); |
| 758 | pstoreu<Scalar, Packet>(dst + i, p); |
| 759 | } |
| 760 | for (; i < count; ++i) { |
| 761 | dst[i] = src[i * src_stride]; |
| 762 | } |
| 763 | // ******************************************************************** // |
| 764 | } else if (kind == Random) { |
| 765 | // Random. |
| 766 | for (; i < count; ++i) { |
| 767 | dst[i * dst_stride] = src[i * src_stride]; |
| 768 | } |
| 769 | } else { |
| 770 | eigen_assert(false); |
| 771 | } |
| 772 | } |
| 773 | }; |
| 774 | |
| 775 | // -------------------------------------------------------------------------- // |
| 776 | // TensorBlockIO copies data from `src` tensor block, to the `dst` tensor block. |
| 777 | // It's possible to specify src->dst dimension mapping for the copy operation. |
| 778 | // Dimensions of `dst` specify how many elements have to be copied, for the |
| 779 | // `src` we need to know only stride to navigate through source memory buffer. |
| 780 | |
| 781 | template <typename Scalar, typename IndexType, int NumDims, int Layout> |
| 782 | class TensorBlockIOV2 { |
| 783 | static const bool IsColMajor = (Layout == ColMajor); |
| 784 | |
| 785 | typedef StridedLinearBufferCopy<Scalar, IndexType> LinCopy; |
| 786 | |
| 787 | public: |
| 788 | typedef DSizes<IndexType, NumDims> Dimensions; |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 789 | typedef DSizes<int, NumDims> DimensionsMap; |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 790 | |
| 791 | struct Dst { |
| 792 | Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst, |
| 793 | IndexType dst_offset = 0) |
| 794 | : dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {} |
| 795 | |
| 796 | Dimensions dims; |
| 797 | Dimensions strides; |
| 798 | Scalar* data; |
| 799 | IndexType offset; |
| 800 | }; |
| 801 | |
| 802 | struct Src { |
| 803 | Src(const Dimensions& src_strides, const Scalar* src, |
| 804 | IndexType src_offset = 0) |
| 805 | : strides(src_strides), data(src), offset(src_offset) {} |
| 806 | |
| 807 | Dimensions strides; |
| 808 | const Scalar* data; |
| 809 | IndexType offset; |
| 810 | }; |
| 811 | |
| 812 | // Copies data to `dst` from `src`, using provided dimensions mapping: |
| 813 | // |
| 814 | // src_dimension_index = dst_to_src_dim_map[dst_dimension_index] |
| 815 | // |
| 816 | static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Copy( |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 817 | const Dst& dst, const Src& src, const DimensionsMap& dst_to_src_dim_map) { |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 818 | // Copy single scalar value from `src` to `dst`. |
| 819 | if (NumDims == 0) { |
| 820 | *(dst.data + dst.offset) = *(src.data + src.offset); |
| 821 | return; |
| 822 | } |
| 823 | |
| 824 | // Both `dst` and `src` must have contiguous innermost dimension. We also |
| 825 | // accept the special case with stride '0', because it's used as a trick to |
| 826 | // implement broadcasting. |
| 827 | { |
| 828 | int inner_dim = IsColMajor ? 0 : NumDims - 1; |
| 829 | EIGEN_UNUSED_VARIABLE(inner_dim); |
| 830 | eigen_assert(dst.strides[inner_dim] == 1 || dst.strides[inner_dim] == 0); |
| 831 | eigen_assert(src.strides[inner_dim] == 1 || src.strides[inner_dim] == 0); |
| 832 | } |
| 833 | |
| 834 | // Give a shorter name to `dst_to_src_dim_map`. |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 835 | const DimensionsMap& dim_map = dst_to_src_dim_map; |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 836 | |
| 837 | // Do not squeeze reordered inner dimensions. |
| 838 | int num_squeezable_dims = NumSqueezableInnerDims(dim_map); |
| 839 | |
| 840 | // NOTE: We find the innermost dimension (contiguous in memory) in the dst |
| 841 | // block, and we write data linearly into that dimension, reading it from |
| 842 | // the src. If dimensions are reordered, we might end up reading data from |
| 843 | // the src with `stride != 1`. |
| 844 | // |
| 845 | // NOTE: Random-Read/Linear-Write can be up to ~2X faster than |
| 846 | // Linear-Read/Random-Write: https://stackoverflow.com/a/54935680 |
| 847 | |
| 848 | // Find the innermost dimension in the dst whose size is not 1. This is the |
| 849 | // effective inner dim. |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 850 | int num_size_one_inner_dims = 0; |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 851 | for (int i = 0; i < num_squeezable_dims; ++i) { |
| 852 | const int dst_dim = IsColMajor ? i : NumDims - i - 1; |
| 853 | if (dst.dims[dst_dim] != 1) break; |
| 854 | num_size_one_inner_dims++; |
| 855 | } |
| 856 | |
| 857 | // If all dimensions are of size 1, just copy a scalar from `src` to `dst`. |
| 858 | if (num_size_one_inner_dims == NumDims) { |
| 859 | *(dst.data + dst.offset) = *(src.data + src.offset); |
| 860 | return; |
| 861 | } |
| 862 | |
| 863 | // Outermost dimension in the dst with `stride == 1` (contiguous in memory). |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 864 | const int dst_stride1_dim = |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 865 | IsColMajor ? num_size_one_inner_dims |
| 866 | : NumDims - num_size_one_inner_dims - 1; |
| 867 | |
| 868 | // Dimension in the src that corresponds to the dst innermost dimension. |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 869 | const int src_dim_for_dst_stride1_dim = |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 870 | NumDims == 0 ? 1 : dim_map[dst_stride1_dim]; |
| 871 | |
| 872 | // Size of the innermost dimension (length of contiguous blocks of memory). |
| 873 | IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim]; |
| 874 | |
| 875 | // Squeeze multiple inner dims into one if they are contiguous in `dst` and |
| 876 | // `src` memory, so we can do less linear copy calls. |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 877 | for (int i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) { |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 878 | const int dst_dim = IsColMajor ? i : NumDims - i - 1; |
| 879 | const IndexType dst_stride = dst.strides[dst_dim]; |
| 880 | const IndexType src_stride = src.strides[dim_map[dst_dim]]; |
| 881 | if (dst_inner_dim_size == dst_stride && dst_stride == src_stride) { |
| 882 | dst_inner_dim_size *= dst.dims[dst_dim]; |
| 883 | ++num_size_one_inner_dims; |
| 884 | } else { |
| 885 | break; |
| 886 | } |
| 887 | } |
| 888 | |
| 889 | // Setup strides to read data from `src` and write to `dst`. |
| 890 | IndexType input_offset = src.offset; |
| 891 | IndexType output_offset = dst.offset; |
| 892 | IndexType input_stride = |
| 893 | NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim]; |
| 894 | IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim]; |
| 895 | |
| 896 | const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1; |
| 897 | array<BlockIteratorState, at_least_1_dim> it; |
| 898 | |
| 899 | // Initialize block iterator state. Squeeze away any dimension of size 1. |
| 900 | int idx = 0; // currently initialized iterator state index |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 901 | for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) { |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 902 | const int dst_dim = IsColMajor ? i + 1 : NumDims - i - 2; |
| 903 | if (dst.dims[dst_dim] == 1) continue; |
| 904 | |
| 905 | it[idx].size = dst.dims[dst_dim]; |
| 906 | it[idx].input_stride = src.strides[dim_map[dst_dim]]; |
| 907 | it[idx].output_stride = dst.strides[dst_dim]; |
| 908 | |
| 909 | it[idx].input_span = it[idx].input_stride * (it[idx].size - 1); |
| 910 | it[idx].output_span = it[idx].output_stride * (it[idx].size - 1); |
| 911 | |
| 912 | idx++; |
| 913 | } |
| 914 | |
| 915 | // Iterate copying data from src to dst. |
| 916 | const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize(); |
| 917 | |
| 918 | #define COPY_INNER_DIM(KIND) \ |
| 919 | for (IndexType i = 0; i < block_total_size; i += dst_inner_dim_size) { \ |
| 920 | LinCopy::template Run<KIND>( \ |
| 921 | typename LinCopy::Dst(output_offset, output_stride, dst.data), \ |
| 922 | typename LinCopy::Src(input_offset, input_stride, src.data), \ |
| 923 | dst_inner_dim_size); \ |
| 924 | \ |
| 925 | for (int j = 0; j < idx; ++j) { \ |
| 926 | if (++it[j].count < it[j].size) { \ |
| 927 | input_offset += it[j].input_stride; \ |
| 928 | output_offset += it[j].output_stride; \ |
| 929 | break; \ |
| 930 | } \ |
| 931 | it[j].count = 0; \ |
| 932 | input_offset -= it[j].input_span; \ |
| 933 | output_offset -= it[j].output_span; \ |
| 934 | } \ |
| 935 | } |
| 936 | |
| 937 | if (input_stride == 1 && output_stride == 1) { |
| 938 | COPY_INNER_DIM(LinCopy::Linear); |
| 939 | } else if (input_stride == 1 && output_stride != 1) { |
| 940 | COPY_INNER_DIM(LinCopy::Scatter); |
| 941 | } else if (input_stride == 0 && output_stride == 1) { |
| 942 | COPY_INNER_DIM(LinCopy::FillLinear); |
| 943 | } else if (input_stride == 0 && output_stride != 1) { |
| 944 | COPY_INNER_DIM(LinCopy::FillScatter); |
| 945 | } else if (output_stride == 1) { |
| 946 | COPY_INNER_DIM(LinCopy::Gather); |
| 947 | } else { |
| 948 | COPY_INNER_DIM(LinCopy::Random); |
| 949 | } |
| 950 | |
| 951 | #undef COPY_INNER_DIM |
| 952 | } |
| 953 | |
| 954 | // Copy from `src` to `dst` with an identity src->dst dimension map. |
| 955 | static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Copy(const Dst& dst, |
| 956 | const Src& src) { |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 957 | DimensionsMap dst_to_src_map; |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 958 | for (int i = 0; i < NumDims; ++i) dst_to_src_map[i] = i; |
| 959 | Copy(dst, src, dst_to_src_map); |
| 960 | } |
| 961 | |
| 962 | private: |
| 963 | struct BlockIteratorState { |
| 964 | BlockIteratorState() |
| 965 | : size(0), |
| 966 | count(0), |
| 967 | input_stride(0), |
| 968 | output_stride(0), |
| 969 | input_span(0), |
| 970 | output_span(0) {} |
| 971 | |
| 972 | IndexType size; |
| 973 | IndexType count; |
| 974 | IndexType input_stride; |
| 975 | IndexType output_stride; |
| 976 | IndexType input_span; |
| 977 | IndexType output_span; |
| 978 | }; |
| 979 | |
| 980 | // Compute how many inner dimensions it's allowed to squeeze when doing IO |
| 981 | // between two tensor blocks. It's safe to squeeze inner dimensions, only |
| 982 | // if they are not reordered. |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 983 | static int NumSqueezableInnerDims(const DimensionsMap& dim_map) { |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 984 | int num_squeezable_dims = 0; |
| 985 | for (int i = 0; i < NumDims; ++i) { |
| 986 | const int dim = IsColMajor ? i : NumDims - i - 1; |
| 987 | if (dim_map[dim] != dim) break; |
| 988 | num_squeezable_dims++; |
| 989 | } |
| 990 | return num_squeezable_dims; |
| 991 | } |
| 992 | }; |
| 993 | |
| 994 | // -------------------------------------------------------------------------- // |
| 995 | // TensorBlockAssignment assigns a block expression of type `TensorBlockExpr` to |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 996 | // a Tensor block defined by `desc`, backed by a memory buffer at `target`. |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 997 | // |
| 998 | // Currently there is no way to write from a Tensor expression to a block of |
| 999 | // memory, if dimensions are reordered. If you need to do that, you should |
| 1000 | // materialize a Tensor block expression into a memory buffer, and then use |
| 1001 | // TensorBlockIO to copy data between two memory buffers with a custom |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 1002 | // `target->src` dimension map (see definition above). |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 1003 | // |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 1004 | // Also currently the innermost dimension of `target` must have a stride '1' |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 1005 | // (contiguous in memory). This restriction could be lifted with a `pscatter`, |
| 1006 | // but in practice it's never needed, and there is a similar TensorBlockIO |
| 1007 | // workaround for that. |
| 1008 | // |
| 1009 | // TODO(ezhulenev): TensorBlockAssignment is a special case of TensorBlockIO |
| 1010 | // where `src` is a tensor expression. Explore if it is possible to rewrite IO |
| 1011 | // to use expressions instead of pointers, and after that TensorBlockAssignment |
| 1012 | // will become an alias to IO. |
| 1013 | template <typename Scalar, int NumDims, typename TensorBlockExpr, |
| 1014 | typename IndexType = Eigen::Index> |
| 1015 | class TensorBlockAssignment { |
| 1016 | // We will use coeff/packet path to evaluate block expressions. |
| 1017 | typedef TensorEvaluator<const TensorBlockExpr, DefaultDevice> |
| 1018 | TensorBlockEvaluator; |
| 1019 | |
| 1020 | typedef DSizes<IndexType, NumDims> Dimensions; |
| 1021 | |
| 1022 | enum { |
| 1023 | Vectorizable = packet_traits<Scalar>::Vectorizable, |
| 1024 | PacketSize = packet_traits<Scalar>::size |
| 1025 | }; |
| 1026 | |
| 1027 | template <bool Vectorizable, typename Evaluator> |
| 1028 | struct InnerDimAssign { |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 1029 | EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count, |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 1030 | const Evaluator& eval, |
| 1031 | IndexType eval_offset) { |
| 1032 | for (IndexType i = 0; i < count; ++i) { |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 1033 | target[i] = eval.coeff(eval_offset + i); |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 1034 | } |
| 1035 | } |
| 1036 | }; |
| 1037 | |
| 1038 | template <typename Evaluator> |
| 1039 | struct InnerDimAssign<true, Evaluator> { |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 1040 | EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count, |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 1041 | const Evaluator& eval, |
| 1042 | IndexType eval_offset) { |
| 1043 | typedef typename packet_traits<Scalar>::type Packet; |
| 1044 | |
| 1045 | const IndexType unrolled_size = count - 4 * PacketSize; |
| 1046 | const IndexType vectorized_size = count - PacketSize; |
| 1047 | IndexType i = 0; |
| 1048 | |
| 1049 | for (; i <= unrolled_size; i += 4 * PacketSize) { |
| 1050 | for (int j = 0; j < 4; ++j) { |
| 1051 | const IndexType idx = eval_offset + i + j * PacketSize; |
| 1052 | Packet p = eval.template packet<Unaligned>(idx); |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 1053 | pstoreu<Scalar>(target + i + j * PacketSize, p); |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 1054 | } |
| 1055 | } |
| 1056 | |
| 1057 | for (; i <= vectorized_size; i += PacketSize) { |
| 1058 | Packet p = eval.template packet<Unaligned>(eval_offset + i); |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 1059 | pstoreu<Scalar>(target + i, p); |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 1060 | } |
| 1061 | |
| 1062 | for (; i < count; ++i) { |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 1063 | target[i] = eval.coeff(eval_offset + i); |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 1064 | } |
| 1065 | } |
| 1066 | }; |
| 1067 | |
| 1068 | public: |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 1069 | struct Target { |
| 1070 | Target(const Dimensions& target_dims, const Dimensions& target_strides, |
| 1071 | Scalar* target_data, IndexType target_offset = 0) |
| 1072 | : dims(target_dims), |
| 1073 | strides(target_strides), |
| 1074 | data(target_data), |
| 1075 | offset(target_offset) {} |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 1076 | |
| 1077 | Dimensions dims; |
| 1078 | Dimensions strides; |
| 1079 | Scalar* data; |
| 1080 | IndexType offset; |
| 1081 | }; |
| 1082 | |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 1083 | static Target target(const Dimensions& target_dims, |
| 1084 | const Dimensions& target_strides, Scalar* target_data, |
| 1085 | IndexType target_offset = 0) { |
| 1086 | return Target(target_dims, target_strides, target_data, target_offset); |
| 1087 | } |
| 1088 | |
| 1089 | template <typename TargetDimsIndexType, typename TargetStridesIndexType> |
| 1090 | static Target target( |
| 1091 | const DSizes<TargetDimsIndexType, NumDims>& target_dims, |
| 1092 | const DSizes<TargetStridesIndexType, NumDims>& target_strides, |
| 1093 | Scalar* target_data, IndexType target_offset = 0) { |
| 1094 | // DSizes constructor will do index type promotion if it's safe. |
| 1095 | return Target(Dimensions(target_dims), Dimensions(target_strides), |
| 1096 | target_data, target_offset); |
| 1097 | } |
| 1098 | |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 1099 | static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 1100 | const Target& target, const TensorBlockExpr& expr) { |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 1101 | // Prepare evaluator for block expression. |
| 1102 | DefaultDevice default_device; |
| 1103 | TensorBlockEvaluator eval(expr, default_device); |
| 1104 | |
| 1105 | // Tensor block expression dimension should match destination dimensions. |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 1106 | eigen_assert(dimensions_match(target.dims, eval.dimensions())); |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 1107 | |
| 1108 | static const int Layout = TensorBlockEvaluator::Layout; |
| 1109 | static const bool is_col_major = Layout == ColMajor; |
| 1110 | |
| 1111 | // Initialize output inner dimension size based on a layout. |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 1112 | const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize(); |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 1113 | const int inner_dim_idx = is_col_major ? 0 : NumDims - 1; |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 1114 | IndexType output_inner_dim_size = target.dims[inner_dim_idx]; |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 1115 | |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 1116 | // Target inner dimension stride must be '1'. |
| 1117 | eigen_assert(target.strides[inner_dim_idx] == 1); |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 1118 | |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 1119 | // Squeeze multiple inner dims into one if they are contiguous in `target`. |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 1120 | IndexType num_squeezed_dims = 0; |
| 1121 | for (Index i = 1; i < NumDims; ++i) { |
| 1122 | const Index dim = is_col_major ? i : NumDims - i - 1; |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 1123 | const IndexType target_stride = target.strides[dim]; |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 1124 | |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 1125 | if (output_inner_dim_size == target_stride) { |
| 1126 | output_inner_dim_size *= target.dims[dim]; |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 1127 | num_squeezed_dims++; |
| 1128 | } else { |
| 1129 | break; |
| 1130 | } |
| 1131 | } |
| 1132 | |
| 1133 | // Initialize output block iterator state. Dimension in this array are |
| 1134 | // always in inner_most -> outer_most order (col major layout). |
| 1135 | array<BlockIteratorState, NumDims> it; |
| 1136 | |
| 1137 | int idx = 0; // currently initialized iterator state index |
| 1138 | for (Index i = num_squeezed_dims; i < NumDims - 1; ++i) { |
| 1139 | const Index dim = is_col_major ? i + 1 : NumDims - i - 2; |
| 1140 | |
| 1141 | it[idx].count = 0; |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 1142 | it[idx].size = target.dims[dim]; |
| 1143 | it[idx].output_stride = target.strides[dim]; |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 1144 | it[idx].output_span = it[i].output_stride * (it[i].size - 1); |
| 1145 | idx++; |
| 1146 | } |
| 1147 | |
| 1148 | // We read block expression from the beginning, and start writing data to |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 1149 | // `target` at given offset. |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 1150 | IndexType input_offset = 0; |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 1151 | IndexType output_offset = target.offset; |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 1152 | |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 1153 | // Iterate copying data from `eval` to `target`. |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 1154 | for (IndexType i = 0; i < output_size; i += output_inner_dim_size) { |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 1155 | // Assign to `target` at current offset. |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 1156 | InnerDimAssign<Vectorizable && TensorBlockEvaluator::PacketAccess, |
Eugene Zhulenev | 3c2990f | 2019-10-08 14:45:43 -0700 | [diff] [blame^] | 1157 | TensorBlockEvaluator>::Run(target.data + output_offset, |
Rasmus Munk Larsen | 9f90524 | 2019-10-01 16:34:00 -0700 | [diff] [blame] | 1158 | output_inner_dim_size, eval, |
| 1159 | input_offset); |
| 1160 | |
| 1161 | // Move input offset forward by the number of assigned coefficients. |
| 1162 | input_offset += output_inner_dim_size; |
| 1163 | |
| 1164 | // Update index. |
| 1165 | for (int j = 0; j < idx; ++j) { |
| 1166 | if (++it[j].count < it[j].size) { |
| 1167 | output_offset += it[j].output_stride; |
| 1168 | break; |
| 1169 | } |
| 1170 | it[j].count = 0; |
| 1171 | output_offset -= it[j].output_span; |
| 1172 | } |
| 1173 | } |
| 1174 | } |
| 1175 | |
| 1176 | private: |
| 1177 | struct BlockIteratorState { |
| 1178 | BlockIteratorState() |
| 1179 | : count(0), size(0), output_stride(0), output_span(0) {} |
| 1180 | |
| 1181 | IndexType count; |
| 1182 | IndexType size; |
| 1183 | IndexType output_stride; |
| 1184 | IndexType output_span; |
| 1185 | }; |
| 1186 | }; |
| 1187 | |
| 1188 | // -------------------------------------------------------------------------- // |
| 1189 | |
| 1190 | } // namespace internal |
| 1191 | } // namespace Eigen |
| 1192 | |
| 1193 | #endif // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_V2_H |