unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h - eigen/fuchsia - Git at Google

 #ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
 #define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H

 namespace Eigen {

 /** \class TensorBlock
   * \ingroup CXX11_Tensor_Module
   *
   * \brief Tensor block class.
   *
   * This class represents a tensor block specified by the index of the
   * first block coefficient, and the size of the block in each dimension.
   *
   */

 namespace internal {

 template <typename Index, typename Scalar, std::size_t NumDims, int Layout>
 class TensorBlock {
  public:
   typedef DSizes<Index, NumDims> Dimensions;

   TensorBlock(const Index first_coeff_index,
               const Dimensions& block_sizes,
               const Dimensions& block_strides,
               const Dimensions& tensor_strides,
               Scalar* data)
       : m_first_coeff_index(first_coeff_index),
         m_block_sizes(block_sizes),
         m_block_strides(block_strides),
         m_tensor_strides(tensor_strides),
         m_data(data) {}

   Index first_coeff_index() const { return m_first_coeff_index; }

   const Dimensions& block_sizes() const { return m_block_sizes; }

   const Dimensions& block_strides() const { return m_block_strides; }

   const Dimensions& tensor_strides() const { return m_tensor_strides; }

   Scalar* data() { return m_data; }

   const Scalar* data() const { return m_data; }

  private:
   Index m_first_coeff_index;
   Dimensions m_block_sizes;
   Dimensions m_block_strides;
   Dimensions m_tensor_strides;
   Scalar* m_data;  // Not owned.
 };

 template <typename Index, typename Scalar, bool Vectorizable>
 struct TensorBlockCopyOp {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
       const Index num_coeff_to_copy, const Index dst_index,
       const Index dst_stride, Scalar* EIGEN_RESTRICT dst_data, const Index src_index,
       const Index src_stride, const Scalar* EIGEN_RESTRICT src_data) {
     for (Index i = 0; i < num_coeff_to_copy; ++i) {
       dst_data[dst_index + i * dst_stride] =
           src_data[src_index + i * src_stride];
     }
   }
 };

 // NOTE: Benchmarks run on an implementation of this that broke each of the
 // loops in these conditionals into it's own template specialization (to
 // avoid conditionals in the caller's loop) did not show an improvement.
 template <typename Index, typename Scalar>
 struct TensorBlockCopyOp<Index, Scalar, true> {
   typedef typename packet_traits<Scalar>::type Packet;
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
       const Index num_coeff_to_copy, const Index dst_index,
       const Index dst_stride, Scalar* EIGEN_RESTRICT dst_data,
       const Index src_index, const Index src_stride,
       const Scalar* EIGEN_RESTRICT src_data) {
     if (src_stride == 1) {
       const Index packet_size = internal::unpacket_traits<Packet>::size;
       const Index vectorized_size =
           (num_coeff_to_copy / packet_size) * packet_size;
       if (dst_stride == 1) {
         // LINEAR
         for (Index i = 0; i < vectorized_size; i += packet_size) {
           Packet p = internal::ploadt<Packet, Unaligned>(
               src_data + src_index + i);
           internal::pstoret<Scalar, Packet, Unaligned>(
               dst_data + dst_index + i, p);
         }
         for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) {
           dst_data[dst_index + i] = src_data[src_index + i];
         }
       } else {
         // SCATTER
         for (Index i = 0; i < vectorized_size; i += packet_size) {
           Packet p = internal::ploadt<Packet, Unaligned>(
               src_data + src_index + i);
           internal::pscatter<Scalar, Packet>(
               dst_data + dst_index + i * dst_stride, p, dst_stride);
         }
         for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) {
           dst_data[dst_index + i * dst_stride] = src_data[src_index + i];
         }
       }
     } else {
       if (dst_stride == 1) {
         // GATHER
         const Index packet_size = internal::unpacket_traits<Packet>::size;
         const Index vectorized_size =
             (num_coeff_to_copy / packet_size) * packet_size;
         for (Index i = 0; i < vectorized_size; i += packet_size) {
           Packet p = internal::pgather<Scalar, Packet>(
               src_data + src_index + i * src_stride, src_stride);
           internal::pstoret<Scalar, Packet, Unaligned>(
               dst_data + dst_index + i, p);
         }
         for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) {
           dst_data[dst_index + i] = src_data[src_index + i * src_stride];
         }
       } else {
         // RANDOM
         for (Index i = 0; i < num_coeff_to_copy; ++i) {
           dst_data[dst_index + i * dst_stride] =
               src_data[src_index + i * src_stride];
         }
       }
     }
   }
 };

 /** \class TensorBlockIO
   * \ingroup CXX11_Tensor_Module
   *
   * \brief Tensor block IO class.
   *
   * This class is responsible for copying data between a tensor and a tensor
   * block.
   *
   */
 template <typename Index, typename Scalar, std::size_t NumDims, int Layout,
           bool Vectorizable, bool BlockRead>
 class TensorBlockIO {
  public:
   typedef typename internal::TensorBlock<Index, Scalar, NumDims, Layout>
     TensorBlock;
   typedef typename internal::TensorBlockCopyOp<Index, Scalar, Vectorizable>
     TensorBlockCopyOp;

  protected:
   struct BlockIteratorState {
     Index input_stride;
     Index output_stride;
     Index input_span;
     Index output_span;
     Index size;
     Index count;
   };

   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Copy(
       const TensorBlock& block, Index first_coeff_index,
       const array<Index, NumDims>& tensor_to_block_dim_map,
       const array<Index, NumDims>& tensor_strides, const Scalar* src_data,
       Scalar* dst_data) {
     // Calculate strides and dimensions.
     const Index block_dim_for_tensor_stride1_dim =
         NumDims == 0 ? 1 :
         tensor_to_block_dim_map[static_cast<int>(Layout) ==
                                         static_cast<int>(ColMajor)
                                     ? 0
                                     : NumDims - 1];
     const size_t block_inner_dim_size =
         NumDims == 0 ? 1 :
         block.block_sizes()[block_dim_for_tensor_stride1_dim];
     const size_t block_outer_dim_size =
         NumDims == 0 ? 1 :
         block.block_sizes().TotalSize() / block_inner_dim_size;

     Index inputIndex;
     Index outputIndex;
     Index input_stride;
     Index output_stride;

     // Setup strides to read/write along the tensor's stride1 dimension.
     if (BlockRead) {
       inputIndex = first_coeff_index;
       outputIndex = 0;
       input_stride = 1;
       output_stride = NumDims == 0 ? 1
           : block.block_strides()[block_dim_for_tensor_stride1_dim];
     } else {
       inputIndex = 0;
       outputIndex = first_coeff_index;
       input_stride = NumDims == 0 ? 1
           : block.block_strides()[block_dim_for_tensor_stride1_dim];
       output_stride = 1;
     }

     const std::size_t at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
     array<BlockIteratorState, at_least_1_dim> block_iter_state;

     // Initialize block iterator state.
     for (int i = 0; i < static_cast<int>(NumDims) - 1; ++i) {
       const int dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
                           ? i + 1
                           : NumDims - i - 2;
       block_iter_state[i].size =
           block.block_sizes()[tensor_to_block_dim_map[dim]];
       if (BlockRead) {
         block_iter_state[i].input_stride = tensor_strides[dim];
         block_iter_state[i].output_stride =
             block.block_strides()[tensor_to_block_dim_map[dim]];
       } else {
         block_iter_state[i].input_stride =
             block.block_strides()[tensor_to_block_dim_map[dim]];
         block_iter_state[i].output_stride = tensor_strides[dim];
       }
       block_iter_state[i].input_span =
           block_iter_state[i].input_stride * (block_iter_state[i].size - 1);
       block_iter_state[i].output_span =
           block_iter_state[i].output_stride * (block_iter_state[i].size - 1);
       block_iter_state[i].count = 0;
     }

     // Iterate copying data from src to dst.
     for (Index i = 0; i < block_outer_dim_size; ++i) {
       TensorBlockCopyOp::Run(block_inner_dim_size, outputIndex, output_stride,
                              dst_data, inputIndex, input_stride, src_data);
       // Update index.
       for (int i = 0; i < static_cast<int>(NumDims) - 1; ++i) {
         if (++block_iter_state[i].count < block_iter_state[i].size) {
           inputIndex += block_iter_state[i].input_stride;
           outputIndex += block_iter_state[i].output_stride;
           break;
         }
         block_iter_state[i].count = 0;
         inputIndex -= block_iter_state[i].input_span;
         outputIndex -= block_iter_state[i].output_span;
       }
     }
   }
 };

 /** \class TensorBlockReader
   * \ingroup CXX11_Tensor_Module
   *
   * \brief Tensor block reader class.
   *
   * This class is responsible for reading a tensor block.
   *
   */

 template <typename Index, typename Scalar, std::size_t NumDims, int Layout,
           bool Vectorizable>
 class TensorBlockReader : public TensorBlockIO<Index, Scalar, NumDims,
                                                Layout, Vectorizable, true> {
  public:
   typedef typename internal::TensorBlock<Index, Scalar, NumDims, Layout>
       TensorBlock;
   typedef TensorBlockIO<Index, Scalar, NumDims, Layout, Vectorizable, true>
       Base;

   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
       TensorBlock* block, const Scalar* src_data) {
     array<Index, NumDims> tensor_to_block_dim_map;
     for (int i = 0; i < NumDims; ++i) {
       tensor_to_block_dim_map[i] = i;
     }
     Base::Copy(*block, block->first_coeff_index(), tensor_to_block_dim_map,
                block->tensor_strides(), src_data, block->data());
   }

   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
       TensorBlock* block, Index first_coeff_index,
       const array<Index, NumDims>& tensor_to_block_dim_map,
       const array<Index, NumDims>& tensor_strides, const Scalar* src_data) {
     Base::Copy(*block, first_coeff_index, tensor_to_block_dim_map,
                tensor_strides, src_data, block->data());
   }
 };

 /** \class TensorBlockWriter
   * \ingroup CXX11_Tensor_Module
   *
   * \brief Tensor block writer class.
   *
   * This class is responsible for writing a tensor block.
   *
   */

 template <typename Index, typename Scalar, std::size_t NumDims, int Layout,
           bool Vectorizable>
 class TensorBlockWriter : public TensorBlockIO<Index, Scalar, NumDims,
                                                Layout, Vectorizable, false> {
  public:
   typedef typename internal::TensorBlock<Index, Scalar, NumDims, Layout>
       TensorBlock;
   typedef TensorBlockIO<Index, Scalar, NumDims, Layout, Vectorizable, false>
       Base;

   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
       const TensorBlock& block, Scalar* dst_data) {
     array<Index, NumDims> tensor_to_block_dim_map;
     for (int i = 0; i < NumDims; ++i) {
       tensor_to_block_dim_map[i] = i;
     }
     Base::Copy(block, block.first_coeff_index(), tensor_to_block_dim_map,
                block.tensor_strides(), block.data(), dst_data);
   }

   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
       const TensorBlock& block, Index first_coeff_index,
       const array<Index, NumDims>& tensor_to_block_dim_map,
       const array<Index, NumDims>& tensor_strides, Scalar* dst_data) {
     Base::Copy(block, first_coeff_index, tensor_to_block_dim_map,
                tensor_strides, block.data(), dst_data);
   }
 };

 enum TensorBlockShapeType {
   kUniformAllDims,
   kSkewedInnerDims,
 };

 struct TensorOpResourceRequirements {
   TensorBlockShapeType block_shape;
   std::size_t block_total_size;
   // TODO(andydavis) Add 'target_num_threads' to support communication of
   // thread-resource requirements. This will allow ops deep in the
   // expression tree (like reductions) to communicate resources
   // requirements based on local state (like the total number of reductions
   // to be computed).
   TensorOpResourceRequirements(internal::TensorBlockShapeType shape,
                                const std::size_t size)
       : block_shape(shape), block_total_size(size) {}
 };

 /** \class TensorBlockMapper
   * \ingroup CXX11_Tensor_Module
   *
   * \brief Tensor block mapper class.
   *
   * This class is responsible for iterating over the blocks of a tensor.
   *
   */

 template <typename Index, typename Scalar, std::size_t NumDims, int Layout>
 class TensorBlockMapper {
  public:
   typedef typename internal::TensorBlock<Index, Scalar, NumDims, Layout>
       TensorBlock;

   TensorBlockMapper(const Eigen::DSizes<Index, NumDims>& dims,
                     const TensorBlockShapeType block_shape,
                     size_t min_target_size)
       : m_dimensions(dims), m_block_dim_sizes(dims), m_total_block_count(1) {
     min_target_size = numext::maxi<size_t>(1, min_target_size);
     if (m_dimensions.TotalSize() == 0) {
       // Corner case: one of the dimensions is zero. Logic below is too complex
       // to handle this case on a general basis, just use unit block size.
       // Note: we must not yield blocks with zero dimensions (recipe for
       // overflows/underflows, divisions by zero and NaNs later).
       for (int i = 0; i < NumDims; ++i) {
         m_block_dim_sizes[i] = 1;
       }
     } else if (m_block_dim_sizes.TotalSize() > min_target_size) {
       if (block_shape == kUniformAllDims) {
         // Tensor will not fit within 'min_target_size' budget: calculate tensor
         // block dimension sizes based on "square" dimension size target.
         const size_t dim_size_target =
             std::pow(static_cast<float>(min_target_size),
                      1.0 / static_cast<float>(m_block_dim_sizes.rank()));
         for (size_t i = 0; i < m_block_dim_sizes.rank(); ++i) {
           // TODO(andydavis) Adjust the inner most 'm_block_dim_size' to make it
           // a multiple of the packet size. Note that reducing 'm_block_dim_size'
           // in this manner can increase the number of blocks, and so will
           // amplify any per-block overhead.
           m_block_dim_sizes[i] =
               numext::mini(dim_size_target, static_cast<size_t>(m_dimensions[i]));
         }
         // Add any un-allocated coefficients to inner dimension(s).
         Index total_size = m_block_dim_sizes.TotalSize();
         for (int i = 0; i < NumDims; ++i) {
           const int dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
               ? i : NumDims - i - 1;
           if (m_block_dim_sizes[dim] < m_dimensions[dim]) {
             const Index total_size_other_dims = total_size /
                 m_block_dim_sizes[dim];
             const Index alloc_avail = divup<Index>(min_target_size, total_size_other_dims);
             if (alloc_avail == m_block_dim_sizes[dim]) {
               // Insufficient excess coefficients to allocate.
               break;
             }
             m_block_dim_sizes[dim] = numext::mini(m_dimensions[dim], alloc_avail);
             total_size = total_size_other_dims * m_block_dim_sizes[dim];
           }
         }
       } else {
         eigen_assert(block_shape == kSkewedInnerDims);
         Index coeff_to_allocate = min_target_size;
         for (int i = 0; i < NumDims; ++i) {
           const int dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
               ? i : NumDims - i - 1;
           m_block_dim_sizes[dim] = numext::mini(coeff_to_allocate,
                                                 m_dimensions[dim]);
           coeff_to_allocate = divup(coeff_to_allocate,
               numext::maxi(static_cast<Index>(1), m_block_dim_sizes[dim]));
         }
         eigen_assert(coeff_to_allocate == 1);
       }
     }
     eigen_assert(m_block_dim_sizes.TotalSize() >=
         numext::mini(min_target_size, m_dimensions.TotalSize()));

     // Calculate block counts by dimension and total block count.
     DSizes<Index, NumDims> block_count;
     for (size_t i = 0; i < block_count.rank(); ++i) {
       block_count[i] =
           (m_dimensions[i] + m_block_dim_sizes[i] - 1) / m_block_dim_sizes[i];
     }
     m_total_block_count = array_prod(block_count);

     // Calculate block strides (used for enumerating blocks).
     if (NumDims > 0) {
       if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
         m_block_strides[0] = 1;
         m_tensor_strides[0] = 1;
         for (int i = 1; i < NumDims; ++i) {
           m_block_strides[i] = m_block_strides[i - 1] * block_count[i - 1];
           m_tensor_strides[i] = m_tensor_strides[i - 1] * m_dimensions[i - 1];
         }
       } else {
         m_block_strides[NumDims - 1] = 1;
         m_tensor_strides[NumDims - 1] = 1;
         for (int i = NumDims - 2; i >= 0; --i) {
           m_block_strides[i] = m_block_strides[i + 1] * block_count[i + 1];
           m_tensor_strides[i] = m_tensor_strides[i + 1] * m_dimensions[i + 1];
         }
       }
     }
   }

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
   GetBlockForIndex(Index block_index, Scalar* data) const {
     Index first_coeff_index = 0;
     DSizes<Index, NumDims> coords;
     DSizes<Index, NumDims> sizes;
     DSizes<Index, NumDims> strides;
     if (NumDims > 0) {
       if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
         for (int i = NumDims - 1; i > 0; --i) {
           const Index idx = block_index / m_block_strides[i];
           coords[i] = idx * m_block_dim_sizes[i];
           sizes[i] =
               numext::mini((m_dimensions[i] - coords[i]), m_block_dim_sizes[i]);
           block_index -= idx * m_block_strides[i];
           first_coeff_index += coords[i] * m_tensor_strides[i];
         }
         coords[0] = block_index * m_block_dim_sizes[0];
         sizes[0] =
             numext::mini((m_dimensions[0] - coords[0]), m_block_dim_sizes[0]);
         first_coeff_index += coords[0] * m_tensor_strides[0];

         strides[0] = 1;
         for (int i = 1; i < NumDims; ++i) {
           strides[i] = strides[i - 1] * sizes[i - 1];
         }
       } else {
         for (int i = 0; i < NumDims - 1; ++i) {
           const Index idx = block_index / m_block_strides[i];
           coords[i] = idx * m_block_dim_sizes[i];
           sizes[i] =
               numext::mini((m_dimensions[i] - coords[i]), m_block_dim_sizes[i]);
           block_index -= idx * m_block_strides[i];
           first_coeff_index += coords[i] * m_tensor_strides[i];
         }
         coords[NumDims - 1] = block_index * m_block_dim_sizes[NumDims - 1];
         sizes[NumDims - 1] =
             numext::mini((m_dimensions[NumDims - 1] - coords[NumDims - 1]),
                        m_block_dim_sizes[NumDims - 1]);
         first_coeff_index += coords[NumDims - 1] * m_tensor_strides[NumDims - 1];

         strides[NumDims - 1] = 1;
         for (int i = NumDims - 2; i >= 0; --i) {
           strides[i] = strides[i + 1] * sizes[i + 1];
         }
       }
     }

     return TensorBlock(first_coeff_index, sizes, strides, m_tensor_strides,
                        data);
   }

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index total_block_count() const {
     return m_total_block_count;
   }

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index block_dims_total_size() const {
     return m_block_dim_sizes.TotalSize();
   }

  private:
   DSizes<Index, NumDims> m_dimensions;
   DSizes<Index, NumDims> m_block_dim_sizes;
   DSizes<Index, NumDims> m_block_strides;
   DSizes<Index, NumDims> m_tensor_strides;
   Index m_total_block_count;
 };

 /** \class TensorSliceBlockMapper
   * \ingroup CXX11_Tensor_Module
   *
   * \brief Tensor slice block mapper class.
   *
   * This class is responsible for iterating over the blocks of
   * a slice of a tensor. Supports shuffling of the block strides
   * for callers that want to reduce strides for dimensions to be
   * processed together.
   *
   */

 template <typename Index, typename Scalar, std::size_t NumDims, int Layout>
 class TensorSliceBlockMapper {
  public:
   typedef typename internal::TensorBlock<Index, Scalar, NumDims, Layout>
       TensorBlock;
   typedef DSizes<Index, NumDims> Dimensions;

   TensorSliceBlockMapper(const Dimensions& tensor_dims,
                          const Dimensions& tensor_slice_offsets,
                          const Dimensions& tensor_slice_extents,
                          const Dimensions& block_dim_sizes,
                          const Dimensions& block_stride_order)
       : m_tensor_dimensions(tensor_dims),
         m_tensor_slice_offsets(tensor_slice_offsets),
         m_tensor_slice_extents(tensor_slice_extents),
         m_block_dim_sizes(block_dim_sizes),
         m_block_stride_order(block_stride_order),
         m_total_block_count(1) {
     // Calculate block counts by dimension and total block count.
     DSizes<Index, NumDims> block_count;
     for (size_t i = 0; i < block_count.rank(); ++i) {
       block_count[i] = (m_tensor_slice_extents[i] + m_block_dim_sizes[i] - 1) /
           m_block_dim_sizes[i];
     }
     m_total_block_count = array_prod(block_count);

     // Calculate block strides (used for enumerating blocks).
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       m_block_strides[0] = 1;
       m_tensor_strides[0] = 1;
       for (int i = 1; i < NumDims; ++i) {
         m_block_strides[i] = m_block_strides[i - 1] * block_count[i - 1];
         m_tensor_strides[i] = m_tensor_strides[i - 1] *
             m_tensor_dimensions[i - 1];
       }
     } else {
       m_block_strides[NumDims - 1] = 1;
       m_tensor_strides[NumDims - 1] = 1;
       for (int i = NumDims - 2; i >= 0; --i) {
         m_block_strides[i] = m_block_strides[i + 1] * block_count[i + 1];
         m_tensor_strides[i] = m_tensor_strides[i + 1] *
             m_tensor_dimensions[i + 1];
       }
     }
   }

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
   GetBlockForIndex(Index block_index, Scalar* data) const {
     Index first_coeff_index = 0;
     DSizes<Index, NumDims> coords;
     DSizes<Index, NumDims> sizes;
     DSizes<Index, NumDims> strides;
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       for (int i = NumDims - 1; i > 0; --i) {
         const Index idx = block_index / m_block_strides[i];
         coords[i] = m_tensor_slice_offsets[i] + idx * m_block_dim_sizes[i];
         sizes[i] = numext::mini(m_tensor_slice_offsets[i] + m_tensor_slice_extents[i] - coords[i],
                                 m_block_dim_sizes[i]);
         block_index -= idx * m_block_strides[i];
         first_coeff_index += coords[i] * m_tensor_strides[i];
       }
       coords[0] = m_tensor_slice_offsets[0] +
           block_index * m_block_dim_sizes[0];
       sizes[0] = numext::mini(m_tensor_slice_offsets[0] + m_tensor_slice_extents[0] - coords[0],
                                 m_block_dim_sizes[0]);
       first_coeff_index += coords[0] * m_tensor_strides[0];

       Index prev_dim = m_block_stride_order[0];
       strides[prev_dim] = 1;
       for (int i = 1; i < NumDims; ++i) {
         const Index curr_dim = m_block_stride_order[i];
         strides[curr_dim] = strides[prev_dim] * sizes[prev_dim];
         prev_dim = curr_dim;
       }
     } else {
       for (int i = 0; i < static_cast<int>(NumDims) - 1; ++i) {
         const Index idx = block_index / m_block_strides[i];
         coords[i] = m_tensor_slice_offsets[i] + idx * m_block_dim_sizes[i];
         sizes[i] = numext::mini(m_tensor_slice_offsets[i] + m_tensor_slice_extents[i] - coords[i],
                                 m_block_dim_sizes[i]);
         block_index -= idx * m_block_strides[i];
         first_coeff_index += coords[i] * m_tensor_strides[i];
       }
       coords[NumDims - 1] = m_tensor_slice_offsets[NumDims - 1] +
           block_index * m_block_dim_sizes[NumDims - 1];
       sizes[NumDims - 1] = numext::mini(
           m_tensor_slice_offsets[NumDims - 1] + m_tensor_slice_extents[NumDims - 1] - coords[NumDims - 1],
           m_block_dim_sizes[NumDims - 1]);
       first_coeff_index += coords[NumDims - 1] * m_tensor_strides[NumDims - 1];

       Index prev_dim = m_block_stride_order[NumDims - 1];
       strides[prev_dim] = 1;
       for (int i = NumDims - 2; i >= 0; --i) {
         const Index curr_dim = m_block_stride_order[i];
         strides[curr_dim] = strides[prev_dim] * sizes[prev_dim];
         prev_dim = curr_dim;
       }
     }

     return TensorBlock(first_coeff_index, sizes, strides, m_tensor_strides,
                        data);
   }

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index total_block_count() const {
     return m_total_block_count;
   }

  private:
   Dimensions m_tensor_dimensions;
   Dimensions m_tensor_slice_offsets;
   Dimensions m_tensor_slice_extents;
   Dimensions m_tensor_strides;
   Dimensions m_block_dim_sizes;
   Dimensions m_block_stride_order;
   Dimensions m_block_strides;
   Index m_total_block_count;
 };

 }  // end namespace internal

 }  // end namespace Eigen

 #endif  // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H