Update Eigen to commit:dd56367554cdf0662cc1d4a8e462e8c3e8656d08

CHANGELOG
=========
dd5636755 - Fix docs job for nightlies
d79bac0d3 - Fix boolean scatter and random generation for tensors.
9935396b1 - Specify constructor template arguments for ConstexprTest struct
72adf891d - Slightly simplify ForkJoin code, and make sure the test is actually run.
6aebfa9ac - Build docs on push, and don'\''t expire
bddaa99e1 - Fix bitwise operation error when compiling as C++26
e42dceb3a - Fix implicit copy-constructor warning in TensorRef.
5fc6fc988 - Initialize matrix in bicgstab test
0ae7b5901 - Make assignment constexpr
4dda5b927 - fix Warray-bounds in inner product

PiperOrigin-RevId: 731494675
Change-Id: Id59938322667f1bd2b3348ce69f1bc143608e9e4
diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h
index 5a2a3ac..0ea1bc3 100644
--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h
@@ -191,7 +191,7 @@
   static constexpr int Outer = Index_ / Kernel::AssignmentTraits::InnerSizeAtCompileTime;
   static constexpr int Inner = Index_ % Kernel::AssignmentTraits::InnerSizeAtCompileTime;
 
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel) {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel) {
     kernel.assignCoeffByOuterInner(Outer, Inner);
     copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, Index_ + 1, Stop>::run(kernel);
   }
@@ -204,7 +204,7 @@
 
 template <typename Kernel, int Index_, int Stop>
 struct copy_using_evaluator_DefaultTraversal_InnerUnrolling {
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel, Index outer) {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel, Index outer) {
     kernel.assignCoeffByOuterInner(outer, Index_);
     copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, Index_ + 1, Stop>::run(kernel, outer);
   }
@@ -212,7 +212,7 @@
 
 template <typename Kernel, int Stop>
 struct copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, Stop, Stop> {
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&, Index) {}
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&, Index) {}
 };
 
 /***********************
@@ -221,7 +221,7 @@
 
 template <typename Kernel, int Index_, int Stop>
 struct copy_using_evaluator_LinearTraversal_CompleteUnrolling {
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel) {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel) {
     kernel.assignCoeff(Index_);
     copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, Index_ + 1, Stop>::run(kernel);
   }
@@ -229,7 +229,7 @@
 
 template <typename Kernel, int Stop>
 struct copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, Stop, Stop> {
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&) {}
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&) {}
 };
 
 /**************************
@@ -270,7 +270,7 @@
 
 template <typename Kernel, int Stop, int SrcAlignment, int DstAlignment>
 struct copy_using_evaluator_innervec_InnerUnrolling<Kernel, Stop, Stop, SrcAlignment, DstAlignment> {
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&, Index) {}
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&, Index) {}
 };
 
 /***************************************************************************
@@ -281,7 +281,21 @@
 
 template <typename Kernel, int Traversal = Kernel::AssignmentTraits::Traversal,
           int Unrolling = Kernel::AssignmentTraits::Unrolling>
-struct dense_assignment_loop;
+struct dense_assignment_loop_impl;
+
+template <typename Kernel, int Traversal = Kernel::AssignmentTraits::Traversal,
+          int Unrolling = Kernel::AssignmentTraits::Unrolling>
+struct dense_assignment_loop {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel) {
+#ifdef __cpp_lib_is_constant_evaluated
+    if (internal::is_constant_evaluated())
+      dense_assignment_loop_impl<Kernel, Traversal == AllAtOnceTraversal ? AllAtOnceTraversal : DefaultTraversal,
+                                 NoUnrolling>::run(kernel);
+    else
+#endif
+      dense_assignment_loop_impl<Kernel, Traversal, Unrolling>::run(kernel);
+  }
+};
 
 /************************
 ***** Special Cases *****
@@ -289,7 +303,7 @@
 
 // Zero-sized assignment is a no-op.
 template <typename Kernel, int Unrolling>
-struct dense_assignment_loop<Kernel, AllAtOnceTraversal, Unrolling> {
+struct dense_assignment_loop_impl<Kernel, AllAtOnceTraversal, Unrolling> {
   static constexpr int SizeAtCompileTime = Kernel::AssignmentTraits::SizeAtCompileTime;
 
   EIGEN_DEVICE_FUNC static void EIGEN_STRONG_INLINE EIGEN_CONSTEXPR run(Kernel& /*kernel*/) {
@@ -302,8 +316,8 @@
 ************************/
 
 template <typename Kernel>
-struct dense_assignment_loop<Kernel, DefaultTraversal, NoUnrolling> {
-  EIGEN_DEVICE_FUNC static void EIGEN_STRONG_INLINE run(Kernel& kernel) {
+struct dense_assignment_loop_impl<Kernel, DefaultTraversal, NoUnrolling> {
+  EIGEN_DEVICE_FUNC static void EIGEN_STRONG_INLINE constexpr run(Kernel& kernel) {
     for (Index outer = 0; outer < kernel.outerSize(); ++outer) {
       for (Index inner = 0; inner < kernel.innerSize(); ++inner) {
         kernel.assignCoeffByOuterInner(outer, inner);
@@ -313,19 +327,19 @@
 };
 
 template <typename Kernel>
-struct dense_assignment_loop<Kernel, DefaultTraversal, CompleteUnrolling> {
+struct dense_assignment_loop_impl<Kernel, DefaultTraversal, CompleteUnrolling> {
   static constexpr int SizeAtCompileTime = Kernel::AssignmentTraits::SizeAtCompileTime;
 
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel) {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel) {
     copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, 0, SizeAtCompileTime>::run(kernel);
   }
 };
 
 template <typename Kernel>
-struct dense_assignment_loop<Kernel, DefaultTraversal, InnerUnrolling> {
+struct dense_assignment_loop_impl<Kernel, DefaultTraversal, InnerUnrolling> {
   static constexpr int InnerSizeAtCompileTime = Kernel::AssignmentTraits::InnerSizeAtCompileTime;
 
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel) {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel) {
     const Index outerSize = kernel.outerSize();
     for (Index outer = 0; outer < outerSize; ++outer)
       copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, 0, InnerSizeAtCompileTime>::run(kernel, outer);
@@ -382,7 +396,7 @@
 };
 
 template <typename Kernel>
-struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, NoUnrolling> {
+struct dense_assignment_loop_impl<Kernel, LinearVectorizedTraversal, NoUnrolling> {
   using Scalar = typename Kernel::Scalar;
   using PacketType = typename Kernel::PacketType;
   static constexpr int PacketSize = unpacket_traits<PacketType>::size;
@@ -407,7 +421,7 @@
 };
 
 template <typename Kernel>
-struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, CompleteUnrolling> {
+struct dense_assignment_loop_impl<Kernel, LinearVectorizedTraversal, CompleteUnrolling> {
   using PacketType = typename Kernel::PacketType;
   static constexpr int PacketSize = unpacket_traits<PacketType>::size;
   static constexpr int Size = Kernel::AssignmentTraits::SizeAtCompileTime;
@@ -424,7 +438,7 @@
 **************************/
 
 template <typename Kernel>
-struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, NoUnrolling> {
+struct dense_assignment_loop_impl<Kernel, InnerVectorizedTraversal, NoUnrolling> {
   using PacketType = typename Kernel::PacketType;
   static constexpr int PacketSize = unpacket_traits<PacketType>::size;
   static constexpr int SrcAlignment = Kernel::AssignmentTraits::JointAlignment;
@@ -440,7 +454,7 @@
 };
 
 template <typename Kernel>
-struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, CompleteUnrolling> {
+struct dense_assignment_loop_impl<Kernel, InnerVectorizedTraversal, CompleteUnrolling> {
   static constexpr int SizeAtCompileTime = Kernel::AssignmentTraits::SizeAtCompileTime;
 
   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel) {
@@ -449,7 +463,7 @@
 };
 
 template <typename Kernel>
-struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, InnerUnrolling> {
+struct dense_assignment_loop_impl<Kernel, InnerVectorizedTraversal, InnerUnrolling> {
   static constexpr int InnerSize = Kernel::AssignmentTraits::InnerSizeAtCompileTime;
   static constexpr int SrcAlignment = Kernel::AssignmentTraits::SrcAlignment;
   static constexpr int DstAlignment = Kernel::AssignmentTraits::DstAlignment;
@@ -467,7 +481,7 @@
 ***********************/
 
 template <typename Kernel>
-struct dense_assignment_loop<Kernel, LinearTraversal, NoUnrolling> {
+struct dense_assignment_loop_impl<Kernel, LinearTraversal, NoUnrolling> {
   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) {
     const Index size = kernel.size();
     for (Index i = 0; i < size; ++i) kernel.assignCoeff(i);
@@ -475,7 +489,7 @@
 };
 
 template <typename Kernel>
-struct dense_assignment_loop<Kernel, LinearTraversal, CompleteUnrolling> {
+struct dense_assignment_loop_impl<Kernel, LinearTraversal, CompleteUnrolling> {
   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) {
     copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, 0, Kernel::AssignmentTraits::SizeAtCompileTime>::run(
         kernel);
@@ -487,7 +501,7 @@
 ***************************/
 
 template <typename Kernel>
-struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling> {
+struct dense_assignment_loop_impl<Kernel, SliceVectorizedTraversal, NoUnrolling> {
   using Scalar = typename Kernel::Scalar;
   using PacketType = typename Kernel::PacketType;
   static constexpr int PacketSize = unpacket_traits<PacketType>::size;
@@ -528,7 +542,7 @@
 
 #if EIGEN_UNALIGNED_VECTORIZE
 template <typename Kernel>
-struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, InnerUnrolling> {
+struct dense_assignment_loop_impl<Kernel, SliceVectorizedTraversal, InnerUnrolling> {
   using PacketType = typename Kernel::PacketType;
   static constexpr int PacketSize = unpacket_traits<PacketType>::size;
   static constexpr int InnerSize = Kernel::AssignmentTraits::InnerSizeAtCompileTime;
@@ -566,9 +580,10 @@
   typedef copy_using_evaluator_traits<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor> AssignmentTraits;
   typedef typename AssignmentTraits::PacketType PacketType;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE generic_dense_assignment_kernel(DstEvaluatorType& dst,
-                                                                        const SrcEvaluatorType& src,
-                                                                        const Functor& func, DstXprType& dstExpr)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr generic_dense_assignment_kernel(DstEvaluatorType& dst,
+                                                                                  const SrcEvaluatorType& src,
+                                                                                  const Functor& func,
+                                                                                  DstXprType& dstExpr)
       : m_dst(dst), m_src(src), m_functor(func), m_dstExpr(dstExpr) {
 #ifdef EIGEN_DEBUG_ASSIGN
     AssignmentTraits::debug();
@@ -586,7 +601,7 @@
   EIGEN_DEVICE_FUNC const SrcEvaluatorType& srcEvaluator() const EIGEN_NOEXCEPT { return m_src; }
 
   /// Assign src(row,col) to dst(row,col) through the assignment functor.
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Index row, Index col) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void assignCoeff(Index row, Index col) {
     m_functor.assignCoeff(m_dst.coeffRef(row, col), m_src.coeff(row, col));
   }
 
@@ -596,7 +611,7 @@
   }
 
   /// \sa assignCoeff(Index,Index)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeffByOuterInner(Index outer, Index inner) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void assignCoeffByOuterInner(Index outer, Index inner) {
     Index row = rowIndexByOuterInner(outer, inner);
     Index col = colIndexByOuterInner(outer, inner);
     assignCoeff(row, col);
@@ -620,7 +635,7 @@
     assignPacket<StoreMode, LoadMode, Packet>(row, col);
   }
 
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index rowIndexByOuterInner(Index outer, Index inner) {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr Index rowIndexByOuterInner(Index outer, Index inner) {
     typedef typename DstEvaluatorType::ExpressionTraits Traits;
     return int(Traits::RowsAtCompileTime) == 1          ? 0
            : int(Traits::ColsAtCompileTime) == 1        ? inner
@@ -628,7 +643,7 @@
                                                         : inner;
   }
 
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index colIndexByOuterInner(Index outer, Index inner) {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr Index colIndexByOuterInner(Index outer, Index inner) {
     typedef typename DstEvaluatorType::ExpressionTraits Traits;
     return int(Traits::ColsAtCompileTime) == 1          ? 0
            : int(Traits::RowsAtCompileTime) == 1        ? inner
@@ -672,16 +687,16 @@
  ***************************************************************************/
 
 template <typename DstXprType, typename SrcXprType, typename Functor>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resize_if_allowed(DstXprType& dst, const SrcXprType& src,
-                                                             const Functor& /*func*/) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void resize_if_allowed(DstXprType& dst, const SrcXprType& src,
+                                                                       const Functor& /*func*/) {
   EIGEN_ONLY_USED_FOR_DEBUG(dst);
   EIGEN_ONLY_USED_FOR_DEBUG(src);
   eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
 }
 
 template <typename DstXprType, typename SrcXprType, typename T1, typename T2>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resize_if_allowed(DstXprType& dst, const SrcXprType& src,
-                                                             const internal::assign_op<T1, T2>& /*func*/) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void resize_if_allowed(DstXprType& dst, const SrcXprType& src,
+                                                                       const internal::assign_op<T1, T2>& /*func*/) {
   Index dstRows = src.rows();
   Index dstCols = src.cols();
   if (((dst.rows() != dstRows) || (dst.cols() != dstCols))) dst.resize(dstRows, dstCols);
@@ -750,7 +765,7 @@
 // not has to bother about these annoying details.
 
 template <typename Dst, typename Src>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_assignment(Dst& dst, const Src& src) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_assignment(Dst& dst, const Src& src) {
   call_assignment(dst, src, internal::assign_op<typename Dst::Scalar, typename Src::Scalar>());
 }
 template <typename Dst, typename Src>
@@ -767,7 +782,7 @@
 }
 
 template <typename Dst, typename Src, typename Func>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_assignment(
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_assignment(
     Dst& dst, const Src& src, const Func& func, std::enable_if_t<!evaluator_assume_aliasing<Src>::value, void*> = 0) {
   call_assignment_no_alias(dst, src, func);
 }
@@ -851,9 +866,12 @@
 // both partial specialization+SFINAE without ambiguous specialization
 template <typename DstXprType, typename SrcXprType, typename Functor, typename Weak>
 struct Assignment<DstXprType, SrcXprType, Functor, Dense2Dense, Weak> {
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(DstXprType& dst, const SrcXprType& src, const Functor& func) {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(DstXprType& dst, const SrcXprType& src,
+                                                                  const Functor& func) {
 #ifndef EIGEN_NO_DEBUG
-    internal::check_for_aliasing(dst, src);
+    if (!internal::is_constant_evaluated()) {
+      internal::check_for_aliasing(dst, src);
+    }
 #endif
 
     call_dense_assignment_loop(dst, src, func);
diff --git a/Eigen/src/Core/EigenBase.h b/Eigen/src/Core/EigenBase.h
index 6d16700..894bfc1 100644
--- a/Eigen/src/Core/EigenBase.h
+++ b/Eigen/src/Core/EigenBase.h
@@ -50,7 +50,7 @@
   /** \returns a const reference to the derived object */
   EIGEN_DEVICE_FUNC constexpr const Derived& derived() const { return *static_cast<const Derived*>(this); }
 
-  EIGEN_DEVICE_FUNC inline Derived& const_cast_derived() const {
+  EIGEN_DEVICE_FUNC inline constexpr Derived& const_cast_derived() const {
     return *static_cast<Derived*>(const_cast<EigenBase*>(this));
   }
   EIGEN_DEVICE_FUNC inline const Derived& const_derived() const { return *static_cast<const Derived*>(this); }
diff --git a/Eigen/src/Core/Fill.h b/Eigen/src/Core/Fill.h
index dd57ca1..3b0af91 100644
--- a/Eigen/src/Core/Fill.h
+++ b/Eigen/src/Core/Fill.h
@@ -60,12 +60,12 @@
   using Func = scalar_constant_op<Scalar>;
   using PlainObject = typename Xpr::PlainObject;
   using Constant = typename PlainObject::ConstantReturnType;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Xpr& dst, const Scalar& val) {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void run(Xpr& dst, const Scalar& val) {
     const Constant src(dst.rows(), dst.cols(), val);
     run(dst, src);
   }
   template <typename SrcXpr>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Xpr& dst, const SrcXpr& src) {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void run(Xpr& dst, const SrcXpr& src) {
     call_dense_assignment_loop(dst, src, assign_op<Scalar, Scalar>());
   }
 };
@@ -100,12 +100,12 @@
   using Scalar = typename Xpr::Scalar;
   using PlainObject = typename Xpr::PlainObject;
   using Zero = typename PlainObject::ZeroReturnType;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Xpr& dst) {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void run(Xpr& dst) {
     const Zero src(dst.rows(), dst.cols());
     run(dst, src);
   }
   template <typename SrcXpr>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Xpr& dst, const SrcXpr& src) {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void run(Xpr& dst, const SrcXpr& src) {
     call_dense_assignment_loop(dst, src, assign_op<Scalar, Scalar>());
   }
 };
diff --git a/Eigen/src/Core/InnerProduct.h b/Eigen/src/Core/InnerProduct.h
index 1e16942..9849d9b 100644
--- a/Eigen/src/Core/InnerProduct.h
+++ b/Eigen/src/Core/InnerProduct.h
@@ -57,16 +57,20 @@
 
 template <typename Func, typename Lhs, typename Rhs>
 struct inner_product_evaluator {
-  static constexpr int LhsFlags = evaluator<Lhs>::Flags, RhsFlags = evaluator<Rhs>::Flags,
-                       SizeAtCompileTime = min_size_prefer_fixed(Lhs::SizeAtCompileTime, Rhs::SizeAtCompileTime),
-                       LhsAlignment = evaluator<Lhs>::Alignment, RhsAlignment = evaluator<Rhs>::Alignment;
+  static constexpr int LhsFlags = evaluator<Lhs>::Flags;
+  static constexpr int RhsFlags = evaluator<Rhs>::Flags;
+  static constexpr int SizeAtCompileTime = size_prefer_fixed(Lhs::SizeAtCompileTime, Rhs::SizeAtCompileTime);
+  static constexpr int MaxSizeAtCompileTime =
+      min_size_prefer_fixed(Lhs::MaxSizeAtCompileTime, Rhs::MaxSizeAtCompileTime);
+  static constexpr int LhsAlignment = evaluator<Lhs>::Alignment;
+  static constexpr int RhsAlignment = evaluator<Rhs>::Alignment;
 
   using Scalar = typename Func::result_type;
   using Packet = typename find_inner_product_packet<Scalar, SizeAtCompileTime>::type;
 
   static constexpr bool Vectorize =
       bool(LhsFlags & RhsFlags & PacketAccessBit) && Func::PacketAccess &&
-      ((SizeAtCompileTime == Dynamic) || (unpacket_traits<Packet>::size <= SizeAtCompileTime));
+      ((MaxSizeAtCompileTime == Dynamic) || (unpacket_traits<Packet>::size <= MaxSizeAtCompileTime));
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit inner_product_evaluator(const Lhs& lhs, const Rhs& rhs,
                                                                          Func func = Func())
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 77f0cfa..5e91fba 100644
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -1694,10 +1694,24 @@
 }
 template <>
 EIGEN_STRONG_INLINE void pscatter<bool, Packet16b>(bool* to, const Packet16b& from, Index stride) {
-  to[4 * stride * 0] = _mm_cvtsi128_si32(from);
-  to[4 * stride * 1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
-  to[4 * stride * 2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
-  to[4 * stride * 3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
+  EIGEN_ALIGN16 bool tmp[16];
+  pstore(tmp, from);
+  to[stride * 0] = tmp[0];
+  to[stride * 1] = tmp[1];
+  to[stride * 2] = tmp[2];
+  to[stride * 3] = tmp[3];
+  to[stride * 4] = tmp[4];
+  to[stride * 5] = tmp[5];
+  to[stride * 6] = tmp[6];
+  to[stride * 7] = tmp[7];
+  to[stride * 8] = tmp[8];
+  to[stride * 9] = tmp[9];
+  to[stride * 10] = tmp[10];
+  to[stride * 11] = tmp[11];
+  to[stride * 12] = tmp[12];
+  to[stride * 13] = tmp[13];
+  to[stride * 14] = tmp[14];
+  to[stride * 15] = tmp[15];
 }
 
 // some compilers might be tempted to perform multiple moves instead of using a vector path.
diff --git a/Eigen/src/Core/functors/AssignmentFunctors.h b/Eigen/src/Core/functors/AssignmentFunctors.h
index 09d1da8..3687bb2 100644
--- a/Eigen/src/Core/functors/AssignmentFunctors.h
+++ b/Eigen/src/Core/functors/AssignmentFunctors.h
@@ -23,7 +23,7 @@
  */
 template <typename DstScalar, typename SrcScalar>
 struct assign_op {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(DstScalar& a, const SrcScalar& b) const { a = b; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void assignCoeff(DstScalar& a, const SrcScalar& b) const { a = b; }
 
   template <int Alignment, typename Packet>
   EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const {
diff --git a/Eigen/src/Geometry/OrthoMethods.h b/Eigen/src/Geometry/OrthoMethods.h
index 9c8f095..fc708ee 100644
--- a/Eigen/src/Geometry/OrthoMethods.h
+++ b/Eigen/src/Geometry/OrthoMethods.h
@@ -95,7 +95,8 @@
 namespace internal {
 
 template <int Arch, typename VectorLhs, typename VectorRhs, typename Scalar = typename VectorLhs::Scalar,
-          bool Vectorizable = bool((evaluator<VectorLhs>::Flags & evaluator<VectorRhs>::Flags) & PacketAccessBit)>
+          bool Vectorizable =
+              bool((int(evaluator<VectorLhs>::Flags) & int(evaluator<VectorRhs>::Flags)) & PacketAccessBit)>
 struct cross3_impl {
   EIGEN_DEVICE_FUNC static inline typename internal::plain_matrix_type<VectorLhs>::type run(const VectorLhs& lhs,
                                                                                             const VectorRhs& rhs) {
diff --git a/Eigen/src/ThreadPool/ForkJoin.h b/Eigen/src/ThreadPool/ForkJoin.h
index d6ea4dd..f67abd3 100644
--- a/Eigen/src/ThreadPool/ForkJoin.h
+++ b/Eigen/src/ThreadPool/ForkJoin.h
@@ -31,7 +31,7 @@
 // where `s_{j+1} - s_{j}` and `end - s_n` are roughly within a factor of two of `granularity`. For a unary
 // task function `g(k)`, the same operation is applied with
 //
-//   f(i,j) = [&](){ for(int k = i; k < j; ++k) g(k); };
+//   f(i,j) = [&](){ for(Index k = i; k < j; ++k) g(k); };
 //
 // Note that the parameter `granularity` should be tuned by the user based on the trade-off of running the
 // given task function sequentially vs. scheduling individual tasks in parallel. An example of a partially
@@ -45,51 +45,50 @@
 // ForkJoinScheduler::ParallelFor(0, num_tasks, granularity, std::move(parallel_task), &thread_pool);
 // ```
 //
-// Example usage #2 (asynchronous):
+// Example usage #2 (executing multiple tasks asynchronously, each one parallelized with ParallelFor):
 // ```
 // ThreadPool thread_pool(num_threads);
-// Barrier barrier(num_tasks * num_async_calls);
-// auto done = [&](){barrier.Notify();};
-// for (int k=0; k<num_async_calls; ++k) {
-//   thread_pool.Schedule([&](){
-//     ForkJoinScheduler::ParallelForAsync(0, num_tasks, granularity, parallel_task, done, &thread_pool);
-//   });
+// Barrier barrier(num_async_calls);
+// auto done = [&](){ barrier.Notify(); };
+// for (Index k=0; k<num_async_calls; ++k) {
+//   ForkJoinScheduler::ParallelForAsync(task_start[k], task_end[k], granularity[k], parallel_task[k], done,
+//   &thread_pool);
 // }
 // barrier.Wait();
 // ```
 class ForkJoinScheduler {
  public:
-  // Runs `do_func` asynchronously for the range [start, end) with a specified granularity. `do_func` should
-  // either be of type `std::function<void(int)>` or `std::function<void(int, int)`.
-  // If `end > start`, the `done` callback will be called `end - start` times when all tasks have been
-  // executed. Otherwise, `done` is called only once.
-  template <typename DoFnType>
-  static void ParallelForAsync(int start, int end, int granularity, DoFnType do_func, std::function<void()> done,
-                               Eigen::ThreadPool* thread_pool) {
+  // Runs `do_func` asynchronously for the range [start, end) with a specified
+  // granularity. `do_func` should be of type `std::function<void(Index,
+  // Index)`. `done()` is called exactly once after all tasks have been executed.
+  template <typename DoFnType, typename DoneFnType>
+  static void ParallelForAsync(Index start, Index end, Index granularity, DoFnType&& do_func, DoneFnType&& done,
+                               ThreadPool* thread_pool) {
     if (start >= end) {
       done();
       return;
     }
-    ForkJoinScheduler::RunParallelForAsync(start, end, granularity, do_func, done, thread_pool);
+    thread_pool->Schedule([start, end, granularity, thread_pool, do_func = std::forward<DoFnType>(do_func),
+                           done = std::forward<DoneFnType>(done)]() {
+      RunParallelFor(start, end, granularity, do_func, thread_pool);
+      done();
+    });
   }
 
   // Synchronous variant of ParallelForAsync.
   template <typename DoFnType>
-  static void ParallelFor(int start, int end, int granularity, DoFnType do_func, Eigen::ThreadPool* thread_pool) {
+  static void ParallelFor(Index start, Index end, Index granularity, DoFnType&& do_func, ThreadPool* thread_pool) {
     if (start >= end) return;
-    auto dummy_done = []() {};
     Barrier barrier(1);
-    thread_pool->Schedule([start, end, granularity, thread_pool, &do_func, &dummy_done, &barrier]() {
-      ForkJoinScheduler::ParallelForAsync(start, end, granularity, do_func, dummy_done, thread_pool);
-      barrier.Notify();
-    });
+    auto done = [&barrier]() { barrier.Notify(); };
+    ParallelForAsync(start, end, granularity, do_func, done, thread_pool);
     barrier.Wait();
   }
 
  private:
   // Schedules `right_thunk`, runs `left_thunk`, and runs other tasks until `right_thunk` has finished.
   template <typename LeftType, typename RightType>
-  static void ForkJoin(LeftType&& left_thunk, RightType&& right_thunk, Eigen::ThreadPool* thread_pool) {
+  static void ForkJoin(LeftType&& left_thunk, RightType&& right_thunk, ThreadPool* thread_pool) {
     std::atomic<bool> right_done(false);
     auto execute_right = [&right_thunk, &right_done]() {
       std::forward<RightType>(right_thunk)();
@@ -97,47 +96,38 @@
     };
     thread_pool->Schedule(execute_right);
     std::forward<LeftType>(left_thunk)();
-    Eigen::ThreadPool::Task task;
+    ThreadPool::Task task;
     while (!right_done.load(std::memory_order_acquire)) {
       thread_pool->MaybeGetTask(&task);
       if (task.f) task.f();
     }
   }
 
-  // Runs `do_func` in parallel for the range [start, end). The main recursive asynchronous runner that
-  // calls `ForkJoin`.
-  static void RunParallelForAsync(int start, int end, int granularity, std::function<void(int)>& do_func,
-                                  std::function<void()>& done, Eigen::ThreadPool* thread_pool) {
-    std::function<void(int, int)> wrapped_do_func = [&do_func](int start, int end) {
-      for (int i = start; i < end; ++i) do_func(i);
-    };
-    ForkJoinScheduler::RunParallelForAsync(start, end, granularity, wrapped_do_func, done, thread_pool);
+  static Index ComputeMidpoint(Index start, Index end, Index granularity) {
+    // Typical workloads choose initial values of `{start, end, granularity}` such that `start - end` and
+    // `granularity` are powers of two. Since modern processors usually implement (2^x)-way
+    // set-associative caches, we minimize the number of cache misses by choosing midpoints that are not
+    // powers of two (to avoid having two addresses in the main memory pointing to the same point in the
+    // cache). More specifically, we choose the midpoint at (roughly) the 9/16 mark.
+    const Index size = end - start;
+    const Index offset = numext::round_down(9 * (size + 1) / 16, granularity);
+    return start + offset;
   }
 
-  // Variant of `RunAsyncParallelFor` that uses a do function that operates on an index range.
-  // Specifically, `do_func` takes two arguments: the start and end of the range.
-  static void RunParallelForAsync(int start, int end, int granularity, std::function<void(int, int)>& do_func,
-                                  std::function<void()>& done, Eigen::ThreadPool* thread_pool) {
-    if ((end - start) <= granularity) {
+  template <typename DoFnType>
+  static void RunParallelFor(Index start, Index end, Index granularity, DoFnType&& do_func, ThreadPool* thread_pool) {
+    Index mid = ComputeMidpoint(start, end, granularity);
+    if ((end - start) < granularity || mid == start || mid == end) {
       do_func(start, end);
-      for (int j = 0; j < end - start; ++j) done();
-    } else {
-      // Typical workloads choose initial values of `{start, end, granularity}` such that `start - end` and
-      // `granularity` are powers of two. Since modern processors usually implement (2^x)-way
-      // set-associative caches, we minimize the number of cache misses by choosing midpoints that are not
-      // powers of two (to avoid having two addresses in the main memory pointing to the same point in the
-      // cache). More specifically, we choose the midpoint at (roughly) the 9/16 mark.
-      const int size = end - start;
-      const int mid = start + 9 * (size + 1) / 16;
-      ForkJoinScheduler::ForkJoin(
-          [start, mid, granularity, &do_func, &done, thread_pool]() {
-            RunParallelForAsync(start, mid, granularity, do_func, done, thread_pool);
-          },
-          [mid, end, granularity, &do_func, &done, thread_pool]() {
-            RunParallelForAsync(mid, end, granularity, do_func, done, thread_pool);
-          },
-          thread_pool);
+      return;
     }
+    ForkJoin([start, mid, granularity, &do_func, thread_pool]() {
+               RunParallelFor(start, mid, granularity, do_func, thread_pool);
+             },
+             [mid, end, granularity, &do_func, thread_pool]() {
+               RunParallelFor(mid, end, granularity, do_func, thread_pool);
+             },
+             thread_pool);
   }
 };
 
diff --git a/Eigen/src/ThreadPool/NonBlockingThreadPool.h b/Eigen/src/ThreadPool/NonBlockingThreadPool.h
index 11dfae3..4ec1354 100644
--- a/Eigen/src/ThreadPool/NonBlockingThreadPool.h
+++ b/Eigen/src/ThreadPool/NonBlockingThreadPool.h
@@ -156,7 +156,10 @@
   // Tries to assign work to the current task.
   void MaybeGetTask(Task* t) {
     PerThread* pt = GetPerThread();
-    Queue& q = thread_data_[pt->thread_id].queue;
+    const int thread_id = pt->thread_id;
+    // If we are not a worker thread of this pool, we can't get any work.
+    if (thread_id < 0) return;
+    Queue& q = thread_data_[thread_id].queue;
     *t = q.PopFront();
     if (t->f) return;
     if (num_threads_ == 1) {
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index fdfde45..3a67ab1 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -320,6 +320,7 @@
 ei_add_test(threads_eventcount "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
 ei_add_test(threads_runqueue "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
 ei_add_test(threads_non_blocking_thread_pool "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
+ei_add_test(threads_fork_join "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
 add_executable(bug1213 bug1213.cpp bug1213_main.cpp)
 
 check_cxx_compiler_flag("-ffast-math" COMPILER_SUPPORT_FASTMATH)
diff --git a/test/bicgstab.cpp b/test/bicgstab.cpp
index 7ff2f3d..6d837e5 100644
--- a/test/bicgstab.cpp
+++ b/test/bicgstab.cpp
@@ -52,7 +52,7 @@
 
 // https://gitlab.com/libeigen/eigen/-/issues/2899
 void test_2899() {
-  Eigen::MatrixXd A(4, 4);
+  Eigen::MatrixXd A = Eigen::MatrixXd::Zero(4, 4);
   A(0, 0) = 1;
   A(1, 0) = -1.0 / 6;
   A(1, 1) = 2.0 / 3;
@@ -64,7 +64,7 @@
   A(3, 1) = -1.0 / 3;
   A(3, 2) = -1.0 / 3;
   A(3, 3) = 2.0 / 3;
-  Eigen::VectorXd b(4);
+  Eigen::VectorXd b = Eigen::VectorXd::Zero(4);
   b(0) = 0;
   b(1) = 1;
   b(2) = 1;
diff --git a/test/constexpr.cpp b/test/constexpr.cpp
index 34c728f..ecfda0a 100644
--- a/test/constexpr.cpp
+++ b/test/constexpr.cpp
@@ -10,6 +10,13 @@
 #define EIGEN_TESTING_CONSTEXPR
 #include "main.h"
 
+template <typename Scalar, int Rows>
+struct ConstexprTest {
+  constexpr ConstexprTest(const Matrix<Scalar, Rows, Rows>& B) { A = B; }
+
+  Matrix<Scalar, Rows, Rows> A;
+};
+
 EIGEN_DECLARE_TEST(constexpr) {
   // Clang accepts (some of) this code when using C++14/C++17, but GCC does not like
   // the fact that `T array[Size]` inside Eigen::internal::plain_array is not initialized
@@ -33,6 +40,18 @@
   VERIFY_IS_EQUAL(vec.size(), 3);
   static_assert(vec.coeff(0, 1) == 2);
 
+  // Check assignment. A wrapper struct is used to avoid copy ellision.
+  constexpr ConstexprTest<double, 2> obj1(Matrix2d({{1, 2}, {3, 4}}));
+  VERIFY_IS_EQUAL(obj1.A.size(), 4);
+  static_assert(obj1.A(0, 0) == 1);
+  static_assert(obj1.A(0) == 1);
+  static_assert(obj1.A.coeff(0, 1) == 2);
+  constexpr ConstexprTest<double, 3> obj2(Matrix3d({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}));
+  VERIFY_IS_EQUAL(obj2.A.size(), 9);
+  static_assert(obj2.A(0, 0) == 1);
+  static_assert(obj2.A(0) == 1);
+  static_assert(obj2.A.coeff(0, 1) == 2);
+
   // Also check dynamic size arrays/matrices with fixed-size storage (currently
   // only works if all elements are initialized, since otherwise the compiler
   // complains about uninitialized trailing elements.
diff --git a/test/packetmath.cpp b/test/packetmath.cpp
index cdbaad6..9c5d6cf 100644
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@@ -1635,7 +1635,7 @@
   EIGEN_ALIGN_MAX Scalar data1[PacketSize];
   RealScalar refvalue = RealScalar(0);
   for (int i = 0; i < PacketSize; ++i) {
-    data1[i] = internal::random<Scalar>() / RealScalar(PacketSize);
+    data1[i] = internal::random<Scalar>();
   }
 
   int stride = internal::random<int>(1, 20);
@@ -1655,7 +1655,7 @@
   }
 
   for (int i = 0; i < PacketSize * 7; ++i) {
-    buffer[i] = internal::random<Scalar>() / RealScalar(PacketSize);
+    buffer[i] = internal::random<Scalar>();
   }
   packet = internal::pgather<Scalar, Packet>(buffer, 7);
   internal::pstore(data1, packet);
@@ -1745,6 +1745,7 @@
     CALL_SUBTEST_12(test::runner<std::complex<double>>::run());
     CALL_SUBTEST_13(test::runner<half>::run());
     CALL_SUBTEST_14((packetmath<bool, internal::packet_traits<bool>::type>()));
+    CALL_SUBTEST_14((packetmath_scatter_gather<bool, internal::packet_traits<bool>::type>()));
     CALL_SUBTEST_15(test::runner<bfloat16>::run());
     g_first_pass = false;
   }
diff --git a/test/threads_fork_join.cpp b/test/threads_fork_join.cpp
index 941c317..b852b05 100644
--- a/test/threads_fork_join.cpp
+++ b/test/threads_fork_join.cpp
@@ -12,39 +12,26 @@
 #include "Eigen/ThreadPool"
 
 struct TestData {
-  ThreadPool tp;
+  std::unique_ptr<ThreadPool> tp;
   std::vector<double> data;
 };
 
 TestData make_test_data(int num_threads, int num_shards) {
-  return {ThreadPool(num_threads), std::vector<double>(num_shards, 1.0)};
+  return {std::make_unique<ThreadPool>(num_threads), std::vector<double>(num_shards, 1.0)};
 }
 
-static void test_unary_parallel_for(int granularity) {
+static void test_parallel_for(int granularity) {
   // Test correctness.
   const int kNumTasks = 100000;
   TestData test_data = make_test_data(/*num_threads=*/4, kNumTasks);
-  std::atomic<double> sum = 0.0;
-  std::function<void(int)> unary_do_fn = [&](int i) {
-    for (double new_sum = sum; !sum.compare_exchange_weak(new_sum, new_sum + test_data.data[i]);) {
-    };
-  };
-  ForkJoinScheduler::ParallelFor(0, kNumTasks, granularity, std::move(unary_do_fn), &test_data.tp);
-  VERIFY_IS_EQUAL(sum, kNumTasks);
-}
-
-static void test_binary_parallel_for(int granularity) {
-  // Test correctness.
-  const int kNumTasks = 100000;
-  TestData test_data = make_test_data(/*num_threads=*/4, kNumTasks);
-  std::atomic<double> sum = 0.0;
-  std::function<void(int, int)> binary_do_fn = [&](int i, int j) {
+  std::atomic<uint64_t> sum(0);
+  std::function<void(Index, Index)> binary_do_fn = [&](Index i, Index j) {
     for (int k = i; k < j; ++k)
-      for (double new_sum = sum; !sum.compare_exchange_weak(new_sum, new_sum + test_data.data[k]);) {
+      for (uint64_t new_sum = sum; !sum.compare_exchange_weak(new_sum, new_sum + test_data.data[k]);) {
       };
   };
-  ForkJoinScheduler::ParallelFor(0, kNumTasks, granularity, std::move(binary_do_fn), &test_data.tp);
-  VERIFY_IS_EQUAL(sum, kNumTasks);
+  ForkJoinScheduler::ParallelFor(0, kNumTasks, granularity, std::move(binary_do_fn), test_data.tp.get());
+  VERIFY_IS_EQUAL(sum.load(), kNumTasks);
 }
 
 static void test_async_parallel_for() {
@@ -54,26 +41,26 @@
   const int kNumTasks = 100;
   const int kNumAsyncCalls = kNumThreads * 4;
   TestData test_data = make_test_data(kNumThreads, kNumTasks);
-  std::atomic<double> sum = 0.0;
-  std::function<void(int)> unary_do_fn = [&](int i) {
-    for (double new_sum = sum; !sum.compare_exchange_weak(new_sum, new_sum + test_data.data[i]);) {
-    };
+  std::atomic<uint64_t> sum(0);
+  std::function<void(Index, Index)> binary_do_fn = [&](Index i, Index j) {
+    for (Index k = i; k < j; ++k) {
+      for (uint64_t new_sum = sum; !sum.compare_exchange_weak(new_sum, new_sum + test_data.data[i]);) {
+      }
+    }
   };
-  Barrier barrier(kNumTasks * kNumAsyncCalls);
+  Barrier barrier(kNumAsyncCalls);
   std::function<void()> done = [&]() { barrier.Notify(); };
   for (int k = 0; k < kNumAsyncCalls; ++k) {
-    test_data.tp.Schedule([&]() {
-      ForkJoinScheduler::ParallelForAsync(0, kNumTasks, /*granularity=*/1, unary_do_fn, done, &test_data.tp);
+    test_data.tp->Schedule([&]() {
+      ForkJoinScheduler::ParallelForAsync(0, kNumTasks, /*granularity=*/1, binary_do_fn, done, test_data.tp.get());
     });
   }
   barrier.Wait();
-  VERIFY_IS_EQUAL(sum, kNumTasks * kNumAsyncCalls);
+  VERIFY_IS_EQUAL(sum.load(), kNumTasks * kNumAsyncCalls);
 }
 
 EIGEN_DECLARE_TEST(fork_join) {
-  CALL_SUBTEST(test_unary_parallel_for(1));
-  CALL_SUBTEST(test_unary_parallel_for(2));
-  CALL_SUBTEST(test_binary_parallel_for(1));
-  CALL_SUBTEST(test_binary_parallel_for(2));
+  CALL_SUBTEST(test_parallel_for(1));
+  CALL_SUBTEST(test_parallel_for(2));
   CALL_SUBTEST(test_async_parallel_for());
 }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
index c9c613a..e9de988 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
@@ -50,6 +50,12 @@
 }
 
 template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool RandomToTypeUniform<bool>(uint64_t* state, uint64_t stream) {
+  unsigned rnd = PCG_XSH_RS_generator(state, stream);
+  return (rnd & 0x1) != 0;
+}
+
+template <>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state, uint64_t stream) {
   // Generate 10 random bits for the mantissa, merge with exponent.
   unsigned rnd = PCG_XSH_RS_generator(state, stream);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
index e12923d..98223fe 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
@@ -247,6 +247,8 @@
 
   EIGEN_STRONG_INLINE TensorRef() : Base() {}
 
+  EIGEN_STRONG_INLINE TensorRef(const TensorRef& other) : Base(other) {}
+
   template <typename Expression>
   EIGEN_STRONG_INLINE TensorRef(const Expression& expr) : Base(expr) {
     EIGEN_STATIC_ASSERT(internal::is_lvalue<Expression>::value,
@@ -254,6 +256,8 @@
                         "TensorRef<const Expression>?)");
   }
 
+  TensorRef& operator=(const TensorRef& other) { return Base::operator=(other).derived(); }
+
   template <typename Expression>
   EIGEN_STRONG_INLINE TensorRef& operator=(const Expression& expr) {
     EIGEN_STATIC_ASSERT(internal::is_lvalue<Expression>::value,
@@ -262,8 +266,6 @@
     return Base::operator=(expr).derived();
   }
 
-  TensorRef& operator=(const TensorRef& other) { return Base::operator=(other).derived(); }
-
   template <typename... IndexTypes>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices) {
     const std::size_t num_indices = (sizeof...(otherIndices) + 1);
@@ -306,17 +308,17 @@
  public:
   EIGEN_STRONG_INLINE TensorRef() : Base() {}
 
+  EIGEN_STRONG_INLINE TensorRef(const TensorRef& other) : Base(other) {}
+
   template <typename Expression>
   EIGEN_STRONG_INLINE TensorRef(const Expression& expr) : Base(expr) {}
 
+  TensorRef& operator=(const TensorRef& other) { return Base::operator=(other).derived(); }
+
   template <typename Expression>
   EIGEN_STRONG_INLINE TensorRef& operator=(const Expression& expr) {
     return Base::operator=(expr).derived();
   }
-
-  TensorRef(const TensorRef& other) : Base(other) {}
-
-  TensorRef& operator=(const TensorRef& other) { return Base::operator=(other).derived(); }
 };
 
 // evaluator for rvalues