Update Eigen to commit:d2dce377670f1742f3f701b3585682d02f10bb92 CHANGELOG ========= d2dce3776 - Optimize slerp() as proposed by Gopinath Vasalamarri. 66d8111ac - Use a more conservative method to detect non-finite inputs to cbrt. d6689a15d - Replace instances of EIGEN_CONSTEXPR macro 33f5f5961 - Vectorize cbrt for float and double. 533096090 - Enable packet segment in partial redux 6266d430c - packet segment: also check DiagonalWrapper e39ad8bad - fix constexpr in CoreEvaluators.h 7aefb9f4d - fix memset optimization for std::complex types 73ca849a6 - fix packetSegment for ArrayWrapper / MatrixWrapper 28c3b26d5 - masked load/store framework cebe09110 - Fix a potential deadlock because of Eigen thread pool 11fd34cc1 - Fix the typing of the Tasks in ForkJoin.h 2cd47d743 - Fixe Conversion Warning in Parallelizer b86004226 - Add postream for ostream-ing packets more reliably. 02d9e1138 - Add missing pmadd for Packet16bf. 9cc9209b9 - Fix cmake warning and default to j0. e0c99a8dd - By default, run ctests on all available cores in parallel. 63a40ffb9 - Use fma<float> for fma<half> and fma<bfloat16> if native fma is not available on the platform. 44fb6422b - All triggering full CI if MR label containts all-tests 3866cbfbe - Fix test for TensorRef of trace. 6579e36eb - Allow Tensor trace to be passed to a TensorRef. 8e32cbf7d - Reduce flakiness of test for Eigen::half. d935916ac - Add numext::fma and missing pmadd implementations. 754bd24f5 - fix 2828 ac2165c11 - fix allFinite 314396819 - Generalize the Eigen ForkJoin scheduler to use any ThreadPool interface. 70f2aead9 - Use native _Float16 for AVX512FP16 and update vectorization. 0259a52b0 - Use more .noalias() 14f845a1a - Fix givens rotation. 33b04fe51 - CMake: add install-doc target 10e62ccd2 - Fix x86 complex vectorized fma PiperOrigin-RevId: 753703769 Change-Id: I43bc1cf7c598ca3f306fffea9844e9c5b1a21b79

commit: d10e741dbbd25096637a48288609b247e06bc273 [log] [tgz]
author: Rasmus Munk Larsen <rmlarsen@google.com> Thu May 01 13:21:16 2025 -0700
committer: Copybara-Service <copybara-worker@google.com> Thu May 01 13:21:47 2025 -0700
tree: c0a6b5ca8cb9e1b70af468d535dc54c015b0e9fd
parent: 29dbcba5db6ba4ff3cb8ad783b9ea0abb0249bc3 [diff]
diff --git a/Eigen/Core b/Eigen/Core
index 99cd473..6ae069a 100644
--- a/Eigen/Core
+++ b/Eigen/Core

@@ -193,21 +193,27 @@
 #include "src/Core/arch/Default/GenericPacketMathFunctionsFwd.h"
 
 #if defined EIGEN_VECTORIZE_AVX512
+#include "src/Core/arch/SSE/PacketMath.h"
+#include "src/Core/arch/AVX/PacketMath.h"
+#include "src/Core/arch/AVX512/PacketMath.h"
 #if defined EIGEN_VECTORIZE_AVX512FP16
 #include "src/Core/arch/AVX512/PacketMathFP16.h"
 #endif
-#include "src/Core/arch/SSE/PacketMath.h"
 #include "src/Core/arch/SSE/TypeCasting.h"
-#include "src/Core/arch/SSE/Complex.h"
-#include "src/Core/arch/AVX/PacketMath.h"
 #include "src/Core/arch/AVX/TypeCasting.h"
-#include "src/Core/arch/AVX/Complex.h"
-#include "src/Core/arch/AVX512/PacketMath.h"
 #include "src/Core/arch/AVX512/TypeCasting.h"
+#if defined EIGEN_VECTORIZE_AVX512FP16
+#include "src/Core/arch/AVX512/TypeCastingFP16.h"
+#endif
+#include "src/Core/arch/SSE/Complex.h"
+#include "src/Core/arch/AVX/Complex.h"
 #include "src/Core/arch/AVX512/Complex.h"
 #include "src/Core/arch/SSE/MathFunctions.h"
 #include "src/Core/arch/AVX/MathFunctions.h"
 #include "src/Core/arch/AVX512/MathFunctions.h"
+#if defined EIGEN_VECTORIZE_AVX512FP16
+#include "src/Core/arch/AVX512/MathFunctionsFP16.h"
+#endif
 #include "src/Core/arch/AVX512/TrsmKernel.h"
 #elif defined EIGEN_VECTORIZE_AVX
    // Use AVX for floats and doubles, SSE for integers

diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h
index 5d52ab2..b1d801d 100644
--- a/Eigen/src/Cholesky/LDLT.h
+++ b/Eigen/src/Cholesky/LDLT.h

@@ -230,8 +230,8 @@
    */
   const LDLT& adjoint() const { return *this; }
 
-  EIGEN_DEVICE_FUNC inline EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
-  EIGEN_DEVICE_FUNC inline EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_matrix.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_matrix.cols(); }
 
   /** \brief Reports whether previous computation was successful.
    *

diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h
index 01b4476..7fa4fa2 100644
--- a/Eigen/src/Cholesky/LLT.h
+++ b/Eigen/src/Cholesky/LLT.h

@@ -182,10 +182,10 @@
    * This method is provided for compatibility with other matrix decompositions, thus enabling generic code such as:
    * \code x = decomposition.adjoint().solve(b) \endcode
    */
-  const LLT& adjoint() const EIGEN_NOEXCEPT { return *this; }
+  const LLT& adjoint() const noexcept { return *this; }
 
-  inline EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
-  inline EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
+  constexpr Index rows() const noexcept { return m_matrix.rows(); }
+  constexpr Index cols() const noexcept { return m_matrix.cols(); }
 
   template <typename VectorType>
   LLT& rankUpdate(const VectorType& vec, const RealScalar& sigma = 1);

diff --git a/Eigen/src/Core/Array.h b/Eigen/src/Core/Array.h
index 3376dfc..57f3186 100644
--- a/Eigen/src/Core/Array.h
+++ b/Eigen/src/Core/Array.h

@@ -129,7 +129,7 @@
 #endif
   /** \brief Move constructor */
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Array(Array&&) = default;
-  EIGEN_DEVICE_FUNC Array& operator=(Array&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable<Scalar>::value) {
+  EIGEN_DEVICE_FUNC Array& operator=(Array&& other) noexcept(std::is_nothrow_move_assignable<Scalar>::value) {
     Base::operator=(std::move(other));
     return *this;
   }
@@ -253,8 +253,8 @@
           PrivateType())
       : Base(other.derived()) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const EIGEN_NOEXCEPT { return 1; }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const EIGEN_NOEXCEPT { return this->innerSize(); }
+  EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { return 1; }
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return this->innerSize(); }
 
 #ifdef EIGEN_ARRAY_PLUGIN
 #include EIGEN_ARRAY_PLUGIN

diff --git a/Eigen/src/Core/ArrayWrapper.h b/Eigen/src/Core/ArrayWrapper.h
index b636d88..c9a194e 100644
--- a/Eigen/src/Core/ArrayWrapper.h
+++ b/Eigen/src/Core/ArrayWrapper.h

@@ -56,14 +56,10 @@
 
   EIGEN_DEVICE_FUNC explicit EIGEN_STRONG_INLINE ArrayWrapper(ExpressionType& matrix) : m_expression(matrix) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_expression.rows(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_expression.cols(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const EIGEN_NOEXCEPT {
-    return m_expression.outerStride();
-  }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const EIGEN_NOEXCEPT {
-    return m_expression.innerStride();
-  }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_expression.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_expression.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return m_expression.outerStride(); }
+  EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { return m_expression.innerStride(); }
 
   EIGEN_DEVICE_FUNC constexpr ScalarWithConstIfNotLvalue* data() { return m_expression.data(); }
   EIGEN_DEVICE_FUNC constexpr const Scalar* data() const { return m_expression.data(); }
@@ -135,14 +131,10 @@
 
   EIGEN_DEVICE_FUNC explicit inline MatrixWrapper(ExpressionType& matrix) : m_expression(matrix) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_expression.rows(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_expression.cols(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const EIGEN_NOEXCEPT {
-    return m_expression.outerStride();
-  }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const EIGEN_NOEXCEPT {
-    return m_expression.innerStride();
-  }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_expression.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_expression.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return m_expression.outerStride(); }
+  EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { return m_expression.innerStride(); }
 
   EIGEN_DEVICE_FUNC constexpr ScalarWithConstIfNotLvalue* data() { return m_expression.data(); }
   EIGEN_DEVICE_FUNC constexpr const Scalar* data() const { return m_expression.data(); }

diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h
index 0ea1bc3..36f0a9d 100644
--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h

@@ -136,6 +136,7 @@
       : Traversal == SliceVectorizedTraversal ? (MayUnrollInner ? InnerUnrolling : NoUnrolling)
 #endif
                                               : NoUnrolling;
+  static constexpr bool UsePacketSegment = has_packet_segment<PacketType>::value;
 
 #ifdef EIGEN_DEBUG_ASSIGN
   static void debug() {
@@ -199,7 +200,7 @@
 
 template <typename Kernel, int Stop>
 struct copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, Stop, Stop> {
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel&) {}
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&) {}
 };
 
 template <typename Kernel, int Index_, int Stop>
@@ -253,7 +254,7 @@
 
 template <typename Kernel, int Stop>
 struct copy_using_evaluator_innervec_CompleteUnrolling<Kernel, Stop, Stop> {
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel&) {}
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&) {}
 };
 
 template <typename Kernel, int Index_, int Stop, int SrcAlignment, int DstAlignment>
@@ -273,6 +274,33 @@
   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&, Index) {}
 };
 
+template <typename Kernel, int Start, int Stop, int SrcAlignment, int DstAlignment, bool UsePacketSegment>
+struct copy_using_evaluator_innervec_segment {
+  using PacketType = typename Kernel::PacketType;
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel, Index outer) {
+    kernel.template assignPacketSegmentByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, Start, 0,
+                                                                                            Stop - Start);
+  }
+};
+
+template <typename Kernel, int Start, int Stop, int SrcAlignment, int DstAlignment>
+struct copy_using_evaluator_innervec_segment<Kernel, Start, Stop, SrcAlignment, DstAlignment,
+                                             /*UsePacketSegment*/ false>
+    : copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, Start, Stop> {};
+
+template <typename Kernel, int Stop, int SrcAlignment, int DstAlignment>
+struct copy_using_evaluator_innervec_segment<Kernel, Stop, Stop, SrcAlignment, DstAlignment,
+                                             /*UsePacketSegment*/ true> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&, Index) {}
+};
+
+template <typename Kernel, int Stop, int SrcAlignment, int DstAlignment>
+struct copy_using_evaluator_innervec_segment<Kernel, Stop, Stop, SrcAlignment, DstAlignment,
+                                             /*UsePacketSegment*/ false> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&, Index) {}
+};
+
 /***************************************************************************
  * Part 3 : implementation of all cases
  ***************************************************************************/
@@ -306,7 +334,7 @@
 struct dense_assignment_loop_impl<Kernel, AllAtOnceTraversal, Unrolling> {
   static constexpr int SizeAtCompileTime = Kernel::AssignmentTraits::SizeAtCompileTime;
 
-  EIGEN_DEVICE_FUNC static void EIGEN_STRONG_INLINE EIGEN_CONSTEXPR run(Kernel& /*kernel*/) {
+  EIGEN_DEVICE_FUNC static void EIGEN_STRONG_INLINE constexpr run(Kernel& /*kernel*/) {
     EIGEN_STATIC_ASSERT(SizeAtCompileTime == 0, EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT)
   }
 };
@@ -353,28 +381,46 @@
 // The goal of unaligned_dense_assignment_loop is simply to factorize the handling
 // of the non vectorizable beginning and ending parts
 
-template <bool IsAligned = false>
+template <typename PacketType, int DstAlignment, int SrcAlignment, bool UsePacketSegment, bool Skip>
 struct unaligned_dense_assignment_loop {
-  // if IsAligned = true, then do nothing
+  // if Skip == true, then do nothing
   template <typename Kernel>
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel&, Index, Index) {}
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& /*kernel*/, Index /*start*/, Index /*end*/) {}
+  template <typename Kernel>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& /*kernel*/, Index /*outer*/,
+                                                                  Index /*innerStart*/, Index /*innerEnd*/) {}
 };
 
-template <>
-struct unaligned_dense_assignment_loop<false> {
-  // MSVC must not inline this functions. If it does, it fails to optimize the
-  // packet access path.
-  // FIXME check which version exhibits this issue
-#if EIGEN_COMP_MSVC
+template <typename PacketType, int DstAlignment, int SrcAlignment>
+struct unaligned_dense_assignment_loop<PacketType, DstAlignment, SrcAlignment, /*UsePacketSegment*/ true,
+                                       /*Skip*/ false> {
   template <typename Kernel>
-  static EIGEN_DONT_INLINE void run(Kernel& kernel, Index start, Index end)
-#else
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel, Index start, Index end) {
+    Index count = end - start;
+    eigen_assert(count <= unpacket_traits<PacketType>::size);
+    if (count > 0) kernel.template assignPacketSegment<DstAlignment, SrcAlignment, PacketType>(start, 0, count);
+  }
   template <typename Kernel>
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel, Index start, Index end)
-#endif
-  {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel, Index outer, Index start, Index end) {
+    Index count = end - start;
+    eigen_assert(count <= unpacket_traits<PacketType>::size);
+    if (count > 0)
+      kernel.template assignPacketSegmentByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, start, 0, count);
+  }
+};
+
+template <typename PacketType, int DstAlignment, int SrcAlignment>
+struct unaligned_dense_assignment_loop<PacketType, DstAlignment, SrcAlignment, /*UsePacketSegment*/ false,
+                                       /*Skip*/ false> {
+  template <typename Kernel>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel, Index start, Index end) {
     for (Index index = start; index < end; ++index) kernel.assignCoeff(index);
   }
+  template <typename Kernel>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel, Index outer, Index innerStart,
+                                                                  Index innerEnd) {
+    for (Index inner = innerStart; inner < innerEnd; ++inner) kernel.assignCoeffByOuterInner(outer, inner);
+  }
 };
 
 template <typename Kernel, int Index_, int Stop>
@@ -392,7 +438,32 @@
 
 template <typename Kernel, int Stop>
 struct copy_using_evaluator_linearvec_CompleteUnrolling<Kernel, Stop, Stop> {
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel&) {}
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&) {}
+};
+
+template <typename Kernel, int Index_, int Stop, bool UsePacketSegment>
+struct copy_using_evaluator_linearvec_segment {
+  using PacketType = typename Kernel::PacketType;
+  static constexpr int SrcAlignment = Kernel::AssignmentTraits::SrcAlignment;
+  static constexpr int DstAlignment = Kernel::AssignmentTraits::DstAlignment;
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel) {
+    kernel.template assignPacketSegment<DstAlignment, SrcAlignment, PacketType>(Index_, 0, Stop - Index_);
+  }
+};
+
+template <typename Kernel, int Index_, int Stop>
+struct copy_using_evaluator_linearvec_segment<Kernel, Index_, Stop, /*UsePacketSegment*/ false>
+    : copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, Index_, Stop> {};
+
+template <typename Kernel, int Stop>
+struct copy_using_evaluator_linearvec_segment<Kernel, Stop, Stop, /*UsePacketSegment*/ true> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&) {}
+};
+
+template <typename Kernel, int Stop>
+struct copy_using_evaluator_linearvec_segment<Kernel, Stop, Stop, /*UsePacketSegment*/ false> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&) {}
 };
 
 template <typename Kernel>
@@ -400,23 +471,30 @@
   using Scalar = typename Kernel::Scalar;
   using PacketType = typename Kernel::PacketType;
   static constexpr int PacketSize = unpacket_traits<PacketType>::size;
-  static constexpr int RequestedAlignment = Kernel::AssignmentTraits::LinearRequiredAlignment;
-  static constexpr bool DstIsAligned = Kernel::AssignmentTraits::DstAlignment >= RequestedAlignment;
   static constexpr int SrcAlignment = Kernel::AssignmentTraits::JointAlignment;
-  static constexpr int DstAlignment =
-      packet_traits<Scalar>::AlignedOnScalar ? RequestedAlignment : Kernel::AssignmentTraits::DstAlignment;
+  static constexpr int DstAlignment = plain_enum_max(Kernel::AssignmentTraits::DstAlignment, alignof(Scalar));
+  static constexpr int RequestedAlignment = unpacket_traits<PacketType>::alignment;
+  static constexpr bool Alignable =
+      (DstAlignment >= RequestedAlignment) || ((RequestedAlignment - DstAlignment) % sizeof(Scalar) == 0);
+  static constexpr int Alignment = Alignable ? RequestedAlignment : DstAlignment;
+  static constexpr bool DstIsAligned = DstAlignment >= Alignment;
+  static constexpr bool UsePacketSegment = Kernel::AssignmentTraits::UsePacketSegment;
 
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) {
+  using head_loop =
+      unaligned_dense_assignment_loop<PacketType, DstAlignment, SrcAlignment, UsePacketSegment, DstIsAligned>;
+  using tail_loop = unaligned_dense_assignment_loop<PacketType, Alignment, SrcAlignment, UsePacketSegment, false>;
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel) {
     const Index size = kernel.size();
-    const Index alignedStart = DstIsAligned ? 0 : first_aligned<RequestedAlignment>(kernel.dstDataPtr(), size);
+    const Index alignedStart = DstIsAligned ? 0 : first_aligned<Alignment>(kernel.dstDataPtr(), size);
     const Index alignedEnd = alignedStart + numext::round_down(size - alignedStart, PacketSize);
 
-    unaligned_dense_assignment_loop<DstIsAligned>::run(kernel, 0, alignedStart);
+    head_loop::run(kernel, 0, alignedStart);
 
     for (Index index = alignedStart; index < alignedEnd; index += PacketSize)
-      kernel.template assignPacket<DstAlignment, SrcAlignment, PacketType>(index);
+      kernel.template assignPacket<Alignment, SrcAlignment, PacketType>(index);
 
-    unaligned_dense_assignment_loop<>::run(kernel, alignedEnd, size);
+    tail_loop::run(kernel, alignedEnd, size);
   }
 };
 
@@ -426,10 +504,11 @@
   static constexpr int PacketSize = unpacket_traits<PacketType>::size;
   static constexpr int Size = Kernel::AssignmentTraits::SizeAtCompileTime;
   static constexpr int AlignedSize = numext::round_down(Size, PacketSize);
+  static constexpr bool UsePacketSegment = Kernel::AssignmentTraits::UsePacketSegment;
 
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel) {
     copy_using_evaluator_linearvec_CompleteUnrolling<Kernel, 0, AlignedSize>::run(kernel);
-    copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, AlignedSize, Size>::run(kernel);
+    copy_using_evaluator_linearvec_segment<Kernel, AlignedSize, Size, UsePacketSegment>::run(kernel);
   }
 };
 
@@ -444,7 +523,7 @@
   static constexpr int SrcAlignment = Kernel::AssignmentTraits::JointAlignment;
   static constexpr int DstAlignment = Kernel::AssignmentTraits::DstAlignment;
 
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel) {
     const Index innerSize = kernel.innerSize();
     const Index outerSize = kernel.outerSize();
     for (Index outer = 0; outer < outerSize; ++outer)
@@ -482,7 +561,7 @@
 
 template <typename Kernel>
 struct dense_assignment_loop_impl<Kernel, LinearTraversal, NoUnrolling> {
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel) {
     const Index size = kernel.size();
     for (Index i = 0; i < size; ++i) kernel.assignCoeff(i);
   }
@@ -490,7 +569,7 @@
 
 template <typename Kernel>
 struct dense_assignment_loop_impl<Kernel, LinearTraversal, CompleteUnrolling> {
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel) {
     copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, 0, Kernel::AssignmentTraits::SizeAtCompileTime>::run(
         kernel);
   }
@@ -505,35 +584,35 @@
   using Scalar = typename Kernel::Scalar;
   using PacketType = typename Kernel::PacketType;
   static constexpr int PacketSize = unpacket_traits<PacketType>::size;
-  static constexpr int RequestedAlignment = Kernel::AssignmentTraits::InnerRequiredAlignment;
+  static constexpr int SrcAlignment = Kernel::AssignmentTraits::JointAlignment;
+  static constexpr int DstAlignment = plain_enum_max(Kernel::AssignmentTraits::DstAlignment, alignof(Scalar));
+  static constexpr int RequestedAlignment = unpacket_traits<PacketType>::alignment;
   static constexpr bool Alignable =
-      packet_traits<Scalar>::AlignedOnScalar || Kernel::AssignmentTraits::DstAlignment >= sizeof(Scalar);
-  static constexpr bool DstIsAligned = Kernel::AssignmentTraits::DstAlignment >= RequestedAlignment;
-  static constexpr int DstAlignment = Alignable ? RequestedAlignment : Kernel::AssignmentTraits::DstAlignment;
+      (DstAlignment >= RequestedAlignment) || ((RequestedAlignment - DstAlignment) % sizeof(Scalar) == 0);
+  static constexpr int Alignment = Alignable ? RequestedAlignment : DstAlignment;
+  static constexpr bool DstIsAligned = DstAlignment >= Alignment;
+  static constexpr bool UsePacketSegment = Kernel::AssignmentTraits::UsePacketSegment;
 
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) {
+  using head_loop = unaligned_dense_assignment_loop<PacketType, DstAlignment, Unaligned, UsePacketSegment, !Alignable>;
+  using tail_loop = unaligned_dense_assignment_loop<PacketType, Alignment, Unaligned, UsePacketSegment, false>;
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel) {
     const Scalar* dst_ptr = kernel.dstDataPtr();
-    if ((!DstIsAligned) && (std::uintptr_t(dst_ptr) % sizeof(Scalar)) > 0) {
-      // the pointer is not aligned-on scalar, so alignment is not possible
-      return dense_assignment_loop<Kernel, DefaultTraversal, NoUnrolling>::run(kernel);
-    }
     const Index innerSize = kernel.innerSize();
     const Index outerSize = kernel.outerSize();
     const Index alignedStep = Alignable ? (PacketSize - kernel.outerStride() % PacketSize) % PacketSize : 0;
-    Index alignedStart =
-        ((!Alignable) || DstIsAligned) ? 0 : internal::first_aligned<RequestedAlignment>(dst_ptr, innerSize);
+    Index alignedStart = ((!Alignable) || DstIsAligned) ? 0 : internal::first_aligned<Alignment>(dst_ptr, innerSize);
 
     for (Index outer = 0; outer < outerSize; ++outer) {
       const Index alignedEnd = alignedStart + numext::round_down(innerSize - alignedStart, PacketSize);
-      // do the non-vectorizable part of the assignment
-      for (Index inner = 0; inner < alignedStart; ++inner) kernel.assignCoeffByOuterInner(outer, inner);
+
+      head_loop::run(kernel, outer, 0, alignedStart);
 
       // do the vectorizable part of the assignment
       for (Index inner = alignedStart; inner < alignedEnd; inner += PacketSize)
-        kernel.template assignPacketByOuterInner<DstAlignment, Unaligned, PacketType>(outer, inner);
+        kernel.template assignPacketByOuterInner<Alignment, Unaligned, PacketType>(outer, inner);
 
-      // do the non-vectorizable part of the assignment
-      for (Index inner = alignedEnd; inner < innerSize; ++inner) kernel.assignCoeffByOuterInner(outer, inner);
+      tail_loop::run(kernel, outer, alignedEnd, innerSize);
 
       alignedStart = numext::mini((alignedStart + alignedStep) % PacketSize, innerSize);
     }
@@ -547,11 +626,16 @@
   static constexpr int PacketSize = unpacket_traits<PacketType>::size;
   static constexpr int InnerSize = Kernel::AssignmentTraits::InnerSizeAtCompileTime;
   static constexpr int VectorizableSize = numext::round_down(InnerSize, PacketSize);
+  static constexpr bool UsePacketSegment = Kernel::AssignmentTraits::UsePacketSegment;
 
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) {
+  using packet_loop = copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, VectorizableSize, Unaligned, Unaligned>;
+  using packet_segment_loop = copy_using_evaluator_innervec_segment<Kernel, VectorizableSize, InnerSize, Unaligned,
+                                                                    Unaligned, UsePacketSegment>;
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel) {
     for (Index outer = 0; outer < kernel.outerSize(); ++outer) {
-      copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, VectorizableSize, 0, 0>::run(kernel, outer);
-      copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, VectorizableSize, InnerSize>::run(kernel, outer);
+      packet_loop::run(kernel, outer);
+      packet_segment_loop::run(kernel, outer);
     }
   }
 };
@@ -590,15 +674,15 @@
 #endif
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index size() const EIGEN_NOEXCEPT { return m_dstExpr.size(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index innerSize() const EIGEN_NOEXCEPT { return m_dstExpr.innerSize(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index outerSize() const EIGEN_NOEXCEPT { return m_dstExpr.outerSize(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_dstExpr.rows(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_dstExpr.cols(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index outerStride() const EIGEN_NOEXCEPT { return m_dstExpr.outerStride(); }
+  EIGEN_DEVICE_FUNC constexpr Index size() const noexcept { return m_dstExpr.size(); }
+  EIGEN_DEVICE_FUNC constexpr Index innerSize() const noexcept { return m_dstExpr.innerSize(); }
+  EIGEN_DEVICE_FUNC constexpr Index outerSize() const noexcept { return m_dstExpr.outerSize(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_dstExpr.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_dstExpr.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return m_dstExpr.outerStride(); }
 
-  EIGEN_DEVICE_FUNC DstEvaluatorType& dstEvaluator() EIGEN_NOEXCEPT { return m_dst; }
-  EIGEN_DEVICE_FUNC const SrcEvaluatorType& srcEvaluator() const EIGEN_NOEXCEPT { return m_src; }
+  EIGEN_DEVICE_FUNC DstEvaluatorType& dstEvaluator() noexcept { return m_dst; }
+  EIGEN_DEVICE_FUNC const SrcEvaluatorType& srcEvaluator() const noexcept { return m_src; }
 
   /// Assign src(row,col) to dst(row,col) through the assignment functor.
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void assignCoeff(Index row, Index col) {
@@ -635,6 +719,27 @@
     assignPacket<StoreMode, LoadMode, Packet>(row, col);
   }
 
+  template <int StoreMode, int LoadMode, typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacketSegment(Index row, Index col, Index begin, Index count) {
+    m_functor.template assignPacketSegment<StoreMode>(
+        &m_dst.coeffRef(row, col), m_src.template packetSegment<LoadMode, Packet>(row, col, begin, count), begin,
+        count);
+  }
+
+  template <int StoreMode, int LoadMode, typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacketSegment(Index index, Index begin, Index count) {
+    m_functor.template assignPacketSegment<StoreMode>(
+        &m_dst.coeffRef(index), m_src.template packetSegment<LoadMode, Packet>(index, begin, count), begin, count);
+  }
+
+  template <int StoreMode, int LoadMode, typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacketSegmentByOuterInner(Index outer, Index inner, Index begin,
+                                                                             Index count) {
+    Index row = rowIndexByOuterInner(outer, inner);
+    Index col = colIndexByOuterInner(outer, inner);
+    assignPacketSegment<StoreMode, LoadMode, Packet>(row, col, begin, count);
+  }
+
   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr Index rowIndexByOuterInner(Index outer, Index inner) {
     typedef typename DstEvaluatorType::ExpressionTraits Traits;
     return int(Traits::RowsAtCompileTime) == 1          ? 0
@@ -704,9 +809,8 @@
 }
 
 template <typename DstXprType, typename SrcXprType, typename Functor>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void call_dense_assignment_loop(DstXprType& dst,
-                                                                                      const SrcXprType& src,
-                                                                                      const Functor& func) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src,
+                                                                                const Functor& func) {
   typedef evaluator<DstXprType> DstEvaluatorType;
   typedef evaluator<SrcXprType> SrcEvaluatorType;
 
@@ -775,7 +879,7 @@
 
 // Deal with "assume-aliasing"
 template <typename Dst, typename Src, typename Func>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void call_assignment(
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_assignment(
     Dst& dst, const Src& src, const Func& func, std::enable_if_t<evaluator_assume_aliasing<Src>::value, void*> = 0) {
   typename plain_matrix_type<Src>::type tmp(src);
   call_assignment_no_alias(dst, tmp, func);
@@ -790,14 +894,14 @@
 // by-pass "assume-aliasing"
 // When there is no aliasing, we require that 'dst' has been properly resized
 template <typename Dst, template <typename> class StorageBase, typename Src, typename Func>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void call_assignment(NoAlias<Dst, StorageBase>& dst,
-                                                                           const Src& src, const Func& func) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_assignment(NoAlias<Dst, StorageBase>& dst, const Src& src,
+                                                                     const Func& func) {
   call_assignment_no_alias(dst.expression(), src, func);
 }
 
 template <typename Dst, typename Src, typename Func>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void call_assignment_no_alias(Dst& dst, const Src& src,
-                                                                                    const Func& func) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_assignment_no_alias(Dst& dst, const Src& src,
+                                                                              const Func& func) {
   enum {
     NeedToTranspose = ((int(Dst::RowsAtCompileTime) == 1 && int(Src::ColsAtCompileTime) == 1) ||
                        (int(Dst::ColsAtCompileTime) == 1 && int(Src::RowsAtCompileTime) == 1)) &&
@@ -836,14 +940,13 @@
 }
 
 template <typename Dst, typename Src>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void call_assignment_no_alias(Dst& dst, const Src& src) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_assignment_no_alias(Dst& dst, const Src& src) {
   call_assignment_no_alias(dst, src, internal::assign_op<typename Dst::Scalar, typename Src::Scalar>());
 }
 
 template <typename Dst, typename Src, typename Func>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void call_assignment_no_alias_no_transpose(Dst& dst,
-                                                                                                 const Src& src,
-                                                                                                 const Func& func) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src,
+                                                                                           const Func& func) {
   // TODO check whether this is the right place to perform these checks:
   EIGEN_STATIC_ASSERT_LVALUE(Dst)
   EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Dst, Src)
@@ -852,8 +955,7 @@
   Assignment<Dst, Src, Func>::run(dst, src, func);
 }
 template <typename Dst, typename Src>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void call_assignment_no_alias_no_transpose(Dst& dst,
-                                                                                                 const Src& src) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src) {
   call_assignment_no_alias_no_transpose(dst, src, internal::assign_op<typename Dst::Scalar, typename Src::Scalar>());
 }
 

diff --git a/Eigen/src/Core/BandMatrix.h b/Eigen/src/Core/BandMatrix.h
index ca991ca..57b0322 100644
--- a/Eigen/src/Core/BandMatrix.h
+++ b/Eigen/src/Core/BandMatrix.h

@@ -200,16 +200,16 @@
       : m_coeffs(1 + supers + subs, cols), m_rows(rows), m_supers(supers), m_subs(subs) {}
 
   /** \returns the number of columns */
-  inline EIGEN_CONSTEXPR Index rows() const { return m_rows.value(); }
+  constexpr Index rows() const { return m_rows.value(); }
 
   /** \returns the number of rows */
-  inline EIGEN_CONSTEXPR Index cols() const { return m_coeffs.cols(); }
+  constexpr Index cols() const { return m_coeffs.cols(); }
 
   /** \returns the number of super diagonals */
-  inline EIGEN_CONSTEXPR Index supers() const { return m_supers.value(); }
+  constexpr Index supers() const { return m_supers.value(); }
 
   /** \returns the number of sub diagonals */
-  inline EIGEN_CONSTEXPR Index subs() const { return m_subs.value(); }
+  constexpr Index subs() const { return m_subs.value(); }
 
   inline const CoefficientsType& coeffs() const { return m_coeffs; }
   inline CoefficientsType& coeffs() { return m_coeffs; }
@@ -260,16 +260,16 @@
   }
 
   /** \returns the number of columns */
-  inline EIGEN_CONSTEXPR Index rows() const { return m_rows.value(); }
+  constexpr Index rows() const { return m_rows.value(); }
 
   /** \returns the number of rows */
-  inline EIGEN_CONSTEXPR Index cols() const { return m_coeffs.cols(); }
+  constexpr Index cols() const { return m_coeffs.cols(); }
 
   /** \returns the number of super diagonals */
-  inline EIGEN_CONSTEXPR Index supers() const { return m_supers.value(); }
+  constexpr Index supers() const { return m_supers.value(); }
 
   /** \returns the number of sub diagonals */
-  inline EIGEN_CONSTEXPR Index subs() const { return m_subs.value(); }
+  constexpr Index subs() const { return m_subs.value(); }
 
   inline const CoefficientsType& coeffs() const { return m_coeffs; }
 

diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h
index 709264c..39abff7 100644
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h

@@ -289,13 +289,9 @@
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE XprType& nestedExpression() { return m_xpr; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR StorageIndex startRow() const EIGEN_NOEXCEPT {
-    return m_startRow.value();
-  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr StorageIndex startRow() const noexcept { return m_startRow.value(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR StorageIndex startCol() const EIGEN_NOEXCEPT {
-    return m_startCol.value();
-  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr StorageIndex startCol() const noexcept { return m_startCol.value(); }
 
  protected:
   XprTypeNested m_xpr;
@@ -319,8 +315,7 @@
    * Adding an offset to nullptr is undefined behavior, so we must avoid it.
    */
   template <typename Scalar>
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE static Scalar* add_to_nullable_pointer(Scalar* base,
-                                                                                               Index offset) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE static Scalar* add_to_nullable_pointer(Scalar* base, Index offset) {
     return base != nullptr ? base + offset : nullptr;
   }
 
@@ -378,30 +373,25 @@
     init();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all_t<XprTypeNested>& nestedExpression() const
-      EIGEN_NOEXCEPT {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all_t<XprTypeNested>& nestedExpression() const noexcept {
     return m_xpr;
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE XprType& nestedExpression() { return m_xpr; }
 
   /** \sa MapBase::innerStride() */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index innerStride() const EIGEN_NOEXCEPT {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index innerStride() const noexcept {
     return internal::traits<BlockType>::HasSameStorageOrderAsXprType ? m_xpr.innerStride() : m_xpr.outerStride();
   }
 
   /** \sa MapBase::outerStride() */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index outerStride() const EIGEN_NOEXCEPT {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index outerStride() const noexcept {
     return internal::traits<BlockType>::HasSameStorageOrderAsXprType ? m_xpr.outerStride() : m_xpr.innerStride();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR StorageIndex startRow() const EIGEN_NOEXCEPT {
-    return m_startRow.value();
-  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr StorageIndex startRow() const noexcept { return m_startRow.value(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR StorageIndex startCol() const EIGEN_NOEXCEPT {
-    return m_startCol.value();
-  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr StorageIndex startCol() const noexcept { return m_startCol.value(); }
 
 #ifndef __SUNPRO_CC
   // FIXME sunstudio is not friendly with the above friend...

diff --git a/Eigen/src/Core/CommaInitializer.h b/Eigen/src/Core/CommaInitializer.h
index c629123..c414117 100644
--- a/Eigen/src/Core/CommaInitializer.h
+++ b/Eigen/src/Core/CommaInitializer.h

@@ -92,7 +92,7 @@
 
   EIGEN_DEVICE_FUNC inline ~CommaInitializer()
 #if defined VERIFY_RAISES_ASSERT && (!defined EIGEN_NO_ASSERTION_CHECKING) && defined EIGEN_EXCEPTIONS
-      EIGEN_EXCEPTION_SPEC(Eigen::eigen_assert_exception)
+      noexcept(false)  // Eigen::eigen_assert_exception
 #endif
   {
     finished();

diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h
index 156ca2b..e3af2d2 100644
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h

@@ -149,7 +149,7 @@
 #endif
     eigen_internal_assert(outerStride == OuterStride);
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index outerStride() const EIGEN_NOEXCEPT { return OuterStride; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index outerStride() const noexcept { return OuterStride; }
   const Scalar* data;
 };
 
@@ -198,19 +198,13 @@
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType coeff(Index row, Index col) const {
-    if (IsRowMajor)
-      return m_d.data[row * m_d.outerStride() + col];
-    else
-      return m_d.data[row + col * m_d.outerStride()];
+    return coeff(getIndex(row, col));
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType coeff(Index index) const { return m_d.data[index]; }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& coeffRef(Index row, Index col) {
-    if (IsRowMajor)
-      return const_cast<Scalar*>(m_d.data)[row * m_d.outerStride() + col];
-    else
-      return const_cast<Scalar*>(m_d.data)[row + col * m_d.outerStride()];
+    return coeffRef(getIndex(row, col));
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& coeffRef(Index index) {
@@ -219,10 +213,7 @@
 
   template <int LoadMode, typename PacketType>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
-    if (IsRowMajor)
-      return ploadt<PacketType, LoadMode>(m_d.data + row * m_d.outerStride() + col);
-    else
-      return ploadt<PacketType, LoadMode>(m_d.data + row + col * m_d.outerStride());
+    return packet<LoadMode, PacketType>(getIndex(row, col));
   }
 
   template <int LoadMode, typename PacketType>
@@ -232,19 +223,43 @@
 
   template <int StoreMode, typename PacketType>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) {
-    if (IsRowMajor)
-      return pstoret<Scalar, PacketType, StoreMode>(const_cast<Scalar*>(m_d.data) + row * m_d.outerStride() + col, x);
-    else
-      return pstoret<Scalar, PacketType, StoreMode>(const_cast<Scalar*>(m_d.data) + row + col * m_d.outerStride(), x);
+    writePacket<StoreMode, PacketType>(getIndex(row, col), x);
   }
 
   template <int StoreMode, typename PacketType>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) {
-    return pstoret<Scalar, PacketType, StoreMode>(const_cast<Scalar*>(m_d.data) + index, x);
+    pstoret<Scalar, PacketType, StoreMode>(const_cast<Scalar*>(m_d.data) + index, x);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+    return packetSegment<LoadMode, PacketType>(getIndex(row, col), begin, count);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
+    return ploadtSegment<PacketType, LoadMode>(m_d.data + index, begin, count);
+  }
+
+  template <int StoreMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index row, Index col, const PacketType& x, Index begin,
+                                                                Index count) {
+    writePacketSegment<StoreMode, PacketType>(getIndex(row, col), x, begin, count);
+  }
+
+  template <int StoreMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index index, const PacketType& x, Index begin,
+                                                                Index count) {
+    pstoretSegment<Scalar, PacketType, StoreMode>(const_cast<Scalar*>(m_d.data) + index, x, begin, count);
   }
 
  protected:
   plainobjectbase_evaluator_data<Scalar, OuterStrideAtCompileTime> m_d;
+
+ private:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index constexpr getIndex(Index row, Index col) const {
+    return IsRowMajor ? row * m_d.outerStride() + col : row + col * m_d.outerStride();
+  }
 };
 
 template <typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
@@ -318,6 +333,28 @@
     m_argImpl.template writePacket<StoreMode, PacketType>(index, x);
   }
 
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+    return m_argImpl.template packetSegment<LoadMode, PacketType>(col, row, begin, count);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
+    return m_argImpl.template packetSegment<LoadMode, PacketType>(index, begin, count);
+  }
+
+  template <int StoreMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index row, Index col, const PacketType& x, Index begin,
+                                                                Index count) {
+    m_argImpl.template writePacketSegment<StoreMode, PacketType>(col, row, x, begin, count);
+  }
+
+  template <int StoreMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index index, const PacketType& x, Index begin,
+                                                                Index count) {
+    m_argImpl.template writePacketSegment<StoreMode, PacketType>(index, x, begin, count);
+  }
+
  protected:
   evaluator<ArgType> m_argImpl;
 };
@@ -464,10 +501,10 @@
 struct evaluator<CwiseNullaryOp<NullaryOp, PlainObjectType>>
     : evaluator_base<CwiseNullaryOp<NullaryOp, PlainObjectType>> {
   typedef CwiseNullaryOp<NullaryOp, PlainObjectType> XprType;
-  typedef internal::remove_all_t<PlainObjectType> PlainObjectTypeCleaned;
+  typedef remove_all_t<PlainObjectType> PlainObjectTypeCleaned;
 
   enum {
-    CoeffReadCost = internal::functor_traits<NullaryOp>::Cost,
+    CoeffReadCost = functor_traits<NullaryOp>::Cost,
 
     Flags = (evaluator<PlainObjectTypeCleaned>::Flags &
              (HereditaryBits | (functor_has_linear_access<NullaryOp>::ret ? LinearAccessBit : 0) |
@@ -502,9 +539,21 @@
     return m_wrapper.template packetOp<PacketType>(m_functor, index);
   }
 
+  template <int LoadMode, typename PacketType, typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(IndexType row, IndexType col, Index /*begin*/,
+                                                                 Index /*count*/) const {
+    return packet<LoadMode, PacketType, IndexType>(row, col);
+  }
+
+  template <int LoadMode, typename PacketType, typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(IndexType index, Index /*begin*/,
+                                                                 Index /*count*/) const {
+    return packet<LoadMode, PacketType, IndexType>(index);
+  }
+
  protected:
   const NullaryOp m_functor;
-  const internal::nullary_wrapper<CoeffReturnType, NullaryOp> m_wrapper;
+  const nullary_wrapper<CoeffReturnType, NullaryOp> m_wrapper;
 };
 
 // -------------------- CwiseUnaryOp --------------------
@@ -546,6 +595,16 @@
     return m_d.func().packetOp(m_d.argImpl.template packet<LoadMode, PacketType>(index));
   }
 
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+    return m_d.func().packetOp(m_d.argImpl.template packetSegment<LoadMode, PacketType>(row, col, begin, count));
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
+    return m_d.func().packetOp(m_d.argImpl.template packetSegment<LoadMode, PacketType>(index, begin, count));
+  }
+
  protected:
   // this helper permits to completely eliminate the functor if it is empty
   struct Data {
@@ -600,16 +659,11 @@
   template <typename DstPacketType>
   using SrcPacketArgs8 = std::enable_if_t<(unpacket_traits<DstPacketType>::size) == (8 * SrcPacketSize), bool>;
 
-  template <bool UseRowMajor = IsRowMajor, std::enable_if_t<UseRowMajor, bool> = true>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_array_bounds(Index, Index col, Index packetSize) const {
-    return col + packetSize <= cols();
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_array_bounds(Index row, Index col, Index begin, Index count) const {
+    return IsRowMajor ? (col + count + begin <= cols()) : (row + count + begin <= rows());
   }
-  template <bool UseRowMajor = IsRowMajor, std::enable_if_t<!UseRowMajor, bool> = true>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_array_bounds(Index row, Index, Index packetSize) const {
-    return row + packetSize <= rows();
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_array_bounds(Index index, Index packetSize) const {
-    return index + packetSize <= size();
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_array_bounds(Index index, Index begin, Index count) const {
+    return index + count + begin <= size();
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SrcType srcCoeff(Index row, Index col, Index offset) const {
@@ -632,43 +686,86 @@
   template <int LoadMode, typename PacketType = SrcPacketType>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType srcPacket(Index row, Index col, Index offset) const {
     constexpr int PacketSize = unpacket_traits<PacketType>::size;
-    Index actualRow = IsRowMajor ? row : row + (offset * PacketSize);
-    Index actualCol = IsRowMajor ? col + (offset * PacketSize) : col;
-    eigen_assert(check_array_bounds(actualRow, actualCol, PacketSize) && "Array index out of bounds");
+    Index packetOffset = offset * PacketSize;
+    Index actualRow = IsRowMajor ? row : row + packetOffset;
+    Index actualCol = IsRowMajor ? col + packetOffset : col;
+    eigen_assert(check_array_bounds(actualRow, actualCol, 0, PacketSize) && "Array index out of bounds");
     return m_argImpl.template packet<LoadMode, PacketType>(actualRow, actualCol);
   }
   template <int LoadMode, typename PacketType = SrcPacketType>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType srcPacket(Index index, Index offset) const {
     constexpr int PacketSize = unpacket_traits<PacketType>::size;
-    Index actualIndex = index + (offset * PacketSize);
-    eigen_assert(check_array_bounds(actualIndex, PacketSize) && "Array index out of bounds");
+    Index packetOffset = offset * PacketSize;
+    Index actualIndex = index + packetOffset;
+    eigen_assert(check_array_bounds(actualIndex, 0, PacketSize) && "Array index out of bounds");
     return m_argImpl.template packet<LoadMode, PacketType>(actualIndex);
   }
+  template <int LoadMode, typename PacketType = SrcPacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType srcPacketSegment(Index row, Index col, Index begin, Index count,
+                                                                    Index offset) const {
+    constexpr int PacketSize = unpacket_traits<PacketType>::size;
+    Index packetOffset = offset * PacketSize;
+    Index actualRow = IsRowMajor ? row : row + packetOffset;
+    Index actualCol = IsRowMajor ? col + packetOffset : col;
+    eigen_assert(check_array_bounds(actualRow, actualCol, 0, count) && "Array index out of bounds");
+    return m_argImpl.template packetSegment<LoadMode, PacketType>(actualRow, actualCol, begin, count);
+  }
+  template <int LoadMode, typename PacketType = SrcPacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType srcPacketSegment(Index index, Index begin, Index count,
+                                                                    Index offset) const {
+    constexpr int PacketSize = unpacket_traits<PacketType>::size;
+    Index packetOffset = offset * PacketSize;
+    Index actualIndex = index + packetOffset + begin;
+    eigen_assert(check_array_bounds(actualIndex, 0, count) && "Array index out of bounds");
+    return m_argImpl.template packetSegment<LoadMode, PacketType>(actualIndex, begin, count);
+  }
+
+  template <int NumPackets, int LoadMode, typename PacketType = SrcPacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketBlock<PacketType, NumPackets> srcPacketSegmentHelper(Index row, Index col,
+                                                                                                   Index begin,
+                                                                                                   Index count) const {
+    constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
+    PacketBlock<PacketType, NumPackets> packets;
+    Index offset = begin / SrcPacketSize;
+    Index actualBegin = begin % SrcPacketSize;
+    for (; offset < NumPackets; offset++) {
+      Index actualCount = numext::mini(SrcPacketSize - actualBegin, count);
+      packets.packet[offset] = srcPacketSegment<SrcLoadMode>(row, col, actualBegin, actualCount, offset);
+      if (count == actualCount) break;
+      actualBegin = 0;
+      count -= actualCount;
+    }
+    return packets;
+  }
+  template <int NumPackets, int LoadMode, typename PacketType = SrcPacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketBlock<PacketType, NumPackets> srcPacketSegmentHelper(Index index,
+                                                                                                   Index begin,
+                                                                                                   Index count) const {
+    constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
+    PacketBlock<PacketType, NumPackets> packets;
+    Index offset = begin / SrcPacketSize;
+    Index actualBegin = begin % SrcPacketSize;
+    for (; offset < NumPackets; offset++) {
+      Index actualCount = numext::mini(SrcPacketSize - actualBegin, count);
+      packets.packet[offset] = srcPacketSegment<SrcLoadMode>(index, actualBegin, actualCount, offset);
+      if (count == actualCount) break;
+      actualBegin = 0;
+      count -= actualCount;
+    }
+    return packets;
+  }
 
   // There is no source packet type with equal or fewer elements than DstPacketType.
   // This is problematic as the evaluation loop may attempt to access data outside the bounds of the array.
   // For example, consider the cast utilizing pcast<Packet4f,Packet2d> with an array of size 4: {0.0f,1.0f,2.0f,3.0f}.
   // The first iteration of the evaluation loop will load 16 bytes: {0.0f,1.0f,2.0f,3.0f} and cast to {0.0,1.0}, which
   // is acceptable. The second iteration will load 16 bytes: {2.0f,3.0f,?,?}, which is outside the bounds of the array.
-
-  // Instead, perform runtime check to determine if the load would access data outside the bounds of the array.
-  // If not, perform full load. Otherwise, revert to a scalar loop to perform a partial load.
-  // In either case, perform a vectorized cast of the source packet.
   template <int LoadMode, typename DstPacketType, AltSrcScalarOp<DstPacketType> = true>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packet(Index row, Index col) const {
     constexpr int DstPacketSize = unpacket_traits<DstPacketType>::size;
     constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType);
     constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode);
-    SrcPacketType src;
-    if (EIGEN_PREDICT_TRUE(check_array_bounds(row, col, SrcPacketSize))) {
-      src = srcPacket<SrcLoadMode>(row, col, 0);
-    } else {
-      Array<SrcType, SrcPacketSize, 1> srcArray;
-      for (size_t k = 0; k < DstPacketSize; k++) srcArray[k] = srcCoeff(row, col, k);
-      for (size_t k = DstPacketSize; k < SrcPacketSize; k++) srcArray[k] = SrcType(0);
-      src = pload<SrcPacketType>(srcArray.data());
-    }
-    return pcast<SrcPacketType, DstPacketType>(src);
+    return pcast<SrcPacketType, DstPacketType>(srcPacketSegment<SrcLoadMode>(row, col, 0, DstPacketSize, 0));
   }
   // Use the source packet type with the same size as DstPacketType, if it exists
   template <int LoadMode, typename DstPacketType, SrcPacketArgs1<DstPacketType> = true>
@@ -704,22 +801,67 @@
         srcPacket<SrcLoadMode>(row, col, 6), srcPacket<SrcLoadMode>(row, col, 7));
   }
 
+  // packetSegment variants
+  template <int LoadMode, typename DstPacketType, AltSrcScalarOp<DstPacketType> = true>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index row, Index col, Index begin,
+                                                                    Index count) const {
+    constexpr int DstPacketSize = unpacket_traits<DstPacketType>::size;
+    constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType);
+    constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode);
+    return pcast<SrcPacketType, DstPacketType>(srcPacketSegment<SrcLoadMode>(row, col, begin, count, 0));
+  }
+  // Use the source packet type with the same size as DstPacketType, if it exists
+  template <int LoadMode, typename DstPacketType, SrcPacketArgs1<DstPacketType> = true>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index row, Index col, Index begin,
+                                                                    Index count) const {
+    constexpr int DstPacketSize = unpacket_traits<DstPacketType>::size;
+    using SizedSrcPacketType = typename find_packet_by_size<SrcType, DstPacketSize>::type;
+    constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType);
+    constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode);
+    return pcast<SizedSrcPacketType, DstPacketType>(
+        srcPacketSegment<SrcLoadMode, SizedSrcPacketType>(row, col, begin, count, 0));
+  }
+  // unpacket_traits<DstPacketType>::size == 2 * SrcPacketSize
+  template <int LoadMode, typename DstPacketType, SrcPacketArgs2<DstPacketType> = true>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index row, Index col, Index begin,
+                                                                    Index count) const {
+    constexpr int NumPackets = 2;
+    constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
+    PacketBlock<SrcPacketType, NumPackets> packets =
+        srcPacketSegmentHelper<NumPackets, SrcLoadMode>(row, col, begin, count);
+    return pcast<SrcPacketType, DstPacketType>(packets.packet[0], packets.packet[1]);
+  }
+  // unpacket_traits<DstPacketType>::size == 4 * SrcPacketSize
+  template <int LoadMode, typename DstPacketType, SrcPacketArgs4<DstPacketType> = true>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index row, Index col, Index begin,
+                                                                    Index count) const {
+    constexpr int NumPackets = 4;
+    constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
+    PacketBlock<SrcPacketType, NumPackets> packets =
+        srcPacketSegmentHelper<NumPackets, SrcLoadMode>(row, col, begin, count);
+    return pcast<SrcPacketType, DstPacketType>(packets.packet[0], packets.packet[1], packets.packet[2],
+                                               packets.packet[3]);
+  }
+  // unpacket_traits<DstPacketType>::size == 8 * SrcPacketSize
+  template <int LoadMode, typename DstPacketType, SrcPacketArgs8<DstPacketType> = true>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index row, Index col, Index begin,
+                                                                    Index count) const {
+    constexpr int NumPackets = 8;
+    constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
+    PacketBlock<SrcPacketType, NumPackets> packets =
+        srcPacketSegmentHelper<NumPackets, SrcLoadMode>(row, col, begin, count);
+    return pcast<SrcPacketType, DstPacketType>(packets.packet[0], packets.packet[1], packets.packet[2],
+                                               packets.packet[3], packets.packet[4], packets.packet[5],
+                                               packets.packet[6], packets.packet[7]);
+  }
+
   // Analogous routines for linear access.
   template <int LoadMode, typename DstPacketType, AltSrcScalarOp<DstPacketType> = true>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packet(Index index) const {
     constexpr int DstPacketSize = unpacket_traits<DstPacketType>::size;
     constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType);
     constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode);
-    SrcPacketType src;
-    if (EIGEN_PREDICT_TRUE(check_array_bounds(index, SrcPacketSize))) {
-      src = srcPacket<SrcLoadMode>(index, 0);
-    } else {
-      Array<SrcType, SrcPacketSize, 1> srcArray;
-      for (size_t k = 0; k < DstPacketSize; k++) srcArray[k] = srcCoeff(index, k);
-      for (size_t k = DstPacketSize; k < SrcPacketSize; k++) srcArray[k] = SrcType(0);
-      src = pload<SrcPacketType>(srcArray.data());
-    }
-    return pcast<SrcPacketType, DstPacketType>(src);
+    return pcast<SrcPacketType, DstPacketType>(srcPacketSegment<SrcLoadMode>(index, 0, DstPacketSize, 0));
   }
   template <int LoadMode, typename DstPacketType, SrcPacketArgs1<DstPacketType> = true>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packet(Index index) const {
@@ -749,6 +891,55 @@
                                                srcPacket<SrcLoadMode>(index, 6), srcPacket<SrcLoadMode>(index, 7));
   }
 
+  // packetSegment variants
+  template <int LoadMode, typename DstPacketType, AltSrcScalarOp<DstPacketType> = true>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index index, Index begin, Index count) const {
+    constexpr int DstPacketSize = unpacket_traits<DstPacketType>::size;
+    constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType);
+    constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode);
+    return pcast<SrcPacketType, DstPacketType>(srcPacketSegment<SrcLoadMode>(index, begin, count, 0));
+  }
+  // Use the source packet type with the same size as DstPacketType, if it exists
+  template <int LoadMode, typename DstPacketType, SrcPacketArgs1<DstPacketType> = true>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index index, Index begin, Index count) const {
+    constexpr int DstPacketSize = unpacket_traits<DstPacketType>::size;
+    using SizedSrcPacketType = typename find_packet_by_size<SrcType, DstPacketSize>::type;
+    constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType);
+    constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode);
+    return pcast<SizedSrcPacketType, DstPacketType>(
+        srcPacketSegment<SrcLoadMode, SizedSrcPacketType>(index, begin, count, 0));
+  }
+  // unpacket_traits<DstPacketType>::size == 2 * SrcPacketSize
+  template <int LoadMode, typename DstPacketType, SrcPacketArgs2<DstPacketType> = true>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index index, Index begin, Index count) const {
+    constexpr int NumPackets = 2;
+    constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
+    PacketBlock<SrcPacketType, NumPackets> packets =
+        srcPacketSegmentHelper<NumPackets, SrcLoadMode>(index, begin, count);
+    return pcast<SrcPacketType, DstPacketType>(packets.packet[0], packets.packet[1]);
+  }
+  // unpacket_traits<DstPacketType>::size == 4 * SrcPacketSize
+  template <int LoadMode, typename DstPacketType, SrcPacketArgs4<DstPacketType> = true>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index index, Index begin, Index count) const {
+    constexpr int NumPackets = 4;
+    constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
+    PacketBlock<SrcPacketType, NumPackets> packets =
+        srcPacketSegmentHelper<NumPackets, SrcLoadMode>(index, begin, count);
+    return pcast<SrcPacketType, DstPacketType>(packets.packet[0], packets.packet[1], packets.packet[2],
+                                               packets.packet[3]);
+  }
+  // unpacket_traits<DstPacketType>::size == 8 * SrcPacketSize
+  template <int LoadMode, typename DstPacketType, SrcPacketArgs8<DstPacketType> = true>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index index, Index begin, Index count) const {
+    constexpr int NumPackets = 8;
+    constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
+    PacketBlock<SrcPacketType, NumPackets> packets =
+        srcPacketSegmentHelper<NumPackets, SrcLoadMode>(index, begin, count);
+    return pcast<SrcPacketType, DstPacketType>(packets.packet[0], packets.packet[1], packets.packet[2],
+                                               packets.packet[3], packets.packet[4], packets.packet[5],
+                                               packets.packet[6], packets.packet[7]);
+  }
+
   constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rows() const { return m_rows; }
   constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index cols() const { return m_cols; }
   constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_rows * m_cols; }
@@ -826,6 +1017,20 @@
                                m_d.arg3Impl.template packet<LoadMode, PacketType>(index));
   }
 
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+    return m_d.func().packetOp(m_d.arg1Impl.template packetSegment<LoadMode, PacketType>(row, col, begin, count),
+                               m_d.arg2Impl.template packetSegment<LoadMode, PacketType>(row, col, begin, count),
+                               m_d.arg3Impl.template packetSegment<LoadMode, PacketType>(row, col, begin, count));
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
+    return m_d.func().packetOp(m_d.arg1Impl.template packetSegment<LoadMode, PacketType>(index, begin, count),
+                               m_d.arg2Impl.template packetSegment<LoadMode, PacketType>(index, begin, count),
+                               m_d.arg3Impl.template packetSegment<LoadMode, PacketType>(index, begin, count));
+  }
+
  protected:
   // this helper permits to completely eliminate the functor if it is empty
   struct Data {
@@ -922,6 +1127,18 @@
                                m_d.rhsImpl.template packet<LoadMode, PacketType>(index));
   }
 
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+    return m_d.func().packetOp(m_d.lhsImpl.template packetSegment<LoadMode, PacketType>(row, col, begin, count),
+                               m_d.rhsImpl.template packetSegment<LoadMode, PacketType>(row, col, begin, count));
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
+    return m_d.func().packetOp(m_d.lhsImpl.template packetSegment<LoadMode, PacketType>(index, begin, count),
+                               m_d.rhsImpl.template packetSegment<LoadMode, PacketType>(index, begin, count));
+  }
+
  protected:
   // this helper permits to completely eliminate the functor if it is empty
   struct Data {
@@ -1013,7 +1230,7 @@
         m_innerStride(map.innerStride()),
         m_outerStride(map.outerStride()) {
     EIGEN_STATIC_ASSERT(check_implication((evaluator<Derived>::Flags & PacketAccessBit) != 0,
-                                          internal::inner_stride_at_compile_time<Derived>::ret == 1),
+                                          inner_stride_at_compile_time<Derived>::ret == 1),
                         PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1);
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
@@ -1035,36 +1252,60 @@
   template <int LoadMode, typename PacketType>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
     PointerType ptr = m_data + row * rowStride() + col * colStride();
-    return internal::ploadt<PacketType, LoadMode>(ptr);
+    return ploadt<PacketType, LoadMode>(ptr);
   }
 
   template <int LoadMode, typename PacketType>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index index) const {
-    return internal::ploadt<PacketType, LoadMode>(m_data + index * m_innerStride.value());
+    return ploadt<PacketType, LoadMode>(m_data + index * m_innerStride.value());
   }
 
   template <int StoreMode, typename PacketType>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) {
     PointerType ptr = m_data + row * rowStride() + col * colStride();
-    return internal::pstoret<Scalar, PacketType, StoreMode>(ptr, x);
+    pstoret<Scalar, PacketType, StoreMode>(ptr, x);
   }
 
   template <int StoreMode, typename PacketType>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) {
-    internal::pstoret<Scalar, PacketType, StoreMode>(m_data + index * m_innerStride.value(), x);
+    pstoret<Scalar, PacketType, StoreMode>(m_data + index * m_innerStride.value(), x);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+    PointerType ptr = m_data + row * rowStride() + col * colStride();
+    return ploadtSegment<PacketType, LoadMode>(ptr, begin, count);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
+    return ploadtSegment<PacketType, LoadMode>(m_data + index * m_innerStride.value(), begin, count);
+  }
+
+  template <int StoreMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index row, Index col, const PacketType& x, Index begin,
+                                                                Index count) {
+    PointerType ptr = m_data + row * rowStride() + col * colStride();
+    pstoretSegment<Scalar, PacketType, StoreMode>(ptr, x, begin, count);
+  }
+
+  template <int StoreMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index index, const PacketType& x, Index begin,
+                                                                Index count) {
+    pstoretSegment<Scalar, PacketType, StoreMode>(m_data + index * m_innerStride.value(), x, begin, count);
   }
 
  protected:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index rowStride() const EIGEN_NOEXCEPT {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rowStride() const noexcept {
     return XprType::IsRowMajor ? m_outerStride.value() : m_innerStride.value();
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index colStride() const EIGEN_NOEXCEPT {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index colStride() const noexcept {
     return XprType::IsRowMajor ? m_innerStride.value() : m_outerStride.value();
   }
 
   PointerType m_data;
-  const internal::variable_if_dynamic<Index, XprType::InnerStrideAtCompileTime> m_innerStride;
-  const internal::variable_if_dynamic<Index, XprType::OuterStrideAtCompileTime> m_outerStride;
+  const variable_if_dynamic<Index, XprType::InnerStrideAtCompileTime> m_innerStride;
+  const variable_if_dynamic<Index, XprType::OuterStrideAtCompileTime> m_outerStride;
 };
 
 template <typename PlainObjectType, int MapOptions, typename StrideType>
@@ -1117,7 +1358,7 @@
 // -------------------- Block --------------------
 
 template <typename ArgType, int BlockRows, int BlockCols, bool InnerPanel,
-          bool HasDirectAccess = internal::has_direct_access<ArgType>::ret>
+          bool HasDirectAccess = has_direct_access<ArgType>::ret>
 struct block_evaluator;
 
 template <typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
@@ -1246,6 +1487,39 @@
                                                 x);
   }
 
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+    return m_argImpl.template packetSegment<LoadMode, PacketType>(m_startRow.value() + row, m_startCol.value() + col,
+                                                                  begin, count);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
+    if (ForwardLinearAccess)
+      return m_argImpl.template packetSegment<LoadMode, PacketType>(m_linear_offset.value() + index, begin, count);
+    else
+      return packetSegment<LoadMode, PacketType>(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0,
+                                                 begin, count);
+  }
+
+  template <int StoreMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index row, Index col, const PacketType& x, Index begin,
+                                                                Index count) {
+    return m_argImpl.template writePacketSegment<StoreMode, PacketType>(m_startRow.value() + row,
+                                                                        m_startCol.value() + col, x, begin, count);
+  }
+
+  template <int StoreMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index index, const PacketType& x, Index begin,
+                                                                Index count) {
+    if (ForwardLinearAccess)
+      return m_argImpl.template writePacketSegment<StoreMode, PacketType>(m_linear_offset.value() + index, x, begin,
+                                                                          count);
+    else
+      return writePacketSegment<StoreMode, PacketType>(RowsAtCompileTime == 1 ? 0 : index,
+                                                       RowsAtCompileTime == 1 ? index : 0, x, begin, count);
+  }
+
  protected:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType
   linear_coeff_impl(Index index, internal::true_type /* ForwardLinearAccess */) const {
@@ -1341,8 +1615,8 @@
   typedef Replicate<ArgType, RowFactor, ColFactor> XprType;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   enum { Factor = (RowFactor == Dynamic || ColFactor == Dynamic) ? Dynamic : RowFactor * ColFactor };
-  typedef typename internal::nested_eval<ArgType, Factor>::type ArgTypeNested;
-  typedef internal::remove_all_t<ArgTypeNested> ArgTypeNestedCleaned;
+  typedef typename nested_eval<ArgType, Factor>::type ArgTypeNested;
+  typedef remove_all_t<ArgTypeNested> ArgTypeNestedCleaned;
 
   enum {
     CoeffReadCost = evaluator<ArgTypeNestedCleaned>::CoeffReadCost,
@@ -1361,19 +1635,15 @@
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
     // try to avoid using modulo; this is a pure optimization strategy
-    const Index actual_row = internal::traits<XprType>::RowsAtCompileTime == 1 ? 0
-                             : RowFactor == 1                                  ? row
-                                                                               : row % m_rows.value();
-    const Index actual_col = internal::traits<XprType>::ColsAtCompileTime == 1 ? 0
-                             : ColFactor == 1                                  ? col
-                                                                               : col % m_cols.value();
+    const Index actual_row = traits<XprType>::RowsAtCompileTime == 1 ? 0 : RowFactor == 1 ? row : row % m_rows.value();
+    const Index actual_col = traits<XprType>::ColsAtCompileTime == 1 ? 0 : ColFactor == 1 ? col : col % m_cols.value();
 
     return m_argImpl.coeff(actual_row, actual_col);
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
     // try to avoid using modulo; this is a pure optimization strategy
-    const Index actual_index = internal::traits<XprType>::RowsAtCompileTime == 1
+    const Index actual_index = traits<XprType>::RowsAtCompileTime == 1
                                    ? (ColFactor == 1 ? index : index % m_cols.value())
                                    : (RowFactor == 1 ? index : index % m_rows.value());
 
@@ -1382,25 +1652,38 @@
 
   template <int LoadMode, typename PacketType>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
-    const Index actual_row = internal::traits<XprType>::RowsAtCompileTime == 1 ? 0
-                             : RowFactor == 1                                  ? row
-                                                                               : row % m_rows.value();
-    const Index actual_col = internal::traits<XprType>::ColsAtCompileTime == 1 ? 0
-                             : ColFactor == 1                                  ? col
-                                                                               : col % m_cols.value();
+    const Index actual_row = traits<XprType>::RowsAtCompileTime == 1 ? 0 : RowFactor == 1 ? row : row % m_rows.value();
+    const Index actual_col = traits<XprType>::ColsAtCompileTime == 1 ? 0 : ColFactor == 1 ? col : col % m_cols.value();
 
     return m_argImpl.template packet<LoadMode, PacketType>(actual_row, actual_col);
   }
 
   template <int LoadMode, typename PacketType>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index index) const {
-    const Index actual_index = internal::traits<XprType>::RowsAtCompileTime == 1
+    const Index actual_index = traits<XprType>::RowsAtCompileTime == 1
                                    ? (ColFactor == 1 ? index : index % m_cols.value())
                                    : (RowFactor == 1 ? index : index % m_rows.value());
 
     return m_argImpl.template packet<LoadMode, PacketType>(actual_index);
   }
 
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+    const Index actual_row = traits<XprType>::RowsAtCompileTime == 1 ? 0 : RowFactor == 1 ? row : row % m_rows.value();
+    const Index actual_col = traits<XprType>::ColsAtCompileTime == 1 ? 0 : ColFactor == 1 ? col : col % m_cols.value();
+
+    return m_argImpl.template packetSegment<LoadMode, PacketType>(actual_row, actual_col, begin, count);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
+    const Index actual_index = traits<XprType>::RowsAtCompileTime == 1
+                                   ? (ColFactor == 1 ? index : index % m_cols.value())
+                                   : (RowFactor == 1 ? index : index % m_rows.value());
+
+    return m_argImpl.template packetSegment<LoadMode, PacketType>(actual_index, begin, count);
+  }
+
  protected:
   const ArgTypeNested m_arg;
   evaluator<ArgTypeNestedCleaned> m_argImpl;
@@ -1457,6 +1740,28 @@
     m_argImpl.template writePacket<StoreMode>(index, x);
   }
 
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+    return m_argImpl.template packetSegment<LoadMode, PacketType>(row, col, begin, count);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
+    return m_argImpl.template packetSegment<LoadMode, PacketType>(index, begin, count);
+  }
+
+  template <int StoreMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index row, Index col, const PacketType& x, Index begin,
+                                                                Index count) {
+    m_argImpl.template writePacketSegment<StoreMode>(row, col, x, begin, count);
+  }
+
+  template <int StoreMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index index, const PacketType& x, Index begin,
+                                                                Index count) {
+    m_argImpl.template writePacketSegment<StoreMode>(index, x, begin, count);
+  }
+
  protected:
   evaluator<ArgType> m_argImpl;
 };
@@ -1536,41 +1841,97 @@
 
   template <int LoadMode, typename PacketType>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
-    enum {
-      PacketSize = unpacket_traits<PacketType>::size,
-      OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1,
-      OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1
-    };
-    typedef internal::reverse_packet_cond<PacketType, ReversePacket> reverse_packet;
-    return reverse_packet::run(m_argImpl.template packet<LoadMode, PacketType>(
-        ReverseRow ? m_rows.value() - row - OffsetRow : row, ReverseCol ? m_cols.value() - col - OffsetCol : col));
+    static constexpr int PacketSize = unpacket_traits<PacketType>::size;
+    static constexpr int OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1;
+    static constexpr int OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1;
+    using reverse_packet = reverse_packet_cond<PacketType, ReversePacket>;
+
+    Index actualRow = ReverseRow ? m_rows.value() - row - OffsetRow : row;
+    Index actualCol = ReverseCol ? m_cols.value() - col - OffsetCol : col;
+
+    return reverse_packet::run(m_argImpl.template packet<LoadMode, PacketType>(actualRow, actualCol));
   }
 
   template <int LoadMode, typename PacketType>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index index) const {
-    enum { PacketSize = unpacket_traits<PacketType>::size };
-    return preverse(
-        m_argImpl.template packet<LoadMode, PacketType>(m_rows.value() * m_cols.value() - index - PacketSize));
+    static constexpr int PacketSize = unpacket_traits<PacketType>::size;
+
+    Index actualIndex = m_rows.value() * m_cols.value() - index - PacketSize;
+
+    return preverse(m_argImpl.template packet<LoadMode, PacketType>(actualIndex));
   }
 
   template <int LoadMode, typename PacketType>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) {
-    // FIXME we could factorize some code with packet(i,j)
-    enum {
-      PacketSize = unpacket_traits<PacketType>::size,
-      OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1,
-      OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1
-    };
-    typedef internal::reverse_packet_cond<PacketType, ReversePacket> reverse_packet;
-    m_argImpl.template writePacket<LoadMode>(ReverseRow ? m_rows.value() - row - OffsetRow : row,
-                                             ReverseCol ? m_cols.value() - col - OffsetCol : col,
-                                             reverse_packet::run(x));
+    static constexpr int PacketSize = unpacket_traits<PacketType>::size;
+    static constexpr int OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1;
+    static constexpr int OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1;
+    using reverse_packet = reverse_packet_cond<PacketType, ReversePacket>;
+
+    Index actualRow = ReverseRow ? m_rows.value() - row - OffsetRow : row;
+    Index actualCol = ReverseCol ? m_cols.value() - col - OffsetCol : col;
+
+    m_argImpl.template writePacket<LoadMode>(actualRow, actualCol, reverse_packet::run(x));
   }
 
   template <int LoadMode, typename PacketType>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) {
-    enum { PacketSize = unpacket_traits<PacketType>::size };
-    m_argImpl.template writePacket<LoadMode>(m_rows.value() * m_cols.value() - index - PacketSize, preverse(x));
+    static constexpr int PacketSize = unpacket_traits<PacketType>::size;
+
+    Index actualIndex = m_rows.value() * m_cols.value() - index - PacketSize;
+
+    m_argImpl.template writePacket<LoadMode>(actualIndex, preverse(x));
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+    static constexpr int PacketSize = unpacket_traits<PacketType>::size;
+    static constexpr int OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1;
+    static constexpr int OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1;
+    using reverse_packet = reverse_packet_cond<PacketType, ReversePacket>;
+
+    Index actualRow = ReverseRow ? m_rows.value() - row - OffsetRow : row;
+    Index actualCol = ReverseCol ? m_cols.value() - col - OffsetCol : col;
+    Index actualBegin = ReversePacket ? (PacketSize - count - begin) : begin;
+
+    return reverse_packet::run(
+        m_argImpl.template packetSegment<LoadMode, PacketType>(actualRow, actualCol, actualBegin, count));
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
+    static constexpr int PacketSize = unpacket_traits<PacketType>::size;
+
+    Index actualIndex = m_rows.value() * m_cols.value() - index - PacketSize;
+    Index actualBegin = PacketSize - count - begin;
+
+    return preverse(m_argImpl.template packetSegment<LoadMode, PacketType>(actualIndex, actualBegin, count));
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index row, Index col, const PacketType& x, Index begin,
+                                                                Index count) {
+    static constexpr int PacketSize = unpacket_traits<PacketType>::size;
+    static constexpr int OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1;
+    static constexpr int OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1;
+    using reverse_packet = reverse_packet_cond<PacketType, ReversePacket>;
+
+    Index actualRow = ReverseRow ? m_rows.value() - row - OffsetRow : row;
+    Index actualCol = ReverseCol ? m_cols.value() - col - OffsetCol : col;
+    Index actualBegin = ReversePacket ? (PacketSize - count - begin) : begin;
+
+    m_argImpl.template writePacketSegment<LoadMode>(actualRow, actualCol, reverse_packet::run(x), actualBegin, count);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index index, const PacketType& x, Index begin,
+                                                                Index count) {
+    static constexpr int PacketSize = unpacket_traits<PacketType>::size;
+
+    Index actualIndex = m_rows.value() * m_cols.value() - index - PacketSize;
+    Index actualBegin = PacketSize - count - begin;
+
+    m_argImpl.template writePacketSegment<LoadMode>(actualIndex, preverse(x), actualBegin, count);
   }
 
  protected:
@@ -1621,13 +1982,13 @@
 
  protected:
   evaluator<ArgType> m_argImpl;
-  const internal::variable_if_dynamicindex<Index, XprType::DiagIndex> m_index;
+  const variable_if_dynamicindex<Index, XprType::DiagIndex> m_index;
 
  private:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index rowOffset() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rowOffset() const {
     return m_index.value() > 0 ? 0 : -m_index.value();
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index colOffset() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index colOffset() const {
     return m_index.value() > 0 ? m_index.value() : 0;
   }
 };
@@ -1656,9 +2017,9 @@
 
   const ArgType& arg() const { return m_arg; }
 
-  EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_arg.rows(); }
+  constexpr Index rows() const noexcept { return m_arg.rows(); }
 
-  EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_arg.cols(); }
+  constexpr Index cols() const noexcept { return m_arg.cols(); }
 
  private:
   const ArgType& m_arg;

diff --git a/Eigen/src/Core/CwiseBinaryOp.h b/Eigen/src/Core/CwiseBinaryOp.h
index aa79b60..e2b2da5 100644
--- a/Eigen/src/Core/CwiseBinaryOp.h
+++ b/Eigen/src/Core/CwiseBinaryOp.h

@@ -108,12 +108,12 @@
     eigen_assert(aLhs.rows() == aRhs.rows() && aLhs.cols() == aRhs.cols());
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const noexcept {
     // return the fixed size type if available to enable compile time optimizations
     return internal::traits<internal::remove_all_t<LhsNested>>::RowsAtCompileTime == Dynamic ? m_rhs.rows()
                                                                                              : m_lhs.rows();
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const noexcept {
     // return the fixed size type if available to enable compile time optimizations
     return internal::traits<internal::remove_all_t<LhsNested>>::ColsAtCompileTime == Dynamic ? m_rhs.cols()
                                                                                              : m_lhs.cols();

diff --git a/Eigen/src/Core/CwiseNullaryOp.h b/Eigen/src/Core/CwiseNullaryOp.h
index 9c305c6..13a542a 100644
--- a/Eigen/src/Core/CwiseNullaryOp.h
+++ b/Eigen/src/Core/CwiseNullaryOp.h

@@ -76,8 +76,8 @@
     EIGEN_STATIC_ASSERT(CwiseNullaryOp::IsVectorAtCompileTime, YOU_TRIED_CALLING_A_VECTOR_METHOD_ON_A_MATRIX);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index rows() const { return m_rows.value(); }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index cols() const { return m_cols.value(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return m_rows.value(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return m_cols.value(); }
 
   /** \returns the functor representing the nullary operation */
   EIGEN_DEVICE_FUNC const NullaryOp& functor() const { return m_functor; }

diff --git a/Eigen/src/Core/CwiseUnaryOp.h b/Eigen/src/Core/CwiseUnaryOp.h
index 42ed459..94ec1a0 100644
--- a/Eigen/src/Core/CwiseUnaryOp.h
+++ b/Eigen/src/Core/CwiseUnaryOp.h

@@ -60,8 +60,8 @@
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit CwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp())
       : m_xpr(xpr), m_functor(func) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_xpr.rows(); }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_xpr.cols(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const noexcept { return m_xpr.rows(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const noexcept { return m_xpr.cols(); }
 
   /** \returns the functor representing the unary operation */
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const UnaryOp& functor() const { return m_functor; }

diff --git a/Eigen/src/Core/CwiseUnaryView.h b/Eigen/src/Core/CwiseUnaryView.h
index 49b1410..7dd7623 100644
--- a/Eigen/src/Core/CwiseUnaryView.h
+++ b/Eigen/src/Core/CwiseUnaryView.h

@@ -66,13 +66,13 @@
 
   EIGEN_DEVICE_FUNC inline const Scalar* data() const { return &(this->coeffRef(0)); }
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const {
+  EIGEN_DEVICE_FUNC constexpr Index innerStride() const {
     return StrideType::InnerStrideAtCompileTime != 0 ? int(StrideType::InnerStrideAtCompileTime)
                                                      : derived().nestedExpression().innerStride() *
                                                            sizeof(typename traits<MatrixType>::Scalar) / sizeof(Scalar);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const {
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const {
     return StrideType::OuterStrideAtCompileTime != 0 ? int(StrideType::OuterStrideAtCompileTime)
                                                      : derived().nestedExpression().outerStride() *
                                                            sizeof(typename traits<MatrixType>::Scalar) / sizeof(Scalar);
@@ -145,8 +145,8 @@
 
   EIGEN_INHERIT_ASSIGNMENT_OPERATORS(CwiseUnaryView)
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const noexcept { return m_matrix.rows(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const noexcept { return m_matrix.cols(); }
 
   /** \returns the functor representing unary operation */
   EIGEN_DEVICE_FUNC const ViewOp& functor() const { return m_functor; }

diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h
index d5906bd..4f68942 100644
--- a/Eigen/src/Core/DenseBase.h
+++ b/Eigen/src/Core/DenseBase.h

@@ -208,7 +208,7 @@
    * \note For a vector, this returns just 1. For a matrix (non-vector), this is the major dimension
    * with respect to the \ref TopicStorageOrders "storage order", i.e., the number of columns for a
    * column-major matrix, and the number of rows for a row-major matrix. */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index outerSize() const {
+  EIGEN_DEVICE_FUNC constexpr Index outerSize() const {
     return IsVectorAtCompileTime ? 1 : int(IsRowMajor) ? this->rows() : this->cols();
   }
 
@@ -217,7 +217,7 @@
    * \note For a vector, this is just the size. For a matrix (non-vector), this is the minor dimension
    * with respect to the \ref TopicStorageOrders "storage order", i.e., the number of rows for a
    * column-major matrix, and the number of columns for a row-major matrix. */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index innerSize() const {
+  EIGEN_DEVICE_FUNC constexpr Index innerSize() const {
     return IsVectorAtCompileTime ? this->size() : int(IsRowMajor) ? this->cols() : this->rows();
   }
 

diff --git a/Eigen/src/Core/DenseCoeffsBase.h b/Eigen/src/Core/DenseCoeffsBase.h
index 97f9b50..cff104c 100644
--- a/Eigen/src/Core/DenseCoeffsBase.h
+++ b/Eigen/src/Core/DenseCoeffsBase.h

@@ -89,13 +89,12 @@
    *
    * \sa operator()(Index,Index) const, coeffRef(Index,Index), coeff(Index) const
    */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR CoeffReturnType coeff(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType coeff(Index row, Index col) const {
     eigen_internal_assert(row >= 0 && row < rows() && col >= 0 && col < cols());
     return internal::evaluator<Derived>(derived()).coeff(row, col);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR CoeffReturnType coeffByOuterInner(Index outer,
-                                                                                          Index inner) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType coeffByOuterInner(Index outer, Index inner) const {
     return coeff(rowIndexByOuterInner(outer, inner), colIndexByOuterInner(outer, inner));
   }
 
@@ -103,7 +102,7 @@
    *
    * \sa operator()(Index,Index), operator[](Index)
    */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR CoeffReturnType operator()(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType operator()(Index row, Index col) const {
     eigen_assert(row >= 0 && row < rows() && col >= 0 && col < cols());
     return coeff(row, col);
   }
@@ -123,7 +122,7 @@
    * \sa operator[](Index) const, coeffRef(Index), coeff(Index,Index) const
    */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR CoeffReturnType coeff(Index index) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType coeff(Index index) const {
     EIGEN_STATIC_ASSERT(internal::evaluator<Derived>::Flags & LinearAccessBit,
                         THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS)
     eigen_internal_assert(index >= 0 && index < size());
@@ -138,7 +137,7 @@
    * z() const, w() const
    */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR CoeffReturnType operator[](Index index) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType operator[](Index index) const {
     EIGEN_STATIC_ASSERT(Derived::IsVectorAtCompileTime,
                         THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD)
     eigen_assert(index >= 0 && index < size());
@@ -155,32 +154,32 @@
    * z() const, w() const
    */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR CoeffReturnType operator()(Index index) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType operator()(Index index) const {
     eigen_assert(index >= 0 && index < size());
     return coeff(index);
   }
 
   /** equivalent to operator[](0).  */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR CoeffReturnType x() const { return (*this)[0]; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType x() const { return (*this)[0]; }
 
   /** equivalent to operator[](1).  */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR CoeffReturnType y() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType y() const {
     EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime == -1 || Derived::SizeAtCompileTime >= 2, OUT_OF_RANGE_ACCESS);
     return (*this)[1];
   }
 
   /** equivalent to operator[](2).  */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR CoeffReturnType z() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType z() const {
     EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime == -1 || Derived::SizeAtCompileTime >= 3, OUT_OF_RANGE_ACCESS);
     return (*this)[2];
   }
 
   /** equivalent to operator[](3).  */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR CoeffReturnType w() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType w() const {
     EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime == -1 || Derived::SizeAtCompileTime >= 4, OUT_OF_RANGE_ACCESS);
     return (*this)[3];
   }
@@ -362,32 +361,32 @@
    * \sa operator[](Index) const, operator()(Index,Index), x(), y(), z(), w()
    */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Scalar& operator()(Index index) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& operator()(Index index) {
     eigen_assert(index >= 0 && index < size());
     return coeffRef(index);
   }
 
   /** equivalent to operator[](0).  */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Scalar& x() { return (*this)[0]; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& x() { return (*this)[0]; }
 
   /** equivalent to operator[](1).  */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Scalar& y() {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& y() {
     EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime == -1 || Derived::SizeAtCompileTime >= 2, OUT_OF_RANGE_ACCESS);
     return (*this)[1];
   }
 
   /** equivalent to operator[](2).  */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Scalar& z() {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& z() {
     EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime == -1 || Derived::SizeAtCompileTime >= 3, OUT_OF_RANGE_ACCESS);
     return (*this)[2];
   }
 
   /** equivalent to operator[](3).  */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Scalar& w() {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& w() {
     EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime == -1 || Derived::SizeAtCompileTime >= 4, OUT_OF_RANGE_ACCESS);
     return (*this)[3];
   }
@@ -421,33 +420,29 @@
    *
    * \sa outerStride(), rowStride(), colStride()
    */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const { return derived().innerStride(); }
+  EIGEN_DEVICE_FUNC constexpr Index innerStride() const { return derived().innerStride(); }
 
   /** \returns the pointer increment between two consecutive inner slices (for example, between two consecutive columns
    *          in a column-major matrix).
    *
    * \sa innerStride(), rowStride(), colStride()
    */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const { return derived().outerStride(); }
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const { return derived().outerStride(); }
 
   // FIXME shall we remove it ?
-  EIGEN_CONSTEXPR inline Index stride() const { return Derived::IsVectorAtCompileTime ? innerStride() : outerStride(); }
+  constexpr Index stride() const { return Derived::IsVectorAtCompileTime ? innerStride() : outerStride(); }
 
   /** \returns the pointer increment between two consecutive rows.
    *
    * \sa innerStride(), outerStride(), colStride()
    */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rowStride() const {
-    return Derived::IsRowMajor ? outerStride() : innerStride();
-  }
+  EIGEN_DEVICE_FUNC constexpr Index rowStride() const { return Derived::IsRowMajor ? outerStride() : innerStride(); }
 
   /** \returns the pointer increment between two consecutive columns.
    *
    * \sa innerStride(), outerStride(), rowStride()
    */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index colStride() const {
-    return Derived::IsRowMajor ? innerStride() : outerStride();
-  }
+  EIGEN_DEVICE_FUNC constexpr Index colStride() const { return Derived::IsRowMajor ? innerStride() : outerStride(); }
 };
 
 /** \brief Base class providing direct read/write coefficient access to matrices and arrays.
@@ -478,25 +473,23 @@
    *
    * \sa outerStride(), rowStride(), colStride()
    */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const EIGEN_NOEXCEPT { return derived().innerStride(); }
+  EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { return derived().innerStride(); }
 
   /** \returns the pointer increment between two consecutive inner slices (for example, between two consecutive columns
    *          in a column-major matrix).
    *
    * \sa innerStride(), rowStride(), colStride()
    */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const EIGEN_NOEXCEPT { return derived().outerStride(); }
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return derived().outerStride(); }
 
   // FIXME shall we remove it ?
-  EIGEN_CONSTEXPR inline Index stride() const EIGEN_NOEXCEPT {
-    return Derived::IsVectorAtCompileTime ? innerStride() : outerStride();
-  }
+  constexpr Index stride() const noexcept { return Derived::IsVectorAtCompileTime ? innerStride() : outerStride(); }
 
   /** \returns the pointer increment between two consecutive rows.
    *
    * \sa innerStride(), outerStride(), colStride()
    */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rowStride() const EIGEN_NOEXCEPT {
+  EIGEN_DEVICE_FUNC constexpr Index rowStride() const noexcept {
     return Derived::IsRowMajor ? outerStride() : innerStride();
   }
 
@@ -504,7 +497,7 @@
    *
    * \sa innerStride(), outerStride(), rowStride()
    */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index colStride() const EIGEN_NOEXCEPT {
+  EIGEN_DEVICE_FUNC constexpr Index colStride() const noexcept {
     return Derived::IsRowMajor ? innerStride() : outerStride();
   }
 };
@@ -513,7 +506,7 @@
 
 template <int Alignment, typename Derived, bool JustReturnZero>
 struct first_aligned_impl {
-  static EIGEN_CONSTEXPR inline Index run(const Derived&) EIGEN_NOEXCEPT { return 0; }
+  static constexpr Index run(const Derived&) noexcept { return 0; }
 };
 
 template <int Alignment, typename Derived>

diff --git a/Eigen/src/Core/DeviceWrapper.h b/Eigen/src/Core/DeviceWrapper.h
index 75fc8e7..012dce1 100644
--- a/Eigen/src/Core/DeviceWrapper.h
+++ b/Eigen/src/Core/DeviceWrapper.h

@@ -87,13 +87,13 @@
           int Unrolling = Kernel::AssignmentTraits::Unrolling>
 struct dense_assignment_loop_with_device {
   using Base = dense_assignment_loop<Kernel, Traversal, Unrolling>;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel, Device&) { Base::run(kernel); }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel, Device&) { Base::run(kernel); }
 };
 
 // entry point for a generic expression with device
 template <typename Dst, typename Src, typename Func, typename Device>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void call_assignment_no_alias(DeviceWrapper<Dst, Device> dst,
-                                                                                    const Src& src, const Func& func) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_assignment_no_alias(DeviceWrapper<Dst, Device> dst,
+                                                                              const Src& src, const Func& func) {
   enum {
     NeedToTranspose = ((int(Dst::RowsAtCompileTime) == 1 && int(Src::ColsAtCompileTime) == 1) ||
                        (int(Dst::ColsAtCompileTime) == 1 && int(Src::RowsAtCompileTime) == 1)) &&
@@ -115,10 +115,8 @@
 
 // copy and pasted from AssignEvaluator except forward device to kernel
 template <typename DstXprType, typename SrcXprType, typename Functor, typename Device>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void call_dense_assignment_loop(DstXprType& dst,
-                                                                                      const SrcXprType& src,
-                                                                                      const Functor& func,
-                                                                                      Device& device) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src,
+                                                                                const Functor& func, Device& device) {
   using DstEvaluatorType = evaluator<DstXprType>;
   using SrcEvaluatorType = evaluator<SrcXprType>;
 

diff --git a/Eigen/src/Core/Diagonal.h b/Eigen/src/Core/Diagonal.h
index 8d27857..ff8611c 100644
--- a/Eigen/src/Core/Diagonal.h
+++ b/Eigen/src/Core/Diagonal.h

@@ -83,13 +83,11 @@
                                : numext::mini<Index>(m_matrix.rows(), m_matrix.cols() - m_index.value());
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return 1; }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return 1; }
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const EIGEN_NOEXCEPT {
-    return m_matrix.outerStride() + 1;
-  }
+  EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { return m_matrix.outerStride() + 1; }
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const EIGEN_NOEXCEPT { return 0; }
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return 0; }
 
   typedef std::conditional_t<internal::is_lvalue<MatrixType>::value, Scalar, const Scalar> ScalarWithConstIfNotLvalue;
 
@@ -134,13 +132,13 @@
 
  private:
   // some compilers may fail to optimize std::max etc in case of compile-time constants...
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index absDiagIndex() const EIGEN_NOEXCEPT {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index absDiagIndex() const noexcept {
     return m_index.value() > 0 ? m_index.value() : -m_index.value();
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index rowOffset() const EIGEN_NOEXCEPT {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rowOffset() const noexcept {
     return m_index.value() > 0 ? 0 : -m_index.value();
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index colOffset() const EIGEN_NOEXCEPT {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index colOffset() const noexcept {
     return m_index.value() > 0 ? m_index.value() : 0;
   }
   // trigger a compile-time error if someone try to call packet

diff --git a/Eigen/src/Core/DiagonalMatrix.h b/Eigen/src/Core/DiagonalMatrix.h
index 4115b64..52630d9 100644
--- a/Eigen/src/Core/DiagonalMatrix.h
+++ b/Eigen/src/Core/DiagonalMatrix.h

@@ -76,9 +76,9 @@
   }
 
   /** \returns the number of rows. */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const { return diagonal().size(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const { return diagonal().size(); }
   /** \returns the number of columns. */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const { return diagonal().size(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const { return diagonal().size(); }
 
   /** \returns the diagonal matrix product of \c *this by the dense matrix, \a matrix */
   template <typename MatrixDerived>

diff --git a/Eigen/src/Core/EigenBase.h b/Eigen/src/Core/EigenBase.h
index 894bfc1..c9a6e88 100644
--- a/Eigen/src/Core/EigenBase.h
+++ b/Eigen/src/Core/EigenBase.h

@@ -56,12 +56,12 @@
   EIGEN_DEVICE_FUNC inline const Derived& const_derived() const { return *static_cast<const Derived*>(this); }
 
   /** \returns the number of rows. \sa cols(), RowsAtCompileTime */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return derived().rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return derived().rows(); }
   /** \returns the number of columns. \sa rows(), ColsAtCompileTime*/
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return derived().cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return derived().cols(); }
   /** \returns the number of coefficients, which is rows()*cols().
    * \sa rows(), cols(), SizeAtCompileTime. */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index size() const EIGEN_NOEXCEPT { return rows() * cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index size() const noexcept { return rows() * cols(); }
 
   /** \internal Don't use it, but do the equivalent: \code dst = *this; \endcode */
   template <typename Dest>

diff --git a/Eigen/src/Core/Fill.h b/Eigen/src/Core/Fill.h
index 3b0af91..9d4ecd4 100644
--- a/Eigen/src/Core/Fill.h
+++ b/Eigen/src/Core/Fill.h

@@ -92,7 +92,8 @@
 
 template <typename Xpr>
 struct eigen_memset_helper {
-  static constexpr bool value = std::is_trivial<typename Xpr::Scalar>::value && eigen_fill_helper<Xpr>::value;
+  static constexpr bool value =
+      std::is_trivially_copyable<typename Xpr::Scalar>::value && eigen_fill_helper<Xpr>::value;
 };
 
 template <typename Xpr>

diff --git a/Eigen/src/Core/ForceAlignedAccess.h b/Eigen/src/Core/ForceAlignedAccess.h
index a91b0da..55beab3 100644
--- a/Eigen/src/Core/ForceAlignedAccess.h
+++ b/Eigen/src/Core/ForceAlignedAccess.h

@@ -41,14 +41,10 @@
 
   EIGEN_DEVICE_FUNC explicit inline ForceAlignedAccess(const ExpressionType& matrix) : m_expression(matrix) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_expression.rows(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_expression.cols(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const EIGEN_NOEXCEPT {
-    return m_expression.outerStride();
-  }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const EIGEN_NOEXCEPT {
-    return m_expression.innerStride();
-  }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_expression.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_expression.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return m_expression.outerStride(); }
+  EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { return m_expression.innerStride(); }
 
   EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index row, Index col) const {
     return m_expression.coeff(row, col);

diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index 26a4634..d45cb4b 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h

@@ -72,6 +72,7 @@
     HasReciprocal = 0,
     HasSqrt = 0,
     HasRsqrt = 0,
+    HasCbrt = 0,
     HasExp = 0,
     HasExpm1 = 0,
     HasLog = 0,
@@ -368,6 +369,11 @@
 EIGEN_DEVICE_FUNC inline Packet pdiv(const Packet& a, const Packet& b) {
   return a / b;
 }
+// Avoid compiler warning for boolean algebra.
+template <>
+EIGEN_DEVICE_FUNC inline bool pdiv(const bool& a, const bool& b) {
+  return a && b;
+}
 
 // In the generic case, memset to all one bits.
 template <typename Packet, typename EnableIf = void>
@@ -449,48 +455,42 @@
 
 template <typename T>
 struct bit_and {
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE T operator()(const T& a, const T& b) const { return a & b; }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE T operator()(const T& a, const T& b) const { return a & b; }
 };
 
 template <typename T>
 struct bit_or {
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE T operator()(const T& a, const T& b) const { return a | b; }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE T operator()(const T& a, const T& b) const { return a | b; }
 };
 
 template <typename T>
 struct bit_xor {
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE T operator()(const T& a, const T& b) const { return a ^ b; }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE T operator()(const T& a, const T& b) const { return a ^ b; }
 };
 
 template <typename T>
 struct bit_not {
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE T operator()(const T& a) const { return ~a; }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE T operator()(const T& a) const { return ~a; }
 };
 
 template <>
 struct bit_and<bool> {
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE bool operator()(const bool& a, const bool& b) const {
-    return a && b;
-  }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE bool operator()(const bool& a, const bool& b) const { return a && b; }
 };
 
 template <>
 struct bit_or<bool> {
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE bool operator()(const bool& a, const bool& b) const {
-    return a || b;
-  }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE bool operator()(const bool& a, const bool& b) const { return a || b; }
 };
 
 template <>
 struct bit_xor<bool> {
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE bool operator()(const bool& a, const bool& b) const {
-    return a != b;
-  }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE bool operator()(const bool& a, const bool& b) const { return a != b; }
 };
 
 template <>
 struct bit_not<bool> {
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE bool operator()(const bool& a) const { return !a; }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE bool operator()(const bool& a) const { return !a; }
 };
 
 // Use operators &, |, ^, ~.
@@ -580,7 +580,7 @@
 }
 
 // In the general case, use bitwise select.
-template <typename Packet, typename EnableIf = void>
+template <typename Packet, bool is_scalar = is_scalar<Packet>::value>
 struct pselect_impl {
   static EIGEN_DEVICE_FUNC inline Packet run(const Packet& mask, const Packet& a, const Packet& b) {
     return por(pand(a, mask), pandnot(b, mask));
@@ -589,9 +589,9 @@
 
 // For scalars, use ternary select.
 template <typename Packet>
-struct pselect_impl<Packet, std::enable_if_t<is_scalar<Packet>::value>> {
+struct pselect_impl<Packet, true> {
   static EIGEN_DEVICE_FUNC inline Packet run(const Packet& mask, const Packet& a, const Packet& b) {
-    return numext::equal_strict(mask, Packet(0)) ? b : a;
+    return numext::select(mask, a, b);
   }
 };
 
@@ -1294,29 +1294,61 @@
  * The following functions might not have to be overwritten for vectorized types
  ***************************************************************************/
 
+template <typename Packet, typename EnableIf = void>
+struct pmadd_impl {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pmadd(const Packet& a, const Packet& b, const Packet& c) {
+    return padd(pmul(a, b), c);
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pmsub(const Packet& a, const Packet& b, const Packet& c) {
+    return psub(pmul(a, b), c);
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pnmadd(const Packet& a, const Packet& b, const Packet& c) {
+    return psub(c, pmul(a, b));
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pnmsub(const Packet& a, const Packet& b, const Packet& c) {
+    return pnegate(pmadd(a, b, c));
+  }
+};
+
+template <typename Scalar>
+struct pmadd_impl<Scalar, std::enable_if_t<is_scalar<Scalar>::value && NumTraits<Scalar>::IsSigned>> {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar pmadd(const Scalar& a, const Scalar& b, const Scalar& c) {
+    return numext::fma(a, b, c);
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar pmsub(const Scalar& a, const Scalar& b, const Scalar& c) {
+    return numext::fma(a, b, Scalar(-c));
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar pnmadd(const Scalar& a, const Scalar& b, const Scalar& c) {
+    return numext::fma(Scalar(-a), b, c);
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar pnmsub(const Scalar& a, const Scalar& b, const Scalar& c) {
+    return -Scalar(numext::fma(a, b, c));
+  }
+};
+
 // FMA instructions.
 /** \internal \returns a * b + c (coeff-wise) */
 template <typename Packet>
 EIGEN_DEVICE_FUNC inline Packet pmadd(const Packet& a, const Packet& b, const Packet& c) {
-  return padd(pmul(a, b), c);
+  return pmadd_impl<Packet>::pmadd(a, b, c);
 }
 
 /** \internal \returns a * b - c (coeff-wise) */
 template <typename Packet>
 EIGEN_DEVICE_FUNC inline Packet pmsub(const Packet& a, const Packet& b, const Packet& c) {
-  return psub(pmul(a, b), c);
+  return pmadd_impl<Packet>::pmsub(a, b, c);
 }
 
 /** \internal \returns -(a * b) + c (coeff-wise) */
 template <typename Packet>
 EIGEN_DEVICE_FUNC inline Packet pnmadd(const Packet& a, const Packet& b, const Packet& c) {
-  return psub(c, pmul(a, b));
+  return pmadd_impl<Packet>::pnmadd(a, b, c);
 }
 
 /** \internal \returns -((a * b + c) (coeff-wise) */
 template <typename Packet>
 EIGEN_DEVICE_FUNC inline Packet pnmsub(const Packet& a, const Packet& b, const Packet& c) {
-  return pnegate(pmadd(a, b, c));
+  return pmadd_impl<Packet>::pnmsub(a, b, c);
 }
 
 /** \internal copy a packet with constant coefficient \a a (e.g., [a,a,a,a]) to \a *to. \a to must be 16 bytes aligned
@@ -1525,6 +1557,104 @@
   return (Packet)pand(result, peven_mask(result));  // atan2 0    atan2 0    ...
 }
 
+/** \internal \returns a packet populated with values in the range [begin, begin + count). Elements
+ * outside this range are not defined. \a *from does not need to be aligned, and can be null if \a count is zero.*/
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet ploaduSegment(const typename unpacket_traits<Packet>::type* from, Index begin,
+                                              Index count) {
+  using Scalar = typename unpacket_traits<Packet>::type;
+  constexpr Index PacketSize = unpacket_traits<Packet>::size;
+  eigen_assert((begin >= 0 && count >= 0 && begin + count <= PacketSize) && "invalid range");
+  Scalar aux[PacketSize];
+  memset(static_cast<void*>(aux), 0x00, sizeof(Scalar) * PacketSize);
+  smart_copy(from + begin, from + begin + count, aux + begin);
+  return ploadu<Packet>(aux);
+}
+
+/** \internal \returns a packet populated with values in the range [begin, begin + count). Elements
+ * outside this range are not defined. \a *from must be aligned, and cannot be null.*/
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet ploadSegment(const typename unpacket_traits<Packet>::type* from, Index begin,
+                                             Index count) {
+  return ploaduSegment<Packet>(from, begin, count);
+}
+
+/** \internal copy the packet \a from in the range [begin, begin + count) to \a *to.
+Elements outside of the range [begin, begin + count) are not defined. \a *to does not need to be aligned, and can be
+null if \a count is zero.*/
+template <typename Scalar, typename Packet>
+EIGEN_DEVICE_FUNC inline void pstoreuSegment(Scalar* to, const Packet& from, Index begin, Index count) {
+  constexpr Index PacketSize = unpacket_traits<Packet>::size;
+  eigen_assert((begin >= 0 && count >= 0 && begin + count <= PacketSize) && "invalid range");
+  Scalar aux[PacketSize];
+  pstoreu<Scalar, Packet>(aux, from);
+  smart_copy(aux + begin, aux + begin + count, to + begin);
+}
+
+/** \internal copy the packet \a from in the range [begin, begin + count) to \a *to.
+Elements outside of the range [begin, begin + count) are not defined. \a *to must be aligned, and cannot be
+null.*/
+template <typename Scalar, typename Packet>
+EIGEN_DEVICE_FUNC inline void pstoreSegment(Scalar* to, const Packet& from, Index begin, Index count) {
+  return pstoreuSegment(to, from, begin, count);
+}
+
+/** \internal \returns a packet populated with values in the range [begin, begin + count). Elements
+ * outside this range are not defined.*/
+template <typename Packet, int Alignment>
+EIGEN_DEVICE_FUNC inline Packet ploadtSegment(const typename unpacket_traits<Packet>::type* from, Index begin,
+                                              Index count) {
+  constexpr int RequiredAlignment = unpacket_traits<Packet>::alignment;
+  if (Alignment >= RequiredAlignment) {
+    return ploadSegment<Packet>(from, begin, count);
+  } else {
+    return ploaduSegment<Packet>(from, begin, count);
+  }
+}
+
+/** \internal copy the packet \a from in the range [begin, begin + count) to \a *to.
+Elements outside of the range [begin, begin + count) are not defined.*/
+template <typename Scalar, typename Packet, int Alignment>
+EIGEN_DEVICE_FUNC inline void pstoretSegment(Scalar* to, const Packet& from, Index begin, Index count) {
+  constexpr int RequiredAlignment = unpacket_traits<Packet>::alignment;
+  if (Alignment >= RequiredAlignment) {
+    pstoreSegment<Scalar, Packet>(to, from, begin, count);
+  } else {
+    pstoreuSegment<Scalar, Packet>(to, from, begin, count);
+  }
+}
+
+#ifndef EIGEN_NO_IO
+
+template <typename Packet>
+class StreamablePacket {
+ public:
+  using Scalar = typename unpacket_traits<Packet>::type;
+  StreamablePacket(const Packet& packet) { pstoreu(v_, packet); }
+
+  friend std::ostream& operator<<(std::ostream& os, const StreamablePacket& packet) {
+    os << "{" << packet.v_[0];
+    for (int i = 1; i < unpacket_traits<Packet>::size; ++i) {
+      os << "," << packet.v_[i];
+    }
+    os << "}";
+    return os;
+  }
+
+ private:
+  Scalar v_[unpacket_traits<Packet>::size];
+};
+
+/**
+ * \internal \returns an intermediary that can be used to ostream packets, e.g. for debugging.
+ */
+template <typename Packet>
+StreamablePacket<Packet> postream(const Packet& packet) {
+  return StreamablePacket<Packet>(packet);
+}
+
+#endif  // EIGEN_NO_IO
+
 }  // end namespace internal
 
 }  // end namespace Eigen

diff --git a/Eigen/src/Core/IndexedView.h b/Eigen/src/Core/IndexedView.h
index 454e560..358239c 100644
--- a/Eigen/src/Core/IndexedView.h
+++ b/Eigen/src/Core/IndexedView.h

@@ -225,14 +225,14 @@
     return this->nestedExpression().data() + row_offset + col_offset;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const EIGEN_NOEXCEPT {
+  EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept {
     if (traits<Derived>::InnerStrideAtCompileTime != Dynamic) {
       return traits<Derived>::InnerStrideAtCompileTime;
     }
     return innerIncrement() * this->nestedExpression().innerStride();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const EIGEN_NOEXCEPT {
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept {
     if (traits<Derived>::OuterStrideAtCompileTime != Dynamic) {
       return traits<Derived>::OuterStrideAtCompileTime;
     }

diff --git a/Eigen/src/Core/Inverse.h b/Eigen/src/Core/Inverse.h
index 013ad0a..79fc3ab 100644
--- a/Eigen/src/Core/Inverse.h
+++ b/Eigen/src/Core/Inverse.h

@@ -51,8 +51,8 @@
 
   explicit EIGEN_DEVICE_FUNC Inverse(const XprType& xpr) : m_xpr(xpr) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_xpr.cols(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_xpr.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_xpr.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_xpr.rows(); }
 
   EIGEN_DEVICE_FUNC const XprTypeNestedCleaned& nestedExpression() const { return m_xpr; }
 

diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h
index df7b7ca..c740da7 100644
--- a/Eigen/src/Core/Map.h
+++ b/Eigen/src/Core/Map.h

@@ -102,11 +102,11 @@
   typedef PointerType PointerArgType;
   EIGEN_DEVICE_FUNC inline PointerType cast_to_pointer_type(PointerArgType ptr) { return ptr; }
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const {
+  EIGEN_DEVICE_FUNC constexpr Index innerStride() const {
     return StrideType::InnerStrideAtCompileTime != 0 ? m_stride.inner() : 1;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const {
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const {
     return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
            : internal::traits<Map>::OuterStrideAtCompileTime != Dynamic
                ? Index(internal::traits<Map>::OuterStrideAtCompileTime)

diff --git a/Eigen/src/Core/MapBase.h b/Eigen/src/Core/MapBase.h
index 1e83fdf..5e3d746 100644
--- a/Eigen/src/Core/MapBase.h
+++ b/Eigen/src/Core/MapBase.h

@@ -84,9 +84,9 @@
   typedef typename Base::CoeffReturnType CoeffReturnType;
 
   /** \copydoc DenseBase::rows() */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_rows.value(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_rows.value(); }
   /** \copydoc DenseBase::cols() */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_cols.value(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_cols.value(); }
 
   /** Returns a pointer to the first coefficient of the matrix or vector.
    *

diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index 528aed2..941961d 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h

@@ -170,8 +170,8 @@
 
 template <typename Scalar>
 struct imag_ref_default_impl<Scalar, false> {
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline Scalar run(Scalar&) { return Scalar(0); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline const Scalar run(const Scalar&) { return Scalar(0); }
+  EIGEN_DEVICE_FUNC constexpr static Scalar run(Scalar&) { return Scalar(0); }
+  EIGEN_DEVICE_FUNC constexpr static const Scalar run(const Scalar&) { return Scalar(0); }
 };
 
 template <typename Scalar>
@@ -182,6 +182,10 @@
   typedef typename NumTraits<Scalar>::Real& type;
 };
 
+// implementation in MathFunctionsImpl.h
+template <typename Mask, bool is_built_in_float = std::is_floating_point<Mask>::value>
+struct scalar_select_mask;
+
 }  // namespace internal
 
 namespace numext {
@@ -207,6 +211,11 @@
   return EIGEN_MATHFUNC_IMPL(imag, Scalar)::run(x);
 }
 
+template <typename Scalar, typename Mask>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar select(const Mask& mask, const Scalar& a, const Scalar& b) {
+  return internal::scalar_select_mask<Mask>::run(mask) ? b : a;
+}
+
 }  // namespace numext
 
 namespace internal {
@@ -827,8 +836,8 @@
 
 template <typename T>
 EIGEN_DEVICE_FUNC
-    std::enable_if_t<!(std::numeric_limits<T>::has_quiet_NaN || std::numeric_limits<T>::has_signaling_NaN), bool>
-    isnan_impl(const T&) {
+std::enable_if_t<!(std::numeric_limits<T>::has_quiet_NaN || std::numeric_limits<T>::has_signaling_NaN), bool>
+isnan_impl(const T&) {
   return false;
 }
 
@@ -936,6 +945,38 @@
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run_trunc(const Scalar& x) { return x; }
 };
 
+// Default implementation.
+template <typename Scalar, typename Enable = void>
+struct fma_impl {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run(const Scalar& a, const Scalar& b, const Scalar& c) {
+    return a * b + c;
+  }
+};
+
+// ADL version if it exists.
+template <typename T>
+struct fma_impl<
+    T,
+    std::enable_if_t<std::is_same<T, decltype(fma(std::declval<T>(), std::declval<T>(), std::declval<T>()))>::value>> {
+  static T run(const T& a, const T& b, const T& c) { return fma(a, b, c); }
+};
+
+#if defined(EIGEN_GPUCC)
+template <>
+struct fma_impl<float, void> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float run(const float& a, const float& b, const float& c) {
+    return ::fmaf(a, b, c);
+  }
+};
+
+template <>
+struct fma_impl<double, void> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double run(const double& a, const double& b, const double& c) {
+    return ::fma(a, b, c);
+  }
+};
+#endif
+
 }  // end namespace internal
 
 /****************************************************************************
@@ -1256,7 +1297,7 @@
 // Integer division with rounding up.
 // T is assumed to be an integer type with a>=0, and b>0
 template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE EIGEN_CONSTEXPR T div_ceil(T a, T b) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE constexpr T div_ceil(T a, T b) {
   using UnsignedT = typename internal::make_unsigned<T>::type;
   EIGEN_STATIC_ASSERT((NumTraits<T>::IsInteger), THIS FUNCTION IS FOR INTEGER TYPES)
   // Note: explicitly declaring a and b as non-negative values allows the compiler to use better optimizations
@@ -1269,7 +1310,7 @@
 // Integer round down to nearest power of b
 // T is assumed to be an integer type with a>=0, and b>0
 template <typename T, typename U>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE EIGEN_CONSTEXPR T round_down(T a, U b) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE constexpr T round_down(T a, U b) {
   using UnsignedT = typename internal::make_unsigned<T>::type;
   using UnsignedU = typename internal::make_unsigned<U>::type;
   EIGEN_STATIC_ASSERT((NumTraits<T>::IsInteger), THIS FUNCTION IS FOR INTEGER TYPES)
@@ -1282,7 +1323,7 @@
 
 /** Log base 2 for 32 bits positive integers.
  * Conveniently returns 0 for x==0. */
-EIGEN_CONSTEXPR inline int log2(int x) {
+constexpr int log2(int x) {
   unsigned int v(x);
   constexpr int table[32] = {0, 9,  1,  10, 13, 21, 2,  29, 11, 14, 16, 18, 22, 25, 3, 30,
                              8, 12, 20, 28, 15, 17, 24, 7,  19, 27, 23, 6,  26, 5,  4, 31};
@@ -1320,11 +1361,17 @@
 
 /** \returns the cube root of \a x. **/
 template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T cbrt(const T& x) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t<!NumTraits<T>::IsComplex, T> cbrt(const T& x) {
   EIGEN_USING_STD(cbrt);
   return static_cast<T>(cbrt(x));
 }
 
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t<NumTraits<T>::IsComplex, T> cbrt(const T& x) {
+  EIGEN_USING_STD(pow);
+  return pow(x, typename NumTraits<T>::Real(1.0 / 3.0));
+}
+
 /** \returns the reciprocal square root of \a x. **/
 template <typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T rsqrt(const T& x) {
@@ -1353,17 +1400,17 @@
 #endif
 
 template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-    std::enable_if_t<NumTraits<T>::IsSigned || NumTraits<T>::IsComplex, typename NumTraits<T>::Real>
-    abs(const T& x) {
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE std::enable_if_t<NumTraits<T>::IsSigned || NumTraits<T>::IsComplex, typename NumTraits<T>::Real>
+abs(const T& x) {
   EIGEN_USING_STD(abs);
   return abs(x);
 }
 
 template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-    std::enable_if_t<!(NumTraits<T>::IsSigned || NumTraits<T>::IsComplex), typename NumTraits<T>::Real>
-    abs(const T& x) {
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE std::enable_if_t<!(NumTraits<T>::IsSigned || NumTraits<T>::IsComplex), typename NumTraits<T>::Real>
+abs(const T& x) {
   return x;
 }
 
@@ -1843,6 +1890,15 @@
   return bit_cast<Scalar, SignedScalar>(bit_cast<SignedScalar, Scalar>(a) >> n);
 }
 
+// Use std::fma if available.
+using std::fma;
+
+// Otherwise, rely on template implementation.
+template <typename Scalar>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar fma(const Scalar& x, const Scalar& y, const Scalar& z) {
+  return internal::fma_impl<Scalar>::run(x, y, z);
+}
+
 }  // end namespace numext
 
 namespace internal {

diff --git a/Eigen/src/Core/MathFunctionsImpl.h b/Eigen/src/Core/MathFunctionsImpl.h
index 8e2705b..cbac1c2 100644
--- a/Eigen/src/Core/MathFunctionsImpl.h
+++ b/Eigen/src/Core/MathFunctionsImpl.h

@@ -76,7 +76,7 @@
   static_assert(Steps > 0, "Steps must be at least 1.");
   using Scalar = typename unpacket_traits<Packet>::type;
   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Packet run(const Packet& a, const Packet& approx_rsqrt) {
-    constexpr Scalar kMinusHalf = Scalar(-1) / Scalar(2);
+    const Scalar kMinusHalf = Scalar(-1) / Scalar(2);
     const Packet cst_minus_half = pset1<Packet>(kMinusHalf);
     const Packet cst_minus_one = pset1<Packet>(Scalar(-1));
 
@@ -256,6 +256,48 @@
   return ComplexT(numext::log(a), b);
 }
 
+// For generic scalars, use ternary select.
+template <typename Mask>
+struct scalar_select_mask<Mask, /*is_built_in_float*/ false> {
+  static EIGEN_DEVICE_FUNC inline bool run(const Mask& mask) { return numext::is_exactly_zero(mask); }
+};
+
+// For built-in float mask, bitcast the mask to its integer counterpart and use ternary select.
+template <typename Mask>
+struct scalar_select_mask<Mask, /*is_built_in_float*/ true> {
+  using IntegerType = typename numext::get_integer_by_size<sizeof(Mask)>::unsigned_type;
+  static EIGEN_DEVICE_FUNC inline bool run(const Mask& mask) {
+    return numext::is_exactly_zero(numext::bit_cast<IntegerType>(std::abs(mask)));
+  }
+};
+
+template <int Size = sizeof(long double)>
+struct ldbl_select_mask {
+  static constexpr int MantissaDigits = std::numeric_limits<long double>::digits;
+  static constexpr int NumBytes = (MantissaDigits == 64 ? 80 : 128) / CHAR_BIT;
+  static EIGEN_DEVICE_FUNC inline bool run(const long double& mask) {
+    const uint8_t* mask_bytes = reinterpret_cast<const uint8_t*>(&mask);
+    for (Index i = 0; i < NumBytes; i++) {
+      if (mask_bytes[i] != 0) return false;
+    }
+    return true;
+  }
+};
+
+template <>
+struct ldbl_select_mask<sizeof(double)> : scalar_select_mask<double> {};
+
+template <>
+struct scalar_select_mask<long double, true> : ldbl_select_mask<> {};
+
+template <typename RealMask>
+struct scalar_select_mask<std::complex<RealMask>, false> {
+  using impl = scalar_select_mask<RealMask>;
+  static EIGEN_DEVICE_FUNC inline bool run(const std::complex<RealMask>& mask) {
+    return impl::run(numext::real(mask)) && impl::run(numext::imag(mask));
+  }
+};
+
 }  // end namespace internal
 
 }  // end namespace Eigen

diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h
index 67590fb..a2c8eba 100644
--- a/Eigen/src/Core/Matrix.h
+++ b/Eigen/src/Core/Matrix.h

@@ -258,8 +258,8 @@
   /** \brief Moves the matrix into the other one.
    *
    */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Matrix& operator=(Matrix&& other)
-      EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable<Scalar>::value) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Matrix& operator=(Matrix&& other) noexcept(
+      std::is_nothrow_move_assignable<Scalar>::value) {
     Base::operator=(std::move(other));
     return *this;
   }
@@ -393,8 +393,8 @@
   template <typename OtherDerived>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Matrix(const EigenBase<OtherDerived>& other) : Base(other.derived()) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const EIGEN_NOEXCEPT { return 1; }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const EIGEN_NOEXCEPT { return this->innerSize(); }
+  EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { return 1; }
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return this->innerSize(); }
 
   /////////// Geometry module ///////////
 

diff --git a/Eigen/src/Core/NestByValue.h b/Eigen/src/Core/NestByValue.h
index ec360eb..2ce83a8 100644
--- a/Eigen/src/Core/NestByValue.h
+++ b/Eigen/src/Core/NestByValue.h

@@ -45,8 +45,8 @@
 
   EIGEN_DEVICE_FUNC explicit inline NestByValue(const ExpressionType& matrix) : m_expression(matrix) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_expression.rows(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_expression.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_expression.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_expression.cols(); }
 
   EIGEN_DEVICE_FUNC operator const ExpressionType&() const { return m_expression; }
 

diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h
index 1dc3448..5e4e5c2 100644
--- a/Eigen/src/Core/NumTraits.h
+++ b/Eigen/src/Core/NumTraits.h

@@ -22,13 +22,13 @@
 template <typename T, bool use_numeric_limits = std::numeric_limits<T>::is_specialized,
           bool is_integer = NumTraits<T>::IsInteger>
 struct default_digits_impl {
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int run() { return std::numeric_limits<T>::digits; }
+  EIGEN_DEVICE_FUNC constexpr static int run() { return std::numeric_limits<T>::digits; }
 };
 
 template <typename T>
 struct default_digits_impl<T, false, false>  // Floating point
 {
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int run() {
+  EIGEN_DEVICE_FUNC constexpr static int run() {
     using std::ceil;
     using std::log2;
     typedef typename NumTraits<T>::Real Real;
@@ -39,7 +39,7 @@
 template <typename T>
 struct default_digits_impl<T, false, true>  // Integer
 {
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int run() { return 0; }
+  EIGEN_DEVICE_FUNC constexpr static int run() { return 0; }
 };
 
 // default implementation of digits10(), based on numeric_limits if specialized,
@@ -47,13 +47,13 @@
 template <typename T, bool use_numeric_limits = std::numeric_limits<T>::is_specialized,
           bool is_integer = NumTraits<T>::IsInteger>
 struct default_digits10_impl {
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int run() { return std::numeric_limits<T>::digits10; }
+  EIGEN_DEVICE_FUNC constexpr static int run() { return std::numeric_limits<T>::digits10; }
 };
 
 template <typename T>
 struct default_digits10_impl<T, false, false>  // Floating point
 {
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int run() {
+  EIGEN_DEVICE_FUNC constexpr static int run() {
     using std::floor;
     using std::log10;
     typedef typename NumTraits<T>::Real Real;
@@ -64,7 +64,7 @@
 template <typename T>
 struct default_digits10_impl<T, false, true>  // Integer
 {
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int run() { return 0; }
+  EIGEN_DEVICE_FUNC constexpr static int run() { return 0; }
 };
 
 // default implementation of max_digits10(), based on numeric_limits if specialized,
@@ -72,13 +72,13 @@
 template <typename T, bool use_numeric_limits = std::numeric_limits<T>::is_specialized,
           bool is_integer = NumTraits<T>::IsInteger>
 struct default_max_digits10_impl {
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int run() { return std::numeric_limits<T>::max_digits10; }
+  EIGEN_DEVICE_FUNC constexpr static int run() { return std::numeric_limits<T>::max_digits10; }
 };
 
 template <typename T>
 struct default_max_digits10_impl<T, false, false>  // Floating point
 {
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int run() {
+  EIGEN_DEVICE_FUNC constexpr static int run() {
     using std::ceil;
     using std::log10;
     typedef typename NumTraits<T>::Real Real;
@@ -89,7 +89,7 @@
 template <typename T>
 struct default_max_digits10_impl<T, false, true>  // Integer
 {
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int run() { return 0; }
+  EIGEN_DEVICE_FUNC constexpr static int run() { return 0; }
 };
 
 }  // end namespace internal
@@ -188,32 +188,30 @@
   typedef T Nested;
   typedef T Literal;
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline Real epsilon() { return numext::numeric_limits<T>::epsilon(); }
+  EIGEN_DEVICE_FUNC constexpr static Real epsilon() { return numext::numeric_limits<T>::epsilon(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline int digits10() { return internal::default_digits10_impl<T>::run(); }
+  EIGEN_DEVICE_FUNC constexpr static int digits10() { return internal::default_digits10_impl<T>::run(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline int max_digits10() {
-    return internal::default_max_digits10_impl<T>::run();
-  }
+  EIGEN_DEVICE_FUNC constexpr static int max_digits10() { return internal::default_max_digits10_impl<T>::run(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline int digits() { return internal::default_digits_impl<T>::run(); }
+  EIGEN_DEVICE_FUNC constexpr static int digits() { return internal::default_digits_impl<T>::run(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline int min_exponent() { return numext::numeric_limits<T>::min_exponent; }
+  EIGEN_DEVICE_FUNC constexpr static int min_exponent() { return numext::numeric_limits<T>::min_exponent; }
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline int max_exponent() { return numext::numeric_limits<T>::max_exponent; }
+  EIGEN_DEVICE_FUNC constexpr static int max_exponent() { return numext::numeric_limits<T>::max_exponent; }
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline Real dummy_precision() {
+  EIGEN_DEVICE_FUNC constexpr static Real dummy_precision() {
     // make sure to override this for floating-point types
     return Real(0);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline T highest() { return (numext::numeric_limits<T>::max)(); }
+  EIGEN_DEVICE_FUNC constexpr static T highest() { return (numext::numeric_limits<T>::max)(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline T lowest() { return (numext::numeric_limits<T>::lowest)(); }
+  EIGEN_DEVICE_FUNC constexpr static T lowest() { return (numext::numeric_limits<T>::lowest)(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline T infinity() { return numext::numeric_limits<T>::infinity(); }
+  EIGEN_DEVICE_FUNC constexpr static T infinity() { return numext::numeric_limits<T>::infinity(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline T quiet_NaN() { return numext::numeric_limits<T>::quiet_NaN(); }
+  EIGEN_DEVICE_FUNC constexpr static T quiet_NaN() { return numext::numeric_limits<T>::quiet_NaN(); }
 };
 
 template <typename T>
@@ -221,25 +219,23 @@
 
 template <>
 struct NumTraits<float> : GenericNumTraits<float> {
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline float dummy_precision() { return 1e-5f; }
+  EIGEN_DEVICE_FUNC constexpr static float dummy_precision() { return 1e-5f; }
 };
 
 template <>
 struct NumTraits<double> : GenericNumTraits<double> {
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline double dummy_precision() { return 1e-12; }
+  EIGEN_DEVICE_FUNC constexpr static double dummy_precision() { return 1e-12; }
 };
 
 // GPU devices treat `long double` as `double`.
 #ifndef EIGEN_GPU_COMPILE_PHASE
 template <>
 struct NumTraits<long double> : GenericNumTraits<long double> {
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline long double dummy_precision() {
-    return static_cast<long double>(1e-15l);
-  }
+  EIGEN_DEVICE_FUNC constexpr static long double dummy_precision() { return static_cast<long double>(1e-15l); }
 
 #if defined(EIGEN_ARCH_PPC) && (__LDBL_MANT_DIG__ == 106)
   // PowerPC double double causes issues with some values
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline long double epsilon() {
+  EIGEN_DEVICE_FUNC constexpr static long double epsilon() {
     // 2^(-(__LDBL_MANT_DIG__)+1)
     return static_cast<long double>(2.4651903288156618919116517665087e-32l);
   }
@@ -260,10 +256,10 @@
     MulCost = 4 * NumTraits<Real>::MulCost + 2 * NumTraits<Real>::AddCost
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline Real epsilon() { return NumTraits<Real>::epsilon(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline Real dummy_precision() { return NumTraits<Real>::dummy_precision(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline int digits10() { return NumTraits<Real>::digits10(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline int max_digits10() { return NumTraits<Real>::max_digits10(); }
+  EIGEN_DEVICE_FUNC constexpr static Real epsilon() { return NumTraits<Real>::epsilon(); }
+  EIGEN_DEVICE_FUNC constexpr static Real dummy_precision() { return NumTraits<Real>::dummy_precision(); }
+  EIGEN_DEVICE_FUNC constexpr static int digits10() { return NumTraits<Real>::digits10(); }
+  EIGEN_DEVICE_FUNC constexpr static int max_digits10() { return NumTraits<Real>::max_digits10(); }
 };
 
 template <typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
@@ -290,25 +286,19 @@
                                                       : ArrayType::SizeAtCompileTime * int(NumTraits<Scalar>::MulCost)
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline RealScalar epsilon() { return NumTraits<RealScalar>::epsilon(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline RealScalar dummy_precision() {
-    return NumTraits<RealScalar>::dummy_precision();
-  }
+  EIGEN_DEVICE_FUNC constexpr static RealScalar epsilon() { return NumTraits<RealScalar>::epsilon(); }
+  EIGEN_DEVICE_FUNC constexpr static RealScalar dummy_precision() { return NumTraits<RealScalar>::dummy_precision(); }
 
-  EIGEN_CONSTEXPR
-  static inline int digits10() { return NumTraits<Scalar>::digits10(); }
-  EIGEN_CONSTEXPR
-  static inline int max_digits10() { return NumTraits<Scalar>::max_digits10(); }
+  constexpr static int digits10() { return NumTraits<Scalar>::digits10(); }
+  constexpr static int max_digits10() { return NumTraits<Scalar>::max_digits10(); }
 };
 
 template <>
 struct NumTraits<std::string> : GenericNumTraits<std::string> {
   enum { RequireInitialization = 1, ReadCost = HugeCost, AddCost = HugeCost, MulCost = HugeCost };
 
-  EIGEN_CONSTEXPR
-  static inline int digits10() { return 0; }
-  EIGEN_CONSTEXPR
-  static inline int max_digits10() { return 0; }
+  constexpr static int digits10() { return 0; }
+  constexpr static int max_digits10() { return 0; }
 
  private:
   static inline std::string epsilon();

diff --git a/Eigen/src/Core/PartialReduxEvaluator.h b/Eigen/src/Core/PartialReduxEvaluator.h
index 7b2c8dc..1f638f9 100644
--- a/Eigen/src/Core/PartialReduxEvaluator.h
+++ b/Eigen/src/Core/PartialReduxEvaluator.h

@@ -103,19 +103,36 @@
   EIGEN_DEVICE_FUNC static PacketType run(const Evaluator& eval, const Func& func, Index size) {
     if (size == 0) return packetwise_redux_empty_value<PacketType>(func);
 
-    const Index size4 = (size - 1) & (~3);
+    const Index size4 = 1 + numext::round_down(size - 1, 4);
     PacketType p = eval.template packetByOuterInner<Unaligned, PacketType>(0, 0);
-    Index i = 1;
     // This loop is optimized for instruction pipelining:
     // - each iteration generates two independent instructions
     // - thanks to branch prediction and out-of-order execution we have independent instructions across loops
-    for (; i < size4; i += 4)
+    for (Index i = 1; i < size4; i += 4)
       p = func.packetOp(
           p, func.packetOp(func.packetOp(eval.template packetByOuterInner<Unaligned, PacketType>(i + 0, 0),
                                          eval.template packetByOuterInner<Unaligned, PacketType>(i + 1, 0)),
                            func.packetOp(eval.template packetByOuterInner<Unaligned, PacketType>(i + 2, 0),
                                          eval.template packetByOuterInner<Unaligned, PacketType>(i + 3, 0))));
-    for (; i < size; ++i) p = func.packetOp(p, eval.template packetByOuterInner<Unaligned, PacketType>(i, 0));
+    for (Index i = size4; i < size; ++i)
+      p = func.packetOp(p, eval.template packetByOuterInner<Unaligned, PacketType>(i, 0));
+    return p;
+  }
+};
+
+template <typename Func, typename Evaluator>
+struct packetwise_segment_redux_impl {
+  typedef typename Evaluator::Scalar Scalar;
+  typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
+
+  template <typename PacketType>
+  EIGEN_DEVICE_FUNC static PacketType run(const Evaluator& eval, const Func& func, Index size, Index begin,
+                                          Index count) {
+    if (size == 0) return packetwise_redux_empty_value<PacketType>(func);
+
+    PacketType p = eval.template packetSegmentByOuterInner<Unaligned, PacketType>(0, 0, begin, count);
+    for (Index i = 1; i < size; ++i)
+      p = func.packetOp(p, eval.template packetSegmentByOuterInner<Unaligned, PacketType>(i, 0, begin, count));
     return p;
   }
 };
@@ -174,14 +191,13 @@
 
   template <int LoadMode, typename PacketType>
   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC PacketType packet(Index idx) const {
-    enum { PacketSize = internal::unpacket_traits<PacketType>::size };
-    typedef Block<const ArgTypeNestedCleaned, Direction == Vertical ? int(ArgType::RowsAtCompileTime) : int(PacketSize),
-                  Direction == Vertical ? int(PacketSize) : int(ArgType::ColsAtCompileTime), true /* InnerPanel */>
-        PanelType;
-
-    PanelType panel(m_arg, Direction == Vertical ? 0 : idx, Direction == Vertical ? idx : 0,
-                    Direction == Vertical ? m_arg.rows() : Index(PacketSize),
-                    Direction == Vertical ? Index(PacketSize) : m_arg.cols());
+    static constexpr int PacketSize = internal::unpacket_traits<PacketType>::size;
+    static constexpr int PanelRows = Direction == Vertical ? ArgType::RowsAtCompileTime : PacketSize;
+    static constexpr int PanelCols = Direction == Vertical ? PacketSize : ArgType::ColsAtCompileTime;
+    using PanelType = Block<const ArgTypeNestedCleaned, PanelRows, PanelCols, true /* InnerPanel */>;
+    using PanelEvaluator = typename internal::redux_evaluator<PanelType>;
+    using BinaryOp = typename MemberOp::BinaryOp;
+    using Impl = internal::packetwise_redux_impl<BinaryOp, PanelEvaluator>;
 
     // FIXME
     // See bug 1612, currently if PacketSize==1 (i.e. complex<double> with 128bits registers) then the storage-order of
@@ -189,11 +205,39 @@
     // by pass "vectorization" in this case:
     if (PacketSize == 1) return internal::pset1<PacketType>(coeff(idx));
 
-    typedef typename internal::redux_evaluator<PanelType> PanelEvaluator;
+    Index startRow = Direction == Vertical ? 0 : idx;
+    Index startCol = Direction == Vertical ? idx : 0;
+    Index numRows = Direction == Vertical ? m_arg.rows() : PacketSize;
+    Index numCols = Direction == Vertical ? PacketSize : m_arg.cols();
+
+    PanelType panel(m_arg, startRow, startCol, numRows, numCols);
     PanelEvaluator panel_eval(panel);
-    typedef typename MemberOp::BinaryOp BinaryOp;
-    PacketType p = internal::packetwise_redux_impl<BinaryOp, PanelEvaluator>::template run<PacketType>(
-        panel_eval, m_functor.binaryFunc(), m_arg.outerSize());
+    PacketType p = Impl::template run<PacketType>(panel_eval, m_functor.binaryFunc(), m_arg.outerSize());
+    return p;
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index i, Index j, Index begin, Index count) const {
+    return packetSegment<LoadMode, PacketType>(Direction == Vertical ? j : i, begin, count);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC PacketType packetSegment(Index idx, Index begin, Index count) const {
+    static constexpr int PanelRows = Direction == Vertical ? ArgType::RowsAtCompileTime : Dynamic;
+    static constexpr int PanelCols = Direction == Vertical ? Dynamic : ArgType::ColsAtCompileTime;
+    using PanelType = Block<const ArgTypeNestedCleaned, PanelRows, PanelCols, true /* InnerPanel */>;
+    using PanelEvaluator = typename internal::redux_evaluator<PanelType>;
+    using BinaryOp = typename MemberOp::BinaryOp;
+    using Impl = internal::packetwise_segment_redux_impl<BinaryOp, PanelEvaluator>;
+
+    Index startRow = Direction == Vertical ? 0 : idx;
+    Index startCol = Direction == Vertical ? idx : 0;
+    Index numRows = Direction == Vertical ? m_arg.rows() : begin + count;
+    Index numCols = Direction == Vertical ? begin + count : m_arg.cols();
+
+    PanelType panel(m_arg, startRow, startCol, numRows, numCols);
+    PanelEvaluator panel_eval(panel);
+    PacketType p = Impl::template run<PacketType>(panel_eval, m_functor.binaryFunc(), m_arg.outerSize(), begin, count);
     return p;
   }
 

diff --git a/Eigen/src/Core/PlainObjectBase.h b/Eigen/src/Core/PlainObjectBase.h
index eca7e1f..a78305e 100644
--- a/Eigen/src/Core/PlainObjectBase.h
+++ b/Eigen/src/Core/PlainObjectBase.h

@@ -162,8 +162,8 @@
   EIGEN_DEVICE_FUNC Base& base() { return *static_cast<Base*>(this); }
   EIGEN_DEVICE_FUNC const Base& base() const { return *static_cast<const Base*>(this); }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_storage.rows(); }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_storage.cols(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const noexcept { return m_storage.rows(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const noexcept { return m_storage.cols(); }
 
   /** This is an overloaded version of DenseCoeffsBase<Derived,ReadOnlyAccessors>::coeff(Index,Index) const
    * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts.
@@ -298,7 +298,7 @@
    *
    * \sa resize(Index,Index), resize(NoChange_t, Index), resize(Index, NoChange_t)
    */
-  EIGEN_DEVICE_FUNC inline constexpr void resize(Index size) {
+  EIGEN_DEVICE_FUNC constexpr void resize(Index size) {
     EIGEN_STATIC_ASSERT_VECTOR_ONLY(PlainObjectBase)
     eigen_assert(((SizeAtCompileTime == Dynamic && (MaxSizeAtCompileTime == Dynamic || size <= MaxSizeAtCompileTime)) ||
                   SizeAtCompileTime == size) &&
@@ -323,7 +323,7 @@
    *
    * \sa resize(Index,Index)
    */
-  EIGEN_DEVICE_FUNC inline constexpr void resize(NoChange_t, Index cols) { resize(rows(), cols); }
+  EIGEN_DEVICE_FUNC constexpr void resize(NoChange_t, Index cols) { resize(rows(), cols); }
 
   /** Resizes the matrix, changing only the number of rows. For the parameter of type NoChange_t, just pass the special
    * value \c NoChange as in the example below.
@@ -333,7 +333,7 @@
    *
    * \sa resize(Index,Index)
    */
-  EIGEN_DEVICE_FUNC inline constexpr void resize(Index rows, NoChange_t) { resize(rows, cols()); }
+  EIGEN_DEVICE_FUNC constexpr void resize(Index rows, NoChange_t) { resize(rows, cols()); }
 
   /** Resizes \c *this to have the same dimensions as \a other.
    * Takes care of doing all the checking that's needed.
@@ -450,7 +450,7 @@
   /** \brief Move constructor */
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr PlainObjectBase(PlainObjectBase&&) = default;
   /** \brief Move assignment operator */
-  EIGEN_DEVICE_FUNC constexpr PlainObjectBase& operator=(PlainObjectBase&& other) EIGEN_NOEXCEPT {
+  EIGEN_DEVICE_FUNC constexpr PlainObjectBase& operator=(PlainObjectBase&& other) noexcept {
     m_storage = std::move(other.m_storage);
     return *this;
   }

diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index 37683e3..e16c7cc 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h

@@ -224,8 +224,8 @@
                  "if you wanted a coeff-wise or a dot product use the respective explicit functions");
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_lhs.rows(); }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const noexcept { return m_lhs.rows(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const noexcept { return m_rhs.cols(); }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const LhsNestedCleaned& lhs() const { return m_lhs; }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const RhsNestedCleaned& rhs() const { return m_rhs; }

diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h
index 9de6481..ce8d954 100644
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h

@@ -283,7 +283,7 @@
 template <typename Lhs, typename Rhs>
 struct generic_product_impl<Lhs, Rhs, DenseShape, DenseShape, OuterProduct> {
   template <typename T>
-  struct is_row_major : std::conditional_t<(int(T::Flags) & RowMajorBit), internal::true_type, internal::false_type> {};
+  struct is_row_major : bool_constant<(int(T::Flags) & RowMajorBit)> {};
   typedef typename Product<Lhs, Rhs>::Scalar Scalar;
 
   // TODO it would be nice to be able to exploit our *_assign_op functors for that purpose
@@ -445,7 +445,7 @@
 
     eval_dynamic_impl(dst, blas_traits<Lhs>::extract(lhs).template conjugateIf<ConjLhs>(),
                       blas_traits<Rhs>::extract(rhs).template conjugateIf<ConjRhs>(), func, actualAlpha,
-                      std::conditional_t<HasScalarFactor, true_type, false_type>());
+                      bool_constant<HasScalarFactor>());
   }
 
  protected:
@@ -635,6 +635,24 @@
     return packet<LoadMode, PacketType>(row, col);
   }
 
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packetSegment(Index row, Index col, Index begin,
+                                                                       Index count) const {
+    PacketType res;
+    typedef etor_product_packet_impl<bool(int(Flags) & RowMajorBit) ? RowMajor : ColMajor,
+                                     Unroll ? int(InnerSize) : Dynamic, LhsEtorType, RhsEtorType, PacketType, LoadMode>
+        PacketImpl;
+    PacketImpl::run_segment(row, col, m_lhsImpl, m_rhsImpl, m_innerDim, res, begin, count);
+    return res;
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packetSegment(Index index, Index begin, Index count) const {
+    const Index row = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime == 1) ? 0 : index;
+    const Index col = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime == 1) ? index : 0;
+    return packetSegment<LoadMode, PacketType>(row, col, begin, count);
+  }
+
  protected:
   add_const_on_value_type_t<LhsNested> m_lhs;
   add_const_on_value_type_t<RhsNested> m_rhs;
@@ -670,6 +688,13 @@
     res = pmadd(pset1<Packet>(lhs.coeff(row, Index(UnrollingIndex - 1))),
                 rhs.template packet<LoadMode, Packet>(Index(UnrollingIndex - 1), col), res);
   }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
+                                                                Index innerDim, Packet& res, Index begin, Index count) {
+    etor_product_packet_impl<RowMajor, UnrollingIndex - 1, Lhs, Rhs, Packet, LoadMode>::run_segment(
+        row, col, lhs, rhs, innerDim, res, begin, count);
+    res = pmadd(pset1<Packet>(lhs.coeff(row, Index(UnrollingIndex - 1))),
+                rhs.template packetSegment<LoadMode, Packet>(Index(UnrollingIndex - 1), col, begin, count), res);
+  }
 };
 
 template <int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
@@ -681,6 +706,13 @@
     res = pmadd(lhs.template packet<LoadMode, Packet>(row, Index(UnrollingIndex - 1)),
                 pset1<Packet>(rhs.coeff(Index(UnrollingIndex - 1), col)), res);
   }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
+                                                                Index innerDim, Packet& res, Index begin, Index count) {
+    etor_product_packet_impl<ColMajor, UnrollingIndex - 1, Lhs, Rhs, Packet, LoadMode>::run_segment(
+        row, col, lhs, rhs, innerDim, res, begin, count);
+    res = pmadd(lhs.template packetSegment<LoadMode, Packet>(row, Index(UnrollingIndex - 1), begin, count),
+                pset1<Packet>(rhs.coeff(Index(UnrollingIndex - 1), col)), res);
+  }
 };
 
 template <typename Lhs, typename Rhs, typename Packet, int LoadMode>
@@ -689,6 +721,12 @@
                                                         Index /*innerDim*/, Packet& res) {
     res = pmul(pset1<Packet>(lhs.coeff(row, Index(0))), rhs.template packet<LoadMode, Packet>(Index(0), col));
   }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
+                                                                Index /*innerDim*/, Packet& res, Index begin,
+                                                                Index count) {
+    res = pmul(pset1<Packet>(lhs.coeff(row, Index(0))),
+               rhs.template packetSegment<LoadMode, Packet>(Index(0), col, begin, count));
+  }
 };
 
 template <typename Lhs, typename Rhs, typename Packet, int LoadMode>
@@ -697,6 +735,12 @@
                                                         Index /*innerDim*/, Packet& res) {
     res = pmul(lhs.template packet<LoadMode, Packet>(row, Index(0)), pset1<Packet>(rhs.coeff(Index(0), col)));
   }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
+                                                                Index /*innerDim*/, Packet& res, Index begin,
+                                                                Index count) {
+    res = pmul(lhs.template packetSegment<LoadMode, Packet>(row, Index(0), begin, count),
+               pset1<Packet>(rhs.coeff(Index(0), col)));
+  }
 };
 
 template <typename Lhs, typename Rhs, typename Packet, int LoadMode>
@@ -705,6 +749,11 @@
                                                         const Rhs& /*rhs*/, Index /*innerDim*/, Packet& res) {
     res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
   }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/,
+                                                                const Rhs& /*rhs*/, Index /*innerDim*/, Packet& res,
+                                                                Index /*begin*/, Index /*count*/) {
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
+  }
 };
 
 template <typename Lhs, typename Rhs, typename Packet, int LoadMode>
@@ -713,6 +762,11 @@
                                                         const Rhs& /*rhs*/, Index /*innerDim*/, Packet& res) {
     res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
   }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/,
+                                                                const Rhs& /*rhs*/, Index /*innerDim*/, Packet& res,
+                                                                Index /*begin*/, Index /*count*/) {
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
+  }
 };
 
 template <typename Lhs, typename Rhs, typename Packet, int LoadMode>
@@ -723,6 +777,13 @@
     for (Index i = 0; i < innerDim; ++i)
       res = pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode, Packet>(i, col), res);
   }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
+                                                                Index innerDim, Packet& res, Index begin, Index count) {
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
+    for (Index i = 0; i < innerDim; ++i)
+      res = pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packetSegment<LoadMode, Packet>(i, col, begin, count),
+                  res);
+  }
 };
 
 template <typename Lhs, typename Rhs, typename Packet, int LoadMode>
@@ -733,6 +794,13 @@
     for (Index i = 0; i < innerDim; ++i)
       res = pmadd(lhs.template packet<LoadMode, Packet>(row, i), pset1<Packet>(rhs.coeff(i, col)), res);
   }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
+                                                                Index innerDim, Packet& res, Index begin, Index count) {
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
+    for (Index i = 0; i < innerDim; ++i)
+      res = pmadd(lhs.template packetSegment<LoadMode, Packet>(row, i, begin, count), pset1<Packet>(rhs.coeff(i, col)),
+                  res);
+  }
 };
 
 /***************************************************************************
@@ -871,6 +939,26 @@
                           m_diagImpl.template packet<DiagonalPacketLoadMode, PacketType>(id));
   }
 
+  template <int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet_segment_impl(Index row, Index col, Index id, Index begin, Index count,
+                                                     internal::true_type) const {
+    return internal::pmul(m_matImpl.template packetSegment<LoadMode, PacketType>(row, col, begin, count),
+                          internal::pset1<PacketType>(m_diagImpl.coeff(id)));
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet_segment_impl(Index row, Index col, Index id, Index begin, Index count,
+                                                     internal::false_type) const {
+    enum {
+      InnerSize = (MatrixType::Flags & RowMajorBit) ? MatrixType::ColsAtCompileTime : MatrixType::RowsAtCompileTime,
+      DiagonalPacketLoadMode = plain_enum_min(
+          LoadMode,
+          ((InnerSize % 16) == 0) ? int(Aligned16) : int(evaluator<DiagonalType>::Alignment))  // FIXME hardcoded 16!!
+    };
+    return internal::pmul(m_matImpl.template packetSegment<LoadMode, PacketType>(row, col, begin, count),
+                          m_diagImpl.template packetSegment<DiagonalPacketLoadMode, PacketType>(id, begin, count));
+  }
+
   evaluator<DiagonalType> m_diagImpl;
   evaluator<MatrixType> m_matImpl;
 };
@@ -892,7 +980,8 @@
   typedef typename XprType::PlainObject PlainObject;
   typedef typename Lhs::DiagonalVectorType DiagonalType;
 
-  enum { StorageOrder = Base::StorageOrder_ };
+  static constexpr int StorageOrder = Base::StorageOrder_;
+  using IsRowMajor_t = bool_constant<StorageOrder == RowMajor>;
 
   EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr) : Base(xpr.rhs(), xpr.lhs().diagonal()) {}
 
@@ -905,8 +994,7 @@
   EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
     // FIXME: NVCC used to complain about the template keyword, but we have to check whether this is still the case.
     // See also similar calls below.
-    return this->template packet_impl<LoadMode, PacketType>(
-        row, col, row, std::conditional_t<int(StorageOrder) == RowMajor, internal::true_type, internal::false_type>());
+    return this->template packet_impl<LoadMode, PacketType>(row, col, row, IsRowMajor_t());
   }
 
   template <int LoadMode, typename PacketType>
@@ -914,6 +1002,19 @@
     return packet<LoadMode, PacketType>(int(StorageOrder) == ColMajor ? idx : 0,
                                         int(StorageOrder) == ColMajor ? 0 : idx);
   }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+    // FIXME: NVCC used to complain about the template keyword, but we have to check whether this is still the case.
+    // See also similar calls below.
+    return this->template packet_segment_impl<LoadMode, PacketType>(row, col, row, begin, count, IsRowMajor_t());
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packetSegment(Index idx, Index begin, Index count) const {
+    return packetSegment<LoadMode, PacketType>(StorageOrder == ColMajor ? idx : 0, StorageOrder == ColMajor ? 0 : idx,
+                                               begin, count);
+  }
 #endif
 };
 
@@ -933,7 +1034,8 @@
   typedef Product<Lhs, Rhs, ProductKind> XprType;
   typedef typename XprType::PlainObject PlainObject;
 
-  enum { StorageOrder = Base::StorageOrder_ };
+  static constexpr int StorageOrder = Base::StorageOrder_;
+  using IsColMajor_t = bool_constant<StorageOrder == ColMajor>;
 
   EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr) : Base(xpr.lhs(), xpr.rhs().diagonal()) {}
 
@@ -944,14 +1046,23 @@
 #ifndef EIGEN_GPUCC
   template <int LoadMode, typename PacketType>
   EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
-    return this->template packet_impl<LoadMode, PacketType>(
-        row, col, col, std::conditional_t<int(StorageOrder) == ColMajor, internal::true_type, internal::false_type>());
+    return this->template packet_impl<LoadMode, PacketType>(row, col, col, IsColMajor_t());
   }
 
   template <int LoadMode, typename PacketType>
   EIGEN_STRONG_INLINE PacketType packet(Index idx) const {
-    return packet<LoadMode, PacketType>(int(StorageOrder) == ColMajor ? idx : 0,
-                                        int(StorageOrder) == ColMajor ? 0 : idx);
+    return packet<LoadMode, PacketType>(StorageOrder == ColMajor ? idx : 0, StorageOrder == ColMajor ? 0 : idx);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+    return this->template packet_segment_impl<LoadMode, PacketType>(row, col, col, begin, count, IsColMajor_t());
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packetSegment(Index idx, Index begin, Index count) const {
+    return packetSegment<LoadMode, PacketType>(StorageOrder == ColMajor ? idx : 0, StorageOrder == ColMajor ? 0 : idx,
+                                               begin, count);
   }
 #endif
 };

diff --git a/Eigen/src/Core/RandomImpl.h b/Eigen/src/Core/RandomImpl.h
index 76e43f5..efba336 100644
--- a/Eigen/src/Core/RandomImpl.h
+++ b/Eigen/src/Core/RandomImpl.h

@@ -122,7 +122,7 @@
                             ((std::numeric_limits<long double>::digits != (2 * std::numeric_limits<double>::digits)))>
 struct random_longdouble_impl {
   static constexpr int Size = sizeof(long double);
-  static constexpr EIGEN_DEVICE_FUNC inline int mantissaBits() { return NumTraits<long double>::digits() - 1; }
+  static constexpr EIGEN_DEVICE_FUNC int mantissaBits() { return NumTraits<long double>::digits() - 1; }
   static EIGEN_DEVICE_FUNC inline long double run(int numRandomBits) {
     eigen_assert(numRandomBits >= 0 && numRandomBits <= mantissaBits());
     EIGEN_USING_STD(memcpy);
@@ -140,7 +140,7 @@
 };
 template <>
 struct random_longdouble_impl<false> {
-  static constexpr EIGEN_DEVICE_FUNC inline int mantissaBits() { return NumTraits<double>::digits() - 1; }
+  static constexpr EIGEN_DEVICE_FUNC int mantissaBits() { return NumTraits<double>::digits() - 1; }
   static EIGEN_DEVICE_FUNC inline long double run(int numRandomBits) {
     return static_cast<long double>(random_float_impl<double>::run(numRandomBits));
   }

diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h
index 0c5f2d9..4e9ab0e 100644
--- a/Eigen/src/Core/Redux.h
+++ b/Eigen/src/Core/Redux.h

@@ -414,6 +414,13 @@
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetByOuterInner(Index outer, Index inner) const {
     return Base::template packet<LoadMode, PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer);
   }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegmentByOuterInner(Index outer, Index inner, Index begin,
+                                                                             Index count) const {
+    return Base::template packetSegment<LoadMode, PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer,
+                                                              begin, count);
+  }
 };
 
 }  // end namespace internal

diff --git a/Eigen/src/Core/Ref.h b/Eigen/src/Core/Ref.h
index 129bc85..30ec277 100644
--- a/Eigen/src/Core/Ref.h
+++ b/Eigen/src/Core/Ref.h

@@ -73,11 +73,11 @@
   typedef MapBase<Derived> Base;
   EIGEN_DENSE_PUBLIC_INTERFACE(RefBase)
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const {
+  EIGEN_DEVICE_FUNC constexpr Index innerStride() const {
     return StrideType::InnerStrideAtCompileTime != 0 ? m_stride.inner() : 1;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const {
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const {
     return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
            : IsVectorAtCompileTime                   ? this->size()
            : int(Flags) & RowMajorBit                ? this->cols()
@@ -97,11 +97,11 @@
   typedef Stride<StrideType::OuterStrideAtCompileTime, StrideType::InnerStrideAtCompileTime> StrideBase;
 
   // Resolves inner stride if default 0.
-  static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index resolveInnerStride(Index inner) { return inner == 0 ? 1 : inner; }
+  static EIGEN_DEVICE_FUNC constexpr Index resolveInnerStride(Index inner) { return inner == 0 ? 1 : inner; }
 
   // Resolves outer stride if default 0.
-  static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index resolveOuterStride(Index inner, Index outer, Index rows, Index cols,
-                                                                    bool isVectorAtCompileTime, bool isRowMajor) {
+  static EIGEN_DEVICE_FUNC constexpr Index resolveOuterStride(Index inner, Index outer, Index rows, Index cols,
+                                                              bool isVectorAtCompileTime, bool isRowMajor) {
     return outer == 0 ? isVectorAtCompileTime ? inner * rows * cols : isRowMajor ? inner * cols : inner * rows : outer;
   }
 

diff --git a/Eigen/src/Core/Replicate.h b/Eigen/src/Core/Replicate.h
index 11d7ad1..3415045 100644
--- a/Eigen/src/Core/Replicate.h
+++ b/Eigen/src/Core/Replicate.h

@@ -85,8 +85,8 @@
                         THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE)
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const { return m_matrix.rows() * m_rowFactor.value(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const { return m_matrix.cols() * m_colFactor.value(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const { return m_matrix.rows() * m_rowFactor.value(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const { return m_matrix.cols() * m_colFactor.value(); }
 
   EIGEN_DEVICE_FUNC const MatrixTypeNested_& nestedExpression() const { return m_matrix; }
 

diff --git a/Eigen/src/Core/Reshaped.h b/Eigen/src/Core/Reshaped.h
index 4b34e16..22acdc0 100644
--- a/Eigen/src/Core/Reshaped.h
+++ b/Eigen/src/Core/Reshaped.h

@@ -215,10 +215,10 @@
   EIGEN_DEVICE_FUNC XprType& nestedExpression() { return m_xpr; }
 
   /** \sa MapBase::innerStride() */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const { return m_xpr.innerStride(); }
+  EIGEN_DEVICE_FUNC constexpr Index innerStride() const { return m_xpr.innerStride(); }
 
   /** \sa MapBase::outerStride() */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const {
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const {
     return (((Flags & RowMajorBit) == RowMajorBit) ? this->cols() : this->rows()) * m_xpr.innerStride();
   }
 

diff --git a/Eigen/src/Core/ReturnByValue.h b/Eigen/src/Core/ReturnByValue.h
index 3b5e470..892c193 100644
--- a/Eigen/src/Core/ReturnByValue.h
+++ b/Eigen/src/Core/ReturnByValue.h

@@ -58,12 +58,8 @@
   EIGEN_DEVICE_FUNC inline void evalTo(Dest& dst) const {
     static_cast<const Derived*>(this)->evalTo(dst);
   }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT {
-    return static_cast<const Derived*>(this)->rows();
-  }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT {
-    return static_cast<const Derived*>(this)->cols();
-  }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return static_cast<const Derived*>(this)->rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return static_cast<const Derived*>(this)->cols(); }
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 #define Unusable \

diff --git a/Eigen/src/Core/Reverse.h b/Eigen/src/Core/Reverse.h
index eb06fff..d11ba16 100644
--- a/Eigen/src/Core/Reverse.h
+++ b/Eigen/src/Core/Reverse.h

@@ -87,8 +87,8 @@
 
   EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Reverse)
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_matrix.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_matrix.cols(); }
 
   EIGEN_DEVICE_FUNC inline Index innerStride() const { return -m_matrix.innerStride(); }
 

diff --git a/Eigen/src/Core/Select.h b/Eigen/src/Core/Select.h
index 9f46120..0fa5f1e 100644
--- a/Eigen/src/Core/Select.h
+++ b/Eigen/src/Core/Select.h

@@ -63,8 +63,8 @@
     eigen_assert(m_condition.cols() == m_then.cols() && m_condition.cols() == m_else.cols());
   }
 
-  inline EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_condition.rows(); }
-  inline EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_condition.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_condition.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_condition.cols(); }
 
   inline EIGEN_DEVICE_FUNC const Scalar coeff(Index i, Index j) const {
     if (m_condition.coeff(i, j))

diff --git a/Eigen/src/Core/SelfAdjointView.h b/Eigen/src/Core/SelfAdjointView.h
index 4e9a923..16f0e75 100644
--- a/Eigen/src/Core/SelfAdjointView.h
+++ b/Eigen/src/Core/SelfAdjointView.h

@@ -73,10 +73,10 @@
 
   EIGEN_DEVICE_FUNC explicit inline SelfAdjointView(MatrixType& matrix) : m_matrix(matrix) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const EIGEN_NOEXCEPT { return m_matrix.outerStride(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const EIGEN_NOEXCEPT { return m_matrix.innerStride(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_matrix.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_matrix.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return m_matrix.outerStride(); }
+  EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { return m_matrix.innerStride(); }
 
   /** \sa MatrixBase::coeff()
    * \warning the coordinates must fit into the referenced triangular part

diff --git a/Eigen/src/Core/SkewSymmetricMatrix3.h b/Eigen/src/Core/SkewSymmetricMatrix3.h
index 1945fd3..3545afc 100644
--- a/Eigen/src/Core/SkewSymmetricMatrix3.h
+++ b/Eigen/src/Core/SkewSymmetricMatrix3.h

@@ -66,7 +66,7 @@
   EIGEN_DEVICE_FUNC DenseMatrixType toDenseMatrix() const { return derived(); }
 
   /** Determinant vanishes */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Scalar determinant() const { return 0; }
+  EIGEN_DEVICE_FUNC constexpr Scalar determinant() const { return 0; }
 
   /** A.transpose() = -A */
   EIGEN_DEVICE_FUNC PlainObject transpose() const { return (-vector()).asSkewSymmetric(); }
@@ -91,9 +91,9 @@
   EIGEN_DEVICE_FUNC inline SkewSymmetricVectorType& vector() { return derived().vector(); }
 
   /** \returns the number of rows. */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const { return 3; }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const { return 3; }
   /** \returns the number of columns. */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const { return 3; }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const { return 3; }
 
   /** \returns the matrix product of \c *this by the dense matrix, \a matrix */
   template <typename MatrixDerived>

diff --git a/Eigen/src/Core/Solve.h b/Eigen/src/Core/Solve.h
index dfea9c6..aa51410 100644
--- a/Eigen/src/Core/Solve.h
+++ b/Eigen/src/Core/Solve.h

@@ -66,8 +66,8 @@
 
   Solve(const Decomposition &dec, const RhsType &rhs) : m_dec(dec), m_rhs(rhs) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_dec.cols(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_dec.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_rhs.cols(); }
 
   EIGEN_DEVICE_FUNC const Decomposition &dec() const { return m_dec; }
   EIGEN_DEVICE_FUNC const RhsType &rhs() const { return m_rhs; }

diff --git a/Eigen/src/Core/SolveTriangular.h b/Eigen/src/Core/SolveTriangular.h
index 26d62ff..9d31874 100644
--- a/Eigen/src/Core/SolveTriangular.h
+++ b/Eigen/src/Core/SolveTriangular.h

@@ -216,8 +216,8 @@
 
   triangular_solve_retval(const TriangularType& tri, const Rhs& rhs) : m_triangularMatrix(tri), m_rhs(rhs) {}
 
-  inline EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_rhs.rows(); }
-  inline EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); }
+  constexpr Index rows() const noexcept { return m_rhs.rows(); }
+  constexpr Index cols() const noexcept { return m_rhs.cols(); }
 
   template <typename Dest>
   inline void evalTo(Dest& dst) const {

diff --git a/Eigen/src/Core/StlIterators.h b/Eigen/src/Core/StlIterators.h
index 4576cc0..a24d4c2 100644
--- a/Eigen/src/Core/StlIterators.h
+++ b/Eigen/src/Core/StlIterators.h

@@ -36,11 +36,11 @@
   typedef Index difference_type;
   typedef std::random_access_iterator_tag iterator_category;
 
-  indexed_based_stl_iterator_base() EIGEN_NO_THROW : mp_xpr(0), m_index(0) {}
-  indexed_based_stl_iterator_base(XprType& xpr, Index index) EIGEN_NO_THROW : mp_xpr(&xpr), m_index(index) {}
+  indexed_based_stl_iterator_base() noexcept : mp_xpr(0), m_index(0) {}
+  indexed_based_stl_iterator_base(XprType& xpr, Index index) noexcept : mp_xpr(&xpr), m_index(index) {}
 
-  indexed_based_stl_iterator_base(const non_const_iterator& other) EIGEN_NO_THROW : mp_xpr(other.mp_xpr),
-                                                                                    m_index(other.m_index) {}
+  indexed_based_stl_iterator_base(const non_const_iterator& other) noexcept
+      : mp_xpr(other.mp_xpr), m_index(other.m_index) {}
 
   indexed_based_stl_iterator_base& operator=(const non_const_iterator& other) {
     mp_xpr = other.mp_xpr;
@@ -335,15 +335,14 @@
   typedef std::conditional_t<bool(is_lvalue), value_type*, const value_type*> pointer;
   typedef std::conditional_t<bool(is_lvalue), value_type&, const value_type&> reference;
 
-  pointer_based_stl_iterator() EIGEN_NO_THROW : m_ptr(0) {}
-  pointer_based_stl_iterator(XprType& xpr, Index index) EIGEN_NO_THROW : m_incr(xpr.innerStride()) {
+  pointer_based_stl_iterator() noexcept : m_ptr(0) {}
+  pointer_based_stl_iterator(XprType& xpr, Index index) noexcept : m_incr(xpr.innerStride()) {
     m_ptr = xpr.data() + index * m_incr.value();
   }
 
-  pointer_based_stl_iterator(const non_const_iterator& other) EIGEN_NO_THROW : m_ptr(other.m_ptr),
-                                                                               m_incr(other.m_incr) {}
+  pointer_based_stl_iterator(const non_const_iterator& other) noexcept : m_ptr(other.m_ptr), m_incr(other.m_incr) {}
 
-  pointer_based_stl_iterator& operator=(const non_const_iterator& other) EIGEN_NO_THROW {
+  pointer_based_stl_iterator& operator=(const non_const_iterator& other) noexcept {
     m_ptr = other.m_ptr;
     m_incr.setValue(other.m_incr);
     return *this;

diff --git a/Eigen/src/Core/Stride.h b/Eigen/src/Core/Stride.h
index 14b025c..692f0a1 100644
--- a/Eigen/src/Core/Stride.h
+++ b/Eigen/src/Core/Stride.h

@@ -78,9 +78,9 @@
   }
 
   /** \returns the outer stride */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outer() const { return m_outer.value(); }
+  EIGEN_DEVICE_FUNC constexpr Index outer() const { return m_outer.value(); }
   /** \returns the inner stride */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index inner() const { return m_inner.value(); }
+  EIGEN_DEVICE_FUNC constexpr Index inner() const { return m_inner.value(); }
 
  protected:
   internal::variable_if_dynamic<Index, OuterStrideAtCompileTime> m_outer;

diff --git a/Eigen/src/Core/Swap.h b/Eigen/src/Core/Swap.h
index d417c1a..dd825e9 100644
--- a/Eigen/src/Core/Swap.h
+++ b/Eigen/src/Core/Swap.h

@@ -65,6 +65,31 @@
     Index col = Base::colIndexByOuterInner(outer, inner);
     assignPacket<StoreMode, LoadMode, PacketType>(row, col);
   }
+
+  template <int StoreMode, int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE void assignPacketSegment(Index row, Index col, Index begin, Index count) {
+    PacketType tmp = m_src.template packetSegment<LoadMode, PacketType>(row, col, begin, count);
+    const_cast<SrcEvaluatorTypeT &>(m_src).template writePacketSegment<LoadMode>(
+        row, col, m_dst.template packetSegment<StoreMode, PacketType>(row, col, begin, count), begin, count);
+    m_dst.template writePacketSegment<StoreMode>(row, col, tmp, begin, count);
+  }
+
+  template <int StoreMode, int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE void assignPacketSegment(Index index, Index begin, Index count) {
+    PacketType tmp = m_src.template packetSegment<LoadMode, PacketType>(index, begin, count);
+    const_cast<SrcEvaluatorTypeT &>(m_src).template writePacketSegment<LoadMode>(
+        index, m_dst.template packetSegment<StoreMode, PacketType>(index, begin, count), begin, count);
+    m_dst.template writePacketSegment<StoreMode>(index, tmp, begin, count);
+  }
+
+  // TODO find a simple way not to have to copy/paste this function from generic_dense_assignment_kernel, by simple I
+  // mean no CRTP (Gael)
+  template <int StoreMode, int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE void assignPacketSegmentByOuterInner(Index outer, Index inner, Index begin, Index count) {
+    Index row = Base::rowIndexByOuterInner(outer, inner);
+    Index col = Base::colIndexByOuterInner(outer, inner);
+    assignPacketSegment<StoreMode, LoadMode, PacketType>(row, col, begin, count);
+  }
 };
 
 }  // namespace internal

diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h
index 89e3d95..0676a25 100644
--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h

@@ -65,8 +65,8 @@
 
   EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Transpose)
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const noexcept { return m_matrix.cols(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const noexcept { return m_matrix.rows(); }
 
   /** \returns the nested expression */
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all_t<MatrixTypeNested>& nestedExpression() const {

diff --git a/Eigen/src/Core/Transpositions.h b/Eigen/src/Core/Transpositions.h
index 6fbbbd8..f6dd258 100644
--- a/Eigen/src/Core/Transpositions.h
+++ b/Eigen/src/Core/Transpositions.h

@@ -293,9 +293,9 @@
  public:
   explicit Transpose(const TranspositionType& t) : m_transpositions(t) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index size() const EIGEN_NOEXCEPT { return m_transpositions.size(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_transpositions.size(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_transpositions.size(); }
+  EIGEN_DEVICE_FUNC constexpr Index size() const noexcept { return m_transpositions.size(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_transpositions.size(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_transpositions.size(); }
 
   /** \returns the \a matrix with the inverse transpositions applied to the columns.
    */

diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h
index 2b1683b..27ad78e 100644
--- a/Eigen/src/Core/TriangularMatrix.h
+++ b/Eigen/src/Core/TriangularMatrix.h

@@ -58,10 +58,10 @@
     eigen_assert(!((int(Mode) & int(UnitDiag)) && (int(Mode) & int(ZeroDiag))));
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return derived().rows(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return derived().cols(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const EIGEN_NOEXCEPT { return derived().outerStride(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const EIGEN_NOEXCEPT { return derived().innerStride(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return derived().rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return derived().cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return derived().outerStride(); }
+  EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { return derived().innerStride(); }
 
   // dummy resize function
   EIGEN_DEVICE_FUNC void resize(Index rows, Index cols) {
@@ -194,9 +194,9 @@
   EIGEN_INHERIT_ASSIGNMENT_OPERATORS(TriangularView)
 
   /** \copydoc EigenBase::rows() */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_matrix.rows(); }
   /** \copydoc EigenBase::cols() */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_matrix.cols(); }
 
   /** \returns a const reference to the nested expression */
   EIGEN_DEVICE_FUNC const NestedExpression& nestedExpression() const { return m_matrix; }

diff --git a/Eigen/src/Core/VectorwiseOp.h b/Eigen/src/Core/VectorwiseOp.h
index 9887db6..ac52dc5 100644
--- a/Eigen/src/Core/VectorwiseOp.h
+++ b/Eigen/src/Core/VectorwiseOp.h

@@ -36,6 +36,7 @@
 class PartialReduxExpr;
 
 namespace internal {
+
 template <typename MatrixType, typename MemberOp, int Direction>
 struct traits<PartialReduxExpr<MatrixType, MemberOp, Direction> > : traits<MatrixType> {
   typedef typename MemberOp::result_type Scalar;
@@ -63,12 +64,8 @@
   EIGEN_DEVICE_FUNC explicit PartialReduxExpr(const MatrixType& mat, const MemberOp& func = MemberOp())
       : m_matrix(mat), m_functor(func) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT {
-    return (Direction == Vertical ? 1 : m_matrix.rows());
-  }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT {
-    return (Direction == Horizontal ? 1 : m_matrix.cols());
-  }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return (Direction == Vertical ? 1 : m_matrix.rows()); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return (Direction == Horizontal ? 1 : m_matrix.cols()); }
 
   EIGEN_DEVICE_FUNC typename MatrixType::Nested nestedExpression() const { return m_matrix; }
 

diff --git a/Eigen/src/Core/Visitor.h b/Eigen/src/Core/Visitor.h
index 198ec95..0450e2d 100644
--- a/Eigen/src/Core/Visitor.h
+++ b/Eigen/src/Core/Visitor.h

@@ -25,14 +25,12 @@
 template <typename Visitor, bool ShortCircuitEvaluation = false>
 struct short_circuit_eval_impl {
   // if short circuit evaluation is not used, do nothing
-  static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(const Visitor&) { return false; }
+  static constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(const Visitor&) { return false; }
 };
 template <typename Visitor>
 struct short_circuit_eval_impl<Visitor, true> {
   // if short circuit evaluation is used, check the visitor
-  static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(const Visitor& visitor) {
-    return visitor.done();
-  }
+  static constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(const Visitor& visitor) { return visitor.done(); }
 };
 
 // unrolled inner-outer traversal
@@ -296,9 +294,9 @@
 
   EIGEN_DEVICE_FUNC explicit visitor_evaluator(const XprType& xpr) : m_evaluator(xpr), m_xpr(xpr) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_xpr.rows(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_xpr.cols(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index size() const EIGEN_NOEXCEPT { return m_xpr.size(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_xpr.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_xpr.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index size() const noexcept { return m_xpr.size(); }
   // outer-inner access
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
     return m_evaluator.coeff(row, col);
@@ -632,6 +630,17 @@
   };
 };
 
+template <typename Derived, bool AlwaysTrue = NumTraits<typename traits<Derived>::Scalar>::IsInteger>
+struct all_finite_impl {
+  static EIGEN_DEVICE_FUNC inline bool run(const Derived& /*derived*/) { return true; }
+};
+#if !defined(__FINITE_MATH_ONLY__) || !(__FINITE_MATH_ONLY__)
+template <typename Derived>
+struct all_finite_impl<Derived, false> {
+  static EIGEN_DEVICE_FUNC inline bool run(const Derived& derived) { return derived.array().isFiniteTyped().all(); }
+};
+#endif
+
 }  // end namespace internal
 
 /** \fn DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
@@ -781,7 +790,7 @@
  */
 template <typename Derived>
 EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::allFinite() const {
-  return derived().array().isFiniteTyped().all();
+  return internal::all_finite_impl<Derived>::run(derived());
 }
 
 }  // end namespace Eigen

diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h
index d5506da..a4a87c4 100644
--- a/Eigen/src/Core/arch/AVX/Complex.h
+++ b/Eigen/src/Core/arch/AVX/Complex.h

@@ -475,19 +475,11 @@
 }
 template <>
 EIGEN_STRONG_INLINE Packet4cf pnmadd(const Packet4cf& a, const Packet4cf& b, const Packet4cf& c) {
-  __m256 a_odd = _mm256_movehdup_ps(a.v);
-  __m256 a_even = _mm256_moveldup_ps(a.v);
-  __m256 b_swap = _mm256_permute_ps(b.v, _MM_SHUFFLE(2, 3, 0, 1));
-  __m256 result = _mm256_fmaddsub_ps(a_odd, b_swap, _mm256_fmaddsub_ps(a_even, b.v, c.v));
-  return Packet4cf(result);
+  return pnegate(pmsub(a, b, c));
 }
 template <>
 EIGEN_STRONG_INLINE Packet4cf pnmsub(const Packet4cf& a, const Packet4cf& b, const Packet4cf& c) {
-  __m256 a_odd = _mm256_movehdup_ps(a.v);
-  __m256 a_even = _mm256_moveldup_ps(a.v);
-  __m256 b_swap = _mm256_permute_ps(b.v, _MM_SHUFFLE(2, 3, 0, 1));
-  __m256 result = _mm256_fmaddsub_ps(a_odd, b_swap, _mm256_fmsubadd_ps(a_even, b.v, c.v));
-  return Packet4cf(result);
+  return pnegate(pmadd(a, b, c));
 }
 // std::complex<double>
 template <>
@@ -508,21 +500,64 @@
 }
 template <>
 EIGEN_STRONG_INLINE Packet2cd pnmadd(const Packet2cd& a, const Packet2cd& b, const Packet2cd& c) {
-  __m256d a_odd = _mm256_permute_pd(a.v, 0xF);
-  __m256d a_even = _mm256_movedup_pd(a.v);
-  __m256d b_swap = _mm256_permute_pd(b.v, 0x5);
-  __m256d result = _mm256_fmaddsub_pd(a_odd, b_swap, _mm256_fmaddsub_pd(a_even, b.v, c.v));
-  return Packet2cd(result);
+  return pnegate(pmsub(a, b, c));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2cd pnmsub(const Packet2cd& a, const Packet2cd& b, const Packet2cd& c) {
-  __m256d a_odd = _mm256_permute_pd(a.v, 0xF);
-  __m256d a_even = _mm256_movedup_pd(a.v);
-  __m256d b_swap = _mm256_permute_pd(b.v, 0x5);
-  __m256d result = _mm256_fmaddsub_pd(a_odd, b_swap, _mm256_fmsubadd_pd(a_even, b.v, c.v));
-  return Packet2cd(result);
+  return pnegate(pmadd(a, b, c));
 }
 #endif
+
+/*---------------- load/store segment support ----------------*/
+
+/*---------------- std::complex<float> ----------------*/
+
+template <>
+struct has_packet_segment<Packet2cf> : std::true_type {};
+
+template <>
+struct has_packet_segment<Packet4cf> : std::true_type {};
+
+template <>
+inline Packet2cf ploaduSegment<Packet2cf>(const std::complex<float>* from, Index begin, Index count) {
+  return (Packet2cf)_mm_maskload_ps(&numext::real_ref(*from), segment_mask_2x64(begin, count));
+}
+
+template <>
+inline void pstoreuSegment<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index begin,
+                                                           Index count) {
+  _mm_maskstore_ps(&numext::real_ref(*to), segment_mask_2x64(begin, count), from.v);
+}
+
+template <>
+inline Packet4cf ploaduSegment<Packet4cf>(const std::complex<float>* from, Index begin, Index count) {
+  return (Packet4cf)_mm256_maskload_ps(&numext::real_ref(*from), segment_mask_4x64(begin, count));
+}
+
+template <>
+inline void pstoreuSegment<std::complex<float>, Packet4cf>(std::complex<float>* to, const Packet4cf& from, Index begin,
+                                                           Index count) {
+  _mm256_maskstore_ps(&numext::real_ref(*to), segment_mask_4x64(begin, count), from.v);
+}
+
+/*---------------- std::complex<double> ----------------*/
+
+template <>
+struct has_packet_segment<Packet2cd> : std::true_type {};
+
+template <>
+inline Packet2cd ploaduSegment<Packet2cd>(const std::complex<double>* from, Index begin, Index count) {
+  return (Packet2cd)_mm256_maskload_pd(&numext::real_ref(*from), segment_mask_4x64(2 * begin, 2 * count));
+}
+
+template <>
+inline void pstoreuSegment<std::complex<double>, Packet2cd>(std::complex<double>* to, const Packet2cd& from,
+                                                            Index begin, Index count) {
+  _mm256_maskstore_pd(&numext::real_ref(*to), segment_mask_4x64(2 * begin, 2 * count), from.v);
+}
+
+/*---------------- end load/store segment support ----------------*/
+
 }  // end namespace internal
 
 }  // end namespace Eigen

diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h
index a5c38e7..5b7285f 100644
--- a/Eigen/src/Core/arch/AVX/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX/MathFunctions.h

@@ -28,6 +28,7 @@
 EIGEN_DOUBLE_PACKET_FUNCTION(log2, Packet4d)
 EIGEN_DOUBLE_PACKET_FUNCTION(exp, Packet4d)
 EIGEN_DOUBLE_PACKET_FUNCTION(tanh, Packet4d)
+EIGEN_DOUBLE_PACKET_FUNCTION(cbrt, Packet4d)
 #ifdef EIGEN_VECTORIZE_AVX2
 EIGEN_DOUBLE_PACKET_FUNCTION(sin, Packet4d)
 EIGEN_DOUBLE_PACKET_FUNCTION(cos, Packet4d)
@@ -106,6 +107,8 @@
 BF16_PACKET_FUNCTION(Packet8f, Packet8bf, psin)
 BF16_PACKET_FUNCTION(Packet8f, Packet8bf, psqrt)
 BF16_PACKET_FUNCTION(Packet8f, Packet8bf, ptanh)
+
+#ifndef EIGEN_VECTORIZE_AVX512FP16
 F16_PACKET_FUNCTION(Packet8f, Packet8h, pcos)
 F16_PACKET_FUNCTION(Packet8f, Packet8h, pexp)
 F16_PACKET_FUNCTION(Packet8f, Packet8h, pexp2)
@@ -118,6 +121,7 @@
 F16_PACKET_FUNCTION(Packet8f, Packet8h, psin)
 F16_PACKET_FUNCTION(Packet8f, Packet8h, psqrt)
 F16_PACKET_FUNCTION(Packet8f, Packet8h, ptanh)
+#endif
 
 }  // end namespace internal
 

diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index c29523a..470e36d 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h

@@ -122,6 +122,7 @@
     HasBessel = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
+    HasCbrt = 1,
     HasTanh = EIGEN_FAST_MATH,
     HasErf = EIGEN_FAST_MATH,
     HasErfc = EIGEN_FAST_MATH,
@@ -150,6 +151,7 @@
     HasExp = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
+    HasCbrt = 1,
     HasATan = 1,
     HasATanh = 1,
     HasBlend = 1
@@ -1839,10 +1841,13 @@
   return a;
 }
 
+#ifndef EIGEN_VECTORIZE_AVX512FP16
 template <>
 EIGEN_STRONG_INLINE Packet8h psignbit(const Packet8h& a) {
   return _mm_cmpgt_epi16(_mm_setzero_si128(), a);
 }
+#endif  // EIGEN_VECTORIZE_AVX512FP16
+
 template <>
 EIGEN_STRONG_INLINE Packet8bf psignbit(const Packet8bf& a) {
   return _mm_cmpgt_epi16(_mm_setzero_si128(), a);
@@ -2044,10 +2049,13 @@
   return _mm256_movemask_ps(_mm256_castsi256_ps(x)) != 0;
 }
 
+#ifndef EIGEN_VECTORIZE_AVX512FP16
 template <>
 EIGEN_STRONG_INLINE bool predux_any(const Packet8h& x) {
   return _mm_movemask_epi8(x) != 0;
 }
+#endif  // EIGEN_VECTORIZE_AVX512FP16
+
 template <>
 EIGEN_STRONG_INLINE bool predux_any(const Packet8bf& x) {
   return _mm_movemask_epi8(x) != 0;
@@ -2211,7 +2219,6 @@
   };
   typedef Packet8h half;
 };
-#endif
 
 template <>
 EIGEN_STRONG_INLINE Packet8h pset1<Packet8h>(const Eigen::half& from) {
@@ -2411,6 +2418,26 @@
 }
 
 template <>
+EIGEN_STRONG_INLINE Packet8h pmadd<Packet8h>(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
+  return float2half(pmadd(half2float(a), half2float(b), half2float(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pmsub<Packet8h>(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
+  return float2half(pmsub(half2float(a), half2float(b), half2float(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pnmadd<Packet8h>(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
+  return float2half(pnmadd(half2float(a), half2float(b), half2float(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pnmsub<Packet8h>(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
+  return float2half(pnmsub(half2float(a), half2float(b), half2float(c)));
+}
+
+template <>
 EIGEN_STRONG_INLINE Packet8h pdiv<Packet8h>(const Packet8h& a, const Packet8h& b) {
   Packet8f af = half2float(a);
   Packet8f bf = half2float(b);
@@ -2446,14 +2473,12 @@
   to[stride * 7] = aux[7];
 }
 
-#ifndef EIGEN_VECTORIZE_AVX512FP16
 template <>
 EIGEN_STRONG_INLINE Eigen::half predux<Packet8h>(const Packet8h& a) {
   Packet8f af = half2float(a);
   float reduced = predux<Packet8f>(af);
   return Eigen::half(reduced);
 }
-#endif
 
 template <>
 EIGEN_STRONG_INLINE Eigen::half predux_max<Packet8h>(const Packet8h& a) {
@@ -2553,6 +2578,8 @@
   kernel.packet[3] = pload<Packet8h>(out[3]);
 }
 
+#endif
+
 // BFloat16 implementation.
 
 EIGEN_STRONG_INLINE Packet8f Bf16ToF32(const Packet8bf& a) {
@@ -2781,6 +2808,26 @@
 }
 
 template <>
+EIGEN_STRONG_INLINE Packet8bf pmadd<Packet8bf>(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
+  return F32ToBf16(pmadd(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pmsub<Packet8bf>(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
+  return F32ToBf16(pmsub(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pnmadd<Packet8bf>(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
+  return F32ToBf16(pnmadd(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pnmsub<Packet8bf>(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
+  return F32ToBf16(pnmsub(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
+}
+
+template <>
 EIGEN_STRONG_INLINE Packet8bf pdiv<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
   return F32ToBf16(pdiv<Packet8f>(Bf16ToF32(a), Bf16ToF32(b)));
 }
@@ -2893,6 +2940,258 @@
   kernel.packet[3] = _mm_unpackhi_epi32(ab_47, cd_47);
 }
 
+/*---------------- load/store segment support ----------------*/
+
+// returns a mask of 8-bit elements (at most 4) that are all 1's in the range [begin, begin + count) and 0 elsewhere.
+inline __m128i segment_mask_4x8(Index begin, Index count) {
+  eigen_assert(begin >= 0 && begin + count <= 4);
+  long long mask = 1;
+  mask <<= CHAR_BIT * count;
+  mask--;
+  mask <<= CHAR_BIT * begin;
+#if defined(_WIN32) && !defined(_WIN64)
+  return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&mask));
+#else
+  return _mm_cvtsi64_si128(mask);
+#endif
+}
+
+// returns a mask of 8-bit elements (at most 8) that are all 1's in the range [begin, begin + count) and 0 elsewhere.
+inline __m128i segment_mask_8x8(Index begin, Index count) {
+  eigen_assert(begin >= 0 && begin + count <= 8);
+  long long mask = 1;
+  // avoid UB when count == 8
+  mask <<= (CHAR_BIT / 2) * count;
+  mask <<= (CHAR_BIT / 2) * count;
+  mask--;
+  mask <<= CHAR_BIT * begin;
+#if defined(_WIN32) && !defined(_WIN64)
+  return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&mask));
+#else
+  return _mm_cvtsi64_si128(mask);
+#endif
+}
+
+// returns a mask of 32-bit elements (at most 4) that are all 1's in the range [begin, begin + count) and 0 elsewhere.
+inline __m128i segment_mask_4x32(Index begin, Index count) {
+  eigen_assert(begin >= 0 && begin + count <= 4);
+  return _mm_cvtepi8_epi32(segment_mask_4x8(begin, count));
+}
+
+// returns a mask of 64-bit elements (at most 2) that are all 1's in the range [begin, begin + count) and 0 elsewhere.
+inline __m128i segment_mask_2x64(Index begin, Index count) {
+  eigen_assert(begin >= 0 && begin + count <= 2);
+  return _mm_cvtepi8_epi64(segment_mask_4x8(begin, count));
+}
+
+// returns a mask of 32-bit elements (at most 8) that are all 1's in the range [begin, begin + count) and 0 elsewhere.
+inline __m256i segment_mask_8x32(Index begin, Index count) {
+  __m128i mask_epi8 = segment_mask_8x8(begin, count);
+#ifdef EIGEN_VECTORIZE_AVX2
+  __m256i mask_epi32 = _mm256_cvtepi8_epi32(mask_epi8);
+#else
+  __m128i mask_epi32_lo = _mm_cvtepi8_epi32(mask_epi8);
+  __m128i mask_epi32_hi = _mm_cvtepi8_epi32(_mm_srli_epi64(mask_epi8, 32));
+  __m256i mask_epi32 = _mm256_insertf128_si256(_mm256_castsi128_si256(mask_epi32_lo), mask_epi32_hi, 1);
+#endif
+  return mask_epi32;
+}
+
+// returns a mask of 64-bit elements (at most 4) that are all 1's in the range [begin, begin + count) and 0 elsewhere.
+inline __m256i segment_mask_4x64(Index begin, Index count) {
+  __m128i mask_epi8 = segment_mask_4x8(begin, count);
+#ifdef EIGEN_VECTORIZE_AVX2
+  __m256i mask_epi64 = _mm256_cvtepi8_epi64(mask_epi8);
+#else
+  __m128i mask_epi64_lo = _mm_cvtepi8_epi64(mask_epi8);
+  __m128i mask_epi64_hi = _mm_cvtepi8_epi64(_mm_srli_epi64(mask_epi8, 16));
+  __m256i mask_epi64 = _mm256_insertf128_si256(_mm256_castsi128_si256(mask_epi64_lo), mask_epi64_hi, 1);
+#endif
+  return mask_epi64;
+}
+
+/*---------------- float ----------------*/
+
+template <>
+struct has_packet_segment<Packet4f> : std::true_type {};
+
+template <>
+struct has_packet_segment<Packet8f> : std::true_type {};
+
+template <>
+inline Packet4f ploaduSegment<Packet4f>(const float* from, Index begin, Index count) {
+  return _mm_maskload_ps(from, segment_mask_4x32(begin, count));
+}
+
+template <>
+inline void pstoreuSegment<float, Packet4f>(float* to, const Packet4f& from, Index begin, Index count) {
+  _mm_maskstore_ps(to, segment_mask_4x32(begin, count), from);
+}
+
+template <>
+inline Packet8f ploaduSegment<Packet8f>(const float* from, Index begin, Index count) {
+  return _mm256_maskload_ps(from, segment_mask_8x32(begin, count));
+}
+
+template <>
+inline void pstoreuSegment<float, Packet8f>(float* to, const Packet8f& from, Index begin, Index count) {
+  _mm256_maskstore_ps(to, segment_mask_8x32(begin, count), from);
+}
+
+/*---------------- int32 ----------------*/
+
+template <>
+struct has_packet_segment<Packet4i> : std::true_type {};
+
+template <>
+struct has_packet_segment<Packet8i> : std::true_type {};
+
+#ifdef EIGEN_VECTORIZE_AVX2
+
+template <>
+inline Packet4i ploaduSegment<Packet4i>(const int* from, Index begin, Index count) {
+  return _mm_maskload_epi32(from, segment_mask_4x32(begin, count));
+}
+
+template <>
+inline void pstoreuSegment<int, Packet4i>(int* to, const Packet4i& from, Index begin, Index count) {
+  _mm_maskstore_epi32(to, segment_mask_4x32(begin, count), from);
+}
+
+template <>
+inline Packet8i ploaduSegment<Packet8i>(const int* from, Index begin, Index count) {
+  return _mm256_maskload_epi32(from, segment_mask_8x32(begin, count));
+}
+
+template <>
+inline void pstoreuSegment<int, Packet8i>(int* to, const Packet8i& from, Index begin, Index count) {
+  _mm256_maskstore_epi32(to, segment_mask_8x32(begin, count), from);
+}
+
+#else
+
+template <>
+inline Packet4i ploaduSegment<Packet4i>(const int* from, Index begin, Index count) {
+  return _mm_castps_si128(ploaduSegment<Packet4f>(reinterpret_cast<const float*>(from), begin, count));
+}
+
+template <>
+inline void pstoreuSegment<int, Packet4i>(int* to, const Packet4i& from, Index begin, Index count) {
+  pstoreuSegment<float, Packet4f>(reinterpret_cast<float*>(to), _mm_castsi128_ps(from), begin, count);
+}
+
+template <>
+inline Packet8i ploaduSegment<Packet8i>(const int* from, Index begin, Index count) {
+  return _mm256_castps_si256(ploaduSegment<Packet8f>(reinterpret_cast<const float*>(from), begin, count));
+}
+
+template <>
+inline void pstoreuSegment<int, Packet8i>(int* to, const Packet8i& from, Index begin, Index count) {
+  pstoreuSegment<float, Packet8f>(reinterpret_cast<float*>(to), _mm256_castsi256_ps(from), begin, count);
+}
+
+#endif
+
+/*---------------- uint32 ----------------*/
+
+template <>
+struct has_packet_segment<Packet4ui> : std::true_type {};
+
+template <>
+struct has_packet_segment<Packet8ui> : std::true_type {};
+
+template <>
+inline Packet4ui ploaduSegment<Packet4ui>(const uint32_t* from, Index begin, Index count) {
+  return Packet4ui(ploaduSegment<Packet4i>(reinterpret_cast<const int*>(from), begin, count));
+}
+
+template <>
+inline void pstoreuSegment<uint32_t, Packet4ui>(uint32_t* to, const Packet4ui& from, Index begin, Index count) {
+  pstoreuSegment<int, Packet4i>(reinterpret_cast<int*>(to), Packet4i(from), begin, count);
+}
+
+template <>
+inline Packet8ui ploaduSegment<Packet8ui>(const uint32_t* from, Index begin, Index count) {
+  return Packet8ui(ploaduSegment<Packet8i>(reinterpret_cast<const int*>(from), begin, count));
+}
+
+template <>
+inline void pstoreuSegment<uint32_t, Packet8ui>(uint32_t* to, const Packet8ui& from, Index begin, Index count) {
+  pstoreuSegment<int, Packet8i>(reinterpret_cast<int*>(to), Packet8i(from), begin, count);
+}
+
+/*---------------- double ----------------*/
+
+template <>
+struct has_packet_segment<Packet2d> : std::true_type {};
+
+template <>
+struct has_packet_segment<Packet4d> : std::true_type {};
+
+template <>
+inline Packet2d ploaduSegment<Packet2d>(const double* from, Index begin, Index count) {
+  return _mm_maskload_pd(from, segment_mask_2x64(begin, count));
+}
+
+template <>
+inline void pstoreuSegment<double, Packet2d>(double* to, const Packet2d& from, Index begin, Index count) {
+  _mm_maskstore_pd(to, segment_mask_2x64(begin, count), from);
+}
+
+template <>
+inline Packet4d ploaduSegment<Packet4d>(const double* from, Index begin, Index count) {
+  return _mm256_maskload_pd(from, segment_mask_4x64(begin, count));
+}
+
+template <>
+inline void pstoreuSegment<double, Packet4d>(double* to, const Packet4d& from, Index begin, Index count) {
+  _mm256_maskstore_pd(to, segment_mask_4x64(begin, count), from);
+}
+
+#ifdef EIGEN_VECTORIZE_AVX2
+
+/*---------------- int64_t ----------------*/
+
+template <>
+struct has_packet_segment<Packet2l> : std::true_type {};
+
+template <>
+struct has_packet_segment<Packet4l> : std::true_type {};
+
+template <>
+inline Packet2l ploaduSegment<Packet2l>(const int64_t* from, Index begin, Index count) {
+  return _mm_maskload_epi64(reinterpret_cast<const long long*>(from), segment_mask_2x64(begin, count));
+}
+template <>
+inline void pstoreuSegment<int64_t, Packet2l>(int64_t* to, const Packet2l& from, Index begin, Index count) {
+  _mm_maskstore_epi64(reinterpret_cast<long long*>(to), segment_mask_2x64(begin, count), from);
+}
+template <>
+inline Packet4l ploaduSegment<Packet4l>(const int64_t* from, Index begin, Index count) {
+  return _mm256_maskload_epi64(reinterpret_cast<const long long*>(from), segment_mask_4x64(begin, count));
+}
+template <>
+inline void pstoreuSegment<int64_t, Packet4l>(int64_t* to, const Packet4l& from, Index begin, Index count) {
+  _mm256_maskstore_epi64(reinterpret_cast<long long*>(to), segment_mask_4x64(begin, count), from);
+}
+
+/*---------------- uint64_t ----------------*/
+
+template <>
+struct has_packet_segment<Packet4ul> : std::true_type {};
+
+template <>
+inline Packet4ul ploaduSegment<Packet4ul>(const uint64_t* from, Index begin, Index count) {
+  return Packet4ul(ploaduSegment<Packet4l>(reinterpret_cast<const int64_t*>(from), begin, count));
+}
+template <>
+inline void pstoreuSegment<uint64_t, Packet4ul>(uint64_t* to, const Packet4ul& from, Index begin, Index count) {
+  pstoreuSegment<int64_t, Packet4l>(reinterpret_cast<int64_t*>(to), Packet4l(from), begin, count);
+}
+#endif
+
+/*---------------- end load/store segment support ----------------*/
+
 }  // end namespace internal
 
 }  // end namespace Eigen

diff --git a/Eigen/src/Core/arch/AVX/TypeCasting.h b/Eigen/src/Core/arch/AVX/TypeCasting.h
index 9dcd6ef..5b73ffe 100644
--- a/Eigen/src/Core/arch/AVX/TypeCasting.h
+++ b/Eigen/src/Core/arch/AVX/TypeCasting.h

@@ -279,20 +279,22 @@
 }
 #endif
 
+#ifndef EIGEN_VECTORIZE_AVX512FP16
 template <>
 EIGEN_STRONG_INLINE Packet8f pcast<Packet8h, Packet8f>(const Packet8h& a) {
   return half2float(a);
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet8f pcast<Packet8bf, Packet8f>(const Packet8bf& a) {
-  return Bf16ToF32(a);
-}
-
-template <>
 EIGEN_STRONG_INLINE Packet8h pcast<Packet8f, Packet8h>(const Packet8f& a) {
   return float2half(a);
 }
+#endif
+
+template <>
+EIGEN_STRONG_INLINE Packet8f pcast<Packet8bf, Packet8f>(const Packet8bf& a) {
+  return Bf16ToF32(a);
+}
 
 template <>
 EIGEN_STRONG_INLINE Packet8bf pcast<Packet8f, Packet8bf>(const Packet8f& a) {

diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h
index 6039254..04499a0 100644
--- a/Eigen/src/Core/arch/AVX512/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h

@@ -47,16 +47,16 @@
 
 #if EIGEN_FAST_MATH
 template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f psqrt<Packet16f>(const Packet16f& _x) {
-  return generic_sqrt_newton_step<Packet16f>::run(_x, _mm512_rsqrt14_ps(_x));
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f psqrt<Packet16f>(const Packet16f& x) {
+  return generic_sqrt_newton_step<Packet16f>::run(x, _mm512_rsqrt14_ps(x));
 }
 
 template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8d psqrt<Packet8d>(const Packet8d& _x) {
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8d psqrt<Packet8d>(const Packet8d& x) {
 #ifdef EIGEN_VECTORIZE_AVX512ER
-  return generic_sqrt_newton_step<Packet8d, /*Steps=*/1>::run(_x, _mm512_rsqrt28_pd(_x));
+  return generic_sqrt_newton_step<Packet8d, /*Steps=*/1>::run(x, _mm512_rsqrt28_pd(x));
 #else
-  return generic_sqrt_newton_step<Packet8d, /*Steps=*/2>::run(_x, _mm512_rsqrt14_pd(_x));
+  return generic_sqrt_newton_step<Packet8d, /*Steps=*/2>::run(x, _mm512_rsqrt14_pd(x));
 #endif
 }
 #else
@@ -80,19 +80,19 @@
 #elif EIGEN_FAST_MATH
 
 template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f prsqrt<Packet16f>(const Packet16f& _x) {
-  return generic_rsqrt_newton_step<Packet16f, /*Steps=*/1>::run(_x, _mm512_rsqrt14_ps(_x));
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f prsqrt<Packet16f>(const Packet16f& x) {
+  return generic_rsqrt_newton_step<Packet16f, /*Steps=*/1>::run(x, _mm512_rsqrt14_ps(x));
 }
 #endif
 
 // prsqrt for double.
 #if EIGEN_FAST_MATH
 template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8d prsqrt<Packet8d>(const Packet8d& _x) {
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8d prsqrt<Packet8d>(const Packet8d& x) {
 #ifdef EIGEN_VECTORIZE_AVX512ER
-  return generic_rsqrt_newton_step<Packet8d, /*Steps=*/1>::run(_x, _mm512_rsqrt28_pd(_x));
+  return generic_rsqrt_newton_step<Packet8d, /*Steps=*/1>::run(x, _mm512_rsqrt28_pd(x));
 #else
-  return generic_rsqrt_newton_step<Packet8d, /*Steps=*/2>::run(_x, _mm512_rsqrt14_pd(_x));
+  return generic_rsqrt_newton_step<Packet8d, /*Steps=*/2>::run(x, _mm512_rsqrt14_pd(x));
 #endif
 }
 
@@ -118,6 +118,8 @@
 BF16_PACKET_FUNCTION(Packet16f, Packet16bf, psin)
 BF16_PACKET_FUNCTION(Packet16f, Packet16bf, psqrt)
 BF16_PACKET_FUNCTION(Packet16f, Packet16bf, ptanh)
+
+#ifndef EIGEN_VECTORIZE_AVX512FP16
 F16_PACKET_FUNCTION(Packet16f, Packet16h, pcos)
 F16_PACKET_FUNCTION(Packet16f, Packet16h, pexp)
 F16_PACKET_FUNCTION(Packet16f, Packet16h, pexp2)
@@ -130,6 +132,7 @@
 F16_PACKET_FUNCTION(Packet16f, Packet16h, psin)
 F16_PACKET_FUNCTION(Packet16f, Packet16h, psqrt)
 F16_PACKET_FUNCTION(Packet16f, Packet16h, ptanh)
+#endif  // EIGEN_VECTORIZE_AVX512FP16
 
 }  // end namespace internal
 

diff --git a/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h b/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h
new file mode 100644
index 0000000..240ade4
--- /dev/null
+++ b/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h

@@ -0,0 +1,75 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2025 The Eigen Authors.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATH_FUNCTIONS_FP16_AVX512_H
+#define EIGEN_MATH_FUNCTIONS_FP16_AVX512_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+EIGEN_STRONG_INLINE Packet32h combine2Packet16h(const Packet16h& a, const Packet16h& b) {
+  __m512i result = _mm512_castsi256_si512(_mm256_castph_si256(a));
+  result = _mm512_inserti64x4(result, _mm256_castph_si256(b), 1);
+  return _mm512_castsi512_ph(result);
+}
+
+EIGEN_STRONG_INLINE void extract2Packet16h(const Packet32h& x, Packet16h& a, Packet16h& b) {
+  a = _mm256_castsi256_ph(_mm512_castsi512_si256(_mm512_castph_si512(x)));
+  b = _mm256_castsi256_ph(_mm512_extracti64x4_epi64(_mm512_castph_si512(x), 1));
+}
+
+#define _EIGEN_GENERATE_FP16_MATH_FUNCTION(func)                      \
+  template <>                                                         \
+  EIGEN_STRONG_INLINE Packet8h func<Packet8h>(const Packet8h& a) {    \
+    return float2half(func(half2float(a)));                           \
+  }                                                                   \
+                                                                      \
+  template <>                                                         \
+  EIGEN_STRONG_INLINE Packet16h func<Packet16h>(const Packet16h& a) { \
+    return float2half(func(half2float(a)));                           \
+  }                                                                   \
+                                                                      \
+  template <>                                                         \
+  EIGEN_STRONG_INLINE Packet32h func<Packet32h>(const Packet32h& a) { \
+    Packet16h low;                                                    \
+    Packet16h high;                                                   \
+    extract2Packet16h(a, low, high);                                  \
+    return combine2Packet16h(func(low), func(high));                  \
+  }
+
+_EIGEN_GENERATE_FP16_MATH_FUNCTION(psin)
+_EIGEN_GENERATE_FP16_MATH_FUNCTION(pcos)
+_EIGEN_GENERATE_FP16_MATH_FUNCTION(plog)
+_EIGEN_GENERATE_FP16_MATH_FUNCTION(plog2)
+_EIGEN_GENERATE_FP16_MATH_FUNCTION(plog1p)
+_EIGEN_GENERATE_FP16_MATH_FUNCTION(pexp)
+_EIGEN_GENERATE_FP16_MATH_FUNCTION(pexpm1)
+_EIGEN_GENERATE_FP16_MATH_FUNCTION(pexp2)
+_EIGEN_GENERATE_FP16_MATH_FUNCTION(ptanh)
+#undef _EIGEN_GENERATE_FP16_MATH_FUNCTION
+
+// pfrexp
+template <>
+EIGEN_STRONG_INLINE Packet32h pfrexp<Packet32h>(const Packet32h& a, Packet32h& exponent) {
+  return pfrexp_generic(a, exponent);
+}
+
+// pldexp
+template <>
+EIGEN_STRONG_INLINE Packet32h pldexp<Packet32h>(const Packet32h& a, const Packet32h& exponent) {
+  return pldexp_generic(a, exponent);
+}
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_MATH_FUNCTIONS_FP16_AVX512_H
\ No newline at end of file

diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h
index 5d869e4..27a0f10 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h

@@ -40,6 +40,10 @@
 #endif
 typedef eigen_packet_wrapper<__m256i, 2> Packet16bf;
 
+typedef eigen_packet_wrapper<__m512i, 6> Packet32s;
+typedef eigen_packet_wrapper<__m256i, 6> Packet16s;
+typedef eigen_packet_wrapper<__m128i, 6> Packet8s;
+
 template <>
 struct is_arithmetic<__m512> {
   enum { value = true };
@@ -124,6 +128,7 @@
     HasATanh = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
+    HasCbrt = 1,
     HasLog = 1,
     HasLog1p = 1,
     HasExpm1 = 1,
@@ -149,6 +154,7 @@
     HasBlend = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
+    HasCbrt = 1,
     HasSin = EIGEN_FAST_MATH,
     HasCos = EIGEN_FAST_MATH,
     HasLog = 1,
@@ -249,6 +255,39 @@
 #endif
 
 template <>
+struct unpacket_traits<Packet32s> {
+  typedef numext::int16_t type;
+  typedef Packet16s half;
+  enum {
+    size = 32,
+    alignment = Aligned64,
+    vectorizable = false,
+  };
+};
+
+template <>
+struct unpacket_traits<Packet16s> {
+  typedef numext::int16_t type;
+  typedef Packet8s half;
+  enum {
+    size = 16,
+    alignment = Aligned32,
+    vectorizable = false,
+  };
+};
+
+template <>
+struct unpacket_traits<Packet8s> {
+  typedef numext::int16_t type;
+  typedef Packet8s half;
+  enum {
+    size = 8,
+    alignment = Aligned16,
+    vectorizable = false,
+  };
+};
+
+template <>
 EIGEN_STRONG_INLINE Packet16f pset1<Packet16f>(const float& from) {
   return _mm512_set1_ps(from);
 }
@@ -1335,10 +1374,13 @@
   return _mm512_abs_epi64(a);
 }
 
+#ifndef EIGEN_VECTORIZE_AVX512FP16
 template <>
 EIGEN_STRONG_INLINE Packet16h psignbit(const Packet16h& a) {
   return _mm256_srai_epi16(a, 15);
 }
+#endif  // EIGEN_VECTORIZE_AVX512FP16
+
 template <>
 EIGEN_STRONG_INLINE Packet16bf psignbit(const Packet16bf& a) {
   return _mm256_srai_epi16(a, 15);
@@ -2199,6 +2241,7 @@
 }
 
 // Packet math for Eigen::half
+#ifndef EIGEN_VECTORIZE_AVX512FP16
 template <>
 EIGEN_STRONG_INLINE Packet16h pset1<Packet16h>(const Eigen::half& from) {
   return _mm256_set1_epi16(from.x);
@@ -2223,6 +2266,7 @@
 EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet16h& from) {
   // (void*) -> workaround clang warning:
   // cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32
+  EIGEN_DEBUG_ALIGNED_STORE
   _mm256_store_si256((__m256i*)(void*)to, from);
 }
 
@@ -2230,6 +2274,7 @@
 EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet16h& from) {
   // (void*) -> workaround clang warning:
   // cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32
+  EIGEN_DEBUG_UNALIGNED_STORE
   _mm256_storeu_si256((__m256i*)(void*)to, from);
 }
 
@@ -2369,7 +2414,6 @@
   return _mm256_xor_si256(a, sign_mask);
 }
 
-#ifndef EIGEN_VECTORIZE_AVX512FP16
 template <>
 EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
   Packet16f af = half2float(a);
@@ -2403,13 +2447,31 @@
 }
 
 template <>
+EIGEN_STRONG_INLINE Packet16h pmadd<Packet16h>(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
+  return float2half(pmadd(half2float(a), half2float(b), half2float(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pmsub<Packet16h>(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
+  return float2half(pmsub(half2float(a), half2float(b), half2float(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pnmadd<Packet16h>(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
+  return float2half(pnmadd(half2float(a), half2float(b), half2float(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pnmsub<Packet16h>(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
+  return float2half(pnmsub(half2float(a), half2float(b), half2float(c)));
+}
+
+template <>
 EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& from) {
   Packet16f from_float = half2float(from);
   return half(predux(from_float));
 }
 
-#endif
-
 template <>
 EIGEN_STRONG_INLINE Packet8h predux_half_dowto4<Packet16h>(const Packet16h& a) {
   Packet8h lane0 = _mm256_extractf128_si256(a, 0);
@@ -2643,6 +2705,8 @@
   kernel.packet[3] = pload<Packet16h>(out[3]);
 }
 
+#endif  // EIGEN_VECTORIZE_AVX512FP16
+
 template <>
 struct is_arithmetic<Packet16bf> {
   enum { value = true };
@@ -2714,11 +2778,13 @@
 
 template <>
 EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet16bf& from) {
+  EIGEN_DEBUG_ALIGNED_STORE
   _mm256_store_si256(reinterpret_cast<__m256i*>(to), from);
 }
 
 template <>
 EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet16bf& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from);
 }
 
@@ -2889,7 +2955,27 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet16bf pmul<Packet16bf>(const Packet16bf& a, const Packet16bf& b) {
-  return F32ToBf16(pmul<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
+  return F32ToBf16(pmul(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pmadd<Packet16bf>(const Packet16bf& a, const Packet16bf& b, const Packet16bf& c) {
+  return F32ToBf16(pmadd(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pmsub<Packet16bf>(const Packet16bf& a, const Packet16bf& b, const Packet16bf& c) {
+  return F32ToBf16(pmsub(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pnmadd<Packet16bf>(const Packet16bf& a, const Packet16bf& b, const Packet16bf& c) {
+  return F32ToBf16(pnmadd(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pnmsub<Packet16bf>(const Packet16bf& a, const Packet16bf& b, const Packet16bf& c) {
+  return F32ToBf16(pnmsub(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
 }
 
 template <>
@@ -3095,6 +3181,172 @@
   kernel.packet[3] = _mm256_permute2x128_si256(abcd_8b, abcd_cf, 0x31);
 }
 
+// Minimal implementation of 16-bit int packets for use in pfrexp, pldexp.
+
+template <>
+EIGEN_STRONG_INLINE Packet32s pset1<Packet32s>(const numext::int16_t& x) {
+  return _mm512_set1_epi16(x);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16s pset1<Packet16s>(const numext::int16_t& x) {
+  return _mm256_set1_epi16(x);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const numext::int16_t& x) {
+  return _mm_set1_epi16(x);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<numext::int16_t, Packet32s>(numext::int16_t* out, const Packet32s& x) {
+  EIGEN_DEBUG_ALIGNED_STORE
+  _mm512_store_epi32(out, x);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<numext::int16_t, Packet16s>(numext::int16_t* out, const Packet16s& x) {
+  EIGEN_DEBUG_ALIGNED_STORE
+#if defined(EIGEN_VECTORIZE_AVX512F) && defined(EIGEN_VECTORIZE_AVX512VL)
+  _mm256_store_epi32(out, x);
+#else
+  _mm256_store_si256(reinterpret_cast<__m256i*>(out), x);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<numext::int16_t, Packet8s>(numext::int16_t* out, const Packet8s& x) {
+  EIGEN_DEBUG_ALIGNED_STORE
+#if defined(EIGEN_VECTORIZE_AVX512F) && defined(EIGEN_VECTORIZE_AVX512VL)
+  _mm256_store_epi32(out, x);
+#else
+  _mm_store_si128(reinterpret_cast<__m128i*>(out), x);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<numext::int16_t, Packet32s>(numext::int16_t* out, const Packet32s& x) {
+  EIGEN_DEBUG_UNALIGNED_STORE
+  _mm512_storeu_epi32(out, x);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<numext::int16_t, Packet16s>(numext::int16_t* out, const Packet16s& x) {
+  EIGEN_DEBUG_UNALIGNED_STORE
+  _mm256_storeu_epi32(out, x);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<numext::int16_t, Packet8s>(numext::int16_t* out, const Packet8s& x) {
+  EIGEN_DEBUG_UNALIGNED_STORE
+  _mm_storeu_epi32(out, x);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet32s padd(const Packet32s& a, const Packet32s& b) {
+  return _mm512_add_epi16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16s padd(const Packet16s& a, const Packet16s& b) {
+  return _mm256_add_epi16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8s padd(const Packet8s& a, const Packet8s& b) {
+  return _mm_add_epi16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet32s psub(const Packet32s& a, const Packet32s& b) {
+  return _mm512_sub_epi16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16s psub(const Packet16s& a, const Packet16s& b) {
+  return _mm256_sub_epi16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8s psub(const Packet8s& a, const Packet8s& b) {
+  return _mm_sub_epi16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet32s pmul(const Packet32s& a, const Packet32s& b) {
+  return _mm512_mullo_epi16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16s pmul(const Packet16s& a, const Packet16s& b) {
+  return _mm256_mullo_epi16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8s pmul(const Packet8s& a, const Packet8s& b) {
+  return _mm_mullo_epi16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet32s pnegate(const Packet32s& a) {
+  return _mm512_sub_epi16(_mm512_setzero_si512(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16s pnegate(const Packet16s& a) {
+  return _mm256_sub_epi16(_mm256_setzero_si256(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8s pnegate(const Packet8s& a) {
+  return _mm_sub_epi16(_mm_setzero_si128(), a);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet32s parithmetic_shift_right(Packet32s a) {
+  return _mm512_srai_epi16(a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet16s parithmetic_shift_right(Packet16s a) {
+  return _mm256_srai_epi16(a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet8s parithmetic_shift_right(Packet8s a) {
+  return _mm_srai_epi16(a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet32s plogical_shift_left(Packet32s a) {
+  return _mm512_slli_epi16(a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet16s plogical_shift_left(Packet16s a) {
+  return _mm256_slli_epi16(a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet8s plogical_shift_left(Packet8s a) {
+  return _mm_slli_epi16(a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet32s plogical_shift_right(Packet32s a) {
+  return _mm512_srli_epi16(a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet16s plogical_shift_right(Packet16s a) {
+  return _mm256_srli_epi16(a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet8s plogical_shift_right(Packet8s a) {
+  return _mm_srli_epi16(a, N);
+}
+
 }  // end namespace internal
 
 }  // end namespace Eigen

diff --git a/Eigen/src/Core/arch/AVX512/PacketMathFP16.h b/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
index 038e233..ef64bc5 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMathFP16.h

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-//
+// Copyright (C) 2025 The Eigen Authors.
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -18,8 +18,8 @@
 namespace internal {
 
 typedef __m512h Packet32h;
-typedef eigen_packet_wrapper<__m256i, 1> Packet16h;
-typedef eigen_packet_wrapper<__m128i, 2> Packet8h;
+typedef __m256h Packet16h;
+typedef __m128h Packet8h;
 
 template <>
 struct is_arithmetic<Packet8h> {
@@ -68,6 +68,7 @@
 struct unpacket_traits<Packet32h> {
   typedef Eigen::half type;
   typedef Packet16h half;
+  typedef Packet32s integer_packet;
   enum {
     size = 32,
     alignment = Aligned64,
@@ -81,6 +82,7 @@
 struct unpacket_traits<Packet16h> {
   typedef Eigen::half type;
   typedef Packet8h half;
+  typedef Packet16s integer_packet;
   enum {
     size = 16,
     alignment = Aligned32,
@@ -94,6 +96,7 @@
 struct unpacket_traits<Packet8h> {
   typedef Eigen::half type;
   typedef Packet8h half;
+  typedef Packet8s integer_packet;
   enum {
     size = 8,
     alignment = Aligned16,
@@ -103,14 +106,33 @@
   };
 };
 
+// Conversions
+
+EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) { return _mm512_cvtxph_ps(a); }
+
+EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) { return _mm256_cvtxph_ps(a); }
+
+EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) { return _mm512_cvtxps_ph(a); }
+
+EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) { return _mm256_cvtxps_ph(a); }
+
 // Memory functions
 
 // pset1
 
 template <>
 EIGEN_STRONG_INLINE Packet32h pset1<Packet32h>(const Eigen::half& from) {
-  // half/half_raw is bit compatible
-  return _mm512_set1_ph(numext::bit_cast<_Float16>(from));
+  return _mm512_set1_ph(from.x);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pset1<Packet16h>(const Eigen::half& from) {
+  return _mm256_set1_ph(from.x);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pset1<Packet8h>(const Eigen::half& from) {
+  return _mm_set1_ph(from.x);
 }
 
 template <>
@@ -118,24 +140,47 @@
   return _mm512_setzero_ph();
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16h pzero(const Packet16h& /*a*/) {
+  return _mm256_setzero_ph();
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pzero(const Packet8h& /*a*/) {
+  return _mm_setzero_ph();
+}
+
 // pset1frombits
 template <>
 EIGEN_STRONG_INLINE Packet32h pset1frombits<Packet32h>(unsigned short from) {
   return _mm512_castsi512_ph(_mm512_set1_epi16(from));
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16h pset1frombits<Packet16h>(unsigned short from) {
+  return _mm256_castsi256_ph(_mm256_set1_epi16(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pset1frombits<Packet8h>(unsigned short from) {
+  return _mm_castsi128_ph(_mm_set1_epi16(from));
+}
+
 // pfirst
 
 template <>
 EIGEN_STRONG_INLINE Eigen::half pfirst<Packet32h>(const Packet32h& from) {
-#ifdef EIGEN_VECTORIZE_AVX512DQ
-  return half_impl::raw_uint16_to_half(
-      static_cast<unsigned short>(_mm256_extract_epi16(_mm512_extracti32x8_epi32(_mm512_castph_si512(from), 0), 0)));
-#else
-  Eigen::half dest[32];
-  _mm512_storeu_ph(dest, from);
-  return dest[0];
-#endif
+  return Eigen::half(_mm512_cvtsh_h(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half pfirst<Packet16h>(const Packet16h& from) {
+  return Eigen::half(_mm256_cvtsh_h(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half pfirst<Packet8h>(const Packet8h& from) {
+  return Eigen::half(_mm_cvtsh_h(from));
 }
 
 // pload
@@ -145,6 +190,16 @@
   EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_ph(from);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16h pload<Packet16h>(const Eigen::half* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ph(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pload<Packet8h>(const Eigen::half* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ph(from);
+}
+
 // ploadu
 
 template <>
@@ -152,6 +207,16 @@
   EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_ph(from);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16h ploadu<Packet16h>(const Eigen::half* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_ph(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h ploadu<Packet8h>(const Eigen::half* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_ph(from);
+}
+
 // pstore
 
 template <>
@@ -159,6 +224,16 @@
   EIGEN_DEBUG_ALIGNED_STORE _mm512_store_ph(to, from);
 }
 
+template <>
+EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet16h& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ph(to, from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet8h& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm_store_ph(to, from);
+}
+
 // pstoreu
 
 template <>
@@ -166,6 +241,16 @@
   EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_ph(to, from);
 }
 
+template <>
+EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet16h& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_ph(to, from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet8h& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ph(to, from);
+}
+
 // ploaddup
 template <>
 EIGEN_STRONG_INLINE Packet32h ploaddup<Packet32h>(const Eigen::half* from) {
@@ -175,6 +260,17 @@
                                a);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16h ploaddup<Packet16h>(const Eigen::half* from) {
+  __m256h a = _mm256_castph128_ph256(_mm_loadu_ph(from));
+  return _mm256_permutexvar_ph(_mm256_set_epi16(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h ploaddup<Packet8h>(const Eigen::half* from) {
+  return _mm_set_ph(from[3].x, from[3].x, from[2].x, from[2].x, from[1].x, from[1].x, from[0].x, from[0].x);
+}
+
 // ploadquad
 template <>
 EIGEN_STRONG_INLINE Packet32h ploadquad<Packet32h>(const Eigen::half* from) {
@@ -184,6 +280,17 @@
       a);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16h ploadquad<Packet16h>(const Eigen::half* from) {
+  return _mm256_set_ph(from[3].x, from[3].x, from[3].x, from[3].x, from[2].x, from[2].x, from[2].x, from[2].x,
+                       from[1].x, from[1].x, from[1].x, from[1].x, from[0].x, from[0].x, from[0].x, from[0].x);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h ploadquad<Packet8h>(const Eigen::half* from) {
+  return _mm_set_ph(from[1].x, from[1].x, from[1].x, from[1].x, from[0].x, from[0].x, from[0].x, from[0].x);
+}
+
 // pabs
 
 template <>
@@ -191,6 +298,16 @@
   return _mm512_abs_ph(a);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16h pabs<Packet16h>(const Packet16h& a) {
+  return _mm256_abs_ph(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pabs<Packet8h>(const Packet8h& a) {
+  return _mm_abs_ph(a);
+}
+
 // psignbit
 
 template <>
@@ -198,6 +315,16 @@
   return _mm512_castsi512_ph(_mm512_srai_epi16(_mm512_castph_si512(a), 15));
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16h psignbit<Packet16h>(const Packet16h& a) {
+  return _mm256_castsi256_ph(_mm256_srai_epi16(_mm256_castph_si256(a), 15));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h psignbit<Packet8h>(const Packet8h& a) {
+  return _mm_castsi128_ph(_mm_srai_epi16(_mm_castph_si128(a), 15));
+}
+
 // pmin
 
 template <>
@@ -205,6 +332,16 @@
   return _mm512_min_ph(a, b);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16h pmin<Packet16h>(const Packet16h& a, const Packet16h& b) {
+  return _mm256_min_ph(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pmin<Packet8h>(const Packet8h& a, const Packet8h& b) {
+  return _mm_min_ph(a, b);
+}
+
 // pmax
 
 template <>
@@ -212,6 +349,16 @@
   return _mm512_max_ph(a, b);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16h pmax<Packet16h>(const Packet16h& a, const Packet16h& b) {
+  return _mm256_max_ph(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pmax<Packet8h>(const Packet8h& a, const Packet8h& b) {
+  return _mm_max_ph(a, b);
+}
+
 // plset
 template <>
 EIGEN_STRONG_INLINE Packet32h plset<Packet32h>(const half& a) {
@@ -219,6 +366,16 @@
                                                           16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16h plset<Packet16h>(const half& a) {
+  return _mm256_add_ph(pset1<Packet16h>(a), _mm256_set_ph(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h plset<Packet8h>(const half& a) {
+  return _mm_add_ph(pset1<Packet8h>(a), _mm_set_ph(7, 6, 5, 4, 3, 2, 1, 0));
+}
+
 // por
 
 template <>
@@ -226,6 +383,16 @@
   return _mm512_castsi512_ph(_mm512_or_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a, const Packet16h& b) {
+  return _mm256_castsi256_ph(_mm256_or_si256(_mm256_castph_si256(a), _mm256_castph_si256(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a, const Packet8h& b) {
+  return _mm_castsi128_ph(_mm_or_si128(_mm_castph_si128(a), _mm_castph_si128(b)));
+}
+
 // pxor
 
 template <>
@@ -233,6 +400,16 @@
   return _mm512_castsi512_ph(_mm512_xor_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16h pxor(const Packet16h& a, const Packet16h& b) {
+  return _mm256_castsi256_ph(_mm256_xor_si256(_mm256_castph_si256(a), _mm256_castph_si256(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pxor(const Packet8h& a, const Packet8h& b) {
+  return _mm_castsi128_ph(_mm_xor_si128(_mm_castph_si128(a), _mm_castph_si128(b)));
+}
+
 // pand
 
 template <>
@@ -240,6 +417,16 @@
   return _mm512_castsi512_ph(_mm512_and_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16h pand(const Packet16h& a, const Packet16h& b) {
+  return _mm256_castsi256_ph(_mm256_and_si256(_mm256_castph_si256(a), _mm256_castph_si256(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pand(const Packet8h& a, const Packet8h& b) {
+  return _mm_castsi128_ph(_mm_and_si128(_mm_castph_si128(a), _mm_castph_si128(b)));
+}
+
 // pandnot
 
 template <>
@@ -247,6 +434,16 @@
   return _mm512_castsi512_ph(_mm512_andnot_si512(_mm512_castph_si512(b), _mm512_castph_si512(a)));
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16h pandnot(const Packet16h& a, const Packet16h& b) {
+  return _mm256_castsi256_ph(_mm256_andnot_si256(_mm256_castph_si256(b), _mm256_castph_si256(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h& a, const Packet8h& b) {
+  return _mm_castsi128_ph(_mm_andnot_si128(_mm_castph_si128(b), _mm_castph_si128(a)));
+}
+
 // pselect
 
 template <>
@@ -255,6 +452,18 @@
   return _mm512_mask_blend_ph(mask32, a, b);
 }
 
+template <>
+EIGEN_DEVICE_FUNC inline Packet16h pselect(const Packet16h& mask, const Packet16h& a, const Packet16h& b) {
+  __mmask16 mask16 = _mm256_cmp_epi16_mask(_mm256_castph_si256(mask), _mm256_setzero_si256(), _MM_CMPINT_EQ);
+  return _mm256_mask_blend_ph(mask16, a, b);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet8h pselect(const Packet8h& mask, const Packet8h& a, const Packet8h& b) {
+  __mmask8 mask8 = _mm_cmp_epi16_mask(_mm_castph_si128(mask), _mm_setzero_si128(), _MM_CMPINT_EQ);
+  return _mm_mask_blend_ph(mask8, a, b);
+}
+
 // pcmp_eq
 
 template <>
@@ -263,6 +472,18 @@
   return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, static_cast<short>(0xffffu)));
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16h pcmp_eq(const Packet16h& a, const Packet16h& b) {
+  __mmask16 mask = _mm256_cmp_ph_mask(a, b, _CMP_EQ_OQ);
+  return _mm256_castsi256_ph(_mm256_mask_set1_epi16(_mm256_set1_epi32(0), mask, static_cast<short>(0xffffu)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pcmp_eq(const Packet8h& a, const Packet8h& b) {
+  __mmask8 mask = _mm_cmp_ph_mask(a, b, _CMP_EQ_OQ);
+  return _mm_castsi128_ph(_mm_mask_set1_epi16(_mm_set1_epi32(0), mask, static_cast<short>(0xffffu)));
+}
+
 // pcmp_le
 
 template <>
@@ -271,6 +492,18 @@
   return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, static_cast<short>(0xffffu)));
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16h pcmp_le(const Packet16h& a, const Packet16h& b) {
+  __mmask16 mask = _mm256_cmp_ph_mask(a, b, _CMP_LE_OQ);
+  return _mm256_castsi256_ph(_mm256_mask_set1_epi16(_mm256_set1_epi32(0), mask, static_cast<short>(0xffffu)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pcmp_le(const Packet8h& a, const Packet8h& b) {
+  __mmask8 mask = _mm_cmp_ph_mask(a, b, _CMP_LE_OQ);
+  return _mm_castsi128_ph(_mm_mask_set1_epi16(_mm_set1_epi32(0), mask, static_cast<short>(0xffffu)));
+}
+
 // pcmp_lt
 
 template <>
@@ -279,6 +512,18 @@
   return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, static_cast<short>(0xffffu)));
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16h pcmp_lt(const Packet16h& a, const Packet16h& b) {
+  __mmask16 mask = _mm256_cmp_ph_mask(a, b, _CMP_LT_OQ);
+  return _mm256_castsi256_ph(_mm256_mask_set1_epi16(_mm256_set1_epi32(0), mask, static_cast<short>(0xffffu)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pcmp_lt(const Packet8h& a, const Packet8h& b) {
+  __mmask8 mask = _mm_cmp_ph_mask(a, b, _CMP_LT_OQ);
+  return _mm_castsi128_ph(_mm_mask_set1_epi16(_mm_set1_epi32(0), mask, static_cast<short>(0xffffu)));
+}
+
 // pcmp_lt_or_nan
 
 template <>
@@ -287,6 +532,18 @@
   return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi16(0), mask, static_cast<short>(0xffffu)));
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16h pcmp_lt_or_nan(const Packet16h& a, const Packet16h& b) {
+  __mmask16 mask = _mm256_cmp_ph_mask(a, b, _CMP_NGE_UQ);
+  return _mm256_castsi256_ph(_mm256_mask_set1_epi16(_mm256_set1_epi32(0), mask, static_cast<short>(0xffffu)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pcmp_lt_or_nan(const Packet8h& a, const Packet8h& b) {
+  __mmask8 mask = _mm_cmp_ph_mask(a, b, _CMP_NGE_UQ);
+  return _mm_castsi128_ph(_mm_mask_set1_epi16(_mm_set1_epi32(0), mask, static_cast<short>(0xffffu)));
+}
+
 // padd
 
 template <>
@@ -296,12 +553,12 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
-  return _mm256_castph_si256(_mm256_add_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
+  return _mm256_add_ph(a, b);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {
-  return _mm_castph_si128(_mm_add_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
+  return _mm_add_ph(a, b);
 }
 
 // psub
@@ -313,12 +570,12 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet16h psub<Packet16h>(const Packet16h& a, const Packet16h& b) {
-  return _mm256_castph_si256(_mm256_sub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
+  return _mm256_sub_ph(a, b);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet8h psub<Packet8h>(const Packet8h& a, const Packet8h& b) {
-  return _mm_castph_si128(_mm_sub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
+  return _mm_sub_ph(a, b);
 }
 
 // pmul
@@ -330,12 +587,12 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {
-  return _mm256_castph_si256(_mm256_mul_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
+  return _mm256_mul_ph(a, b);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {
-  return _mm_castph_si128(_mm_mul_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
+  return _mm_mul_ph(a, b);
 }
 
 // pdiv
@@ -347,12 +604,13 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet16h pdiv<Packet16h>(const Packet16h& a, const Packet16h& b) {
-  return _mm256_castph_si256(_mm256_div_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
+  return _mm256_div_ph(a, b);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet8h pdiv<Packet8h>(const Packet8h& a, const Packet8h& b) {
-  return _mm_castph_si128(_mm_div_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
+  return _mm_div_ph(a, b);
+  ;
 }
 
 // pround
@@ -361,14 +619,40 @@
 EIGEN_STRONG_INLINE Packet32h pround<Packet32h>(const Packet32h& a) {
   // Work-around for default std::round rounding mode.
 
-  // Mask for the sign bit
-  const Packet32h signMask = pset1frombits<Packet32h>(static_cast<numext::uint16_t>(0x8000u));
-  // The largest half-preicision float less than 0.5
+  // Mask for the sign bit.
+  const Packet32h signMask =
+      pset1frombits<Packet32h>(static_cast<numext::uint16_t>(static_cast<std::uint16_t>(0x8000u)));
+  // The largest half-precision float less than 0.5.
   const Packet32h prev0dot5 = pset1frombits<Packet32h>(static_cast<numext::uint16_t>(0x37FFu));
 
   return _mm512_roundscale_ph(padd(por(pand(a, signMask), prev0dot5), a), _MM_FROUND_TO_ZERO);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16h pround<Packet16h>(const Packet16h& a) {
+  // Work-around for default std::round rounding mode.
+
+  // Mask for the sign bit.
+  const Packet16h signMask =
+      pset1frombits<Packet16h>(static_cast<numext::uint16_t>(static_cast<std::uint16_t>(0x8000u)));
+  // The largest half-precision float less than 0.5.
+  const Packet16h prev0dot5 = pset1frombits<Packet16h>(static_cast<numext::uint16_t>(0x37FFu));
+
+  return _mm256_roundscale_ph(padd(por(pand(a, signMask), prev0dot5), a), _MM_FROUND_TO_ZERO);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pround<Packet8h>(const Packet8h& a) {
+  // Work-around for default std::round rounding mode.
+
+  // Mask for the sign bit.
+  const Packet8h signMask = pset1frombits<Packet8h>(static_cast<numext::uint16_t>(static_cast<std::uint16_t>(0x8000u)));
+  // The largest half-precision float less than 0.5.
+  const Packet8h prev0dot5 = pset1frombits<Packet8h>(static_cast<numext::uint16_t>(0x37FFu));
+
+  return _mm_roundscale_ph(padd(por(pand(a, signMask), prev0dot5), a), _MM_FROUND_TO_ZERO);
+}
+
 // print
 
 template <>
@@ -376,6 +660,16 @@
   return _mm512_roundscale_ph(a, _MM_FROUND_CUR_DIRECTION);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16h print<Packet16h>(const Packet16h& a) {
+  return _mm256_roundscale_ph(a, _MM_FROUND_CUR_DIRECTION);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h print<Packet8h>(const Packet8h& a) {
+  return _mm_roundscale_ph(a, _MM_FROUND_CUR_DIRECTION);
+}
+
 // pceil
 
 template <>
@@ -383,6 +677,16 @@
   return _mm512_roundscale_ph(a, _MM_FROUND_TO_POS_INF);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16h pceil<Packet16h>(const Packet16h& a) {
+  return _mm256_roundscale_ph(a, _MM_FROUND_TO_POS_INF);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pceil<Packet8h>(const Packet8h& a) {
+  return _mm_roundscale_ph(a, _MM_FROUND_TO_POS_INF);
+}
+
 // pfloor
 
 template <>
@@ -390,6 +694,16 @@
   return _mm512_roundscale_ph(a, _MM_FROUND_TO_NEG_INF);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16h pfloor<Packet16h>(const Packet16h& a) {
+  return _mm256_roundscale_ph(a, _MM_FROUND_TO_NEG_INF);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pfloor<Packet8h>(const Packet8h& a) {
+  return _mm_roundscale_ph(a, _MM_FROUND_TO_NEG_INF);
+}
+
 // ptrunc
 
 template <>
@@ -397,47 +711,99 @@
   return _mm512_roundscale_ph(a, _MM_FROUND_TO_ZERO);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16h ptrunc<Packet16h>(const Packet16h& a) {
+  return _mm256_roundscale_ph(a, _MM_FROUND_TO_ZERO);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h ptrunc<Packet8h>(const Packet8h& a) {
+  return _mm_roundscale_ph(a, _MM_FROUND_TO_ZERO);
+}
+
 // predux
 template <>
 EIGEN_STRONG_INLINE half predux<Packet32h>(const Packet32h& a) {
-  return (half)_mm512_reduce_add_ph(a);
+  return half(_mm512_reduce_add_ph(a));
 }
 
 template <>
 EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& a) {
-  return (half)_mm256_reduce_add_ph(_mm256_castsi256_ph(a));
+  return half(_mm256_reduce_add_ph(a));
 }
 
 template <>
 EIGEN_STRONG_INLINE half predux<Packet8h>(const Packet8h& a) {
-  return (half)_mm_reduce_add_ph(_mm_castsi128_ph(a));
+  return half(_mm_reduce_add_ph(a));
 }
 
 // predux_half_dowto4
 template <>
 EIGEN_STRONG_INLINE Packet16h predux_half_dowto4<Packet32h>(const Packet32h& a) {
-#ifdef EIGEN_VECTORIZE_AVX512DQ
-  __m256i lowHalf = _mm256_castps_si256(_mm512_extractf32x8_ps(_mm512_castph_ps(a), 0));
-  __m256i highHalf = _mm256_castps_si256(_mm512_extractf32x8_ps(_mm512_castph_ps(a), 1));
+  const __m512i bits = _mm512_castph_si512(a);
+  Packet16h lo = _mm256_castsi256_ph(_mm512_castsi512_si256(bits));
+  Packet16h hi = _mm256_castsi256_ph(_mm512_extracti64x4_epi64(bits, 1));
+  return padd(lo, hi);
+}
 
-  return Packet16h(padd<Packet16h>(lowHalf, highHalf));
-#else
-  Eigen::half data[32];
-  _mm512_storeu_ph(data, a);
-
-  __m256i lowHalf = _mm256_castph_si256(_mm256_loadu_ph(data));
-  __m256i highHalf = _mm256_castph_si256(_mm256_loadu_ph(data + 16));
-
-  return Packet16h(padd<Packet16h>(lowHalf, highHalf));
-#endif
+template <>
+EIGEN_STRONG_INLINE Packet8h predux_half_dowto4<Packet16h>(const Packet16h& a) {
+  Packet8h lo = _mm_castsi128_ph(_mm256_castsi256_si128(_mm256_castph_si256(a)));
+  Packet8h hi = _mm_castps_ph(_mm256_extractf128_ps(_mm256_castph_ps(a), 1));
+  return padd(lo, hi);
 }
 
 // predux_max
 
+template <>
+EIGEN_STRONG_INLINE half predux_max<Packet32h>(const Packet32h& a) {
+  return half(_mm512_reduce_max_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_max<Packet16h>(const Packet16h& a) {
+  return half(_mm256_reduce_max_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_max<Packet8h>(const Packet8h& a) {
+  return half(_mm_reduce_max_ph(a));
+}
+
 // predux_min
 
+template <>
+EIGEN_STRONG_INLINE half predux_min<Packet32h>(const Packet32h& a) {
+  return half(_mm512_reduce_min_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_min<Packet16h>(const Packet16h& a) {
+  return half(_mm256_reduce_min_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_min<Packet8h>(const Packet8h& a) {
+  return half(_mm_reduce_min_ph(a));
+}
+
 // predux_mul
 
+template <>
+EIGEN_STRONG_INLINE half predux_mul<Packet32h>(const Packet32h& a) {
+  return half(_mm512_reduce_mul_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_mul<Packet16h>(const Packet16h& a) {
+  return half(_mm256_reduce_mul_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_mul<Packet8h>(const Packet8h& a) {
+  return half(_mm_reduce_mul_ph(a));
+}
+
 #ifdef EIGEN_VECTORIZE_FMA
 
 // pmadd
@@ -449,12 +815,12 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet16h pmadd(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
-  return _mm256_castph_si256(_mm256_fmadd_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
+  return _mm256_fmadd_ph(a, b, c);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet8h pmadd(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
-  return _mm_castph_si128(_mm_fmadd_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
+  return _mm_fmadd_ph(a, b, c);
 }
 
 // pmsub
@@ -466,12 +832,12 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet16h pmsub(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
-  return _mm256_castph_si256(_mm256_fmsub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
+  return _mm256_fmsub_ph(a, b, c);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet8h pmsub(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
-  return _mm_castph_si128(_mm_fmsub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
+  return _mm_fmsub_ph(a, b, c);
 }
 
 // pnmadd
@@ -483,12 +849,12 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet16h pnmadd(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
-  return _mm256_castph_si256(_mm256_fnmadd_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
+  return _mm256_fnmadd_ph(a, b, c);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet8h pnmadd(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
-  return _mm_castph_si128(_mm_fnmadd_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
+  return _mm_fnmadd_ph(a, b, c);
 }
 
 // pnmsub
@@ -500,12 +866,12 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet16h pnmsub(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
-  return _mm256_castph_si256(_mm256_fnmsub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
+  return _mm256_fnmsub_ph(a, b, c);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet8h pnmsub(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
-  return _mm_castph_si128(_mm_fnmsub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
+  return _mm_fnmsub_ph(a, b, c);
 }
 
 #endif
@@ -514,35 +880,74 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet32h pnegate<Packet32h>(const Packet32h& a) {
-  return psub(pzero(a), a);
+  return _mm512_castsi512_ph(
+      _mm512_xor_si512(_mm512_castph_si512(a), _mm512_set1_epi16(static_cast<std::uint16_t>(0x8000u))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pnegate<Packet16h>(const Packet16h& a) {
+  return _mm256_castsi256_ph(
+      _mm256_xor_si256(_mm256_castph_si256(a), _mm256_set1_epi16(static_cast<std::uint16_t>(0x8000u))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pnegate<Packet8h>(const Packet8h& a) {
+  return _mm_castsi128_ph(_mm_xor_si128(_mm_castph_si128(a), _mm_set1_epi16(static_cast<std::uint16_t>(0x8000u))));
 }
 
 // pconj
 
-template <>
-EIGEN_STRONG_INLINE Packet32h pconj<Packet32h>(const Packet32h& a) {
-  return a;
-}
+// Nothing, packets are real.
 
 // psqrt
 
 template <>
 EIGEN_STRONG_INLINE Packet32h psqrt<Packet32h>(const Packet32h& a) {
-  return _mm512_sqrt_ph(a);
+  return generic_sqrt_newton_step<Packet32h>::run(a, _mm512_rsqrt_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h psqrt<Packet16h>(const Packet16h& a) {
+  return generic_sqrt_newton_step<Packet16h>::run(a, _mm256_rsqrt_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h psqrt<Packet8h>(const Packet8h& a) {
+  return generic_sqrt_newton_step<Packet8h>::run(a, _mm_rsqrt_ph(a));
 }
 
 // prsqrt
 
 template <>
 EIGEN_STRONG_INLINE Packet32h prsqrt<Packet32h>(const Packet32h& a) {
-  return _mm512_rsqrt_ph(a);
+  return generic_rsqrt_newton_step<Packet32h, /*Steps=*/1>::run(a, _mm512_rsqrt_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h prsqrt<Packet16h>(const Packet16h& a) {
+  return generic_rsqrt_newton_step<Packet16h, /*Steps=*/1>::run(a, _mm256_rsqrt_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h prsqrt<Packet8h>(const Packet8h& a) {
+  return generic_rsqrt_newton_step<Packet8h, /*Steps=*/1>::run(a, _mm_rsqrt_ph(a));
 }
 
 // preciprocal
 
 template <>
 EIGEN_STRONG_INLINE Packet32h preciprocal<Packet32h>(const Packet32h& a) {
-  return _mm512_rcp_ph(a);
+  return generic_reciprocal_newton_step<Packet32h, /*Steps=*/1>::run(a, _mm512_rcp_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h preciprocal<Packet16h>(const Packet16h& a) {
+  return generic_reciprocal_newton_step<Packet16h, /*Steps=*/1>::run(a, _mm256_rcp_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h preciprocal<Packet8h>(const Packet8h& a) {
+  return generic_reciprocal_newton_step<Packet8h, /*Steps=*/1>::run(a, _mm_rcp_ph(a));
 }
 
 // ptranspose
@@ -663,6 +1068,246 @@
   a.packet[3] = _mm512_castsi512_ph(a3);
 }
 
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16h, 16>& kernel) {
+  __m256i a = _mm256_castph_si256(kernel.packet[0]);
+  __m256i b = _mm256_castph_si256(kernel.packet[1]);
+  __m256i c = _mm256_castph_si256(kernel.packet[2]);
+  __m256i d = _mm256_castph_si256(kernel.packet[3]);
+  __m256i e = _mm256_castph_si256(kernel.packet[4]);
+  __m256i f = _mm256_castph_si256(kernel.packet[5]);
+  __m256i g = _mm256_castph_si256(kernel.packet[6]);
+  __m256i h = _mm256_castph_si256(kernel.packet[7]);
+  __m256i i = _mm256_castph_si256(kernel.packet[8]);
+  __m256i j = _mm256_castph_si256(kernel.packet[9]);
+  __m256i k = _mm256_castph_si256(kernel.packet[10]);
+  __m256i l = _mm256_castph_si256(kernel.packet[11]);
+  __m256i m = _mm256_castph_si256(kernel.packet[12]);
+  __m256i n = _mm256_castph_si256(kernel.packet[13]);
+  __m256i o = _mm256_castph_si256(kernel.packet[14]);
+  __m256i p = _mm256_castph_si256(kernel.packet[15]);
+
+  __m256i ab_07 = _mm256_unpacklo_epi16(a, b);
+  __m256i cd_07 = _mm256_unpacklo_epi16(c, d);
+  __m256i ef_07 = _mm256_unpacklo_epi16(e, f);
+  __m256i gh_07 = _mm256_unpacklo_epi16(g, h);
+  __m256i ij_07 = _mm256_unpacklo_epi16(i, j);
+  __m256i kl_07 = _mm256_unpacklo_epi16(k, l);
+  __m256i mn_07 = _mm256_unpacklo_epi16(m, n);
+  __m256i op_07 = _mm256_unpacklo_epi16(o, p);
+
+  __m256i ab_8f = _mm256_unpackhi_epi16(a, b);
+  __m256i cd_8f = _mm256_unpackhi_epi16(c, d);
+  __m256i ef_8f = _mm256_unpackhi_epi16(e, f);
+  __m256i gh_8f = _mm256_unpackhi_epi16(g, h);
+  __m256i ij_8f = _mm256_unpackhi_epi16(i, j);
+  __m256i kl_8f = _mm256_unpackhi_epi16(k, l);
+  __m256i mn_8f = _mm256_unpackhi_epi16(m, n);
+  __m256i op_8f = _mm256_unpackhi_epi16(o, p);
+
+  __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07);
+  __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07);
+  __m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07);
+  __m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07);
+  __m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07);
+  __m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07);
+  __m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07);
+  __m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07);
+
+  __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f);
+  __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f);
+  __m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f);
+  __m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f);
+  __m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f);
+  __m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f);
+  __m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f);
+  __m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f);
+
+  __m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03);
+  __m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03);
+  __m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03);
+  __m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03);
+  __m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47);
+  __m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47);
+  __m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47);
+  __m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47);
+  __m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b);
+  __m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b);
+  __m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b);
+  __m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b);
+  __m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf);
+  __m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf);
+  __m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf);
+  __m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf);
+
+  // NOTE: no unpacklo/hi instr in this case, so using permute instr.
+  __m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20);
+  __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
+  __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
+  __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
+  __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
+  __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
+  __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
+  __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
+  __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
+  __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
+  __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
+  __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
+  __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
+  __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
+  __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
+  __m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31);
+
+  kernel.packet[0] = _mm256_castsi256_ph(a_p_0);
+  kernel.packet[1] = _mm256_castsi256_ph(a_p_1);
+  kernel.packet[2] = _mm256_castsi256_ph(a_p_2);
+  kernel.packet[3] = _mm256_castsi256_ph(a_p_3);
+  kernel.packet[4] = _mm256_castsi256_ph(a_p_4);
+  kernel.packet[5] = _mm256_castsi256_ph(a_p_5);
+  kernel.packet[6] = _mm256_castsi256_ph(a_p_6);
+  kernel.packet[7] = _mm256_castsi256_ph(a_p_7);
+  kernel.packet[8] = _mm256_castsi256_ph(a_p_8);
+  kernel.packet[9] = _mm256_castsi256_ph(a_p_9);
+  kernel.packet[10] = _mm256_castsi256_ph(a_p_a);
+  kernel.packet[11] = _mm256_castsi256_ph(a_p_b);
+  kernel.packet[12] = _mm256_castsi256_ph(a_p_c);
+  kernel.packet[13] = _mm256_castsi256_ph(a_p_d);
+  kernel.packet[14] = _mm256_castsi256_ph(a_p_e);
+  kernel.packet[15] = _mm256_castsi256_ph(a_p_f);
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16h, 8>& kernel) {
+  EIGEN_ALIGN64 half in[8][16];
+  pstore<half>(in[0], kernel.packet[0]);
+  pstore<half>(in[1], kernel.packet[1]);
+  pstore<half>(in[2], kernel.packet[2]);
+  pstore<half>(in[3], kernel.packet[3]);
+  pstore<half>(in[4], kernel.packet[4]);
+  pstore<half>(in[5], kernel.packet[5]);
+  pstore<half>(in[6], kernel.packet[6]);
+  pstore<half>(in[7], kernel.packet[7]);
+
+  EIGEN_ALIGN64 half out[8][16];
+
+  for (int i = 0; i < 8; ++i) {
+    for (int j = 0; j < 8; ++j) {
+      out[i][j] = in[j][2 * i];
+    }
+    for (int j = 0; j < 8; ++j) {
+      out[i][j + 8] = in[j][2 * i + 1];
+    }
+  }
+
+  kernel.packet[0] = pload<Packet16h>(out[0]);
+  kernel.packet[1] = pload<Packet16h>(out[1]);
+  kernel.packet[2] = pload<Packet16h>(out[2]);
+  kernel.packet[3] = pload<Packet16h>(out[3]);
+  kernel.packet[4] = pload<Packet16h>(out[4]);
+  kernel.packet[5] = pload<Packet16h>(out[5]);
+  kernel.packet[6] = pload<Packet16h>(out[6]);
+  kernel.packet[7] = pload<Packet16h>(out[7]);
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16h, 4>& kernel) {
+  EIGEN_ALIGN64 half in[4][16];
+  pstore<half>(in[0], kernel.packet[0]);
+  pstore<half>(in[1], kernel.packet[1]);
+  pstore<half>(in[2], kernel.packet[2]);
+  pstore<half>(in[3], kernel.packet[3]);
+
+  EIGEN_ALIGN64 half out[4][16];
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      out[i][j] = in[j][4 * i];
+    }
+    for (int j = 0; j < 4; ++j) {
+      out[i][j + 4] = in[j][4 * i + 1];
+    }
+    for (int j = 0; j < 4; ++j) {
+      out[i][j + 8] = in[j][4 * i + 2];
+    }
+    for (int j = 0; j < 4; ++j) {
+      out[i][j + 12] = in[j][4 * i + 3];
+    }
+  }
+
+  kernel.packet[0] = pload<Packet16h>(out[0]);
+  kernel.packet[1] = pload<Packet16h>(out[1]);
+  kernel.packet[2] = pload<Packet16h>(out[2]);
+  kernel.packet[3] = pload<Packet16h>(out[3]);
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8h, 8>& kernel) {
+  __m128i a = _mm_castph_si128(kernel.packet[0]);
+  __m128i b = _mm_castph_si128(kernel.packet[1]);
+  __m128i c = _mm_castph_si128(kernel.packet[2]);
+  __m128i d = _mm_castph_si128(kernel.packet[3]);
+  __m128i e = _mm_castph_si128(kernel.packet[4]);
+  __m128i f = _mm_castph_si128(kernel.packet[5]);
+  __m128i g = _mm_castph_si128(kernel.packet[6]);
+  __m128i h = _mm_castph_si128(kernel.packet[7]);
+
+  __m128i a03b03 = _mm_unpacklo_epi16(a, b);
+  __m128i c03d03 = _mm_unpacklo_epi16(c, d);
+  __m128i e03f03 = _mm_unpacklo_epi16(e, f);
+  __m128i g03h03 = _mm_unpacklo_epi16(g, h);
+  __m128i a47b47 = _mm_unpackhi_epi16(a, b);
+  __m128i c47d47 = _mm_unpackhi_epi16(c, d);
+  __m128i e47f47 = _mm_unpackhi_epi16(e, f);
+  __m128i g47h47 = _mm_unpackhi_epi16(g, h);
+
+  __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03);
+  __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03);
+  __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03);
+  __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03);
+  __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47);
+  __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47);
+  __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47);
+  __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47);
+
+  __m128i a0b0c0d0e0f0g0h0 = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01);
+  __m128i a1b1c1d1e1f1g1h1 = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01);
+  __m128i a2b2c2d2e2f2g2h2 = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23);
+  __m128i a3b3c3d3e3f3g3h3 = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23);
+  __m128i a4b4c4d4e4f4g4h4 = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45);
+  __m128i a5b5c5d5e5f5g5h5 = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45);
+  __m128i a6b6c6d6e6f6g6h6 = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67);
+  __m128i a7b7c7d7e7f7g7h7 = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67);
+
+  kernel.packet[0] = _mm_castsi128_ph(a0b0c0d0e0f0g0h0);
+  kernel.packet[1] = _mm_castsi128_ph(a1b1c1d1e1f1g1h1);
+  kernel.packet[2] = _mm_castsi128_ph(a2b2c2d2e2f2g2h2);
+  kernel.packet[3] = _mm_castsi128_ph(a3b3c3d3e3f3g3h3);
+  kernel.packet[4] = _mm_castsi128_ph(a4b4c4d4e4f4g4h4);
+  kernel.packet[5] = _mm_castsi128_ph(a5b5c5d5e5f5g5h5);
+  kernel.packet[6] = _mm_castsi128_ph(a6b6c6d6e6f6g6h6);
+  kernel.packet[7] = _mm_castsi128_ph(a7b7c7d7e7f7g7h7);
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8h, 4>& kernel) {
+  EIGEN_ALIGN32 Eigen::half in[4][8];
+  pstore<Eigen::half>(in[0], kernel.packet[0]);
+  pstore<Eigen::half>(in[1], kernel.packet[1]);
+  pstore<Eigen::half>(in[2], kernel.packet[2]);
+  pstore<Eigen::half>(in[3], kernel.packet[3]);
+
+  EIGEN_ALIGN32 Eigen::half out[4][8];
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      out[i][j] = in[j][2 * i];
+    }
+    for (int j = 0; j < 4; ++j) {
+      out[i][j + 4] = in[j][2 * i + 1];
+    }
+  }
+
+  kernel.packet[0] = pload<Packet8h>(out[0]);
+  kernel.packet[1] = pload<Packet8h>(out[1]);
+  kernel.packet[2] = pload<Packet8h>(out[2]);
+  kernel.packet[3] = pload<Packet8h>(out[3]);
+}
+
 // preverse
 
 template <>
@@ -672,6 +1317,20 @@
                                a);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16h preverse(const Packet16h& a) {
+  __m128i m = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
+  return _mm256_castsi256_ph(_mm256_insertf128_si256(
+      _mm256_castsi128_si256(_mm_shuffle_epi8(_mm256_extractf128_si256(_mm256_castph_si256(a), 1), m)),
+      _mm_shuffle_epi8(_mm256_extractf128_si256(_mm256_castph_si256(a), 0), m), 1));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h preverse(const Packet8h& a) {
+  __m128i m = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
+  return _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(a), m));
+}
+
 // pscatter
 
 template <>
@@ -684,191 +1343,68 @@
     to[stride * i] = aux[i];
   }
 }
+template <>
+EIGEN_STRONG_INLINE void pscatter<half, Packet16h>(half* to, const Packet16h& from, Index stride) {
+  EIGEN_ALIGN64 half aux[16];
+  pstore(aux, from);
+  to[stride * 0] = aux[0];
+  to[stride * 1] = aux[1];
+  to[stride * 2] = aux[2];
+  to[stride * 3] = aux[3];
+  to[stride * 4] = aux[4];
+  to[stride * 5] = aux[5];
+  to[stride * 6] = aux[6];
+  to[stride * 7] = aux[7];
+  to[stride * 8] = aux[8];
+  to[stride * 9] = aux[9];
+  to[stride * 10] = aux[10];
+  to[stride * 11] = aux[11];
+  to[stride * 12] = aux[12];
+  to[stride * 13] = aux[13];
+  to[stride * 14] = aux[14];
+  to[stride * 15] = aux[15];
+}
+
+template <>
+EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8h>(Eigen::half* to, const Packet8h& from, Index stride) {
+  EIGEN_ALIGN32 Eigen::half aux[8];
+  pstore(aux, from);
+  to[stride * 0] = aux[0];
+  to[stride * 1] = aux[1];
+  to[stride * 2] = aux[2];
+  to[stride * 3] = aux[3];
+  to[stride * 4] = aux[4];
+  to[stride * 5] = aux[5];
+  to[stride * 6] = aux[6];
+  to[stride * 7] = aux[7];
+}
 
 // pgather
 
 template <>
 EIGEN_STRONG_INLINE Packet32h pgather<Eigen::half, Packet32h>(const Eigen::half* from, Index stride) {
-  return _mm512_castsi512_ph(_mm512_set_epi16(
-      from[31 * stride].x, from[30 * stride].x, from[29 * stride].x, from[28 * stride].x, from[27 * stride].x,
-      from[26 * stride].x, from[25 * stride].x, from[24 * stride].x, from[23 * stride].x, from[22 * stride].x,
-      from[21 * stride].x, from[20 * stride].x, from[19 * stride].x, from[18 * stride].x, from[17 * stride].x,
-      from[16 * stride].x, from[15 * stride].x, from[14 * stride].x, from[13 * stride].x, from[12 * stride].x,
-      from[11 * stride].x, from[10 * stride].x, from[9 * stride].x, from[8 * stride].x, from[7 * stride].x,
-      from[6 * stride].x, from[5 * stride].x, from[4 * stride].x, from[3 * stride].x, from[2 * stride].x,
-      from[1 * stride].x, from[0 * stride].x));
+  return _mm512_set_ph(from[31 * stride].x, from[30 * stride].x, from[29 * stride].x, from[28 * stride].x,
+                       from[27 * stride].x, from[26 * stride].x, from[25 * stride].x, from[24 * stride].x,
+                       from[23 * stride].x, from[22 * stride].x, from[21 * stride].x, from[20 * stride].x,
+                       from[19 * stride].x, from[18 * stride].x, from[17 * stride].x, from[16 * stride].x,
+                       from[15 * stride].x, from[14 * stride].x, from[13 * stride].x, from[12 * stride].x,
+                       from[11 * stride].x, from[10 * stride].x, from[9 * stride].x, from[8 * stride].x,
+                       from[7 * stride].x, from[6 * stride].x, from[5 * stride].x, from[4 * stride].x,
+                       from[3 * stride].x, from[2 * stride].x, from[1 * stride].x, from[0 * stride].x);
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet16h pcos<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h psin<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h plog<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h plog2<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h plog1p<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h pexp<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h pexpm1<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h ptanh<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h pfrexp<Packet16h>(const Packet16h&, Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h pldexp<Packet16h>(const Packet16h&, const Packet16h&);
-
-EIGEN_STRONG_INLINE Packet32h combine2Packet16h(const Packet16h& a, const Packet16h& b) {
-  __m512d result = _mm512_undefined_pd();
-  result = _mm512_insertf64x4(result, _mm256_castsi256_pd(a), 0);
-  result = _mm512_insertf64x4(result, _mm256_castsi256_pd(b), 1);
-  return _mm512_castpd_ph(result);
+EIGEN_STRONG_INLINE Packet16h pgather<Eigen::half, Packet16h>(const Eigen::half* from, Index stride) {
+  return _mm256_set_ph(from[15 * stride].x, from[14 * stride].x, from[13 * stride].x, from[12 * stride].x,
+                       from[11 * stride].x, from[10 * stride].x, from[9 * stride].x, from[8 * stride].x,
+                       from[7 * stride].x, from[6 * stride].x, from[5 * stride].x, from[4 * stride].x,
+                       from[3 * stride].x, from[2 * stride].x, from[1 * stride].x, from[0 * stride].x);
 }
 
-EIGEN_STRONG_INLINE void extract2Packet16h(const Packet32h& x, Packet16h& a, Packet16h& b) {
-  a = _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(x), 0));
-  b = _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(x), 1));
-}
-
-// psin
 template <>
-EIGEN_STRONG_INLINE Packet32h psin<Packet32h>(const Packet32h& a) {
-  Packet16h low;
-  Packet16h high;
-  extract2Packet16h(a, low, high);
-
-  Packet16h lowOut = psin(low);
-  Packet16h highOut = psin(high);
-
-  return combine2Packet16h(lowOut, highOut);
-}
-
-// pcos
-template <>
-EIGEN_STRONG_INLINE Packet32h pcos<Packet32h>(const Packet32h& a) {
-  Packet16h low;
-  Packet16h high;
-  extract2Packet16h(a, low, high);
-
-  Packet16h lowOut = pcos(low);
-  Packet16h highOut = pcos(high);
-
-  return combine2Packet16h(lowOut, highOut);
-}
-
-// plog
-template <>
-EIGEN_STRONG_INLINE Packet32h plog<Packet32h>(const Packet32h& a) {
-  Packet16h low;
-  Packet16h high;
-  extract2Packet16h(a, low, high);
-
-  Packet16h lowOut = plog(low);
-  Packet16h highOut = plog(high);
-
-  return combine2Packet16h(lowOut, highOut);
-}
-
-// plog2
-template <>
-EIGEN_STRONG_INLINE Packet32h plog2<Packet32h>(const Packet32h& a) {
-  Packet16h low;
-  Packet16h high;
-  extract2Packet16h(a, low, high);
-
-  Packet16h lowOut = plog2(low);
-  Packet16h highOut = plog2(high);
-
-  return combine2Packet16h(lowOut, highOut);
-}
-
-// plog1p
-template <>
-EIGEN_STRONG_INLINE Packet32h plog1p<Packet32h>(const Packet32h& a) {
-  Packet16h low;
-  Packet16h high;
-  extract2Packet16h(a, low, high);
-
-  Packet16h lowOut = plog1p(low);
-  Packet16h highOut = plog1p(high);
-
-  return combine2Packet16h(lowOut, highOut);
-}
-
-// pexp
-template <>
-EIGEN_STRONG_INLINE Packet32h pexp<Packet32h>(const Packet32h& a) {
-  Packet16h low;
-  Packet16h high;
-  extract2Packet16h(a, low, high);
-
-  Packet16h lowOut = pexp(low);
-  Packet16h highOut = pexp(high);
-
-  return combine2Packet16h(lowOut, highOut);
-}
-
-// pexpm1
-template <>
-EIGEN_STRONG_INLINE Packet32h pexpm1<Packet32h>(const Packet32h& a) {
-  Packet16h low;
-  Packet16h high;
-  extract2Packet16h(a, low, high);
-
-  Packet16h lowOut = pexpm1(low);
-  Packet16h highOut = pexpm1(high);
-
-  return combine2Packet16h(lowOut, highOut);
-}
-
-// ptanh
-template <>
-EIGEN_STRONG_INLINE Packet32h ptanh<Packet32h>(const Packet32h& a) {
-  Packet16h low;
-  Packet16h high;
-  extract2Packet16h(a, low, high);
-
-  Packet16h lowOut = ptanh(low);
-  Packet16h highOut = ptanh(high);
-
-  return combine2Packet16h(lowOut, highOut);
-}
-
-// pfrexp
-template <>
-EIGEN_STRONG_INLINE Packet32h pfrexp<Packet32h>(const Packet32h& a, Packet32h& exponent) {
-  Packet16h low;
-  Packet16h high;
-  extract2Packet16h(a, low, high);
-
-  Packet16h exp1 = _mm256_undefined_si256();
-  Packet16h exp2 = _mm256_undefined_si256();
-
-  Packet16h lowOut = pfrexp(low, exp1);
-  Packet16h highOut = pfrexp(high, exp2);
-
-  exponent = combine2Packet16h(exp1, exp2);
-
-  return combine2Packet16h(lowOut, highOut);
-}
-
-// pldexp
-template <>
-EIGEN_STRONG_INLINE Packet32h pldexp<Packet32h>(const Packet32h& a, const Packet32h& exponent) {
-  Packet16h low;
-  Packet16h high;
-  extract2Packet16h(a, low, high);
-
-  Packet16h exp1;
-  Packet16h exp2;
-  extract2Packet16h(exponent, exp1, exp2);
-
-  Packet16h lowOut = pldexp(low, exp1);
-  Packet16h highOut = pldexp(high, exp2);
-
-  return combine2Packet16h(lowOut, highOut);
+EIGEN_STRONG_INLINE Packet8h pgather<Eigen::half, Packet8h>(const Eigen::half* from, Index stride) {
+  return _mm_set_ph(from[7 * stride].x, from[6 * stride].x, from[5 * stride].x, from[4 * stride].x, from[3 * stride].x,
+                    from[2 * stride].x, from[1 * stride].x, from[0 * stride].x);
 }
 
 }  // end namespace internal

diff --git a/Eigen/src/Core/arch/AVX512/TypeCasting.h b/Eigen/src/Core/arch/AVX512/TypeCasting.h
index 9508ac6..fc55fd8 100644
--- a/Eigen/src/Core/arch/AVX512/TypeCasting.h
+++ b/Eigen/src/Core/arch/AVX512/TypeCasting.h

@@ -237,17 +237,13 @@
   return _mm512_castsi512_si128(a);
 }
 
+#ifndef EIGEN_VECTORIZE_AVX512FP16
 template <>
 EIGEN_STRONG_INLINE Packet8h preinterpret<Packet8h, Packet16h>(const Packet16h& a) {
   return _mm256_castsi256_si128(a);
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet8bf preinterpret<Packet8bf, Packet16bf>(const Packet16bf& a) {
-  return _mm256_castsi256_si128(a);
-}
-
-template <>
 EIGEN_STRONG_INLINE Packet16f pcast<Packet16h, Packet16f>(const Packet16h& a) {
   return half2float(a);
 }
@@ -257,6 +253,13 @@
   return float2half(a);
 }
 
+#endif
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf preinterpret<Packet8bf, Packet16bf>(const Packet16bf& a) {
+  return _mm256_castsi256_si128(a);
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet16f pcast<Packet16bf, Packet16f>(const Packet16bf& a) {
   return Bf16ToF32(a);
@@ -267,68 +270,6 @@
   return F32ToBf16(a);
 }
 
-#ifdef EIGEN_VECTORIZE_AVX512FP16
-
-template <>
-EIGEN_STRONG_INLINE Packet16h preinterpret<Packet16h, Packet32h>(const Packet32h& a) {
-  return _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(a), 0));
-}
-template <>
-EIGEN_STRONG_INLINE Packet8h preinterpret<Packet8h, Packet32h>(const Packet32h& a) {
-  return _mm256_castsi256_si128(preinterpret<Packet16h>(a));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16f pcast<Packet32h, Packet16f>(const Packet32h& a) {
-  // Discard second-half of input.
-  Packet16h low = _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(a), 0));
-  return _mm512_cvtxph_ps(_mm256_castsi256_ph(low));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pcast<Packet16f, Packet32h>(const Packet16f& a, const Packet16f& b) {
-  __m512d result = _mm512_undefined_pd();
-  result = _mm512_insertf64x4(
-      result, _mm256_castsi256_pd(_mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)), 0);
-  result = _mm512_insertf64x4(
-      result, _mm256_castsi256_pd(_mm512_cvtps_ph(b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)), 1);
-  return _mm512_castpd_ph(result);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8f pcast<Packet16h, Packet8f>(const Packet16h& a) {
-  // Discard second-half of input.
-  Packet8h low = _mm_castps_si128(_mm256_extractf32x4_ps(_mm256_castsi256_ps(a), 0));
-  return _mm256_cvtxph_ps(_mm_castsi128_ph(low));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16h pcast<Packet8f, Packet16h>(const Packet8f& a, const Packet8f& b) {
-  __m256d result = _mm256_undefined_pd();
-  result = _mm256_insertf64x2(result,
-                              _mm_castsi128_pd(_mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)), 0);
-  result = _mm256_insertf64x2(result,
-                              _mm_castsi128_pd(_mm256_cvtps_ph(b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)), 1);
-  return _mm256_castpd_si256(result);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4f pcast<Packet8h, Packet4f>(const Packet8h& a) {
-  Packet8f full = _mm256_cvtxph_ps(_mm_castsi128_ph(a));
-  // Discard second-half of input.
-  return _mm256_extractf32x4_ps(full, 0);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8h pcast<Packet4f, Packet8h>(const Packet4f& a, const Packet4f& b) {
-  __m256 result = _mm256_undefined_ps();
-  result = _mm256_insertf128_ps(result, a, 0);
-  result = _mm256_insertf128_ps(result, b, 1);
-  return _mm256_cvtps_ph(result, _MM_FROUND_TO_NEAREST_INT);
-}
-
-#endif
-
 }  // end namespace internal
 
 }  // end namespace Eigen

diff --git a/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h b/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h
new file mode 100644
index 0000000..f06f13d
--- /dev/null
+++ b/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h

@@ -0,0 +1,130 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2025 The Eigen Authors.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TYPE_CASTING_FP16_AVX512_H
+#define EIGEN_TYPE_CASTING_FP16_AVX512_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+template <>
+EIGEN_STRONG_INLINE Packet32s preinterpret<Packet32s, Packet32h>(const Packet32h& a) {
+  return _mm512_castph_si512(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16s preinterpret<Packet16s, Packet16h>(const Packet16h& a) {
+  return _mm256_castph_si256(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s preinterpret<Packet8s, Packet8h>(const Packet8h& a) {
+  return _mm_castph_si128(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet32h preinterpret<Packet32h, Packet32s>(const Packet32s& a) {
+  return _mm512_castsi512_ph(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16h preinterpret<Packet16h, Packet16s>(const Packet16s& a) {
+  return _mm256_castsi256_ph(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8h preinterpret<Packet8h, Packet8s>(const Packet8s& a) {
+  return _mm_castsi128_ph(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pcast<Packet16h, Packet16f>(const Packet16h& a) {
+  return half2float(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pcast<Packet8h, Packet8f>(const Packet8h& a) {
+  return half2float(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pcast<Packet16f, Packet16h>(const Packet16f& a) {
+  return float2half(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8h pcast<Packet8f, Packet8h>(const Packet8f& a) {
+  return float2half(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pcast<Packet32h, Packet16f>(const Packet32h& a) {
+  // Discard second-half of input.
+  Packet16h low = _mm256_castpd_ph(_mm512_extractf64x4_pd(_mm512_castph_pd(a), 0));
+  return _mm512_cvtxph_ps(low);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pcast<Packet16h, Packet8f>(const Packet16h& a) {
+  // Discard second-half of input.
+  Packet8h low = _mm_castps_ph(_mm256_extractf32x4_ps(_mm256_castph_ps(a), 0));
+  return _mm256_cvtxph_ps(low);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet8h, Packet4f>(const Packet8h& a) {
+  Packet8f full = _mm256_cvtxph_ps(a);
+  // Discard second-half of input.
+  return _mm256_extractf32x4_ps(full, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pcast<Packet16f, Packet32h>(const Packet16f& a, const Packet16f& b) {
+  __m512 result = _mm512_castsi512_ps(_mm512_castsi256_si512(_mm256_castph_si256(_mm512_cvtxps_ph(a))));
+  result = _mm512_insertf32x8(result, _mm256_castph_ps(_mm512_cvtxps_ph(b)), 1);
+  return _mm512_castps_ph(result);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16h pcast<Packet8f, Packet16h>(const Packet8f& a, const Packet8f& b) {
+  __m256 result = _mm256_castsi256_ps(_mm256_castsi128_si256(_mm_castph_si128(_mm256_cvtxps_ph(a))));
+  result = _mm256_insertf32x4(result, _mm_castph_ps(_mm256_cvtxps_ph(b)), 1);
+  return _mm256_castps_ph(result);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8h pcast<Packet4f, Packet8h>(const Packet4f& a, const Packet4f& b) {
+  __m256 result = _mm256_castsi256_ps(_mm256_castsi128_si256(_mm_castps_si128(a)));
+  result = _mm256_insertf128_ps(result, b, 1);
+  return _mm256_cvtxps_ph(result);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet32s pcast<Packet32h, Packet32s>(const Packet32h& a) {
+  return _mm512_cvtph_epi16(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16s pcast<Packet16h, Packet16s>(const Packet16h& a) {
+  return _mm256_cvtph_epi16(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pcast<Packet8h, Packet8s>(const Packet8h& a) {
+  return _mm_cvtph_epi16(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pcast<Packet32s, Packet32h>(const Packet32s& a) {
+  return _mm512_cvtepi16_ph(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16h pcast<Packet16s, Packet16h>(const Packet16s& a) {
+  return _mm256_cvtepi16_ph(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8h pcast<Packet8s, Packet8h>(const Packet8s& a) {
+  return _mm_cvtepi16_ph(a);
+}
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_TYPE_CASTING_FP16_AVX512_H

diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index 482064e..d7bd9be 100644
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h

@@ -186,6 +186,7 @@
     HasExp = 1,
 #ifdef EIGEN_VECTORIZE_VSX
     HasSqrt = 1,
+    HasCbrt = 1,
 #if !EIGEN_COMP_CLANG
     HasRsqrt = 1,
 #else
@@ -424,55 +425,6 @@
     masked_store_available = false
   };
 };
-inline std::ostream& operator<<(std::ostream& s, const Packet16c& v) {
-  union {
-    Packet16c v;
-    signed char n[16];
-  } vt;
-  vt.v = v;
-  for (int i = 0; i < 16; i++) s << vt.n[i] << ", ";
-  return s;
-}
-
-inline std::ostream& operator<<(std::ostream& s, const Packet16uc& v) {
-  union {
-    Packet16uc v;
-    unsigned char n[16];
-  } vt;
-  vt.v = v;
-  for (int i = 0; i < 16; i++) s << vt.n[i] << ", ";
-  return s;
-}
-
-inline std::ostream& operator<<(std::ostream& s, const Packet4f& v) {
-  union {
-    Packet4f v;
-    float n[4];
-  } vt;
-  vt.v = v;
-  s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
-  return s;
-}
-
-inline std::ostream& operator<<(std::ostream& s, const Packet4i& v) {
-  union {
-    Packet4i v;
-    int n[4];
-  } vt;
-  vt.v = v;
-  s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
-  return s;
-}
-
-inline std::ostream& operator<<(std::ostream& s, const Packet4ui& v) {
-  union {
-    Packet4ui v;
-    unsigned int n[4];
-  } vt;
-  vt.v = v;
-  s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
-  return s;
-}
 
 template <typename Packet>
 EIGEN_STRONG_INLINE Packet pload_common(const __UNPACK_TYPE__(Packet) * from) {
@@ -2385,6 +2337,44 @@
 }
 
 template <>
+EIGEN_STRONG_INLINE Packet8bf pmsub(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
+  Packet4f a_even = Bf16ToF32Even(a);
+  Packet4f a_odd = Bf16ToF32Odd(a);
+  Packet4f b_even = Bf16ToF32Even(b);
+  Packet4f b_odd = Bf16ToF32Odd(b);
+  Packet4f c_even = Bf16ToF32Even(c);
+  Packet4f c_odd = Bf16ToF32Odd(c);
+  Packet4f pmadd_even = pmsub<Packet4f>(a_even, b_even, c_even);
+  Packet4f pmadd_odd = pmsub<Packet4f>(a_odd, b_odd, c_odd);
+  return F32ToBf16(pmadd_even, pmadd_odd);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8bf pnmadd(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
+  Packet4f a_even = Bf16ToF32Even(a);
+  Packet4f a_odd = Bf16ToF32Odd(a);
+  Packet4f b_even = Bf16ToF32Even(b);
+  Packet4f b_odd = Bf16ToF32Odd(b);
+  Packet4f c_even = Bf16ToF32Even(c);
+  Packet4f c_odd = Bf16ToF32Odd(c);
+  Packet4f pmadd_even = pnmadd<Packet4f>(a_even, b_even, c_even);
+  Packet4f pmadd_odd = pnmadd<Packet4f>(a_odd, b_odd, c_odd);
+  return F32ToBf16(pmadd_even, pmadd_odd);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pnmsub(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
+  Packet4f a_even = Bf16ToF32Even(a);
+  Packet4f a_odd = Bf16ToF32Odd(a);
+  Packet4f b_even = Bf16ToF32Even(b);
+  Packet4f b_odd = Bf16ToF32Odd(b);
+  Packet4f c_even = Bf16ToF32Even(c);
+  Packet4f c_odd = Bf16ToF32Odd(c);
+  Packet4f pmadd_even = pnmsub<Packet4f>(a_even, b_even, c_even);
+  Packet4f pmadd_odd = pnmsub<Packet4f>(a_odd, b_odd, c_odd);
+  return F32ToBf16(pmadd_even, pmadd_odd);
+}
+
+template <>
 EIGEN_STRONG_INLINE Packet8bf pmin<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
   BF16_TO_F32_BINARY_OP_WRAPPER(pmin<Packet4f>, a, b);
 }
@@ -3187,6 +3177,7 @@
     HasLog = 0,
     HasExp = 1,
     HasSqrt = 1,
+    HasCbrt = 1,
 #if !EIGEN_COMP_CLANG
     HasRsqrt = 1,
 #else

diff --git a/Eigen/src/Core/arch/Default/BFloat16.h b/Eigen/src/Core/arch/Default/BFloat16.h
index b8d3b4f..f2e55f3 100644
--- a/Eigen/src/Core/arch/Default/BFloat16.h
+++ b/Eigen/src/Core/arch/Default/BFloat16.h

@@ -673,6 +673,11 @@
   return bfloat16(::fmaxf(f1, f2));
 }
 
+EIGEN_DEVICE_FUNC inline bfloat16 fma(const bfloat16& a, const bfloat16& b, const bfloat16& c) {
+  // Emulate FMA via float.
+  return bfloat16(numext::fma(static_cast<float>(a), static_cast<float>(b), static_cast<float>(c)));
+}
+
 #ifndef EIGEN_NO_IO
 EIGEN_ALWAYS_INLINE std::ostream& operator<<(std::ostream& os, const bfloat16& v) {
   os << static_cast<float>(v);

diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
index 174eb57..e9f564b 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h

@@ -187,7 +187,7 @@
   static constexpr int TotalBits = sizeof(Scalar) * CHAR_BIT, MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
                        ExponentBits = TotalBits - MantissaBits - 1;
 
-  EIGEN_CONSTEXPR ScalarUI scalar_sign_mantissa_mask =
+  constexpr ScalarUI scalar_sign_mantissa_mask =
       ~(((ScalarUI(1) << ExponentBits) - ScalarUI(1)) << MantissaBits);  // ~0x7f800000
   const Packet sign_mantissa_mask = pset1frombits<Packet>(static_cast<ScalarUI>(scalar_sign_mantissa_mask));
   const Packet half = pset1<Packet>(Scalar(0.5));
@@ -196,7 +196,7 @@
 
   // To handle denormals, normalize by multiplying by 2^(int(MantissaBits)+1).
   const Packet is_denormal = pcmp_lt(pabs(a), normal_min);
-  EIGEN_CONSTEXPR ScalarUI scalar_normalization_offset = ScalarUI(MantissaBits + 1);  // 24
+  constexpr ScalarUI scalar_normalization_offset = ScalarUI(MantissaBits + 1);  // 24
   // The following cannot be constexpr because bfloat16(uint16_t) is not constexpr.
   const Scalar scalar_normalization_factor = Scalar(ScalarUI(1) << int(scalar_normalization_offset));  // 2^24
   const Packet normalization_factor = pset1<Packet>(scalar_normalization_factor);
@@ -289,6 +289,142 @@
   return pmul(a, preinterpret<Packet>(plogical_shift_left<MantissaBits>(e)));
 }
 
+// This function implements a single step of Halley's iteration for
+// computing x = y^(1/3):
+//   x_{k+1} = x_k - (x_k^3 - y) x_k / (2x_k^3 + y)
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet cbrt_halley_iteration_step(const Packet& x_k,
+                                                                                      const Packet& y) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  Packet x_k_cb = pmul(x_k, pmul(x_k, x_k));
+  Packet denom = pmadd(pset1<Packet>(Scalar(2)), x_k_cb, y);
+  Packet num = psub(x_k_cb, y);
+  Packet r = pdiv(num, denom);
+  return pnmadd(x_k, r, x_k);
+}
+
+// Decompose the input such that x^(1/3) = y^(1/3) * 2^e_div3, and y is in the
+// interval [0.125,1].
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet cbrt_decompose(const Packet& x, Packet& e_div3) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  // Extract the significant s in the range [0.5,1) and exponent e, such that
+  // x = 2^e * s.
+  Packet e, s;
+  s = pfrexp(x, e);
+
+  // Split the exponent into a part divisible by 3 and the remainder.
+  // e = 3*e_div3 + e_mod3.
+  constexpr Scalar kOneThird = Scalar(1) / 3;
+  e_div3 = pceil(pmul(e, pset1<Packet>(kOneThird)));
+  Packet e_mod3 = pnmadd(pset1<Packet>(Scalar(3)), e_div3, e);
+
+  // Replace s by y = (s * 2^e_mod3).
+  return pldexp_fast(s, e_mod3);
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet cbrt_special_cases_and_sign(const Packet& x,
+                                                                                       const Packet& abs_root) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+
+  // Set sign.
+  const Packet sign_mask = pset1<Packet>(Scalar(-0.0));
+  const Packet x_sign = pand(sign_mask, x);
+  Packet root = por(x_sign, abs_root);
+
+  // Pass non-finite and zero values of x straight through.
+  const Packet is_not_finite = por(pisinf(x), pisnan(x));
+  const Packet is_zero = pcmp_eq(pzero(x), x);
+  const Packet use_x = por(is_not_finite, is_zero);
+  return pselect(use_x, x, root);
+}
+
+// Generic implementation of cbrt(x) for float.
+//
+// The algorithm computes the cubic root of the input by first
+// decomposing it into a exponent and significant
+//   x = s * 2^e.
+//
+// We can then write the cube root as
+//
+//   x^(1/3) = 2^(e/3) * s^(1/3)
+//           = 2^((3*e_div3 + e_mod3)/3) * s^(1/3)
+//           = 2^(e_div3) * 2^(e_mod3/3) * s^(1/3)
+//           = 2^(e_div3) * (s * 2^e_mod3)^(1/3)
+//
+// where e_div3 = ceil(e/3) and e_mod3 = e - 3*e_div3.
+//
+// The cube root of the second term y = (s * 2^e_mod3)^(1/3) is coarsely
+// approximated using a cubic polynomial and subsequently refined using a
+// single step of Halley's iteration, and finally the two terms are combined
+// using pldexp_fast.
+//
+// Note: Many alternatives exist for implementing cbrt. See, for example,
+// the excellent discussion in Kahan's note:
+//   https://csclub.uwaterloo.ca/~pbarfuss/qbrt.pdf
+// This particular implementation was found to be very fast and accurate
+// among several alternatives tried, but is probably not "optimal" on all
+// platforms.
+//
+// This is accurate to 2 ULP.
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcbrt_float(const Packet& x) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
+
+  // Decompose the input such that x^(1/3) = y^(1/3) * 2^e_div3, and y is in the
+  // interval [0.125,1].
+  Packet e_div3;
+  const Packet y = cbrt_decompose(pabs(x), e_div3);
+
+  // Compute initial approximation accurate to 5.22e-3.
+  // The polynomial was computed using Rminimax.
+  constexpr float alpha[] = {5.9220016002655029296875e-01f, -1.3859539031982421875e+00f, 1.4581282138824462890625e+00f,
+                             3.408401906490325927734375e-01f};
+  Packet r = ppolevl<Packet, 3>::run(y, alpha);
+
+  // Take one step of Halley's iteration.
+  r = cbrt_halley_iteration_step(r, y);
+
+  // Finally multiply by 2^(e_div3)
+  r = pldexp_fast(r, e_div3);
+
+  return cbrt_special_cases_and_sign(x, r);
+}
+
+// Generic implementation of cbrt(x) for double.
+//
+// The algorithm is identical to the one for float except that a different initial
+// approximation is used for y^(1/3) and two Halley iteration steps are peformed.
+//
+// This is accurate to 1 ULP.
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcbrt_double(const Packet& x) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  static_assert(std::is_same<Scalar, double>::value, "Scalar type must be double");
+
+  // Decompose the input such that x^(1/3) = y^(1/3) * 2^e_div3, and y is in the
+  // interval [0.125,1].
+  Packet e_div3;
+  const Packet y = cbrt_decompose(pabs(x), e_div3);
+
+  // Compute initial approximation accurate to 0.016.
+  // The polynomial was computed using Rminimax.
+  constexpr double alpha[] = {-4.69470621553356115551736138513660989701747894287109375e-01,
+                              1.072314636518546304699839311069808900356292724609375e+00,
+                              3.81249427609571867048288140722434036433696746826171875e-01};
+  Packet r = ppolevl<Packet, 2>::run(y, alpha);
+
+  // Take two steps of Halley's iteration.
+  r = cbrt_halley_iteration_step(r, y);
+  r = cbrt_halley_iteration_step(r, y);
+
+  // Finally multiply by 2^(e_div3).
+  r = pldexp_fast(r, e_div3);
+  return cbrt_special_cases_and_sign(x, r);
+}
+
 // Natural or base 2 logarithm.
 // Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
 // and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can
@@ -1123,7 +1259,7 @@
 
   constexpr Scalar kPiOverTwo = static_cast<Scalar>(EIGEN_PI / 2);
 
-  const Packet cst_signmask = pset1<Packet>(-Scalar(0));
+  const Packet cst_signmask = pset1<Packet>(Scalar(-0.0));
   const Packet cst_one = pset1<Packet>(Scalar(1));
   const Packet cst_pi_over_two = pset1<Packet>(kPiOverTwo);
 
@@ -1685,7 +1821,7 @@
 template <typename Packet>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void veltkamp_splitting(const Packet& x, Packet& x_hi, Packet& x_lo) {
   typedef typename unpacket_traits<Packet>::type Scalar;
-  EIGEN_CONSTEXPR int shift = (NumTraits<Scalar>::digits() + 1) / 2;
+  constexpr int shift = (NumTraits<Scalar>::digits() + 1) / 2;
   const Scalar shift_scale = Scalar(uint64_t(1) << shift);  // Scalar constructor not necessarily constexpr.
   const Packet gamma = pmul(pset1<Packet>(shift_scale + Scalar(1)), x);
   Packet rho = psub(x, gamma);
@@ -1991,7 +2127,7 @@
   Packet m_x = pfrexp(x, e_x);
 
   // Adjust m_x to lie in [1/sqrt(2):sqrt(2)] to minimize absolute error in log2(m_x).
-  EIGEN_CONSTEXPR Scalar sqrt_half = Scalar(0.70710678118654752440);
+  constexpr Scalar sqrt_half = Scalar(0.70710678118654752440);
   const Packet m_x_scale_mask = pcmp_lt(m_x, pset1<Packet>(sqrt_half));
   m_x = pselect(m_x_scale_mask, pmul(pset1<Packet>(Scalar(2)), m_x), m_x);
   e_x = pselect(m_x_scale_mask, psub(e_x, pset1<Packet>(Scalar(1))), e_x);
@@ -2074,7 +2210,7 @@
   const Packet y_is_even = pcmp_eq(pround(y_div_2), y_div_2);
   const Packet y_is_odd_int = pandnot(y_is_int, y_is_even);
   // Smallest exponent for which (1 + epsilon) overflows to infinity.
-  EIGEN_CONSTEXPR Scalar huge_exponent =
+  constexpr Scalar huge_exponent =
       (NumTraits<Scalar>::max_exponent() * Scalar(EIGEN_LN2)) / NumTraits<Scalar>::epsilon();
   const Packet y_abs_is_huge = pcmp_le(pset1<Packet>(huge_exponent), y_abs);
 

diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
index ac0e2cf..673954e 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h

@@ -54,6 +54,14 @@
 template <typename Packet>
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_fast(const Packet& a, const Packet& exponent);
 
+/** \internal \returns cbrt(x) for single precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcbrt_float(const Packet& x_in);
+
+/** \internal \returns cbrt(x) for double precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcbrt_double(const Packet& x_in);
+
 /** \internal \returns log(x) for single precision float */
 template <typename Packet>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_float(const Packet _x);
@@ -195,6 +203,7 @@
   EIGEN_FLOAT_PACKET_FUNCTION(log, PACKET)                 \
   EIGEN_FLOAT_PACKET_FUNCTION(log2, PACKET)                \
   EIGEN_FLOAT_PACKET_FUNCTION(exp, PACKET)                 \
+  EIGEN_FLOAT_PACKET_FUNCTION(cbrt, PACKET)                \
   EIGEN_GENERIC_PACKET_FUNCTION(expm1, PACKET)             \
   EIGEN_GENERIC_PACKET_FUNCTION(exp2, PACKET)              \
   EIGEN_GENERIC_PACKET_FUNCTION(log1p, PACKET)             \
@@ -208,6 +217,7 @@
   EIGEN_DOUBLE_PACKET_FUNCTION(log2, PACKET)                \
   EIGEN_DOUBLE_PACKET_FUNCTION(exp, PACKET)                 \
   EIGEN_DOUBLE_PACKET_FUNCTION(tanh, PACKET)                \
+  EIGEN_DOUBLE_PACKET_FUNCTION(cbrt, PACKET)                \
   EIGEN_GENERIC_PACKET_FUNCTION(atan, PACKET)               \
   EIGEN_GENERIC_PACKET_FUNCTION(exp2, PACKET)
 

diff --git a/Eigen/src/Core/arch/Default/Half.h b/Eigen/src/Core/arch/Default/Half.h
index 95697f3..ba70d5f 100644
--- a/Eigen/src/Core/arch/Default/Half.h
+++ b/Eigen/src/Core/arch/Default/Half.h

@@ -37,21 +37,23 @@
 // IWYU pragma: private
 #include "../../InternalHeaderCheck.h"
 
-#if defined(EIGEN_HAS_GPU_FP16) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
 // When compiling with GPU support, the "__half_raw" base class as well as
 // some other routines are defined in the GPU compiler header files
 // (cuda_fp16.h, hip_fp16.h), and they are not tagged constexpr
 // As a consequence, we get compile failures when compiling Eigen with
 // GPU support. Hence the need to disable EIGEN_CONSTEXPR when building
-// Eigen with GPU support
-#pragma push_macro("EIGEN_CONSTEXPR")
-#undef EIGEN_CONSTEXPR
-#define EIGEN_CONSTEXPR
+// Eigen with GPU support.
+// Any functions that require `numext::bit_cast` may also not be constexpr,
+// including any native types when setting via raw bit values.
+#if defined(EIGEN_HAS_GPU_FP16) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
+#define _EIGEN_MAYBE_CONSTEXPR
+#else
+#define _EIGEN_MAYBE_CONSTEXPR constexpr
 #endif
 
 #define F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, METHOD)                                                  \
   template <>                                                                                              \
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_UNUSED PACKET_F16 METHOD<PACKET_F16>(const PACKET_F16& _x) { \
+  EIGEN_UNUSED EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC PACKET_F16 METHOD<PACKET_F16>(const PACKET_F16& _x) { \
     return float2half(METHOD<PACKET_F>(half2float(_x)));                                                   \
   }
 
@@ -81,8 +83,10 @@
 // Making the host side compile phase of hipcc use the same Eigen::half impl, as the gcc compile, resolves
 // this error, and hence the following convoluted #if condition
 #if !defined(EIGEN_HAS_GPU_FP16) || !defined(EIGEN_GPU_COMPILE_PHASE)
+
 // Make our own __half_raw definition that is similar to CUDA's.
 struct __half_raw {
+  struct construct_from_rep_tag {};
 #if (defined(EIGEN_HAS_GPU_FP16) && !defined(EIGEN_GPU_COMPILE_PHASE))
   // Eigen::half can be used as the datatype for shared memory declarations (in Eigen and TF)
   // The element type for shared memory cannot have non-trivial constructors
@@ -91,43 +95,53 @@
   // hence the need for this
   EIGEN_DEVICE_FUNC __half_raw() {}
 #else
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw() : x(0) {}
+  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR __half_raw() : x(0) {}
 #endif
+
 #if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
-  explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw(numext::uint16_t raw) : x(numext::bit_cast<__fp16>(raw)) {}
+  explicit EIGEN_DEVICE_FUNC __half_raw(numext::uint16_t raw) : x(numext::bit_cast<__fp16>(raw)) {}
+  EIGEN_DEVICE_FUNC constexpr __half_raw(construct_from_rep_tag, __fp16 rep) : x{rep} {}
   __fp16 x;
+#elif defined(EIGEN_HAS_BUILTIN_FLOAT16)
+  explicit EIGEN_DEVICE_FUNC __half_raw(numext::uint16_t raw) : x(numext::bit_cast<_Float16>(raw)) {}
+  EIGEN_DEVICE_FUNC constexpr __half_raw(construct_from_rep_tag, _Float16 rep) : x{rep} {}
+  _Float16 x;
 #else
-  explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw(numext::uint16_t raw) : x(raw) {}
+  explicit EIGEN_DEVICE_FUNC constexpr __half_raw(numext::uint16_t raw) : x(raw) {}
+  EIGEN_DEVICE_FUNC constexpr __half_raw(construct_from_rep_tag, numext::uint16_t rep) : x{rep} {}
   numext::uint16_t x;
 #endif
 };
 
 #elif defined(EIGEN_HAS_HIP_FP16)
-// Nothing to do here
+// HIP GPU compile phase: nothing to do here.
 // HIP fp16 header file has a definition for __half_raw
 #elif defined(EIGEN_HAS_CUDA_FP16)
+
+// CUDA GPU compile phase.
 #if EIGEN_CUDA_SDK_VER < 90000
 // In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
 typedef __half __half_raw;
 #endif  // defined(EIGEN_HAS_CUDA_FP16)
+
 #elif defined(SYCL_DEVICE_ONLY)
 typedef cl::sycl::half __half_raw;
 #endif
 
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw raw_uint16_to_half(numext::uint16_t x);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR __half_raw raw_uint16_to_half(numext::uint16_t x);
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff);
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h);
 
 struct half_base : public __half_raw {
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half_base() {}
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half_base(const __half_raw& h) : __half_raw(h) {}
+  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base() {}
+  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base(const __half_raw& h) : __half_raw(h) {}
 
 #if defined(EIGEN_HAS_GPU_FP16)
 #if defined(EIGEN_HAS_HIP_FP16)
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half_base(const __half& h) { x = __half_as_ushort(h); }
+  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base(const __half& h) { x = __half_as_ushort(h); }
 #elif defined(EIGEN_HAS_CUDA_FP16)
 #if EIGEN_CUDA_SDK_VER >= 90000
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
+  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
 #endif
 #endif
 #endif
@@ -156,21 +170,29 @@
 #endif
 #endif
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half() {}
+  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half() {}
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half(const __half_raw& h) : half_impl::half_base(h) {}
+  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(const __half_raw& h) : half_impl::half_base(h) {}
 
 #if defined(EIGEN_HAS_GPU_FP16)
 #if defined(EIGEN_HAS_HIP_FP16)
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {}
+  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {}
 #elif defined(EIGEN_HAS_CUDA_FP16)
 #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {}
+  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {}
 #endif
 #endif
 #endif
 
-  explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half(bool b)
+#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
+  explicit EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(__fp16 b)
+      : half(__half_raw(__half_raw::construct_from_rep_tag(), b)) {}
+#elif defined(EIGEN_HAS_BUILTIN_FLOAT16)
+  explicit EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(_Float16 b)
+      : half(__half_raw(__half_raw::construct_from_rep_tag(), b)) {}
+#endif
+
+  explicit EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(bool b)
       : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
   template <class T>
   explicit EIGEN_DEVICE_FUNC half(T val)
@@ -201,99 +223,99 @@
 namespace half_impl {
 template <typename = void>
 struct numeric_limits_half_impl {
-  static EIGEN_CONSTEXPR const bool is_specialized = true;
-  static EIGEN_CONSTEXPR const bool is_signed = true;
-  static EIGEN_CONSTEXPR const bool is_integer = false;
-  static EIGEN_CONSTEXPR const bool is_exact = false;
-  static EIGEN_CONSTEXPR const bool has_infinity = true;
-  static EIGEN_CONSTEXPR const bool has_quiet_NaN = true;
-  static EIGEN_CONSTEXPR const bool has_signaling_NaN = true;
+  static constexpr const bool is_specialized = true;
+  static constexpr const bool is_signed = true;
+  static constexpr const bool is_integer = false;
+  static constexpr const bool is_exact = false;
+  static constexpr const bool has_infinity = true;
+  static constexpr const bool has_quiet_NaN = true;
+  static constexpr const bool has_signaling_NaN = true;
   EIGEN_DIAGNOSTICS(push)
   EIGEN_DISABLE_DEPRECATED_WARNING
-  static EIGEN_CONSTEXPR const std::float_denorm_style has_denorm = std::denorm_present;
-  static EIGEN_CONSTEXPR const bool has_denorm_loss = false;
+  static constexpr const std::float_denorm_style has_denorm = std::denorm_present;
+  static constexpr const bool has_denorm_loss = false;
   EIGEN_DIAGNOSTICS(pop)
-  static EIGEN_CONSTEXPR const std::float_round_style round_style = std::round_to_nearest;
-  static EIGEN_CONSTEXPR const bool is_iec559 = true;
+  static constexpr const std::float_round_style round_style = std::round_to_nearest;
+  static constexpr const bool is_iec559 = true;
   // The C++ standard defines this as "true if the set of values representable
   // by the type is finite." Half has finite precision.
-  static EIGEN_CONSTEXPR const bool is_bounded = true;
-  static EIGEN_CONSTEXPR const bool is_modulo = false;
-  static EIGEN_CONSTEXPR const int digits = 11;
-  static EIGEN_CONSTEXPR const int digits10 =
+  static constexpr const bool is_bounded = true;
+  static constexpr const bool is_modulo = false;
+  static constexpr const int digits = 11;
+  static constexpr const int digits10 =
       3;  // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
-  static EIGEN_CONSTEXPR const int max_digits10 =
+  static constexpr const int max_digits10 =
       5;  // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
-  static EIGEN_CONSTEXPR const int radix = std::numeric_limits<float>::radix;
-  static EIGEN_CONSTEXPR const int min_exponent = -13;
-  static EIGEN_CONSTEXPR const int min_exponent10 = -4;
-  static EIGEN_CONSTEXPR const int max_exponent = 16;
-  static EIGEN_CONSTEXPR const int max_exponent10 = 4;
-  static EIGEN_CONSTEXPR const bool traps = std::numeric_limits<float>::traps;
+  static constexpr const int radix = std::numeric_limits<float>::radix;
+  static constexpr const int min_exponent = -13;
+  static constexpr const int min_exponent10 = -4;
+  static constexpr const int max_exponent = 16;
+  static constexpr const int max_exponent10 = 4;
+  static constexpr const bool traps = std::numeric_limits<float>::traps;
   // IEEE754: "The implementer shall choose how tininess is detected, but shall
   // detect tininess in the same way for all operations in radix two"
-  static EIGEN_CONSTEXPR const bool tinyness_before = std::numeric_limits<float>::tinyness_before;
+  static constexpr const bool tinyness_before = std::numeric_limits<float>::tinyness_before;
 
-  static EIGEN_CONSTEXPR Eigen::half(min)() { return Eigen::half_impl::raw_uint16_to_half(0x0400); }
-  static EIGEN_CONSTEXPR Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); }
-  static EIGEN_CONSTEXPR Eigen::half(max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); }
-  static EIGEN_CONSTEXPR Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x1400); }
-  static EIGEN_CONSTEXPR Eigen::half round_error() { return Eigen::half_impl::raw_uint16_to_half(0x3800); }
-  static EIGEN_CONSTEXPR Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); }
-  static EIGEN_CONSTEXPR Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); }
-  static EIGEN_CONSTEXPR Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7d00); }
-  static EIGEN_CONSTEXPR Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x0001); }
+  static _EIGEN_MAYBE_CONSTEXPR Eigen::half(min)() { return Eigen::half_impl::raw_uint16_to_half(0x0400); }
+  static _EIGEN_MAYBE_CONSTEXPR Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); }
+  static _EIGEN_MAYBE_CONSTEXPR Eigen::half(max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); }
+  static _EIGEN_MAYBE_CONSTEXPR Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x1400); }
+  static _EIGEN_MAYBE_CONSTEXPR Eigen::half round_error() { return Eigen::half_impl::raw_uint16_to_half(0x3800); }
+  static _EIGEN_MAYBE_CONSTEXPR Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); }
+  static _EIGEN_MAYBE_CONSTEXPR Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); }
+  static _EIGEN_MAYBE_CONSTEXPR Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7d00); }
+  static _EIGEN_MAYBE_CONSTEXPR Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x0001); }
 };
 
 template <typename T>
-EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::is_specialized;
+constexpr const bool numeric_limits_half_impl<T>::is_specialized;
 template <typename T>
-EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::is_signed;
+constexpr const bool numeric_limits_half_impl<T>::is_signed;
 template <typename T>
-EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::is_integer;
+constexpr const bool numeric_limits_half_impl<T>::is_integer;
 template <typename T>
-EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::is_exact;
+constexpr const bool numeric_limits_half_impl<T>::is_exact;
 template <typename T>
-EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::has_infinity;
+constexpr const bool numeric_limits_half_impl<T>::has_infinity;
 template <typename T>
-EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::has_quiet_NaN;
+constexpr const bool numeric_limits_half_impl<T>::has_quiet_NaN;
 template <typename T>
-EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::has_signaling_NaN;
+constexpr const bool numeric_limits_half_impl<T>::has_signaling_NaN;
 EIGEN_DIAGNOSTICS(push)
 EIGEN_DISABLE_DEPRECATED_WARNING
 template <typename T>
-EIGEN_CONSTEXPR const std::float_denorm_style numeric_limits_half_impl<T>::has_denorm;
+constexpr const std::float_denorm_style numeric_limits_half_impl<T>::has_denorm;
 template <typename T>
-EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::has_denorm_loss;
+constexpr const bool numeric_limits_half_impl<T>::has_denorm_loss;
 EIGEN_DIAGNOSTICS(pop)
 template <typename T>
-EIGEN_CONSTEXPR const std::float_round_style numeric_limits_half_impl<T>::round_style;
+constexpr const std::float_round_style numeric_limits_half_impl<T>::round_style;
 template <typename T>
-EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::is_iec559;
+constexpr const bool numeric_limits_half_impl<T>::is_iec559;
 template <typename T>
-EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::is_bounded;
+constexpr const bool numeric_limits_half_impl<T>::is_bounded;
 template <typename T>
-EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::is_modulo;
+constexpr const bool numeric_limits_half_impl<T>::is_modulo;
 template <typename T>
-EIGEN_CONSTEXPR const int numeric_limits_half_impl<T>::digits;
+constexpr const int numeric_limits_half_impl<T>::digits;
 template <typename T>
-EIGEN_CONSTEXPR const int numeric_limits_half_impl<T>::digits10;
+constexpr const int numeric_limits_half_impl<T>::digits10;
 template <typename T>
-EIGEN_CONSTEXPR const int numeric_limits_half_impl<T>::max_digits10;
+constexpr const int numeric_limits_half_impl<T>::max_digits10;
 template <typename T>
-EIGEN_CONSTEXPR const int numeric_limits_half_impl<T>::radix;
+constexpr const int numeric_limits_half_impl<T>::radix;
 template <typename T>
-EIGEN_CONSTEXPR const int numeric_limits_half_impl<T>::min_exponent;
+constexpr const int numeric_limits_half_impl<T>::min_exponent;
 template <typename T>
-EIGEN_CONSTEXPR const int numeric_limits_half_impl<T>::min_exponent10;
+constexpr const int numeric_limits_half_impl<T>::min_exponent10;
 template <typename T>
-EIGEN_CONSTEXPR const int numeric_limits_half_impl<T>::max_exponent;
+constexpr const int numeric_limits_half_impl<T>::max_exponent;
 template <typename T>
-EIGEN_CONSTEXPR const int numeric_limits_half_impl<T>::max_exponent10;
+constexpr const int numeric_limits_half_impl<T>::max_exponent10;
 template <typename T>
-EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::traps;
+constexpr const bool numeric_limits_half_impl<T>::traps;
 template <typename T>
-EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::tinyness_before;
+constexpr const bool numeric_limits_half_impl<T>::tinyness_before;
 }  // end namespace half_impl
 }  // end namespace Eigen
 
@@ -320,8 +342,7 @@
     (defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE))
 // Note: We deliberately do *not* define this to 1 even if we have Arm's native
 // fp16 type since GPU half types are rather different from native CPU half types.
-// TODO: Rename to something like EIGEN_HAS_NATIVE_GPU_FP16
-#define EIGEN_HAS_NATIVE_FP16
+#define EIGEN_HAS_NATIVE_GPU_FP16
 #endif
 
 // Intrinsics for native fp16 support. Note that on current hardware,
@@ -329,7 +350,7 @@
 // versions to get the ALU speed increased), but you do save the
 // conversion steps back and forth.
 
-#if defined(EIGEN_HAS_NATIVE_FP16)
+#if defined(EIGEN_HAS_NATIVE_GPU_FP16)
 EIGEN_STRONG_INLINE __device__ half operator+(const half& a, const half& b) {
 #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
   return __hadd(::__half(a), ::__half(b));
@@ -371,7 +392,8 @@
 EIGEN_STRONG_INLINE __device__ bool operator<=(const half& a, const half& b) { return __hle(a, b); }
 EIGEN_STRONG_INLINE __device__ bool operator>(const half& a, const half& b) { return __hgt(a, b); }
 EIGEN_STRONG_INLINE __device__ bool operator>=(const half& a, const half& b) { return __hge(a, b); }
-#endif
+
+#endif  // EIGEN_HAS_NATIVE_GPU_FP16
 
 #if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) && !defined(EIGEN_GPU_COMPILE_PHASE)
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator+(const half& a, const half& b) { return half(vaddh_f16(a.x, b.x)); }
@@ -401,16 +423,47 @@
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<=(const half& a, const half& b) { return vcleh_f16(a.x, b.x); }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>(const half& a, const half& b) { return vcgth_f16(a.x, b.x); }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>=(const half& a, const half& b) { return vcgeh_f16(a.x, b.x); }
+
+#elif defined(EIGEN_HAS_BUILTIN_FLOAT16) && !defined(EIGEN_GPU_COMPILE_PHASE)
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator+(const half& a, const half& b) { return half(a.x + b.x); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator*(const half& a, const half& b) { return half(a.x * b.x); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator-(const half& a, const half& b) { return half(a.x - b.x); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator/(const half& a, const half& b) { return half(a.x / b.x); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator-(const half& a) { return half(-a.x); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator+=(half& a, const half& b) {
+  a = a + b;
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator*=(half& a, const half& b) {
+  a = a * b;
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator-=(half& a, const half& b) {
+  a = a - b;
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator/=(half& a, const half& b) {
+  a = a / b;
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator==(const half& a, const half& b) { return a.x == b.x; }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator!=(const half& a, const half& b) { return a.x != b.x; }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<(const half& a, const half& b) { return a.x < b.x; }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<=(const half& a, const half& b) { return a.x <= b.x; }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>(const half& a, const half& b) { return a.x > b.x; }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>=(const half& a, const half& b) { return a.x >= b.x; }
+
 // We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler,
 // invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation
 // of the functions, while the latter can only deal with one of them.
-#elif !defined(EIGEN_HAS_NATIVE_FP16) || (EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)  // Emulate support for half floats
+#elif !defined(EIGEN_HAS_NATIVE_GPU_FP16) || (EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)  // Emulate support for half floats
 
 #if EIGEN_COMP_CLANG && defined(EIGEN_GPUCC)
 // We need to provide emulated *host-side* FP16 operators for clang.
 #pragma push_macro("EIGEN_DEVICE_FUNC")
 #undef EIGEN_DEVICE_FUNC
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_HAS_NATIVE_FP16)
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_HAS_NATIVE_GPU_FP16)
 #define EIGEN_DEVICE_FUNC __host__
 #else  // both host and device need emulated ops.
 #define EIGEN_DEVICE_FUNC __host__ __device__
@@ -458,6 +511,7 @@
 #if EIGEN_COMP_CLANG && defined(EIGEN_GPUCC)
 #pragma pop_macro("EIGEN_DEVICE_FUNC")
 #endif
+
 #endif  // Emulate support for half floats
 
 // Division by an index. Do it in full float precision to avoid accuracy
@@ -493,7 +547,7 @@
 // these in hardware. If we need more performance on older/other CPUs, they are
 // also possible to vectorize directly.
 
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw raw_uint16_to_half(numext::uint16_t x) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR __half_raw raw_uint16_to_half(numext::uint16_t x) {
   // We cannot simply do a "return __half_raw(x)" here, because __half_raw is union type
   // in the hip_fp16 header file, and that will trigger a compile error
   // On the other hand, having anything but a return statement also triggers a compile error
@@ -515,6 +569,8 @@
   // For SYCL, cl::sycl::half is _Float16, so cast directly.
 #if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
   return numext::bit_cast<numext::uint16_t>(h.x);
+#elif defined(EIGEN_HAS_BUILTIN_FLOAT16)
+  return numext::bit_cast<numext::uint16_t>(h.x);
 #elif defined(SYCL_DEVICE_ONLY)
   return numext::bit_cast<numext::uint16_t>(h);
 #else
@@ -528,6 +584,16 @@
   __half tmp_ff = __float2half(ff);
   return *(__half_raw*)&tmp_ff;
 
+#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
+  __half_raw h;
+  h.x = static_cast<__fp16>(ff);
+  return h;
+
+#elif defined(EIGEN_HAS_BUILTIN_FLOAT16)
+  __half_raw h;
+  h.x = static_cast<_Float16>(ff);
+  return h;
+
 #elif defined(EIGEN_HAS_FP16_C)
   __half_raw h;
 #if EIGEN_COMP_MSVC
@@ -538,11 +604,6 @@
 #endif
   return h;
 
-#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
-  __half_raw h;
-  h.x = static_cast<__fp16>(ff);
-  return h;
-
 #else
   uint32_t f_bits = Eigen::numext::bit_cast<uint32_t>(ff);
   const uint32_t f32infty_bits = {255 << 23};
@@ -595,6 +656,8 @@
 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
     (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
   return __half2float(h);
+#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
+  return static_cast<float>(h.x);
 #elif defined(EIGEN_HAS_FP16_C)
 #if EIGEN_COMP_MSVC
   // MSVC does not have scalar instructions.
@@ -602,8 +665,6 @@
 #else
   return _cvtsh_ss(h.x);
 #endif
-#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
-  return static_cast<float>(h.x);
 #else
   const float magic = Eigen::numext::bit_cast<float>(static_cast<uint32_t>(113 << 23));
   const uint32_t shifted_exp = 0x7c00 << 13;  // exponent mask after shift
@@ -628,7 +689,7 @@
 // --- standard functions ---
 
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isinf)(const half& a) {
-#ifdef EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC
+#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
   return (numext::bit_cast<numext::uint16_t>(a.x) & 0x7fff) == 0x7c00;
 #else
   return (a.x & 0x7fff) == 0x7c00;
@@ -638,7 +699,7 @@
 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
     (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
   return __hisnan(a);
-#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
+#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
   return (numext::bit_cast<numext::uint16_t>(a.x) & 0x7fff) > 0x7c00;
 #else
   return (a.x & 0x7fff) > 0x7c00;
@@ -651,6 +712,11 @@
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
 #if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
   return half(vabsh_f16(a.x));
+#elif defined(EIGEN_HAS_BUILTIN_FLOAT16)
+  half result;
+  result.x =
+      numext::bit_cast<_Float16>(static_cast<numext::uint16_t>(numext::bit_cast<numext::uint16_t>(a.x) & 0x7FFF));
+  return result;
 #else
   half result;
   result.x = a.x & 0x7FFF;
@@ -734,24 +800,19 @@
   return half(::fmodf(float(a), float(b)));
 }
 
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half(min)(const half& a, const half& b) {
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
-    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
-  return __hlt(b, a) ? b : a;
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half(min)(const half& a, const half& b) { return b < a ? b : a; }
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half(max)(const half& a, const half& b) { return a < b ? b : a; }
+
+EIGEN_DEVICE_FUNC inline half fma(const half& a, const half& b, const half& c) {
+#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
+  return half(vfmah_f16(c.x, a.x, b.x));
+#elif defined(EIGEN_VECTORIZE_AVX512FP16)
+  // Reduces to vfmadd213sh.
+  return half(_mm_cvtsh_h(_mm_fmadd_ph(_mm_set_sh(a.x), _mm_set_sh(b.x), _mm_set_sh(c.x))));
 #else
-  const float f1 = static_cast<float>(a);
-  const float f2 = static_cast<float>(b);
-  return f2 < f1 ? b : a;
-#endif
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half(max)(const half& a, const half& b) {
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
-    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
-  return __hlt(a, b) ? b : a;
-#else
-  const float f1 = static_cast<float>(a);
-  const float f2 = static_cast<float>(b);
-  return f1 < f2 ? b : a;
+  // Emulate FMA via float.
+  return half(numext::fma(static_cast<float>(a), static_cast<float>(b), static_cast<float>(c)));
 #endif
 }
 
@@ -794,31 +855,29 @@
 struct NumTraits<Eigen::half> : GenericNumTraits<Eigen::half> {
   enum { IsSigned = true, IsInteger = false, IsComplex = false, RequireInitialization = false };
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half epsilon() {
+  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half epsilon() {
     return half_impl::raw_uint16_to_half(0x0800);
   }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half dummy_precision() {
+  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half dummy_precision() {
     return half_impl::raw_uint16_to_half(0x211f);  //  Eigen::half(1e-2f);
   }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half highest() {
+  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half highest() {
     return half_impl::raw_uint16_to_half(0x7bff);
   }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half lowest() {
+  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half lowest() {
     return half_impl::raw_uint16_to_half(0xfbff);
   }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half infinity() {
+  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half infinity() {
     return half_impl::raw_uint16_to_half(0x7c00);
   }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half quiet_NaN() {
+  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half quiet_NaN() {
     return half_impl::raw_uint16_to_half(0x7e00);
   }
 };
 
 }  // end namespace Eigen
 
-#if defined(EIGEN_HAS_GPU_FP16) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
-#pragma pop_macro("EIGEN_CONSTEXPR")
-#endif
+#undef _EIGEN_MAYBE_CONSTEXPR
 
 namespace Eigen {
 namespace numext {

diff --git a/Eigen/src/Core/arch/GPU/Tuple.h b/Eigen/src/Core/arch/GPU/Tuple.h
index 6bea9ac..402d92f 100644
--- a/Eigen/src/Core/arch/GPU/Tuple.h
+++ b/Eigen/src/Core/arch/GPU/Tuple.h

@@ -34,7 +34,7 @@
   template <typename U1 = T1,
             typename EnableIf = std::enable_if_t<std::is_default_constructible<U1>::value &&
                                                  reduce_all<std::is_default_constructible<Ts>::value...>::value>>
-  EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC TupleImpl() : head_{}, tail_{} {}
+  constexpr EIGEN_DEVICE_FUNC TupleImpl() : head_{}, tail_{} {}
 
   // Element constructor.
   template <typename U1, typename... Us,
@@ -44,7 +44,7 @@
                 sizeof...(Us) == sizeof...(Ts) && (
                                                       // this does not look like a copy/move constructor.
                                                       N > 1 || std::is_convertible<U1, T1>::value)>>
-  EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC TupleImpl(U1&& arg1, Us&&... args)
+  constexpr EIGEN_DEVICE_FUNC TupleImpl(U1&& arg1, Us&&... args)
       : head_(std::forward<U1>(arg1)), tail_(std::forward<Us>(args)...) {}
 
   // The first stored value.
@@ -102,11 +102,11 @@
   using TupleType = TupleImpl<sizeof...(Ts) + 1, T1, Ts...>;
   using ReturnType = typename tuple_get_impl<Idx - 1, Ts...>::ReturnType;
 
-  static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ReturnType& run(TupleType& tuple) {
+  static constexpr EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ReturnType& run(TupleType& tuple) {
     return tuple_get_impl<Idx - 1, Ts...>::run(tuple.tail());
   }
 
-  static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const ReturnType& run(const TupleType& tuple) {
+  static constexpr EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const ReturnType& run(const TupleType& tuple) {
     return tuple_get_impl<Idx - 1, Ts...>::run(tuple.tail());
   }
 };
@@ -117,11 +117,9 @@
   using TupleType = TupleImpl<sizeof...(Ts) + 1, T1, Ts...>;
   using ReturnType = T1;
 
-  static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T1& run(TupleType& tuple) { return tuple.head(); }
+  static constexpr EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T1& run(TupleType& tuple) { return tuple.head(); }
 
-  static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const T1& run(const TupleType& tuple) {
-    return tuple.head();
-  }
+  static constexpr EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const T1& run(const TupleType& tuple) { return tuple.head(); }
 };
 
 // Concatenates N Tuples.
@@ -139,11 +137,9 @@
   // Uses the index sequences to extract and merge elements from tuple1 and tuple2,
   // then recursively calls again.
   template <typename Tuple1, size_t... I1s, typename Tuple2, size_t... I2s, typename... MoreTuples>
-  static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType run(Tuple1&& tuple1,
-                                                                              std::index_sequence<I1s...>,
-                                                                              Tuple2&& tuple2,
-                                                                              std::index_sequence<I2s...>,
-                                                                              MoreTuples&&... tuples) {
+  static constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType run(Tuple1&& tuple1, std::index_sequence<I1s...>,
+                                                                        Tuple2&& tuple2, std::index_sequence<I2s...>,
+                                                                        MoreTuples&&... tuples) {
     return tuple_cat_impl<NTuples - 1, MergedTupleType, Tuples...>::run(
         MergedTupleType(tuple_get_impl<I1s, Args1...>::run(std::forward<Tuple1>(tuple1))...,
                         tuple_get_impl<I2s, Args2...>::run(std::forward<Tuple2>(tuple2))...),
@@ -152,8 +148,8 @@
 
   // Concatenates the first two tuples.
   template <typename Tuple1, typename Tuple2, typename... MoreTuples>
-  static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType run(Tuple1&& tuple1, Tuple2&& tuple2,
-                                                                              MoreTuples&&... tuples) {
+  static constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType run(Tuple1&& tuple1, Tuple2&& tuple2,
+                                                                        MoreTuples&&... tuples) {
     return run(std::forward<Tuple1>(tuple1), std::make_index_sequence<N1>{}, std::forward<Tuple2>(tuple2),
                std::make_index_sequence<N2>{}, std::forward<MoreTuples>(tuples)...);
   }
@@ -165,7 +161,7 @@
   using ReturnType = TupleImpl<N, Args...>;
 
   template <typename Tuple1>
-  static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType run(Tuple1&& tuple1) {
+  static constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType run(Tuple1&& tuple1) {
     return tuple1;
   }
 };
@@ -174,7 +170,7 @@
 template <>
 struct tuple_cat_impl<0> {
   using ReturnType = TupleImpl<0>;
-  static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType run() { return ReturnType{}; }
+  static constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType run() { return ReturnType{}; }
 };
 
 // For use in make_tuple, unwraps a reference_wrapper.
@@ -211,13 +207,13 @@
  * \return a reference to the desired element.
  */
 template <size_t Idx, typename... Types>
-EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename tuple_get_impl<Idx, Types...>::ReturnType& get(
+constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename tuple_get_impl<Idx, Types...>::ReturnType& get(
     const TupleImpl<sizeof...(Types), Types...>& tuple) {
   return tuple_get_impl<Idx, Types...>::run(tuple);
 }
 
 template <size_t Idx, typename... Types>
-EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename tuple_get_impl<Idx, Types...>::ReturnType& get(
+constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename tuple_get_impl<Idx, Types...>::ReturnType& get(
     TupleImpl<sizeof...(Types), Types...>& tuple) {
   return tuple_get_impl<Idx, Types...>::run(tuple);
 }
@@ -229,7 +225,7 @@
  */
 template <typename... Tuples, typename EnableIf = std::enable_if_t<
                                   internal::reduce_all<is_tuple<typename std::decay<Tuples>::type>::value...>::value>>
-EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     typename tuple_cat_impl<sizeof...(Tuples), typename std::decay<Tuples>::type...>::ReturnType
     tuple_cat(Tuples&&... tuples) {
   return tuple_cat_impl<sizeof...(Tuples), typename std::decay<Tuples>::type...>::run(std::forward<Tuples>(tuples)...);
@@ -239,7 +235,7 @@
  * Tie arguments together into a tuple.
  */
 template <typename... Args, typename ReturnType = TupleImpl<sizeof...(Args), Args&...>>
-EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType tie(Args&... args) EIGEN_NOEXCEPT {
+constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType tie(Args&... args) noexcept {
   return ReturnType{args...};
 }
 
@@ -247,7 +243,7 @@
  * Create a tuple of l-values with the supplied arguments.
  */
 template <typename... Args, typename ReturnType = TupleImpl<sizeof...(Args), typename unwrap_decay<Args>::type...>>
-EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType make_tuple(Args&&... args) {
+constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType make_tuple(Args&&... args) {
   return ReturnType{std::forward<Args>(args)...};
 }
 
@@ -255,8 +251,7 @@
  * Forward a set of arguments as a tuple.
  */
 template <typename... Args>
-EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TupleImpl<sizeof...(Args), Args...> forward_as_tuple(
-    Args&&... args) {
+constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TupleImpl<sizeof...(Args), Args...> forward_as_tuple(Args&&... args) {
   return TupleImpl<sizeof...(Args), Args...>(std::forward<Args>(args)...);
 }
 

diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 3f2d9d5..6d7f038 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h

@@ -207,6 +207,7 @@
     HasExp = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
+    HasCbrt = 1,
     HasTanh = EIGEN_FAST_MATH,
     HasErf = EIGEN_FAST_MATH,
     HasErfc = EIGEN_FAST_MATH,
@@ -4965,6 +4966,26 @@
 }
 
 template <>
+EIGEN_STRONG_INLINE Packet4bf pmadd<Packet4bf>(const Packet4bf& a, const Packet4bf& b, const Packet4bf& c) {
+  return F32ToBf16(pmadd<Packet4f>(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf pmsub<Packet4bf>(const Packet4bf& a, const Packet4bf& b, const Packet4bf& c) {
+  return F32ToBf16(pmsub<Packet4f>(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf pnmadd<Packet4bf>(const Packet4bf& a, const Packet4bf& b, const Packet4bf& c) {
+  return F32ToBf16(pnmadd<Packet4f>(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf pnmsub<Packet4bf>(const Packet4bf& a, const Packet4bf& b, const Packet4bf& c) {
+  return F32ToBf16(pnmsub<Packet4f>(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
+}
+
+template <>
 EIGEN_STRONG_INLINE Packet4bf pdiv<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
   return F32ToBf16(pdiv<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
 }
@@ -5140,6 +5161,7 @@
     HasCos = EIGEN_FAST_MATH,
     HasSqrt = 1,
     HasRsqrt = 1,
+    HasCbrt = 1,
     HasTanh = EIGEN_FAST_MATH,
     HasErf = EIGEN_FAST_MATH,
     HasErfc = EIGEN_FAST_MATH
@@ -5635,6 +5657,21 @@
 }
 
 template <>
+EIGEN_STRONG_INLINE Packet8hf pmsub(const Packet8hf& a, const Packet8hf& b, const Packet8hf& c) {
+  return vfmaq_f16(pnegate(c), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pnmadd(const Packet4hf& a, const Packet4hf& b, const Packet4hf& c) {
+  return vfma_f16(c, pnegate(a), b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pnmsub(const Packet4hf& a, const Packet4hf& b, const Packet4hf& c) {
+  return vfma_f16(pnegate(c), pnegate(a), b);
+}
+
+template <>
 EIGEN_STRONG_INLINE Packet8hf pmin<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
   return vminq_f16(a, b);
 }

diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h
index c69e3d4..f79da7b 100644
--- a/Eigen/src/Core/arch/SSE/Complex.h
+++ b/Eigen/src/Core/arch/SSE/Complex.h

@@ -465,19 +465,11 @@
 }
 template <>
 EIGEN_STRONG_INLINE Packet2cf pnmadd(const Packet2cf& a, const Packet2cf& b, const Packet2cf& c) {
-  __m128 a_odd = _mm_movehdup_ps(a.v);
-  __m128 a_even = _mm_moveldup_ps(a.v);
-  __m128 b_swap = _mm_permute_ps(b.v, _MM_SHUFFLE(2, 3, 0, 1));
-  __m128 result = _mm_fmaddsub_ps(a_odd, b_swap, _mm_fmaddsub_ps(a_even, b.v, c.v));
-  return Packet2cf(result);
+  return pnegate(pmsub(a, b, c));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2cf pnmsub(const Packet2cf& a, const Packet2cf& b, const Packet2cf& c) {
-  __m128 a_odd = _mm_movehdup_ps(a.v);
-  __m128 a_even = _mm_moveldup_ps(a.v);
-  __m128 b_swap = _mm_permute_ps(b.v, _MM_SHUFFLE(2, 3, 0, 1));
-  __m128 result = _mm_fmaddsub_ps(a_odd, b_swap, _mm_fmsubadd_ps(a_even, b.v, c.v));
-  return Packet2cf(result);
+  return pnegate(pmadd(a, b, c));
 }
 // std::complex<double>
 template <>
@@ -498,19 +490,11 @@
 }
 template <>
 EIGEN_STRONG_INLINE Packet1cd pnmadd(const Packet1cd& a, const Packet1cd& b, const Packet1cd& c) {
-  __m128d a_odd = _mm_permute_pd(a.v, 0x3);
-  __m128d a_even = _mm_movedup_pd(a.v);
-  __m128d b_swap = _mm_permute_pd(b.v, 0x1);
-  __m128d result = _mm_fmaddsub_pd(a_odd, b_swap, _mm_fmaddsub_pd(a_even, b.v, c.v));
-  return Packet1cd(result);
+  return pnegate(pmsub(a, b, c));
 }
 template <>
 EIGEN_STRONG_INLINE Packet1cd pnmsub(const Packet1cd& a, const Packet1cd& b, const Packet1cd& c) {
-  __m128d a_odd = _mm_permute_pd(a.v, 0x3);
-  __m128d a_even = _mm_movedup_pd(a.v);
-  __m128d b_swap = _mm_permute_pd(b.v, 0x1);
-  __m128d result = _mm_fmaddsub_pd(a_odd, b_swap, _mm_fmsubadd_pd(a_even, b.v, c.v));
-  return Packet1cd(result);
+  return pnegate(pmadd(a, b, c));
 }
 #endif
 }  // end namespace internal

diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 5e91fba..70d13d6 100644
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h

@@ -195,6 +195,7 @@
     HasBessel = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
+    HasCbrt = 1,
     HasTanh = EIGEN_FAST_MATH,
     HasErf = EIGEN_FAST_MATH,
     HasErfc = EIGEN_FAST_MATH,
@@ -222,6 +223,7 @@
     HasExp = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
+    HasCbrt = 1,
     HasATan = 1,
     HasATanh = 1,
     HasBlend = 1

diff --git a/Eigen/src/Core/functors/AssignmentFunctors.h b/Eigen/src/Core/functors/AssignmentFunctors.h
index 7edcc60..0239262 100644
--- a/Eigen/src/Core/functors/AssignmentFunctors.h
+++ b/Eigen/src/Core/functors/AssignmentFunctors.h

@@ -29,6 +29,11 @@
   EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const {
     pstoret<DstScalar, Packet, Alignment>(a, b);
   }
+
+  template <int Alignment, typename Packet>
+  EIGEN_STRONG_INLINE void assignPacketSegment(DstScalar* a, const Packet& b, Index begin, Index count) const {
+    pstoretSegment<DstScalar, Packet, Alignment>(a, b, begin, count);
+  }
 };
 
 // Empty overload for void type (used by PermutationMatrix)
@@ -60,6 +65,12 @@
     assign_op<DstScalar, DstScalar>().template assignPacket<Alignment, Packet>(
         a, Func().packetOp(ploadt<Packet, Alignment>(a), b));
   }
+
+  template <int Alignment, typename Packet>
+  EIGEN_STRONG_INLINE void assignPacketSegment(DstScalar* a, const Packet& b, Index begin, Index count) const {
+    assign_op<DstScalar, DstScalar>().template assignPacketSegment<Alignment, Packet>(
+        a, Func().packetOp(ploadtSegment<Packet, Alignment>(a, begin, count), b), begin, count);
+  }
 };
 
 template <typename DstScalar, typename SrcScalar, typename Func>

diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h
index c91e6bb..a93b998 100644
--- a/Eigen/src/Core/functors/BinaryFunctors.h
+++ b/Eigen/src/Core/functors/BinaryFunctors.h

@@ -438,7 +438,6 @@
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const {
-    maybe_raise_div_by_zero<Packet>::run(b);
     return internal::pdiv(a, b);
   }
 };

diff --git a/Eigen/src/Core/functors/NullaryFunctors.h b/Eigen/src/Core/functors/NullaryFunctors.h
index 14b56d7..35dc738 100644
--- a/Eigen/src/Core/functors/NullaryFunctors.h
+++ b/Eigen/src/Core/functors/NullaryFunctors.h

@@ -28,7 +28,7 @@
   const Scalar m_other;
 };
 template <typename Scalar>
-struct functor_traits<scalar_constant_op<Scalar> > {
+struct functor_traits<scalar_constant_op<Scalar>> {
   enum {
     Cost = 0 /* as the constant value should be loaded in register only once for the whole expression */,
     PacketAccess = packet_traits<Scalar>::Vectorizable,
@@ -56,7 +56,7 @@
   }
 };
 template <typename Scalar>
-struct functor_traits<scalar_identity_op<Scalar> > {
+struct functor_traits<scalar_identity_op<Scalar>> {
   enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = false, IsRepeatable = true };
 };
 
@@ -86,18 +86,19 @@
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const {
     // Principle:
     // [low, ..., low] + ( [step, ..., step] * ( [i, ..., i] + [0, ..., size] ) )
+    Packet low = pset1<Packet>(m_low);
+    Packet high = pset1<Packet>(m_high);
+    Packet step = pset1<Packet>(m_step);
     if (m_flip) {
       Packet pi = plset<Packet>(Scalar(i - m_size1));
-      Packet res = padd(pset1<Packet>(m_high), pmul(pset1<Packet>(m_step), pi));
-      if (EIGEN_PREDICT_TRUE(i != 0)) return res;
-      Packet mask = pcmp_lt(pset1<Packet>(0), plset<Packet>(0));
-      return pselect<Packet>(mask, res, pset1<Packet>(m_low));
+      Packet res = pmadd(step, pi, high);
+      Packet mask = pcmp_lt(pzero(res), plset<Packet>(Scalar(i)));
+      return pselect<Packet>(mask, res, low);
     } else {
       Packet pi = plset<Packet>(Scalar(i));
-      Packet res = padd(pset1<Packet>(m_low), pmul(pset1<Packet>(m_step), pi));
-      if (EIGEN_PREDICT_TRUE(i != m_size1 - unpacket_traits<Packet>::size + 1)) return res;
-      Packet mask = pcmp_lt(plset<Packet>(0), pset1<Packet>(unpacket_traits<Packet>::size - 1));
-      return pselect<Packet>(mask, res, pset1<Packet>(m_high));
+      Packet res = pmadd(step, pi, low);
+      Packet mask = pcmp_lt(pi, pset1<Packet>(Scalar(m_size1)));
+      return pselect<Packet>(mask, res, high);
     }
   }
 
@@ -139,7 +140,7 @@
 template <typename Scalar>
 struct linspaced_op;
 template <typename Scalar>
-struct functor_traits<linspaced_op<Scalar> > {
+struct functor_traits<linspaced_op<Scalar>> {
   enum {
     Cost = 1,
     PacketAccess = (!NumTraits<Scalar>::IsInteger) && packet_traits<Scalar>::HasSetLinear,
@@ -192,7 +193,7 @@
 };
 
 template <typename Scalar>
-struct functor_traits<equalspaced_op<Scalar> > {
+struct functor_traits<equalspaced_op<Scalar>> {
   enum {
     Cost = NumTraits<Scalar>::AddCost + NumTraits<Scalar>::MulCost,
     PacketAccess =

diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h
index 03542e3..ba7d97a 100644
--- a/Eigen/src/Core/functors/UnaryFunctors.h
+++ b/Eigen/src/Core/functors/UnaryFunctors.h

@@ -558,11 +558,15 @@
 template <typename Scalar>
 struct scalar_cbrt_op {
   EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::cbrt(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
+    return internal::pcbrt(a);
+  }
 };
 
 template <typename Scalar>
 struct functor_traits<scalar_cbrt_op<Scalar>> {
-  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
+  enum { Cost = 20 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasCbrt };
 };
 
 /** \internal

diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h
index 8f7b7dd..4f36689 100644
--- a/Eigen/src/Core/products/Parallelizer.h
+++ b/Eigen/src/Core/products/Parallelizer.h

@@ -217,7 +217,7 @@
     // Note that the actual number of threads might be lower than the number of
     // requested ones
     Index actual_threads = omp_get_num_threads();
-    GemmParallelInfo<Index> info(i, static_cast<int>(actual_threads), task_info);
+    GemmParallelInfo<Index> info(static_cast<int>(i), static_cast<int>(actual_threads), task_info);
 
     Index blockCols = (cols / actual_threads) & ~Index(0x3);
     Index blockRows = (rows / actual_threads);

diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h
index 5d3f1cf..49f307c 100644
--- a/Eigen/src/Core/util/ConfigureVectorization.h
+++ b/Eigen/src/Core/util/ConfigureVectorization.h

@@ -285,6 +285,8 @@
 #ifdef __AVX512FP16__
 #ifdef __AVX512VL__
 #define EIGEN_VECTORIZE_AVX512FP16
+// Built-in _Float16.
+#define EIGEN_HAS_BUILTIN_FLOAT16 1
 #else
 #if EIGEN_COMP_GNUC
 #error Please add -mavx512vl to your compiler flags: compiling with -mavx512fp16 alone without AVX512-VL is not supported.

diff --git a/Eigen/src/Core/util/EmulateArray.h b/Eigen/src/Core/util/EmulateArray.h
index f2fd10b..6c4c22d 100644
--- a/Eigen/src/Core/util/EmulateArray.h
+++ b/Eigen/src/Core/util/EmulateArray.h

@@ -248,15 +248,15 @@
 #endif
 
 template <std::size_t I_, class T, std::size_t N>
-constexpr inline T& array_get(std::array<T, N>& a) {
+constexpr T& array_get(std::array<T, N>& a) {
   return (T&)STD_GET_ARR_HACK;
 }
 template <std::size_t I_, class T, std::size_t N>
-constexpr inline T&& array_get(std::array<T, N>&& a) {
+constexpr T&& array_get(std::array<T, N>&& a) {
   return (T&&)STD_GET_ARR_HACK;
 }
 template <std::size_t I_, class T, std::size_t N>
-constexpr inline T const& array_get(std::array<T, N> const& a) {
+constexpr T const& array_get(std::array<T, N> const& a) {
   return (T const&)STD_GET_ARR_HACK;
 }
 

diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h
index 2488be4..3c0bc46 100644
--- a/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/Eigen/src/Core/util/ForwardDeclarations.h

@@ -514,6 +514,9 @@
 struct eigen_memset_helper;
 template <typename Xpr, bool use_memset = eigen_memset_helper<Xpr>::value>
 struct eigen_zero_impl;
+
+template <typename Packet>
+struct has_packet_segment : std::false_type {};
 }  // namespace internal
 
 }  // end namespace Eigen

diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index 7347dbb..aed8a88 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h

@@ -1281,11 +1281,6 @@
 #define EIGEN_CATCH(X) else
 #endif
 
-#define EIGEN_NOEXCEPT noexcept
-#define EIGEN_NOEXCEPT_IF(x) noexcept(x)
-#define EIGEN_NO_THROW noexcept(true)
-#define EIGEN_EXCEPTION_SPEC(X) noexcept(false)
-
 // The all function is used to enable a variadic version of eigen_assert which can take a parameter pack as its input.
 namespace Eigen {
 namespace internal {

diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index 0facd26..89b2fff 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h

@@ -833,46 +833,44 @@
 
 // HIP does not support new/delete on device.
 #if EIGEN_MAX_ALIGN_BYTES != 0 && !defined(EIGEN_HIP_DEVICE_COMPILE)
-#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign)                                    \
-  EIGEN_DEVICE_FUNC void* operator new(std::size_t size, const std::nothrow_t&) EIGEN_NO_THROW { \
-    EIGEN_TRY { return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); }        \
-    EIGEN_CATCH(...) { return 0; }                                                               \
+#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign)                              \
+  EIGEN_DEVICE_FUNC void* operator new(std::size_t size, const std::nothrow_t&) noexcept { \
+    EIGEN_TRY { return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); }  \
+    EIGEN_CATCH(...) { return 0; }                                                         \
   }
-#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign)                                                             \
-  EIGEN_DEVICE_FUNC void* operator new(std::size_t size) {                                                           \
-    return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size);                                          \
-  }                                                                                                                  \
-  EIGEN_DEVICE_FUNC void* operator new[](std::size_t size) {                                                         \
-    return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size);                                          \
-  }                                                                                                                  \
-  EIGEN_DEVICE_FUNC void operator delete(void* ptr) EIGEN_NO_THROW {                                                 \
-    Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr);                                                    \
-  }                                                                                                                  \
-  EIGEN_DEVICE_FUNC void operator delete[](void* ptr) EIGEN_NO_THROW {                                               \
-    Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr);                                                    \
-  }                                                                                                                  \
-  EIGEN_DEVICE_FUNC void operator delete(void* ptr, std::size_t /* sz */) EIGEN_NO_THROW {                           \
-    Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr);                                                    \
-  }                                                                                                                  \
-  EIGEN_DEVICE_FUNC void operator delete[](void* ptr, std::size_t /* sz */) EIGEN_NO_THROW {                         \
-    Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr);                                                    \
-  }                                                                                                                  \
-  /* in-place new and delete. since (at least afaik) there is no actual   */                                         \
-  /* memory allocated we can safely let the default implementation handle */                                         \
-  /* this particular case. */                                                                                        \
-  EIGEN_DEVICE_FUNC static void* operator new(std::size_t size, void* ptr) { return ::operator new(size, ptr); }     \
-  EIGEN_DEVICE_FUNC static void* operator new[](std::size_t size, void* ptr) { return ::operator new[](size, ptr); } \
-  EIGEN_DEVICE_FUNC void operator delete(void* memory, void* ptr) EIGEN_NO_THROW {                                   \
-    return ::operator delete(memory, ptr);                                                                           \
-  }                                                                                                                  \
-  EIGEN_DEVICE_FUNC void operator delete[](void* memory, void* ptr) EIGEN_NO_THROW {                                 \
-    return ::operator delete[](memory, ptr);                                                                         \
-  }                                                                                                                  \
-  /* nothrow-new (returns zero instead of std::bad_alloc) */                                                         \
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign)                                                              \
-  EIGEN_DEVICE_FUNC void operator delete(void* ptr, const std::nothrow_t&) EIGEN_NO_THROW {                          \
-    Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr);                                                    \
-  }                                                                                                                  \
+#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign)                                                              \
+  EIGEN_DEVICE_FUNC void* operator new(std::size_t size) {                                                            \
+    return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size);                                           \
+  }                                                                                                                   \
+  EIGEN_DEVICE_FUNC void* operator new[](std::size_t size) {                                                          \
+    return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size);                                           \
+  }                                                                                                                   \
+  EIGEN_DEVICE_FUNC void operator delete(void* ptr) noexcept {                                                        \
+    Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr);                                                     \
+  }                                                                                                                   \
+  EIGEN_DEVICE_FUNC void operator delete[](void* ptr) noexcept {                                                      \
+    Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr);                                                     \
+  }                                                                                                                   \
+  EIGEN_DEVICE_FUNC void operator delete(void* ptr, std::size_t /* sz */) noexcept {                                  \
+    Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr);                                                     \
+  }                                                                                                                   \
+  EIGEN_DEVICE_FUNC void operator delete[](void* ptr, std::size_t /* sz */) noexcept {                                \
+    Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr);                                                     \
+  }                                                                                                                   \
+  /* in-place new and delete. since (at least afaik) there is no actual   */                                          \
+  /* memory allocated we can safely let the default implementation handle */                                          \
+  /* this particular case. */                                                                                         \
+  EIGEN_DEVICE_FUNC static void* operator new(std::size_t size, void* ptr) { return ::operator new(size, ptr); }      \
+  EIGEN_DEVICE_FUNC static void* operator new[](std::size_t size, void* ptr) { return ::operator new[](size, ptr); }  \
+  EIGEN_DEVICE_FUNC void operator delete(void* memory, void* ptr) noexcept { return ::operator delete(memory, ptr); } \
+  EIGEN_DEVICE_FUNC void operator delete[](void* memory, void* ptr) noexcept {                                        \
+    return ::operator delete[](memory, ptr);                                                                          \
+  }                                                                                                                   \
+  /* nothrow-new (returns zero instead of std::bad_alloc) */                                                          \
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign)                                                               \
+  EIGEN_DEVICE_FUNC void operator delete(void* ptr, const std::nothrow_t&) noexcept {                                 \
+    Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr);                                                     \
+  }                                                                                                                   \
   typedef void eigen_aligned_operator_new_marker_type;
 #else
 #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign)

diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h
index 40604f8..ddbc898 100644
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h

@@ -90,12 +90,8 @@
  * we however don't want to add a dependency to Boost.
  */
 
-struct true_type {
-  enum { value = 1 };
-};
-struct false_type {
-  enum { value = 0 };
-};
+using std::false_type;
+using std::true_type;
 
 template <bool Condition>
 struct bool_constant;
@@ -341,7 +337,7 @@
 #if EIGEN_COMP_CXXVER >= 20 && defined(__cpp_lib_ssize) && __cpp_lib_ssize >= 201902L
 
 template <typename T>
-EIGEN_CONSTEXPR auto index_list_size(T&& x) {
+constexpr auto index_list_size(T&& x) {
   using std::ssize;
   return ssize(std::forward<T>(x));
 }
@@ -349,13 +345,13 @@
 #else
 
 template <typename T>
-EIGEN_CONSTEXPR auto index_list_size(const T& x) {
+constexpr auto index_list_size(const T& x) {
   using R = std::common_type_t<std::ptrdiff_t, std::make_signed_t<decltype(x.size())>>;
   return static_cast<R>(x.size());
 }
 
 template <typename T, std::ptrdiff_t N>
-EIGEN_CONSTEXPR std::ptrdiff_t index_list_size(const T (&)[N]) {
+constexpr std::ptrdiff_t index_list_size(const T (&)[N]) {
   return N;
 }
 #endif
@@ -641,21 +637,21 @@
 constexpr bool is_int_or_enum_v = std::is_enum<A>::value || std::is_integral<A>::value;
 
 template <typename A, typename B>
-inline constexpr void plain_enum_asserts(A, B) {
+constexpr void plain_enum_asserts(A, B) {
   static_assert(is_int_or_enum_v<A>, "Argument a must be an integer or enum");
   static_assert(is_int_or_enum_v<B>, "Argument b must be an integer or enum");
 }
 
 /// \internal Gets the minimum of two values which may be integers or enums
 template <typename A, typename B>
-inline constexpr int plain_enum_min(A a, B b) {
+constexpr int plain_enum_min(A a, B b) {
   plain_enum_asserts(a, b);
   return ((int)a <= (int)b) ? (int)a : (int)b;
 }
 
 /// \internal Gets the maximum of two values which may be integers or enums
 template <typename A, typename B>
-inline constexpr int plain_enum_max(A a, B b) {
+constexpr int plain_enum_max(A a, B b) {
   plain_enum_asserts(a, b);
   return ((int)a >= (int)b) ? (int)a : (int)b;
 }
@@ -667,7 +663,7 @@
  *  finite values is that min(3, Dynamic) should be Dynamic, since that could be anything between 0 and 3.
  */
 template <typename A, typename B>
-inline constexpr int min_size_prefer_dynamic(A a, B b) {
+constexpr int min_size_prefer_dynamic(A a, B b) {
   plain_enum_asserts(a, b);
   if ((int)a == 0 || (int)b == 0) return 0;
   if ((int)a == 1 || (int)b == 1) return 1;
@@ -682,7 +678,7 @@
  * 0 and 3), it is not more than 3.
  */
 template <typename A, typename B>
-inline constexpr int min_size_prefer_fixed(A a, B b) {
+constexpr int min_size_prefer_fixed(A a, B b) {
   plain_enum_asserts(a, b);
   if ((int)a == 0 || (int)b == 0) return 0;
   if ((int)a == 1 || (int)b == 1) return 1;
@@ -694,7 +690,7 @@
 
 /// \internal see `min_size_prefer_fixed`. No need for a separate variant for MaxSizes here.
 template <typename A, typename B>
-inline constexpr int max_size_prefer_dynamic(A a, B b) {
+constexpr int max_size_prefer_dynamic(A a, B b) {
   plain_enum_asserts(a, b);
   if ((int)a == Dynamic || (int)b == Dynamic) return Dynamic;
   return plain_enum_max(a, b);
@@ -714,38 +710,38 @@
 }
 
 template <typename A, typename B>
-inline constexpr bool enum_lt_not_dynamic(A a, B b) {
+constexpr bool enum_lt_not_dynamic(A a, B b) {
   plain_enum_asserts(a, b);
   if ((int)a == Dynamic || (int)b == Dynamic) return false;
   return (int)a < (int)b;
 }
 
 template <typename A, typename B>
-inline constexpr bool enum_le_not_dynamic(A a, B b) {
+constexpr bool enum_le_not_dynamic(A a, B b) {
   plain_enum_asserts(a, b);
   if ((int)a == Dynamic || (int)b == Dynamic) return false;
   return (int)a <= (int)b;
 }
 
 template <typename A, typename B>
-inline constexpr bool enum_gt_not_dynamic(A a, B b) {
+constexpr bool enum_gt_not_dynamic(A a, B b) {
   plain_enum_asserts(a, b);
   if ((int)a == Dynamic || (int)b == Dynamic) return false;
   return (int)a > (int)b;
 }
 
 template <typename A, typename B>
-inline constexpr bool enum_ge_not_dynamic(A a, B b) {
+constexpr bool enum_ge_not_dynamic(A a, B b) {
   plain_enum_asserts(a, b);
   if ((int)a == Dynamic || (int)b == Dynamic) return false;
   return (int)a >= (int)b;
 }
 
 /// \internal Calculate logical XOR at compile time
-inline constexpr bool logical_xor(bool a, bool b) { return a != b; }
+constexpr bool logical_xor(bool a, bool b) { return a != b; }
 
 /// \internal Calculate logical IMPLIES at compile time
-inline constexpr bool check_implication(bool a, bool b) { return !a || b; }
+constexpr bool check_implication(bool a, bool b) { return !a || b; }
 
 /// \internal Provide fallback for std::is_constant_evaluated for pre-C++20.
 #if EIGEN_COMP_CXXVER >= 20 && defined(__cpp_lib_is_constant_evaluated) && __cpp_lib_is_constant_evaluated >= 201811L

diff --git a/Eigen/src/Core/util/ReshapedHelper.h b/Eigen/src/Core/util/ReshapedHelper.h
index e569408..1747950 100644
--- a/Eigen/src/Core/util/ReshapedHelper.h
+++ b/Eigen/src/Core/util/ReshapedHelper.h

@@ -40,7 +40,7 @@
 
 inline Index get_runtime_reshape_size(AutoSize_t /*size*/, Index other, Index total) { return total / other; }
 
-constexpr inline int get_compiletime_reshape_order(int flags, int order) {
+constexpr int get_compiletime_reshape_order(int flags, int order) {
   return order == AutoOrder ? flags & RowMajorBit : order;
 }
 

diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h
index a42bb0f..a0e160e 100644
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h

@@ -158,8 +158,8 @@
     EIGEN_ONLY_USED_FOR_DEBUG(v);
     eigen_assert(v == T(Value));
   }
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR T value() { return T(Value); }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR operator T() const { return T(Value); }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr T value() { return T(Value); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr operator T() const { return T(Value); }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T v) const {
     EIGEN_ONLY_USED_FOR_DEBUG(v);
     eigen_assert(v == T(Value));
@@ -171,7 +171,7 @@
   T m_value;
 
  public:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamic(T value = 0) EIGEN_NO_THROW : m_value(value) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamic(T value = 0) noexcept : m_value(value) {}
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T value() const { return m_value; }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator T() const { return m_value; }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T value) { m_value = value; }
@@ -186,7 +186,7 @@
     EIGEN_ONLY_USED_FOR_DEBUG(v);
     eigen_assert(v == T(Value));
   }
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR T value() { return T(Value); }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr T value() { return T(Value); }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T) {}
 };
 
@@ -315,7 +315,7 @@
 };
 
 #if EIGEN_MAX_STATIC_ALIGN_BYTES > 0
-constexpr inline int compute_default_alignment_helper(int ArrayBytes, int AlignmentBytes) {
+constexpr int compute_default_alignment_helper(int ArrayBytes, int AlignmentBytes) {
   if ((ArrayBytes % AlignmentBytes) == 0) {
     return AlignmentBytes;
   } else if (EIGEN_MIN_ALIGN_BYTES < AlignmentBytes) {
@@ -327,7 +327,7 @@
 #else
 // If static alignment is disabled, no need to bother.
 // This also avoids a division by zero
-constexpr inline int compute_default_alignment_helper(int ArrayBytes, int AlignmentBytes) {
+constexpr int compute_default_alignment_helper(int ArrayBytes, int AlignmentBytes) {
   EIGEN_UNUSED_VARIABLE(ArrayBytes);
   EIGEN_UNUSED_VARIABLE(AlignmentBytes);
   return 0;
@@ -362,7 +362,7 @@
   typedef Matrix<Scalar_, Rows_, Cols_, Options, MaxRows_, MaxCols_> type;
 };
 
-constexpr inline unsigned compute_matrix_flags(int Options) {
+constexpr unsigned compute_matrix_flags(int Options) {
   unsigned row_major_bit = Options & RowMajor ? RowMajorBit : 0;
   // FIXME currently we still have to handle DirectAccessBit at the expression level to handle DenseCoeffsBase<>
   // and then propagate this information to the evaluator's flags.
@@ -370,7 +370,7 @@
   return DirectAccessBit | LvalueBit | NestByRefBit | row_major_bit;
 }
 
-constexpr inline int size_at_compile_time(int rows, int cols) {
+constexpr int size_at_compile_time(int rows, int cols) {
   if (rows == 0 || cols == 0) return 0;
   if (rows == Dynamic || cols == Dynamic) return Dynamic;
   return rows * cols;

diff --git a/Eigen/src/Eigenvalues/RealQZ.h b/Eigen/src/Eigenvalues/RealQZ.h
index 5915387..a54d82d 100644
--- a/Eigen/src/Eigenvalues/RealQZ.h
+++ b/Eigen/src/Eigenvalues/RealQZ.h

@@ -449,23 +449,23 @@
       Index lr = (std::min)(k + 4, dim);  // last row to update
       Map<Matrix<Scalar, Dynamic, 1> > tmp(m_workspace.data(), lr);
       // S
-      tmp = m_S.template middleCols<2>(k).topRows(lr) * essential2;
+      tmp.noalias() = m_S.template middleCols<2>(k).topRows(lr) * essential2;
       tmp += m_S.col(k + 2).head(lr);
       m_S.col(k + 2).head(lr) -= tau * tmp;
-      m_S.template middleCols<2>(k).topRows(lr) -= (tau * tmp) * essential2.adjoint();
+      m_S.template middleCols<2>(k).topRows(lr).noalias() -= (tau * tmp) * essential2.adjoint();
       // T
       tmp = m_T.template middleCols<2>(k).topRows(lr) * essential2;
       tmp += m_T.col(k + 2).head(lr);
       m_T.col(k + 2).head(lr) -= tau * tmp;
-      m_T.template middleCols<2>(k).topRows(lr) -= (tau * tmp) * essential2.adjoint();
+      m_T.template middleCols<2>(k).topRows(lr).noalias() -= (tau * tmp) * essential2.adjoint();
     }
     if (m_computeQZ) {
       // Z
       Map<Matrix<Scalar, 1, Dynamic> > tmp(m_workspace.data(), dim);
-      tmp = essential2.adjoint() * (m_Z.template middleRows<2>(k));
+      tmp.noalias() = essential2.adjoint() * (m_Z.template middleRows<2>(k));
       tmp += m_Z.row(k + 2);
       m_Z.row(k + 2) -= tau * tmp;
-      m_Z.template middleRows<2>(k) -= essential2 * (tau * tmp);
+      m_Z.template middleRows<2>(k).noalias() -= essential2 * (tau * tmp);
     }
     m_T.coeffRef(k + 2, k) = m_T.coeffRef(k + 2, k + 1) = Scalar(0.0);
 

diff --git a/Eigen/src/Eigenvalues/Tridiagonalization.h b/Eigen/src/Eigenvalues/Tridiagonalization.h
index b2f07bc..4da6d07 100644
--- a/Eigen/src/Eigenvalues/Tridiagonalization.h
+++ b/Eigen/src/Eigenvalues/Tridiagonalization.h

@@ -515,8 +515,8 @@
     result.template diagonal<-1>() = m_matrix.template diagonal<-1>();
   }
 
-  EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
-  EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
+  constexpr Index rows() const noexcept { return m_matrix.rows(); }
+  constexpr Index cols() const noexcept { return m_matrix.cols(); }
 
  protected:
   typename MatrixType::Nested m_matrix;

diff --git a/Eigen/src/Geometry/Homogeneous.h b/Eigen/src/Geometry/Homogeneous.h
index 64c1b65..795af0d 100644
--- a/Eigen/src/Geometry/Homogeneous.h
+++ b/Eigen/src/Geometry/Homogeneous.h

@@ -69,10 +69,10 @@
 
   EIGEN_DEVICE_FUNC explicit inline Homogeneous(const MatrixType& matrix) : m_matrix(matrix) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT {
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept {
     return m_matrix.rows() + (int(Direction) == Vertical ? 1 : 0);
   }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT {
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept {
     return m_matrix.cols() + (int(Direction) == Horizontal ? 1 : 0);
   }
 
@@ -244,8 +244,8 @@
   EIGEN_DEVICE_FUNC homogeneous_left_product_impl(const Lhs& lhs, const MatrixType& rhs)
       : m_lhs(take_matrix_for_product<Lhs>::run(lhs)), m_rhs(rhs) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_lhs.rows(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_lhs.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_rhs.cols(); }
 
   template <typename Dest>
   EIGEN_DEVICE_FUNC void evalTo(Dest& dst) const {
@@ -275,8 +275,8 @@
   typedef remove_all_t<typename Rhs::Nested> RhsNested;
   EIGEN_DEVICE_FUNC homogeneous_right_product_impl(const MatrixType& lhs, const Rhs& rhs) : m_lhs(lhs), m_rhs(rhs) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_lhs.rows(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_lhs.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_rhs.cols(); }
 
   template <typename Dest>
   EIGEN_DEVICE_FUNC void evalTo(Dest& dst) const {

diff --git a/Eigen/src/Geometry/Quaternion.h b/Eigen/src/Geometry/Quaternion.h
index 1d8ded9..147e6e3 100644
--- a/Eigen/src/Geometry/Quaternion.h
+++ b/Eigen/src/Geometry/Quaternion.h

@@ -57,22 +57,22 @@
   typedef AngleAxis<Scalar> AngleAxisType;
 
   /** \returns the \c x coefficient */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline CoeffReturnType x() const { return this->derived().coeffs().coeff(0); }
+  EIGEN_DEVICE_FUNC constexpr CoeffReturnType x() const { return this->derived().coeffs().coeff(0); }
   /** \returns the \c y coefficient */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline CoeffReturnType y() const { return this->derived().coeffs().coeff(1); }
+  EIGEN_DEVICE_FUNC constexpr CoeffReturnType y() const { return this->derived().coeffs().coeff(1); }
   /** \returns the \c z coefficient */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline CoeffReturnType z() const { return this->derived().coeffs().coeff(2); }
+  EIGEN_DEVICE_FUNC constexpr CoeffReturnType z() const { return this->derived().coeffs().coeff(2); }
   /** \returns the \c w coefficient */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline CoeffReturnType w() const { return this->derived().coeffs().coeff(3); }
+  EIGEN_DEVICE_FUNC constexpr CoeffReturnType w() const { return this->derived().coeffs().coeff(3); }
 
   /** \returns a reference to the \c x coefficient (if Derived is a non-const lvalue) */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline NonConstCoeffReturnType x() { return this->derived().coeffs().x(); }
+  EIGEN_DEVICE_FUNC constexpr NonConstCoeffReturnType x() { return this->derived().coeffs().x(); }
   /** \returns a reference to the \c y coefficient (if Derived is a non-const lvalue) */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline NonConstCoeffReturnType y() { return this->derived().coeffs().y(); }
+  EIGEN_DEVICE_FUNC constexpr NonConstCoeffReturnType y() { return this->derived().coeffs().y(); }
   /** \returns a reference to the \c z coefficient (if Derived is a non-const lvalue) */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline NonConstCoeffReturnType z() { return this->derived().coeffs().z(); }
+  EIGEN_DEVICE_FUNC constexpr NonConstCoeffReturnType z() { return this->derived().coeffs().z(); }
   /** \returns a reference to the \c w coefficient (if Derived is a non-const lvalue) */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline NonConstCoeffReturnType w() { return this->derived().coeffs().w(); }
+  EIGEN_DEVICE_FUNC constexpr NonConstCoeffReturnType w() { return this->derived().coeffs().w(); }
 
   /** \returns a read-only vector expression of the imaginary part (x,y,z) */
   EIGEN_DEVICE_FUNC inline const VectorBlock<const Coefficients, 3> vec() const { return coeffs().template head<3>(); }
@@ -346,13 +346,11 @@
 
   // We define a copy constructor, which means we don't get an implicit move constructor or assignment operator.
   /** Default move constructor */
-  EIGEN_DEVICE_FUNC inline Quaternion(Quaternion&& other)
-      EIGEN_NOEXCEPT_IF(std::is_nothrow_move_constructible<Scalar>::value)
+  EIGEN_DEVICE_FUNC inline Quaternion(Quaternion&& other) noexcept(std::is_nothrow_move_constructible<Scalar>::value)
       : m_coeffs(std::move(other.coeffs())) {}
 
   /** Default move assignment operator */
-  EIGEN_DEVICE_FUNC Quaternion& operator=(Quaternion&& other)
-      EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable<Scalar>::value) {
+  EIGEN_DEVICE_FUNC Quaternion& operator=(Quaternion&& other) noexcept(std::is_nothrow_move_assignable<Scalar>::value) {
     m_coeffs = std::move(other.coeffs());
     return *this;
   }
@@ -793,7 +791,7 @@
   } else {
     // theta is the angle between the 2 quaternions
     Scalar theta = acos(absD);
-    Scalar sinTheta = sin(theta);
+    Scalar sinTheta = numext::sqrt(Scalar(1) - absD * absD);
 
     scale0 = sin((Scalar(1) - t) * theta) / sinTheta;
     scale1 = sin((t * theta)) / sinTheta;

diff --git a/Eigen/src/Geometry/Transform.h b/Eigen/src/Geometry/Transform.h
index b1a9f21..a5d7b60 100644
--- a/Eigen/src/Geometry/Transform.h
+++ b/Eigen/src/Geometry/Transform.h

@@ -353,10 +353,10 @@
   inline QTransform toQTransform(void) const;
 #endif
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT {
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept {
     return int(Mode) == int(Projective) ? m_matrix.cols() : (m_matrix.cols() - 1);
   }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_matrix.cols(); }
 
   /** shortcut for m_matrix(row,col);
    * \sa MatrixBase::operator(Index,Index) const */
@@ -1059,11 +1059,11 @@
                  : Scalar(1);  // so x has absolute value 1
   VectorType sv(svd.singularValues());
   sv.coeffRef(Dim - 1) *= x;
-  if (scaling) *scaling = svd.matrixV() * sv.asDiagonal() * svd.matrixV().adjoint();
+  if (scaling) (*scaling).noalias() = svd.matrixV() * sv.asDiagonal() * svd.matrixV().adjoint();
   if (rotation) {
     LinearMatrixType m(svd.matrixU());
     m.col(Dim - 1) *= x;
-    *rotation = m * svd.matrixV().adjoint();
+    (*rotation).noalias() = m * svd.matrixV().adjoint();
   }
 }
 
@@ -1182,7 +1182,8 @@
       eigen_assert(false && "Invalid transform traits in Transform::Inverse");
     }
     // translation and remaining parts
-    res.matrix().template topRightCorner<Dim, 1>() = -res.matrix().template topLeftCorner<Dim, Dim>() * translation();
+    res.matrix().template topRightCorner<Dim, 1>().noalias() =
+        -res.matrix().template topLeftCorner<Dim, Dim>() * translation();
     res.makeAffine();  // we do need this, because in the beginning res is uninitialized
   }
   return res;
@@ -1432,7 +1433,7 @@
   typedef Transform<Scalar, Dim, ResultMode, LhsOptions> ResultType;
   static EIGEN_DEVICE_FUNC ResultType run(const Lhs& lhs, const Rhs& rhs) {
     ResultType res;
-    res.linear() = lhs.linear() * rhs.linear();
+    res.linear().noalias() = lhs.linear() * rhs.linear();
     res.translation() = lhs.linear() * rhs.translation() + lhs.translation();
     res.makeAffine();
     return res;

diff --git a/Eigen/src/Geometry/Translation.h b/Eigen/src/Geometry/Translation.h
index 682c4c7..d942ac8 100644
--- a/Eigen/src/Geometry/Translation.h
+++ b/Eigen/src/Geometry/Translation.h

@@ -69,18 +69,18 @@
   EIGEN_DEVICE_FUNC explicit inline Translation(const VectorType& vector) : m_coeffs(vector) {}
 
   /** \brief Returns the x-translation by value. **/
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Scalar x() const { return m_coeffs.x(); }
+  EIGEN_DEVICE_FUNC constexpr Scalar x() const { return m_coeffs.x(); }
   /** \brief Returns the y-translation by value. **/
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Scalar y() const { return m_coeffs.y(); }
+  EIGEN_DEVICE_FUNC constexpr Scalar y() const { return m_coeffs.y(); }
   /** \brief Returns the z-translation by value. **/
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Scalar z() const { return m_coeffs.z(); }
+  EIGEN_DEVICE_FUNC constexpr Scalar z() const { return m_coeffs.z(); }
 
   /** \brief Returns the x-translation as a reference. **/
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Scalar& x() { return m_coeffs.x(); }
+  EIGEN_DEVICE_FUNC constexpr Scalar& x() { return m_coeffs.x(); }
   /** \brief Returns the y-translation as a reference. **/
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Scalar& y() { return m_coeffs.y(); }
+  EIGEN_DEVICE_FUNC constexpr Scalar& y() { return m_coeffs.y(); }
   /** \brief Returns the z-translation as a reference. **/
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Scalar& z() { return m_coeffs.z(); }
+  EIGEN_DEVICE_FUNC constexpr Scalar& z() { return m_coeffs.z(); }
 
   EIGEN_DEVICE_FUNC const VectorType& vector() const { return m_coeffs; }
   EIGEN_DEVICE_FUNC VectorType& vector() { return m_coeffs; }

diff --git a/Eigen/src/Householder/HouseholderSequence.h b/Eigen/src/Householder/HouseholderSequence.h
index 440573f..d49c961 100644
--- a/Eigen/src/Householder/HouseholderSequence.h
+++ b/Eigen/src/Householder/HouseholderSequence.h

@@ -183,7 +183,7 @@
    * \returns Number of rows
    * \details This equals the dimension of the space that the transformation acts on.
    */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT {
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept {
     return Side == OnTheLeft ? m_vectors.rows() : m_vectors.cols();
   }
 
@@ -191,7 +191,7 @@
    * \returns Number of columns
    * \details This equals the dimension of the space that the transformation acts on.
    */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return rows(); }
 
   /** \brief Essential part of a Householder vector.
    * \param[in]  k  Index of Householder reflection

diff --git a/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h b/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
index 0beef60..904d853 100644
--- a/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
+++ b/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h

@@ -51,8 +51,8 @@
     compute(mat);
   }
 
-  EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_invdiag.size(); }
-  EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_invdiag.size(); }
+  constexpr Index rows() const noexcept { return m_invdiag.size(); }
+  constexpr Index cols() const noexcept { return m_invdiag.size(); }
 
   template <typename MatType>
   DiagonalPreconditioner& analyzePattern(const MatType&) {

diff --git a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h
index ba379ec..dd40058 100644
--- a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h
+++ b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h

@@ -84,10 +84,10 @@
   }
 
   /** \returns number of rows of the factored matrix */
-  EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_L.rows(); }
+  constexpr Index rows() const noexcept { return m_L.rows(); }
 
   /** \returns number of columns of the factored matrix */
-  EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_L.cols(); }
+  constexpr Index cols() const noexcept { return m_L.cols(); }
 
   /** \brief Reports whether previous computation was successful.
    *

diff --git a/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h b/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
index 930077d..11ce5e5 100644
--- a/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
+++ b/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h

@@ -135,9 +135,9 @@
   /** \brief Extraction Method for U-Factor */
   const FactorType matrixU() const;
 
-  EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_lu.rows(); }
+  constexpr Index rows() const noexcept { return m_lu.rows(); }
 
-  EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_lu.cols(); }
+  constexpr Index cols() const noexcept { return m_lu.cols(); }
 
   /** \brief Reports whether previous computation was successful.
    *
@@ -446,4 +446,4 @@
 
 }  // end namespace Eigen
 
-#endif  // EIGEN_INCOMPLETE_LUT_H
\ No newline at end of file
+#endif  // EIGEN_INCOMPLETE_LUT_H

diff --git a/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h b/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
index cf85f2e..5caa396 100644
--- a/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
+++ b/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h

@@ -218,10 +218,10 @@
   }
 
   /** \internal */
-  EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return matrix().rows(); }
+  constexpr Index rows() const noexcept { return matrix().rows(); }
 
   /** \internal */
-  EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return matrix().cols(); }
+  constexpr Index cols() const noexcept { return matrix().cols(); }
 
   /** \returns the tolerance threshold used by the stopping criteria.
    * \sa setTolerance()

diff --git a/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h b/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h
index 2b146b3..271679f 100644
--- a/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h
+++ b/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h

@@ -50,8 +50,8 @@
   SolveWithGuess(const Decomposition &dec, const RhsType &rhs, const GuessType &guess)
       : m_dec(dec), m_rhs(rhs), m_guess(guess) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_dec.cols(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_dec.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_rhs.cols(); }
 
   EIGEN_DEVICE_FUNC const Decomposition &dec() const { return m_dec; }
   EIGEN_DEVICE_FUNC const RhsType &rhs() const { return m_rhs; }

diff --git a/Eigen/src/KLUSupport/KLUSupport.h b/Eigen/src/KLUSupport/KLUSupport.h
index 574021d..12cf6c2 100644
--- a/Eigen/src/KLUSupport/KLUSupport.h
+++ b/Eigen/src/KLUSupport/KLUSupport.h

@@ -100,8 +100,8 @@
     if (m_numeric) klu_free_numeric(&m_numeric, &m_common);
   }
 
-  EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return mp_matrix.rows(); }
-  EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return mp_matrix.cols(); }
+  constexpr Index rows() const noexcept { return mp_matrix.rows(); }
+  constexpr Index cols() const noexcept { return mp_matrix.cols(); }
 
   /** \brief Reports whether previous computation was successful.
    *

diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h
index 3e57764..a725a7b 100644
--- a/Eigen/src/LU/FullPivLU.h
+++ b/Eigen/src/LU/FullPivLU.h

@@ -391,8 +391,8 @@
 
   MatrixType reconstructedMatrix() const;
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_lu.rows(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_lu.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_lu.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_lu.cols(); }
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
   template <typename RhsType, typename DstType>
@@ -717,7 +717,7 @@
 
   // Step 2
   m_lu.topLeftCorner(smalldim, smalldim).template triangularView<UnitLower>().solveInPlace(c.topRows(smalldim));
-  if (rows > cols) c.bottomRows(rows - cols) -= m_lu.bottomRows(rows - cols) * c.topRows(cols);
+  if (rows > cols) c.bottomRows(rows - cols).noalias() -= m_lu.bottomRows(rows - cols) * c.topRows(cols);
 
   // Step 3
   m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots)

diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h
index 1edd6b8..f09b90e 100644
--- a/Eigen/src/LU/PartialPivLU.h
+++ b/Eigen/src/LU/PartialPivLU.h

@@ -210,8 +210,8 @@
 
   MatrixType reconstructedMatrix() const;
 
-  EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_lu.rows(); }
-  EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_lu.cols(); }
+  constexpr Index rows() const noexcept { return m_lu.rows(); }
+  constexpr Index cols() const noexcept { return m_lu.cols(); }
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
   template <typename RhsType, typename DstType>

diff --git a/Eigen/src/SVD/SVDBase.h b/Eigen/src/SVD/SVDBase.h
index d1ad63d..dcb4dba 100644
--- a/Eigen/src/SVD/SVDBase.h
+++ b/Eigen/src/SVD/SVDBase.h

@@ -379,7 +379,7 @@
   Index l_rank = rank();
   tmp.noalias() = m_matrixU.leftCols(l_rank).adjoint() * rhs;
   tmp = m_singularValues.head(l_rank).asDiagonal().inverse() * tmp;
-  dst = m_matrixV.leftCols(l_rank) * tmp;
+  dst.noalias() = m_matrixV.leftCols(l_rank) * tmp;
 }
 
 template <typename Derived>

diff --git a/Eigen/src/SVD/UpperBidiagonalization.h b/Eigen/src/SVD/UpperBidiagonalization.h
index d78b30b..6df6318 100644
--- a/Eigen/src/SVD/UpperBidiagonalization.h
+++ b/Eigen/src/SVD/UpperBidiagonalization.h

@@ -172,7 +172,7 @@
     // 1 - update the k-th column of A
     SubColumnType v_k = A.col(k).tail(remainingRows);
     v_k -= V_k1 * Y.row(k).head(k).adjoint();
-    if (k) v_k -= X_k1 * A.col(k).head(k);
+    if (k) v_k.noalias() -= X_k1 * A.col(k).head(k);
 
     // 2 - construct left Householder transform in-place
     v_k.makeHouseholderInPlace(tau_v, diagonal[k]);
@@ -203,7 +203,7 @@
       SubRowType u_k(A.row(k).tail(remainingCols));
       u_k = u_k.conjugate();
       {
-        u_k -= Y_k * A.row(k).head(k + 1).adjoint();
+        u_k.noalias() -= Y_k * A.row(k).head(k + 1).adjoint();
         if (k) u_k -= U_k1.adjoint() * X.row(k).head(k).adjoint();
       }
 

diff --git a/Eigen/src/ThreadPool/CoreThreadPoolDevice.h b/Eigen/src/ThreadPool/CoreThreadPoolDevice.h
index a45be20..c603a38 100644
--- a/Eigen/src/ThreadPool/CoreThreadPoolDevice.h
+++ b/Eigen/src/ThreadPool/CoreThreadPoolDevice.h

@@ -308,19 +308,24 @@
       this->template assignPacket<DstAlignment, SrcAlignment, PacketType>(index);
     }
   };
+  static constexpr bool UsePacketSegment = Kernel::AssignmentTraits::UsePacketSegment;
+  using head_loop =
+      unaligned_dense_assignment_loop<PacketType, DstAlignment, SrcAlignment, UsePacketSegment, DstIsAligned>;
+  using tail_loop = unaligned_dense_assignment_loop<PacketType, DstAlignment, SrcAlignment, UsePacketSegment, false>;
+
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {
     const Index size = kernel.size();
     const Index alignedStart =
         DstIsAligned ? 0 : internal::first_aligned<RequestedAlignment>(kernel.dstDataPtr(), size);
     const Index alignedEnd = alignedStart + numext::round_down(size - alignedStart, PacketSize);
 
-    unaligned_dense_assignment_loop<DstIsAligned != 0>::run(kernel, 0, alignedStart);
+    head_loop::run(kernel, 0, alignedStart);
 
     constexpr float cost = static_cast<float>(XprEvaluationCost);
     AssignmentFunctor functor(kernel);
     device.template parallelFor<AssignmentFunctor, PacketSize>(alignedStart, alignedEnd, functor, cost);
 
-    unaligned_dense_assignment_loop<>::run(kernel, alignedEnd, size);
+    tail_loop::run(kernel, alignedEnd, size);
   }
 };
 

diff --git a/Eigen/src/misc/lapacke_helpers.h b/Eigen/src/misc/lapacke_helpers.h
index 5a2f38f..ff98639 100644
--- a/Eigen/src/misc/lapacke_helpers.h
+++ b/Eigen/src/misc/lapacke_helpers.h

@@ -75,7 +75,7 @@
 
 /// translates storage order of the given Eigen object to the corresponding lapack constant
 template <typename Derived>
-EIGEN_ALWAYS_INLINE EIGEN_CONSTEXPR lapack_int lapack_storage_of(const EigenBase<Derived> &) {
+EIGEN_ALWAYS_INLINE constexpr lapack_int lapack_storage_of(const EigenBase<Derived> &) {
   return Derived::IsRowMajor ? LAPACK_ROW_MAJOR : LAPACK_COL_MAJOR;
 }
 

diff --git a/Eigen/src/plugins/BlockMethods.inc b/Eigen/src/plugins/BlockMethods.inc
index 46dc9dd..0782aa3 100644
--- a/Eigen/src/plugins/BlockMethods.inc
+++ b/Eigen/src/plugins/BlockMethods.inc

@@ -1365,6 +1365,6 @@
  * \sa subVector(Index)
  */
 template <DirectionType Direction>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index subVectors() const {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index subVectors() const {
   return (Direction == Vertical) ? cols() : rows();
 }

diff --git a/cmake/EigenConfigureTesting.cmake b/cmake/EigenConfigureTesting.cmake
index 2a1e7ab..d72d88a 100644
--- a/cmake/EigenConfigureTesting.cmake
+++ b/cmake/EigenConfigureTesting.cmake

@@ -1,19 +1,26 @@
 include(EigenTesting)
 include(CheckCXXSourceCompiles)
 
-# configure the "site" and "buildname" 
+
+# configure the "site" and "buildname"
 ei_set_sitename()
 
 # retrieve and store the build string
 ei_set_build_string()
 
 add_custom_target(buildtests)
+
+if (NOT EIGEN_CTEST_ARGS)
+  # By default, run tests in parallel on all available cores.
+  set(EIGEN_CTEST_ARGS "" CACHE STRING "-j0")
+endif()
 add_custom_target(check COMMAND "ctest" ${EIGEN_CTEST_ARGS})
+
 add_dependencies(check buildtests)
 
 # Convenience target for only building GPU tests.
 add_custom_target(buildtests_gpu)
-add_custom_target(check_gpu COMMAND "ctest" "--output-on-failure" 
+add_custom_target(check_gpu COMMAND "ctest" "--output-on-failure"
                                             "--no-compress-output"
                                             "--build-no-clean"
                                             "-T" "test"
@@ -59,7 +66,7 @@
     set(CTEST_CUSTOM_COVERAGE_EXCLUDE "/test/")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COVERAGE_FLAGS}")
   endif()
-  
+
 elseif(MSVC)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /D_CRT_SECURE_NO_WARNINGS /D_SCL_SECURE_NO_WARNINGS")
 endif()

diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
index 0d123a2..6178d4b 100644
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt

@@ -115,3 +115,8 @@
   WORKING_DIRECTORY ${Eigen_BINARY_DIR}/doc)
 
 add_dependencies(doc doc-eigen-prerequisites doc-unsupported-prerequisites)
+
+add_custom_target(install-doc ALL
+  COMMAND ${CMAKE_COMMAND} -E copy_directory ${Eigen_BINARY_DIR}/doc ${CMAKE_INSTALL_FULL_DOCDIR}
+)
+add_dependencies(install-doc doc)

diff --git a/scripts/msvc_setup.ps1 b/scripts/msvc_setup.ps1
new file mode 100644
index 0000000..e2d0642
--- /dev/null
+++ b/scripts/msvc_setup.ps1

@@ -0,0 +1,21 @@
+# Powershell script to set up MSVC environment.
+
+param ($EIGEN_CI_MSVC_ARCH, $EIGEN_CI_MSVC_VER)
+
+Set-PSDebug -Trace 1
+
+function Get-ScriptDirectory { Split-Path $MyInvocation.ScriptName }
+       
+# Set defaults if not already set.
+IF (!$EIGEN_CI_MSVC_ARCH)             { $EIGEN_CI_MSVC_ARCH             = "x64" }
+IF (!$EIGEN_CI_MSVC_VER)              { $EIGEN_CI_MSVC_VER              = "14.29" }
+
+# Export variables into the global scope
+$global:EIGEN_CI_MSVC_ARCH             = $EIGEN_CI_MSVC_ARCH
+$global:EIGEN_CI_MSVC_VER              = $EIGEN_CI_MSVC_VER
+
+# Find Visual Studio installation directory.
+$global:VS_INSTALL_DIR = &"${Env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer\vswhere.exe" -latest -property installationPath
+# Run VCVarsAll.bat incitialization script and extract environment variables.
+# http://allen-mack.blogspot.com/2008/03/replace-visual-studio-command-prompt.html
+cmd.exe /c "`"${VS_INSTALL_DIR}\VC\Auxiliary\Build\vcvarsall.bat`" $EIGEN_CI_MSVC_ARCH -vcvars_ver=$EIGEN_CI_MSVC_VER & set" | foreach { if ($_ -match "=") { $v = $_.split("="); set-item -force -path "ENV:\$($v[0])" -value "$($v[1])" } }
\ No newline at end of file

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 813cc53..e62ec45 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt

@@ -186,6 +186,7 @@
 ei_add_test(float_conversion)
 ei_add_test(io)
 ei_add_test(packetmath "-DEIGEN_FAST_MATH=1")
+ei_add_test(packet_segment)
 ei_add_test(vectorization_logic)
 ei_add_test(basicstuff)
 ei_add_test(constexpr)

diff --git a/test/packet_ostream.h b/test/packet_ostream.h
deleted file mode 100644
index 49e1bb0..0000000
--- a/test/packet_ostream.h
+++ /dev/null

@@ -1,23 +0,0 @@
-#ifndef TEST_PACKET_OSTREAM
-#define TEST_PACKET_OSTREAM
-
-#include <type_traits>
-#include <ostream>
-
-// Include this header to be able to print Packets while debugging.
-
-template <typename Packet,
-          typename EnableIf = std::enable_if_t<Eigen::internal::unpacket_traits<Packet>::vectorizable> >
-std::ostream& operator<<(std::ostream& os, const Packet& packet) {
-  using Scalar = typename Eigen::internal::unpacket_traits<Packet>::type;
-  Scalar v[Eigen::internal::unpacket_traits<Packet>::size];
-  Eigen::internal::pstoreu(v, packet);
-  os << "{" << v[0];
-  for (int i = 1; i < Eigen::internal::unpacket_traits<Packet>::size; ++i) {
-    os << "," << v[i];
-  }
-  os << "}";
-  return os;
-}
-
-#endif  // TEST_PACKET_OSTREAM
\ No newline at end of file

diff --git a/test/packet_segment.cpp b/test/packet_segment.cpp
new file mode 100644
index 0000000..6fa6a29
--- /dev/null
+++ b/test/packet_segment.cpp

@@ -0,0 +1,168 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2025 The Eigen Authors
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+template <typename Scalar, typename Packet>
+void verify_data(const Scalar* data_in, const Scalar* data_out, const Packet& a, Index begin, Index count) {
+  constexpr int PacketSize = internal::unpacket_traits<Packet>::size;
+  bool ok = true;
+  for (Index i = begin; i < begin + count; i++) {
+    ok = ok && numext::equal_strict(data_in[i], data_out[i]);
+  }
+  if (!ok) {
+    std::cout << "begin: " << begin << ", count: " << count << "\n";
+    std::cout << "Scalar type: " << type_name(Scalar()) << " x " << PacketSize << "\n";
+    std::cout << "data in:  {";
+    for (Index i = 0; i < PacketSize; i++) {
+      if (i > 0) std::cout << ",";
+      if (i < begin || i >= begin + count) {
+        std::cout << "MASK";
+      } else {
+        std::cout << data_in[i];
+      }
+    }
+    std::cout << "}\n";
+    std::cout << "data out: {";
+    for (Index i = 0; i < PacketSize; i++) {
+      if (i > 0) std::cout << ",";
+      if (i < begin || i >= begin + count) {
+        std::cout << "MASK";
+      } else {
+        std::cout << data_out[i];
+      }
+    }
+    std::cout << "}\n";
+    std::cout << "packet:   ";
+    std::cout << internal::postream(a) << "\n";
+  }
+  VERIFY(ok);
+}
+
+template <typename Scalar, int PacketSize, bool Run = internal::find_packet_by_size<Scalar, PacketSize>::value>
+struct packet_segment_test_impl {
+  using Packet = typename internal::find_packet_by_size<Scalar, PacketSize>::type;
+  static void test_unaligned() {
+    // test loading a packet segment from unaligned memory that includes unallocated memory
+
+    // | X   X   X   X | *   *   *   X | X   X   X   X |
+    //    begin -> { X | *   *   *   } <- begin + count
+
+    VectorX<Scalar> data_in(PacketSize), data_out(PacketSize);
+    data_in.setRandom();
+    data_out.setRandom();
+
+    Scalar* unaligned_data_in = data_in.data() - 1;
+    Scalar* unaligned_data_out = data_out.data() - 1;
+
+    Index begin = 1;
+    Index count = PacketSize - 1;
+
+    Packet a = internal::ploaduSegment<Packet>(unaligned_data_in, begin, count);
+    internal::pstoreuSegment<Scalar, Packet>(unaligned_data_out, a, begin, count);
+
+    verify_data(unaligned_data_in, unaligned_data_out, a, begin, count);
+
+    // test loading the entire packet
+
+    data_in.setRandom();
+    data_out.setRandom();
+
+    unaligned_data_in = data_in.data();
+    unaligned_data_out = data_out.data();
+
+    begin = 0;
+    count = PacketSize;
+
+    Packet b = internal::ploaduSegment<Packet>(unaligned_data_in, begin, count);
+    internal::pstoreuSegment<Scalar, Packet>(unaligned_data_out, b, begin, count);
+
+    verify_data(unaligned_data_in, unaligned_data_out, b, begin, count);
+
+    // test loading an empty packet segment in unallocated memory
+    count = 0;
+
+    for (begin = 0; begin < PacketSize; begin++) {
+      data_in.setRandom();
+      data_out = data_in;
+      Packet c = internal::ploaduSegment<Packet>(data_in.data(), begin, count);
+      internal::pstoreuSegment<Scalar, Packet>(data_out.data(), c, begin, count);
+      // verify that ploaduSegment / pstoreuSegment did nothing
+      VERIFY_IS_CWISE_EQUAL(data_in, data_out);
+    }
+  }
+  static void test_aligned() {
+    // test loading a packet segment from aligned memory that includes unallocated memory
+
+    // | X   X   X   X | *   *   *   X | X   X   X   X |
+    //        begin -> { *   *   *   X } <- begin + count
+
+    VectorX<Scalar> data_in(PacketSize - 1), data_out(PacketSize - 1);
+    data_in.setRandom();
+    data_out.setRandom();
+
+    Scalar* aligned_data_in = data_in.data();
+    Scalar* aligned_data_out = data_out.data();
+
+    Index begin = 0;
+    Index count = PacketSize - 1;
+
+    Packet b = internal::ploadSegment<Packet>(aligned_data_in, begin, count);
+    internal::pstoreSegment<Scalar, Packet>(aligned_data_out, b, begin, count);
+
+    verify_data(aligned_data_in, aligned_data_out, b, begin, count);
+  }
+  static void run() {
+    test_unaligned();
+    test_aligned();
+  }
+};
+
+template <typename Scalar, int PacketSize>
+struct packet_segment_test_impl<Scalar, PacketSize, false> {
+  static void run() {}
+};
+
+template <typename Scalar, int PacketSize>
+struct packet_segment_test_driver {
+  static void run() {
+    packet_segment_test_impl<Scalar, PacketSize>::run();
+    packet_segment_test_driver<Scalar, PacketSize / 2>::run();
+  }
+};
+
+template <typename Scalar>
+struct packet_segment_test_driver<Scalar, 1> {
+  static void run() {}
+};
+
+template <typename Scalar>
+void test_packet_segment() {
+  packet_segment_test_driver<Scalar, internal::packet_traits<Scalar>::size>::run();
+}
+
+EIGEN_DECLARE_TEST(packet_segment) {
+  for (int i = 0; i < g_repeat; i++) {
+    test_packet_segment<bool>();
+    test_packet_segment<int8_t>();
+    test_packet_segment<uint8_t>();
+    test_packet_segment<int16_t>();
+    test_packet_segment<uint16_t>();
+    test_packet_segment<int32_t>();
+    test_packet_segment<uint32_t>();
+    test_packet_segment<int64_t>();
+    test_packet_segment<uint64_t>();
+    test_packet_segment<bfloat16>();
+    test_packet_segment<half>();
+    test_packet_segment<float>();
+    test_packet_segment<double>();
+    test_packet_segment<std::complex<float>>();
+    test_packet_segment<std::complex<double>>();
+  }
+}

diff --git a/test/packetmath.cpp b/test/packetmath.cpp
index 9c5d6cf..7647592 100644
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp

@@ -10,7 +10,6 @@
 
 #include "packetmath_test_shared.h"
 #include "random_without_cast_overflow.h"
-#include "packet_ostream.h"
 
 template <typename T>
 inline T REF_ADD(const T& a, const T& b) {
@@ -24,21 +23,55 @@
 inline T REF_MUL(const T& a, const T& b) {
   return a * b;
 }
+
+template <typename Scalar, typename EnableIf = void>
+struct madd_impl {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar madd(const Scalar& a, const Scalar& b, const Scalar& c) {
+    return a * b + c;
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar msub(const Scalar& a, const Scalar& b, const Scalar& c) {
+    return a * b - c;
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar nmadd(const Scalar& a, const Scalar& b, const Scalar& c) {
+    return c - a * b;
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar nmsub(const Scalar& a, const Scalar& b, const Scalar& c) {
+    return Scalar(0) - (a * b + c);
+  }
+};
+
+template <typename Scalar>
+struct madd_impl<Scalar,
+                 std::enable_if_t<Eigen::internal::is_scalar<Scalar>::value && Eigen::NumTraits<Scalar>::IsSigned>> {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar madd(const Scalar& a, const Scalar& b, const Scalar& c) {
+    return numext::fma(a, b, c);
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar msub(const Scalar& a, const Scalar& b, const Scalar& c) {
+    return numext::fma(a, b, Scalar(-c));
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar nmadd(const Scalar& a, const Scalar& b, const Scalar& c) {
+    return numext::fma(Scalar(-a), b, c);
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar nmsub(const Scalar& a, const Scalar& b, const Scalar& c) {
+    return -Scalar(numext::fma(a, b, c));
+  }
+};
+
 template <typename T>
 inline T REF_MADD(const T& a, const T& b, const T& c) {
-  return a * b + c;
+  return madd_impl<T>::madd(a, b, c);
 }
 template <typename T>
 inline T REF_MSUB(const T& a, const T& b, const T& c) {
-  return a * b - c;
+  return madd_impl<T>::msub(a, b, c);
 }
 template <typename T>
 inline T REF_NMADD(const T& a, const T& b, const T& c) {
-  return c - a * b;
+  return madd_impl<T>::nmadd(a, b, c);
 }
 template <typename T>
 inline T REF_NMSUB(const T& a, const T& b, const T& c) {
-  return test::negate(a * b + c);
+  return madd_impl<T>::nmsub(a, b, c);
 }
 template <typename T>
 inline T REF_DIV(const T& a, const T& b) {
@@ -70,6 +103,14 @@
 inline bool REF_MADD(const bool& a, const bool& b, const bool& c) {
   return (a && b) || c;
 }
+template <>
+inline bool REF_DIV(const bool& a, const bool& b) {
+  return a && b;
+}
+template <>
+inline bool REF_RECIPROCAL(const bool& a) {
+  return a;
+}
 
 template <typename T>
 inline T REF_FREXP(const T& x, T& exp) {
@@ -92,6 +133,26 @@
   return static_cast<T>(ldexp(x, static_cast<int>(exp)));
 }
 
+// provides a convenient function to take the absolute value of each component of a complex number to prevent
+// catastrophic cancellation in randomly generated complex numbers
+template <typename T, bool IsComplex = NumTraits<T>::IsComplex>
+struct abs_helper_impl {
+  static T run(T x) { return numext::abs(x); }
+};
+template <typename T>
+struct abs_helper_impl<T, true> {
+  static T run(T x) {
+    T res = x;
+    numext::real_ref(res) = numext::abs(numext::real(res));
+    numext::imag_ref(res) = numext::abs(numext::imag(res));
+    return res;
+  }
+};
+template <typename T>
+T abs_helper(T x) {
+  return abs_helper_impl<T>::run(x);
+}
+
 // Uses pcast to cast from one array to another.
 template <typename SrcPacket, typename TgtPacket, int SrcCoeffRatio, int TgtCoeffRatio>
 struct pcast_array;
@@ -481,8 +542,8 @@
   eigen_optimization_barrier_test<Scalar>::run();
 
   for (int i = 0; i < size; ++i) {
-    data1[i] = internal::random<Scalar>() / RealScalar(PacketSize);
-    data2[i] = internal::random<Scalar>() / RealScalar(PacketSize);
+    data1[i] = internal::random<Scalar>();
+    data2[i] = internal::random<Scalar>();
     refvalue = (std::max)(refvalue, numext::abs(data1[i]));
   }
 
@@ -502,8 +563,8 @@
   for (int M = 0; M < PacketSize; ++M) {
     for (int N = 0; N <= PacketSize; ++N) {
       for (int j = 0; j < size; ++j) {
-        data1[j] = internal::random<Scalar>() / RealScalar(PacketSize);
-        data2[j] = internal::random<Scalar>() / RealScalar(PacketSize);
+        data1[j] = internal::random<Scalar>();
+        data2[j] = internal::random<Scalar>();
         refvalue = (std::max)(refvalue, numext::abs(data1[j]));
       }
 
@@ -568,6 +629,7 @@
   negate_test<Scalar, Packet>(data1, data2, ref, PacketSize);
   CHECK_CWISE1_IF(PacketTraits::HasReciprocal, REF_RECIPROCAL, internal::preciprocal);
   CHECK_CWISE1(numext::conj, internal::pconj);
+
   CHECK_CWISE1_IF(PacketTraits::HasSign, numext::sign, internal::psign);
 
   for (int offset = 0; offset < 3; ++offset) {
@@ -632,11 +694,17 @@
   // Avoid overflows.
   if (NumTraits<Scalar>::IsInteger && NumTraits<Scalar>::IsSigned &&
       Eigen::internal::unpacket_traits<Packet>::size > 1) {
-    Scalar limit =
-        static_cast<Scalar>(std::pow(static_cast<double>(numext::real(NumTraits<Scalar>::highest())),
-                                     1.0 / static_cast<double>(Eigen::internal::unpacket_traits<Packet>::size)));
+    Scalar limit = static_cast<Scalar>(
+        static_cast<RealScalar>(std::pow(static_cast<double>(numext::real(NumTraits<Scalar>::highest())),
+                                         1.0 / static_cast<double>(Eigen::internal::unpacket_traits<Packet>::size))));
     for (int i = 0; i < PacketSize; ++i) {
-      data1[i] = internal::random<Scalar>(-limit, limit);
+      data1[i] = internal::random<Scalar>(Scalar(0) - limit, limit);
+    }
+  } else if (!NumTraits<Scalar>::IsInteger && !NumTraits<Scalar>::IsComplex) {
+    // Prevent very small product results by adjusting range.  Otherwise,
+    // we may end up with multiplying e.g. 32 Eigen::halfs with values < 1.
+    for (int i = 0; i < PacketSize; ++i) {
+      data1[i] = internal::random<Scalar>(Scalar(0.5), Scalar(1)) * (internal::random<bool>() ? Scalar(-1) : Scalar(1));
     }
   }
   ref[0] = Scalar(1);
@@ -724,11 +792,6 @@
   packetmath_pcast_ops_runner<Scalar, Packet>::run();
   packetmath_minus_zero_add_test<Scalar, Packet>::run();
 
-  for (int i = 0; i < size; ++i) {
-    data1[i] = numext::abs(internal::random<Scalar>());
-  }
-  CHECK_CWISE1_IF(PacketTraits::HasSqrt, numext::sqrt, internal::psqrt);
-  CHECK_CWISE1_IF(PacketTraits::HasRsqrt, numext::rsqrt, internal::prsqrt);
   CHECK_CWISE3_IF(true, REF_MADD, internal::pmadd);
   if (!std::is_same<Scalar, bool>::value && NumTraits<Scalar>::IsSigned) {
     nmsub_test<Scalar, Packet>(data1, data2, ref, PacketSize);
@@ -738,14 +801,18 @@
   // which can lead to very flaky tests. Here we ensure the signs are such that
   // they do not cancel.
   for (int i = 0; i < PacketSize; ++i) {
-    data1[i] = numext::abs(internal::random<Scalar>());
-    data1[i + PacketSize] = numext::abs(internal::random<Scalar>());
-    data1[i + 2 * PacketSize] = Scalar(0) - numext::abs(internal::random<Scalar>());
+    data1[i] = abs_helper(internal::random<Scalar>());
+    data1[i + PacketSize] = abs_helper(internal::random<Scalar>());
+    data1[i + 2 * PacketSize] = Scalar(0) - abs_helper(internal::random<Scalar>());
   }
   if (!std::is_same<Scalar, bool>::value && NumTraits<Scalar>::IsSigned) {
     CHECK_CWISE3_IF(true, REF_MSUB, internal::pmsub);
     CHECK_CWISE3_IF(true, REF_NMADD, internal::pnmadd);
   }
+
+  CHECK_CWISE1_IF(PacketTraits::HasSqrt, numext::sqrt, internal::psqrt);
+  CHECK_CWISE1_IF(PacketTraits::HasRsqrt, numext::rsqrt, internal::prsqrt);
+  CHECK_CWISE1_IF(PacketTraits::HasCbrt, numext::cbrt, internal::pcbrt);
 }
 
 // Notice that this definition works for complex types as well.
@@ -767,6 +834,7 @@
 
 CREATE_FUNCTOR(psqrt_functor, internal::psqrt);
 CREATE_FUNCTOR(prsqrt_functor, internal::prsqrt);
+CREATE_FUNCTOR(pcbrt_functor, internal::pcbrt);
 
 // TODO(rmlarsen): Run this test for more functions.
 template <bool Cond, typename Scalar, typename Packet, typename RefFunctorT, typename FunctorT>
@@ -1137,6 +1205,7 @@
 
     packetmath_test_IEEE_corner_cases<PacketTraits::HasSqrt, Scalar, Packet>(numext::sqrt<Scalar>, psqrt_functor());
     packetmath_test_IEEE_corner_cases<PacketTraits::HasRsqrt, Scalar, Packet>(numext::rsqrt<Scalar>, prsqrt_functor());
+    packetmath_test_IEEE_corner_cases<PacketTraits::HasCbrt, Scalar, Packet>(numext::cbrt<Scalar>, pcbrt_functor());
 
     // TODO(rmlarsen): Re-enable for half and bfloat16.
     if (PacketTraits::HasCos && !internal::is_same<Scalar, half>::value &&
@@ -1665,7 +1734,7 @@
 
   for (Index N = 0; N <= PacketSize; ++N) {
     for (Index i = 0; i < N; ++i) {
-      data1[i] = internal::random<Scalar>() / RealScalar(PacketSize);
+      data1[i] = internal::random<Scalar>();
     }
 
     for (Index i = 0; i < N * 20; ++i) {
@@ -1684,7 +1753,7 @@
     }
 
     for (Index i = 0; i < N * 7; ++i) {
-      buffer[i] = internal::random<Scalar>() / RealScalar(PacketSize);
+      buffer[i] = internal::random<Scalar>();
     }
     packet = internal::pgather_partial<Scalar, Packet>(buffer, 7, N);
     internal::pstore_partial(data1, packet, N);

diff --git a/test/packetmath_test_shared.h b/test/packetmath_test_shared.h
index 93d4149..7d7a0da 100644
--- a/test/packetmath_test_shared.h
+++ b/test/packetmath_test_shared.h

@@ -162,7 +162,9 @@
 
   template <typename T>
   inline Packet load(const T* from, unsigned long long umask) const {
-    return internal::ploadu<Packet>(from, umask);
+    using UMaskType = typename numext::get_integer_by_size<internal::plain_enum_max(
+        internal::unpacket_traits<Packet>::size / CHAR_BIT, 1)>::unsigned_type;
+    return internal::ploadu<Packet>(from, static_cast<UMaskType>(umask));
   }
 
   template <typename T>
@@ -172,7 +174,9 @@
 
   template <typename T>
   inline void store(T* to, const Packet& x, unsigned long long umask) const {
-    internal::pstoreu(to, x, umask);
+    using UMaskType = typename numext::get_integer_by_size<internal::plain_enum_max(
+        internal::unpacket_traits<Packet>::size / CHAR_BIT, 1)>::unsigned_type;
+    internal::pstoreu(to, x, static_cast<UMaskType>(umask));
   }
 
   template <typename T>

diff --git a/test/product.h b/test/product.h
index f8eb5df..f37a932 100644
--- a/test/product.h
+++ b/test/product.h

@@ -38,6 +38,15 @@
 std::enable_if_t<RhsType::SizeAtCompileTime != Dynamic, void> check_mismatched_product(LhsType& /*unused*/,
                                                                                        const RhsType& /*unused*/) {}
 
+template <typename Scalar, typename V1, typename V2>
+Scalar ref_dot_product(const V1& v1, const V2& v2) {
+  Scalar out = Scalar(0);
+  for (Index i = 0; i < v1.size(); ++i) {
+    out = Eigen::numext::fma(v1[i], v2[i], out);
+  }
+  return out;
+}
+
 template <typename MatrixType>
 void product(const MatrixType& m) {
   /* this test covers the following files:
@@ -245,7 +254,10 @@
   // inner product
   {
     Scalar x = square2.row(c) * square2.col(c2);
-    VERIFY_IS_APPROX(x, square2.row(c).transpose().cwiseProduct(square2.col(c2)).sum());
+    // NOTE: FMA is necessary here in the reference to ensure accuracy for
+    // large vector sizes and float16/bfloat16 types.
+    Scalar y = ref_dot_product<Scalar>(square2.row(c), square2.col(c2));
+    VERIFY_IS_APPROX(x, y);
   }
 
   // outer product

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
index 902aa96..ea210a1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h

@@ -45,19 +45,19 @@
 template <typename Scalar, typename StorageIndex, StorageIndex NCWindow, StorageIndex CFactor, StorageIndex NCFactor>
 struct TVPanelSize {
   // LocalThreadSizeC: determines total number of thread per workgroup for the contracting dimension
-  static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeC = EIGEN_SYCL_LOCAL_THREAD_DIM0;
+  static constexpr StorageIndex LocalThreadSizeC = EIGEN_SYCL_LOCAL_THREAD_DIM0;
   // LocalThreadSizeNC: determines total number of thread per workgroup for the non-contracting dimension
-  static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC = EIGEN_SYCL_LOCAL_THREAD_DIM1;
+  static constxpr StorageIndex LocalThreadSizeNC = EIGEN_SYCL_LOCAL_THREAD_DIM1;
   // TileSizeDimNC: determines the tile size for the non-contracting dimension
-  static EIGEN_CONSTEXPR StorageIndex TileSizeDimNC = NCWindow / NCFactor;
+  static constexpr StorageIndex TileSizeDimNC = NCWindow / NCFactor;
   // TileSizeDimC: determines the tile size for the contracting dimension
-  static EIGEN_CONSTEXPR StorageIndex TileSizeDimC = CFactor * LocalThreadSizeNC * LocalThreadSizeC;
+  static constexpr StorageIndex TileSizeDimC = CFactor * LocalThreadSizeNC * LocalThreadSizeC;
   // WorkLoadPerThreadNC : determines workload per thread for loading the non-contracting dimension
-  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC = TileSizeDimNC / LocalThreadSizeNC;
+  static constexpr StorageIndex WorkLoadPerThreadNC = TileSizeDimNC / LocalThreadSizeNC;
   // WorkLoadPerThreadC: determines workload per thread for loading the non-contracting dimension
-  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadC = TileSizeDimC / LocalThreadSizeC;
+  static constexpr StorageIndex WorkLoadPerThreadC = TileSizeDimC / LocalThreadSizeC;
   // BC : determines if supporting bank conflict is required
-  static EIGEN_CONSTEXPR bool BC = false;
+  static constexpr bool BC = false;
 };
 #endif
 
@@ -81,40 +81,40 @@
 template <typename Scalar, typename StorageIndex, StorageIndex REG_SIZE_M, StorageIndex REG_SIZE_N, StorageIndex TSDK>
 struct TTPanelSize {
   // TileSizeDimK: determines Tile size for dimension K. The packet size is assumed to be considered
-  static EIGEN_CONSTEXPR StorageIndex TileSizeDimK = TSDK;
+  static constexpr StorageIndex TileSizeDimK = TSDK;
   // WorkLoadPerThreadM : determines workload per thread for loading the M dimension This can be varied based on the
   // available register on a chosen device(can be controlled by EIGEN_SYCL_REG_M macro//
 #ifndef EIGEN_SYCL_REG_M
-  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM = REG_SIZE_M;
+  static constexpr StorageIndex WorkLoadPerThreadM = REG_SIZE_M;
 #else
-  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM = EIGEN_SYCL_REG_M;
+  static constexpr StorageIndex WorkLoadPerThreadM = EIGEN_SYCL_REG_M;
 #endif
 // WorkLoadPerThreadN : determines workload per thread for loading the N dimension This can be varied based on the
 // available register on a chosen device(can be controlled by EIGEN_SYCL_REG_N macro
 #ifndef EIGEN_SYCL_REG_N
-  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN = REG_SIZE_N;
+  static constexpr StorageIndex WorkLoadPerThreadN = REG_SIZE_N;
 #else
-  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN = EIGEN_SYCL_REG_N;
+  static constexpr StorageIndex WorkLoadPerThreadN = EIGEN_SYCL_REG_N;
 #endif
   // LocalThreadSizeM: determines total number of thread per workgroup for the m dimension
-  static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeM = EIGEN_SYCL_LOCAL_THREAD_DIM0;
+  static constexpr StorageIndex LocalThreadSizeM = EIGEN_SYCL_LOCAL_THREAD_DIM0;
   // LocalThreadSizeN: determines total number of thread per workgroup for the n dimension
-  static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeN = EIGEN_SYCL_LOCAL_THREAD_DIM1;
+  static constexpr StorageIndex LocalThreadSizeN = EIGEN_SYCL_LOCAL_THREAD_DIM1;
   // TileSizeDimM: determines the tile size for the m dimension
-  static EIGEN_CONSTEXPR StorageIndex TileSizeDimM = LocalThreadSizeM * WorkLoadPerThreadM;
+  static constexpr StorageIndex TileSizeDimM = LocalThreadSizeM * WorkLoadPerThreadM;
   // TileSizeDimN: determines the tile size for the n dimension
-  static EIGEN_CONSTEXPR StorageIndex TileSizeDimN = LocalThreadSizeN * WorkLoadPerThreadN;
+  static constexpr StorageIndex TileSizeDimN = LocalThreadSizeN * WorkLoadPerThreadN;
   // LoadPerThreadLhs: determines workload per thread for loading Lhs Tensor. This must be divisible by packetsize
-  static EIGEN_CONSTEXPR StorageIndex LoadPerThreadLhs =
+  static constexpr StorageIndex LoadPerThreadLhs =
       ((TileSizeDimK * WorkLoadPerThreadM * WorkLoadPerThreadN) / (TileSizeDimN));
   // LoadPerThreadRhs: determines workload per thread for loading Rhs Tensor. This must be divisible by packetsize
-  static EIGEN_CONSTEXPR StorageIndex LoadPerThreadRhs =
+  static constexpr StorageIndex LoadPerThreadRhs =
       ((TileSizeDimK * WorkLoadPerThreadM * WorkLoadPerThreadN) / (TileSizeDimM));
   // BC : determines if supporting bank conflict is required
-  static EIGEN_CONSTEXPR bool BC = true;
+  static constexpr bool BC = true;
   // DoubleBuffer: determines if double buffering technique should be used (This can be disabled by
   // EIGEN_SYCL_DISABLE_DOUBLE_BUFFER macro when the device does not have sufficient local memory)
-  static EIGEN_CONSTEXPR bool DoubleBuffer =
+  static constexpr bool DoubleBuffer =
 #ifdef EIGEN_SYCL_DISABLE_DOUBLE_BUFFER
       false;
 #else
@@ -220,7 +220,7 @@
 template <typename StorageIndex, StorageIndex ld, data_source dt, typename PacketType, typename DataScalar>
 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<dt != data_source::global_mem, void> write(
     PacketType &packet_data, DataScalar ptr) {
-  EIGEN_CONSTEXPR int PacketSize = Eigen::internal::unpacket_traits<PacketType>::size;
+  constexpr int PacketSize = Eigen::internal::unpacket_traits<PacketType>::size;
   EIGEN_UNROLL_LOOP
   for (int i = 0; i < PacketSize; i++) {
     *ptr = PacketWrapper<PacketType, PacketSize>::scalarize(i, packet_data);
@@ -320,14 +320,14 @@
  */
 template <bool is_transposed, bool is_rhs_, bool packet_load_, typename PacketType>
 struct BlockProperties {
-  static EIGEN_CONSTEXPR bool packet_load = packet_load_;
+  static constexpr bool packet_load = packet_load_;
   typedef typename Eigen::internal::unpacket_traits<PacketType>::type OutScalar;
-  static EIGEN_CONSTEXPR bool is_rhs = is_rhs_;
+  static constexpr bool is_rhs = is_rhs_;
   typedef std::conditional_t<packet_load, PacketType, OutScalar> OutType;
-  static EIGEN_CONSTEXPR int elements_per_access = Eigen::internal::unpacket_traits<OutType>::size;
-  static EIGEN_CONSTEXPR bool is_coalesced_layout = !(is_transposed ^ is_rhs);
-  static EIGEN_CONSTEXPR int nc_stride = (is_coalesced_layout ? elements_per_access : 1);
-  static EIGEN_CONSTEXPR int c_stride = (is_coalesced_layout ? 1 : elements_per_access);
+  static constexpr int elements_per_access = Eigen::internal::unpacket_traits<OutType>::size;
+  static constexpr bool is_coalesced_layout = !(is_transposed ^ is_rhs);
+  static constexpr int nc_stride = (is_coalesced_layout ? elements_per_access : 1);
+  static constexpr int c_stride = (is_coalesced_layout ? 1 : elements_per_access);
 };
 
 /*!
@@ -458,11 +458,11 @@
  public:
   typedef typename Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketReturnType
       PacketReturnType;
-  static EIGEN_CONSTEXPR int PacketSize =
+  static constexpr int PacketSize =
       Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketSize;
-  static EIGEN_CONSTEXPR bool is_lhs_transposed =
+  static constexpr bool is_lhs_transposed =
       !::Eigen::internal::TensorContractionInputMapperTrait<LhsMapper>::inner_dim_contiguous;
-  static EIGEN_CONSTEXPR bool is_rhs_transposed =
+  static constexpr bool is_rhs_transposed =
       !::Eigen::internal::TensorContractionInputMapperTrait<RhsMapper>::inner_dim_contiguous;
 
   typedef BlockProperties<is_lhs_transposed, false, input_mapper_properties::is_lhs_matrix && Vectorizable,
@@ -473,20 +473,20 @@
                           PacketReturnType>
       RHSBlockProperties;
 
-  static EIGEN_CONSTEXPR StorageIndex NStride =
+  static constexpr StorageIndex NStride =
       contraction_tp == contraction_type::local ? Properties::WorkLoadPerThreadN : RHSBlockProperties::nc_stride;
 
   typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch;
   typedef cl::sycl::multi_ptr<OutScalar, cl::sycl::access::address_space::local_space> local_ptr;
   typedef OutScalar * /*cl::sycl::multi_ptr<OutScalar, cl::sycl::access::address_space::private_space>*/ private_ptr;
   typedef std::conditional_t<contraction_tp == contraction_type::local, local_ptr, private_ptr> tile_ptr;
-  static EIGEN_CONSTEXPR StorageIndex LSDL = contraction_tp == contraction_type::local
-                                                 ? Properties::TileSizeDimM + Properties::BC
-                                                 : Properties::WorkLoadPerThreadM;
-  static EIGEN_CONSTEXPR StorageIndex LSDR = contraction_tp == contraction_type::local
-                                                 ? Properties::TileSizeDimN + Properties::BC
-                                                 : Properties::WorkLoadPerThreadN;
-  static EIGEN_CONSTEXPR StorageIndex LocalOffset = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN;
+  static constexpr StorageIndex LSDL = contraction_tp == contraction_type::local
+                                           ? Properties::TileSizeDimM + Properties::BC
+                                           : Properties::WorkLoadPerThreadM;
+  static constexpr StorageIndex LSDR = contraction_tp == contraction_type::local
+                                           ? Properties::TileSizeDimN + Properties::BC
+                                           : Properties::WorkLoadPerThreadN;
+  static constexpr StorageIndex LocalOffset = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN;
 
   /**
    * \brief MemHolder this is a place holder struct for creating memory hierarchy in SYCL. Inside SYCL kernel it is not
@@ -638,7 +638,7 @@
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_block_per_tile(OutScalar *lhs_block_ptr, OutScalar *rhs_block_ptr,
                                                                     PacketReturnType *privateRes) const {
     StorageIndex idx = 0;
-    EIGEN_CONSTEXPR StorageIndex lhs_stride =
+    constexpr StorageIndex lhs_stride =
         contraction_tp == contraction_type::local ? (PacketSize * Properties::LocalThreadSizeM) : 1;
     EIGEN_UNROLL_LOOP
     for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN; wLPTN++) {
@@ -668,8 +668,7 @@
     // when local memory is not used M and N are both accessed in a coalesced way. However, when local memory is
     // available the k*N is transposed in the local to N*K therefore, each blocks operates on blockId*
     // WorkLoadPerThreadN slice of N
-    EIGEN_CONSTEXPR StorageIndex GlobalNStride =
-        contraction_tp == contraction_type::local ? 1 : Properties::LocalThreadSizeN;
+    constexpr StorageIndex GlobalNStride = contraction_tp == contraction_type::local ? 1 : Properties::LocalThreadSizeN;
     EIGEN_UNROLL_LOOP
     for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN / PrivateNStride; wLPTN++) {
       // output leading dimension
@@ -713,9 +712,9 @@
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<contract_tp == contraction_type::no_local> extract_block(
       const Input &inpt, PrivateReg private_ptr, const std::pair<StorageIndex, StorageIndex> &,
       const StorageIndex &ncOffset, const StorageIndex cOffset) const {
-    EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC =
+    constexpr StorageIndex LocalThreadSizeNC =
         InputBlockProperties::is_rhs ? Properties::LocalThreadSizeN : Properties::LocalThreadSizeM;
-    EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC =
+    constexpr StorageIndex WorkLoadPerThreadNC =
         InputBlockProperties::is_rhs ? Properties::WorkLoadPerThreadN : Properties::WorkLoadPerThreadM;
     const StorageIndex &NC = InputBlockProperties::is_rhs ? triple_dim.N : triple_dim.M;
 
@@ -891,11 +890,11 @@
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<contract_tp == contraction_type::local> extract_block(
       const Input &inpt, Local local_ptr, const std::pair<StorageIndex, StorageIndex> &local_index,
       const StorageIndex &ncOffset, const StorageIndex cOffset) const {
-    EIGEN_CONSTEXPR StorageIndex TileSizeDimNC =
+    constexpr StorageIndex TileSizeDimNC =
         InputBlockProperties::is_rhs ? Properties::TileSizeDimN : Properties::TileSizeDimM;
-    EIGEN_CONSTEXPR StorageIndex LoadPerThread =
+    constexpr StorageIndex LoadPerThread =
         InputBlockProperties::is_rhs ? Properties::LoadPerThreadRhs : Properties::LoadPerThreadLhs;
-    EIGEN_CONSTEXPR StorageIndex LSD = InputBlockProperties::is_rhs ? LSDR : LSDL;
+    constexpr StorageIndex LSD = InputBlockProperties::is_rhs ? LSDR : LSDL;
     static_assert(((LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride) == 0) &&
                    (LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride) == 0)),
                   " LocalOffset must be divisible by stride");
@@ -995,11 +994,11 @@
 struct GeneralVectorTensor {
   typedef typename Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketReturnType
       PacketReturnType;
-  static EIGEN_CONSTEXPR int PacketSize =
+  static constexpr int PacketSize =
       Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketSize;
   typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch;
 
-  static EIGEN_CONSTEXPR StorageIndex OutScratchOffset =
+  static constexpr StorageIndex OutScratchOffset =
       KFactor * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC;
 
   // Since the access layout for a vector can always be coalesced, when LHS is a vector, we pass false and false to make
@@ -1328,8 +1327,8 @@
 
   template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered>
   struct input_mapper_propertis {
-    static EIGEN_CONSTEXPR bool is_lhs_matrix = (LDims == 2 && ContractDims == 1) || lhs_inner_dim_contiguous;
-    static EIGEN_CONSTEXPR bool is_rhs_matrix =
+    static constexpr bool is_lhs_matrix = (LDims == 2 && ContractDims == 1) || lhs_inner_dim_contiguous;
+    static constexpr bool is_rhs_matrix =
         (RDims == 2 && ContractDims == 1) || (rhs_inner_dim_contiguous && !rhs_inner_dim_reordered);
   };
 
@@ -1537,9 +1536,9 @@
   void EIGEN_ALWAYS_INLINE LaunchVT(EvaluatorPointerType buffer, const VectorMapper &vec, const TensorMapper &mat,
                                     StorageIndex NC, StorageIndex C) const {
     const StorageIndex nonContractDim = NC;
-    EIGEN_CONSTEXPR StorageIndex NCFactor = 1;
-    EIGEN_CONSTEXPR StorageIndex CFactor = 1;
-    EIGEN_CONSTEXPR StorageIndex NCWindow = 16;
+    constexpr StorageIndex NCFactor = 1;
+    constexpr StorageIndex CFactor = 1;
+    constexpr StorageIndex NCWindow = 16;
     typedef Eigen::TensorSycl::internal::TVPanelSize<CoeffReturnType, StorageIndex, NCWindow, CFactor, NCFactor>
         Properties;
     const StorageIndex roundUpC = Eigen::TensorSycl::internal::roundUp(C, Properties::TileSizeDimC);
@@ -1601,7 +1600,7 @@
                           (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)),
                         "The Local thread size must be a power of 2 for the reduction "
                         "operation");
-    EIGEN_CONSTEXPR StorageIndex local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;
+    constexpr StorageIndex local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;
 
     // Here we force the code not to be more than 2-step reduction: Our empirical research shows that if each thread
     // reduces at least 512 elementss individually, we get better performance.

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
index 6a7571c..394c150 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h

@@ -42,10 +42,10 @@
   typedef Index Real;
   enum { IsComplex = 0, RequireInitialization = false, ReadCost = 1, AddCost = 1, MulCost = 1 };
 
-  EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real epsilon() { return 0; }
-  EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real dummy_precision() { return 0; }
-  EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real highest() { return n; }
-  EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real lowest() { return n; }
+  EIGEN_DEVICE_FUNC static constexpr EIGEN_STRONG_INLINE Real epsilon() { return 0; }
+  EIGEN_DEVICE_FUNC static constexpr EIGEN_STRONG_INLINE Real dummy_precision() { return 0; }
+  EIGEN_DEVICE_FUNC static constexpr EIGEN_STRONG_INLINE Real highest() { return n; }
+  EIGEN_DEVICE_FUNC static constexpr EIGEN_STRONG_INLINE Real lowest() { return n; }
 };
 
 namespace internal {
@@ -569,47 +569,47 @@
 namespace Eigen {
 namespace internal {
 template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_known_statically(Index i) {
+static EIGEN_DEVICE_FUNC constexpr bool index_known_statically(Index i) {
   return index_known_statically_impl<T>::run(i);
 }
 
 template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool all_indices_known_statically() {
+static EIGEN_DEVICE_FUNC constexpr bool all_indices_known_statically() {
   return all_indices_known_statically_impl<T>::run();
 }
 
 template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool indices_statically_known_to_increase() {
+static EIGEN_DEVICE_FUNC constexpr bool indices_statically_known_to_increase() {
   return indices_statically_known_to_increase_impl<T>::run();
 }
 
 template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_eq(Index i, Index value) {
+static EIGEN_DEVICE_FUNC constexpr bool index_statically_eq(Index i, Index value) {
   return index_statically_eq_impl<T>::run(i, value);
 }
 
 template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_ne(Index i, Index value) {
+static EIGEN_DEVICE_FUNC constexpr bool index_statically_ne(Index i, Index value) {
   return index_statically_ne_impl<T>::run(i, value);
 }
 
 template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_gt(Index i, Index value) {
+static EIGEN_DEVICE_FUNC constexpr bool index_statically_gt(Index i, Index value) {
   return index_statically_gt_impl<T>::run(i, value);
 }
 
 template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_lt(Index i, Index value) {
+static EIGEN_DEVICE_FUNC constexpr bool index_statically_lt(Index i, Index value) {
   return index_statically_lt_impl<T>::run(i, value);
 }
 
 template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_first_statically_eq(Index i, Index value) {
+static EIGEN_DEVICE_FUNC constexpr bool index_pair_first_statically_eq(Index i, Index value) {
   return index_pair_first_statically_eq_impl<T>::run(i, value);
 }
 
 template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_second_statically_eq(Index i, Index value) {
+static EIGEN_DEVICE_FUNC constexpr bool index_pair_second_statically_eq(Index i, Index value) {
   return index_pair_second_statically_eq_impl<T>::run(i, value);
 }
 

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
index 8c2bb2e..8454070 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h

@@ -212,9 +212,9 @@
   typedef U first_type;
   typedef V second_type;
 
-  EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Pair() : first(), second() {}
+  constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Pair() : first(), second() {}
 
-  EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Pair(const U& f, const V& s) : first(f), second(s) {}
+  constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Pair(const U& f, const V& s) : first(f), second(s) {}
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void swap(Pair& rhs) {
     using numext::swap;
@@ -224,20 +224,20 @@
 };
 
 template <typename U, typename V>
-EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator==(const Pair<U, V>& x, const Pair<U, V>& y) {
+constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator==(const Pair<U, V>& x, const Pair<U, V>& y) {
   return (x.first == y.first && x.second == y.second);
 }
 
 template <typename U, typename V>
-EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator!=(const Pair<U, V>& x, const Pair<U, V>& y) {
+constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator!=(const Pair<U, V>& x, const Pair<U, V>& y) {
   return !(x == y);
 }
 
 // Can't use std::pairs on cuda devices
 template <typename Idx>
 struct IndexPair {
-  EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair() : first(0), second(0) {}
-  EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair(Idx f, Idx s) : first(f), second(s) {}
+  constexpr EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair() : first(0), second(0) {}
+  constexpr EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair(Idx f, Idx s) : first(f), second(s) {}
 
   EIGEN_DEVICE_FUNC void set(IndexPair<Idx> val) {
     first = val.first;
@@ -251,19 +251,18 @@
 namespace internal {
 
 template <typename IndexType, typename Index, Index First, Index... Is>
-EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array<Index, 1 + sizeof...(Is)> customIndices2Array(
+constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array<Index, 1 + sizeof...(Is)> customIndices2Array(
     IndexType& idx, numeric_list<Index, First, Is...>) {
   return {static_cast<Index>(idx[First]), static_cast<Index>(idx[Is])...};
 }
 template <typename IndexType, typename Index>
-EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array<Index, 0> customIndices2Array(IndexType&,
-                                                                                          numeric_list<Index>) {
+constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array<Index, 0> customIndices2Array(IndexType&, numeric_list<Index>) {
   return array<Index, 0>();
 }
 
 /** Make an array (for index/dimensions) out of a custom index */
 template <typename Index, std::size_t NumIndices, typename IndexType>
-EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array<Index, NumIndices> customIndices2Array(IndexType& idx) {
+constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array<Index, NumIndices> customIndices2Array(IndexType& idx) {
   return customIndices2Array(idx, typename gen_numeric_list<Index, NumIndices>::type{});
 }
 

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
index 6944c03..b4749b4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h

@@ -402,9 +402,9 @@
 
 template <typename Index, Index LTP, Index LTR, bool BC_>
 struct ReductionPannel {
-  static EIGEN_CONSTEXPR Index LocalThreadSizeP = LTP;
-  static EIGEN_CONSTEXPR Index LocalThreadSizeR = LTR;
-  static EIGEN_CONSTEXPR bool BC = BC_;
+  static constexpr Index LocalThreadSizeP = LTP;
+  static constexpr Index LocalThreadSizeR = LTR;
+  static constexpr bool BC = BC_;
 };
 
 template <typename Self, typename Op, TensorSycl::internal::reduction_dim rt>
@@ -430,7 +430,7 @@
                   "The Local thread size must be a power of 2 for the reduction "
                   "operation");
 
-    EIGEN_CONSTEXPR Index localRange = PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR;
+    constexpr Index localRange = PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR;
     // In this step, we force the code not to be more than 2-step reduction:
     // Our empirical research shows that if each thread reduces at least 64
     // elements individually, we get better performance. However, this can change
@@ -445,7 +445,7 @@
     const Index rNumGroups = num_coeffs_to_reduce > reductionPerThread * localRange ? std::min(rGroups, localRange) : 1;
     const Index globalRange = pNumGroups * rNumGroups * localRange;
 
-    EIGEN_CONSTEXPR Index scratchSize =
+    constexpr Index scratchSize =
         PannelParameters::LocalThreadSizeR * (PannelParameters::LocalThreadSizeP + PannelParameters::BC);
     auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
     if (rNumGroups > 1) {
@@ -482,15 +482,15 @@
 struct FullReducer<Self, Op, Eigen::SyclDevice, Vectorizable> {
   typedef typename Self::CoeffReturnType CoeffReturnType;
   typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
-  static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
-  static EIGEN_CONSTEXPR int PacketSize = Self::PacketAccess ? Self::PacketSize : 1;
+  static constexpr bool HasOptimizedImplementation = true;
+  static constexpr int PacketSize = Self::PacketAccess ? Self::PacketSize : 1;
   static void run(const Self &self, Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType data) {
     typedef std::conditional_t<Self::PacketAccess, typename Self::PacketReturnType, CoeffReturnType> OutType;
     static_assert(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) &
                     (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)),
                   "The Local thread size must be a power of 2 for the reduction "
                   "operation");
-    EIGEN_CONSTEXPR Index local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;
+    constexpr Index local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;
 
     typename Self::Index inputSize = self.impl().dimensions().TotalSize();
     // In this step we force the code not to be more than 2-step reduction:
@@ -535,7 +535,7 @@
 // col reduction
 template <typename Self, typename Op>
 struct OuterReducer<Self, Op, Eigen::SyclDevice> {
-  static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
+  static constexpr bool HasOptimizedImplementation = true;
 
   static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
                   typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
@@ -549,7 +549,7 @@
 // row reduction
 template <typename Self, typename Op>
 struct InnerReducer<Self, Op, Eigen::SyclDevice> {
-  static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
+  static constexpr bool HasOptimizedImplementation = true;
 
   static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
                   typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
@@ -566,7 +566,7 @@
 // generic partial reduction
 template <typename Self, typename Op>
 struct GenericReducer<Self, Op, Eigen::SyclDevice> {
-  static EIGEN_CONSTEXPR bool HasOptimizedImplementation = false;
+  static constexpr bool HasOptimizedImplementation = false;
   static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
                   typename Self::EvaluatorPointerType output, typename Self::Index num_values_to_reduce,
                   typename Self::Index num_coeffs_to_preserve) {

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
index f0a390f..6de0867 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h

@@ -197,7 +197,7 @@
 // threads. Currently set to twice the cache line size on Intel and ARM
 // processors.
 EIGEN_STRONG_INLINE Index AdjustBlockSize(Index item_size, Index block_size) {
-  EIGEN_CONSTEXPR Index kBlockAlignment = 128;
+  constexpr Index kBlockAlignment = 128;
   const Index items_per_cacheline = numext::maxi<Index>(1, kBlockAlignment / item_size);
   return items_per_cacheline * numext::div_ceil(block_size, items_per_cacheline);
 }

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h
index 30fde91..3636788 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h

@@ -51,7 +51,7 @@
 template <typename index_t>
 struct ScanParameters {
   // must be power of 2
-  static EIGEN_CONSTEXPR index_t ScanPerThread = 8;
+  static constexpr index_t ScanPerThread = 8;
   const index_t total_size;
   const index_t non_scan_size;
   const index_t scan_size;
@@ -86,7 +86,7 @@
 struct ScanKernelFunctor {
   typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
       LocalAccessor;
-  static EIGEN_CONSTEXPR int PacketSize = ScanParameters<Index>::ScanPerThread / 2;
+  static constexpr int PacketSize = ScanParameters<Index>::ScanPerThread / 2;
 
   LocalAccessor scratch;
   Evaluator dev_eval;
@@ -288,7 +288,7 @@
 struct ScanAdjustmentKernelFunctor {
   typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
       LocalAccessor;
-  static EIGEN_CONSTEXPR int PacketSize = ScanParameters<Index>::ScanPerThread / 2;
+  static constexpr int PacketSize = ScanParameters<Index>::ScanPerThread / 2;
   InAccessor in_ptr;
   OutAccessor out_ptr;
   const ScanParameters<Index> scanParameters;

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
index c149985..5357a48 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h

@@ -27,6 +27,10 @@
   typedef std::remove_reference_t<Nested> Nested_;
   static constexpr int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value;
   static constexpr int Layout = XprTraits::Layout;
+  enum {
+    // Trace is read-only.
+    Flags = traits<XprType>::Flags & ~LvalueBit
+  };
 };
 
 template <typename Dims, typename XprType>
@@ -203,6 +207,8 @@
     return true;
   }
 
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return nullptr; }
+
   EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {

diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h b/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h
index 51c0ad6..ae5c4f4 100644
--- a/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h
+++ b/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h

@@ -106,7 +106,7 @@
     int one;
     int two;
     int flags;
-    constexpr inline Generator(int one_, int two_, int flags_) : one(one_), two(two_), flags(flags_) {}
+    constexpr Generator(int one_, int two_, int flags_) : one(one_), two(two_), flags(flags_) {}
   };
 
   std::size_t m_numIndices;

diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h b/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h
index 3f9bb51..66a982b 100644
--- a/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h
+++ b/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h

@@ -25,7 +25,7 @@
   constexpr static std::size_t N = sizeof...(nn);
 
   template <typename T>
-  constexpr static inline std::array<T, N> run(const std::array<T, N>& indices) {
+  constexpr static std::array<T, N> run(const std::array<T, N>& indices) {
     return {{indices[nn]...}};
   }
 };
@@ -51,7 +51,7 @@
 template <typename iib>
 struct tensor_static_symgroup_multiply_helper {
   template <int... iia>
-  constexpr static inline numeric_list<int, get<iia, iib>::value...> helper(numeric_list<int, iia...>) {
+  constexpr static numeric_list<int, get<iia, iib>::value...> helper(numeric_list<int, iia...>) {
     return numeric_list<int, get<iia, iib>::value...>();
   }
 };
@@ -107,9 +107,9 @@
 };
 
 template <typename Index, std::size_t N, int... ii, int... jj>
-constexpr static inline std::array<Index, N> tensor_static_symgroup_index_permute(std::array<Index, N> idx,
-                                                                                  internal::numeric_list<int, ii...>,
-                                                                                  internal::numeric_list<int, jj...>) {
+constexpr static std::array<Index, N> tensor_static_symgroup_index_permute(std::array<Index, N> idx,
+                                                                           internal::numeric_list<int, ii...>,
+                                                                           internal::numeric_list<int, jj...>) {
   return {{idx[ii]..., idx[jj]...}};
 }
 
@@ -179,9 +179,9 @@
   typedef typename group_elements::type ge;
 
  public:
-  constexpr inline StaticSGroup() {}
-  constexpr inline StaticSGroup(const StaticSGroup<Gen...>&) {}
-  constexpr inline StaticSGroup(StaticSGroup<Gen...>&&) {}
+  constexpr StaticSGroup() {}
+  constexpr StaticSGroup(const StaticSGroup<Gen...>&) {}
+  constexpr StaticSGroup(StaticSGroup<Gen...>&&) {}
 
   template <typename Op, typename RV, typename Index, std::size_t N, typename... Args>
   static inline RV apply(const std::array<Index, N>& idx, RV initial, Args&&... args) {
@@ -196,8 +196,8 @@
 
   constexpr static std::size_t static_size = ge::count;
 
-  constexpr static inline std::size_t size() { return ge::count; }
-  constexpr static inline int globalFlags() { return group_elements::global_flags; }
+  constexpr static std::size_t size() { return ge::count; }
+  constexpr static int globalFlags() { return group_elements::global_flags; }
 
   template <typename Tensor_, typename... IndexTypes>
   inline internal::tensor_symmetry_value_setter<Tensor_, StaticSGroup<Gen...>> operator()(

diff --git a/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h b/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h
index e3be69d..632f437 100644
--- a/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h
+++ b/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h

@@ -36,15 +36,15 @@
  */
 
 template <std::size_t I_, class T>
-constexpr inline T& array_get(std::vector<T>& a) {
+constexpr T& array_get(std::vector<T>& a) {
   return a[I_];
 }
 template <std::size_t I_, class T>
-constexpr inline T&& array_get(std::vector<T>&& a) {
+constexpr T&& array_get(std::vector<T>&& a) {
   return a[I_];
 }
 template <std::size_t I_, class T>
-constexpr inline T const& array_get(std::vector<T> const& a) {
+constexpr T const& array_get(std::vector<T> const& a) {
   return a[I_];
 }
 

diff --git a/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h b/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h
index 808fd7d..19ec8ea 100644
--- a/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h
+++ b/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h

@@ -268,7 +268,7 @@
 
     /* compute the scaled predicted reduction and */
     /* the scaled directional derivative. */
-    wa3 = fjac.template triangularView<Upper>() * (qrfac.colsPermutation().inverse() * wa1);
+    wa3.noalias() = fjac.template triangularView<Upper>() * (qrfac.colsPermutation().inverse() * wa1);
     temp1 = numext::abs2(wa3.stableNorm() / fnorm);
     temp2 = numext::abs2(sqrt(par) * pnorm / fnorm);
     prered = temp1 + temp2 / Scalar(.5);

diff --git a/unsupported/Eigen/src/NonLinearOptimization/r1updt.h b/unsupported/Eigen/src/NonLinearOptimization/r1updt.h
index 4bf7ee5..201fba3 100644
--- a/unsupported/Eigen/src/NonLinearOptimization/r1updt.h
+++ b/unsupported/Eigen/src/NonLinearOptimization/r1updt.h

@@ -75,7 +75,7 @@
       /* givens rotation. */
       w_givens[j] = givens;
     } else
-      v_givens[j] = IdentityRotation;
+      w_givens[j] = IdentityRotation;
 
     /* test for zero diagonal elements in the output s. */
     if (s(j, j) == 0.) {

diff --git a/unsupported/test/cxx11_tensor_ref.cpp b/unsupported/test/cxx11_tensor_ref.cpp
index cf09749..8d6821a 100644
--- a/unsupported/test/cxx11_tensor_ref.cpp
+++ b/unsupported/test/cxx11_tensor_ref.cpp

@@ -108,6 +108,17 @@
   VERIFY_IS_EQUAL(slice.data(), tensor.data());
 }
 
+static void test_ref_of_trace() {
+  Tensor<int, 2> input(6, 6);
+  input.setRandom();
+  int trace = 0;
+  for (int i = 0; i < 6; ++i) {
+    trace += input(i, i);
+  }
+  TensorRef<const Tensor<int, 0>> ref(input.trace());
+  VERIFY_IS_EQUAL(ref.coeff(0), trace);
+}
+
 static void test_ref_of_ref() {
   Tensor<float, 3> input(3, 5, 7);
   input.setRandom();
@@ -224,6 +235,7 @@
   CALL_SUBTEST(test_simple_rvalue_ref());
   CALL_SUBTEST(test_multiple_dims());
   CALL_SUBTEST(test_slice());
+  CALL_SUBTEST(test_ref_of_trace());
   CALL_SUBTEST(test_ref_of_ref());
   CALL_SUBTEST(test_ref_in_expr());
   CALL_SUBTEST(test_coeff_ref());
commit	d10e741dbbd25096637a48288609b247e06bc273	[log] [tgz]
author	Rasmus Munk Larsen <rmlarsen@google.com>	Thu May 01 13:21:16 2025 -0700
committer	Copybara-Service <copybara-worker@google.com>	Thu May 01 13:21:47 2025 -0700
tree	c0a6b5ca8cb9e1b70af468d535dc54c015b0e9fd
parent	29dbcba5db6ba4ff3cb8ad783b9ea0abb0249bc3 [diff]