Update Eigen to commit:e7c799b7c984f9b8bea27967bb04a97c52e62582

CHANGELOG
=========
e7c799b7c - Prevent premature overflow to infinity in exp(x). The changes also provide a 3-4% speedup.
00af47102 - Revert https://gitlab.com/libeigen/eigen/-/commit/040180078db70b8673932d7e5615920d64ceeaf5
8ee6f8475 - Speed up exp(x) by 30-35%.
93ec5450c - disable fill_n optimization for msvc
0af6ab4b7 - Remove unnecessary check for HasBlend trait.
d5eec781b - Get rid of redundant computation for large arguments to erf(x).
2fc63808e - Fix C++20 constexpr test compilation failures
5133c836c - Vectorize erf(x) for double.
d6e3b528b - Update Assign_MKL.h to cast disparate enum type to int, so it can be compared...
040180078 - Ensure that destructor'\''s needed by lldb make it into binary in non-inlined fashion
0fb2ed140 - Make element accessors constexpr
8b4efc8ed - check_size_for_overflow: use numeric limits instead of c99 macro

PiperOrigin-RevId: 698131745
Change-Id: Ia044ff8444a6d2266afdd7ad05556afd3be2366d
diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h
index f7f0b23..f40b2f4 100644
--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h
@@ -738,6 +738,7 @@
 }
 
 // Specialization for filling the destination with a constant value.
+#if !EIGEN_COMP_MSVC
 #ifndef EIGEN_GPU_COMPILE_PHASE
 template <typename DstXprType>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(
@@ -748,6 +749,7 @@
   std::fill_n(dst.data(), dst.size(), src.functor()());
 }
 #endif
+#endif
 
 template <typename DstXprType, typename SrcXprType>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src) {
diff --git a/Eigen/src/Core/Assign_MKL.h b/Eigen/src/Core/Assign_MKL.h
index 5b566cd..ad11220 100644
--- a/Eigen/src/Core/Assign_MKL.h
+++ b/Eigen/src/Core/Assign_MKL.h
@@ -89,7 +89,7 @@
     static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE, EIGENTYPE> &func) { \
       resize_if_allowed(dst, src, func);                                                                   \
       eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());                                  \
-      if (vml_assign_traits<DstXprType, SrcXprNested>::Traversal == LinearTraversal) {                     \
+      if (vml_assign_traits<DstXprType, SrcXprNested>::Traversal == (int)LinearTraversal) {                \
         VMLOP(dst.size(), (const VMLTYPE *)src.nestedExpression().data(),                                  \
               (VMLTYPE *)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE));                     \
       } else {                                                                                             \
diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h
index 879b0db..156ca2b 100644
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h
@@ -124,8 +124,7 @@
   // noncopyable:
   // Don't make this class inherit noncopyable as this kills EBO (Empty Base Optimization)
   // and make complex evaluator much larger than then should do.
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE evaluator_base() {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~evaluator_base() {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr evaluator_base() = default;
 
  private:
   EIGEN_DEVICE_FUNC evaluator_base(const evaluator_base&);
@@ -143,7 +142,7 @@
 template <typename Scalar, int OuterStride>
 class plainobjectbase_evaluator_data {
  public:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride)
       : data(ptr) {
 #ifndef EIGEN_INTERNAL_DEBUGGING
     EIGEN_UNUSED_VARIABLE(outerStride);
@@ -157,9 +156,9 @@
 template <typename Scalar>
 class plainobjectbase_evaluator_data<Scalar, Dynamic> {
  public:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride)
       : data(ptr), m_outerStride(outerStride) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outerStride() const { return m_outerStride; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index outerStride() const { return m_outerStride; }
   const Scalar* data;
 
  protected:
@@ -189,32 +188,34 @@
                                                      : RowsAtCompileTime
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE evaluator() : m_d(0, OuterStrideAtCompileTime) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr evaluator() : m_d(0, OuterStrideAtCompileTime) {
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const PlainObjectType& m)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr explicit evaluator(const PlainObjectType& m)
       : m_d(m.data(), IsVectorAtCompileTime ? 0 : m.outerStride()) {
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType coeff(Index row, Index col) const {
     if (IsRowMajor)
       return m_d.data[row * m_d.outerStride() + col];
     else
       return m_d.data[row + col * m_d.outerStride()];
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_d.data[index]; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType coeff(Index index) const { return m_d.data[index]; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& coeffRef(Index row, Index col) {
     if (IsRowMajor)
       return const_cast<Scalar*>(m_d.data)[row * m_d.outerStride() + col];
     else
       return const_cast<Scalar*>(m_d.data)[row + col * m_d.outerStride()];
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { return const_cast<Scalar*>(m_d.data)[index]; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& coeffRef(Index index) {
+    return const_cast<Scalar*>(m_d.data)[index];
+  }
 
   template <int LoadMode, typename PacketType>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
@@ -251,9 +252,10 @@
     : evaluator<PlainObjectBase<Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols>>> {
   typedef Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols> XprType;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE evaluator() {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr evaluator() = default;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& m) : evaluator<PlainObjectBase<XprType>>(m) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr explicit evaluator(const XprType& m)
+      : evaluator<PlainObjectBase<XprType>>(m) {}
 };
 
 template <typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
@@ -261,9 +263,10 @@
     : evaluator<PlainObjectBase<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols>>> {
   typedef Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> XprType;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE evaluator() {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr evaluator() = default;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& m) : evaluator<PlainObjectBase<XprType>>(m) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr explicit evaluator(const XprType& m)
+      : evaluator<PlainObjectBase<XprType>>(m) {}
 };
 
 // -------------------- Transpose --------------------
diff --git a/Eigen/src/Core/DenseCoeffsBase.h b/Eigen/src/Core/DenseCoeffsBase.h
index 30e0aa3..97f9b50 100644
--- a/Eigen/src/Core/DenseCoeffsBase.h
+++ b/Eigen/src/Core/DenseCoeffsBase.h
@@ -298,7 +298,7 @@
    *
    * \sa operator()(Index,Index), coeff(Index, Index) const, coeffRef(Index)
    */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& coeffRef(Index row, Index col) {
     eigen_internal_assert(row >= 0 && row < rows() && col >= 0 && col < cols());
     return internal::evaluator<Derived>(derived()).coeffRef(row, col);
   }
@@ -312,7 +312,7 @@
    * \sa operator[](Index)
    */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index row, Index col) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& operator()(Index row, Index col) {
     eigen_assert(row >= 0 && row < rows() && col >= 0 && col < cols());
     return coeffRef(row, col);
   }
@@ -332,7 +332,7 @@
    * \sa operator[](Index), coeff(Index) const, coeffRef(Index,Index)
    */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& coeffRef(Index index) {
     EIGEN_STATIC_ASSERT(internal::evaluator<Derived>::Flags & LinearAccessBit,
                         THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS)
     eigen_internal_assert(index >= 0 && index < size());
@@ -346,7 +346,7 @@
    * \sa operator[](Index) const, operator()(Index,Index), x(), y(), z(), w()
    */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator[](Index index) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& operator[](Index index) {
     EIGEN_STATIC_ASSERT(Derived::IsVectorAtCompileTime,
                         THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD)
     eigen_assert(index >= 0 && index < size());
diff --git a/Eigen/src/Core/EigenBase.h b/Eigen/src/Core/EigenBase.h
index a2d6ee2..6d16700 100644
--- a/Eigen/src/Core/EigenBase.h
+++ b/Eigen/src/Core/EigenBase.h
@@ -46,9 +46,9 @@
   typedef typename internal::traits<Derived>::StorageKind StorageKind;
 
   /** \returns a reference to the derived object */
-  EIGEN_DEVICE_FUNC Derived& derived() { return *static_cast<Derived*>(this); }
+  EIGEN_DEVICE_FUNC constexpr Derived& derived() { return *static_cast<Derived*>(this); }
   /** \returns a const reference to the derived object */
-  EIGEN_DEVICE_FUNC const Derived& derived() const { return *static_cast<const Derived*>(this); }
+  EIGEN_DEVICE_FUNC constexpr const Derived& derived() const { return *static_cast<const Derived*>(this); }
 
   EIGEN_DEVICE_FUNC inline Derived& const_cast_derived() const {
     return *static_cast<Derived*>(const_cast<EigenBase*>(this));
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index 1980e92..7e4f054 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -1934,6 +1934,22 @@
 }
 
 template <>
+EIGEN_STRONG_INLINE Packet4d pldexp_fast<Packet4d>(const Packet4d& a, const Packet4d& exponent) {
+  // Clamp exponent to [-1024, 1024]
+  const Packet4d min_exponent = pset1<Packet4d>(-1023.0);
+  const Packet4d max_exponent = pset1<Packet4d>(1024.0);
+  const Packet4i e = _mm256_cvtpd_epi32(pmin(pmax(exponent, min_exponent), max_exponent));
+  const Packet4i bias = pset1<Packet4i>(1023);
+
+  // 2^e
+  Packet4i hi = vec4i_swizzle1(padd(e, bias), 0, 2, 1, 3);
+  const Packet4i lo = _mm_slli_epi64(hi, 52);
+  hi = _mm_slli_epi64(_mm_srli_epi64(hi, 32), 52);
+  const Packet4d c = _mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1));
+  return pmul(a, c);  // a * 2^e
+}
+
+template <>
 EIGEN_STRONG_INLINE float predux<Packet8f>(const Packet8f& a) {
   return predux(Packet4f(_mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 1))));
 }
diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h
index 78d17d5..5d869e4 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
@@ -155,6 +155,7 @@
     HasExp = 1,
     HasATan = 1,
     HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH,
     HasErfc = EIGEN_FAST_MATH,
     HasATanh = 1,
     HasCmp = 1,
diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index da26cd4..49220ca 100644
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -3183,6 +3183,7 @@
     HasSin = EIGEN_FAST_MATH,
     HasCos = EIGEN_FAST_MATH,
     HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH,
     HasErfc = EIGEN_FAST_MATH,
     HasATanh = 1,
     HasATan = 0,
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
index 4e441b4..e21d3ef 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@@ -274,22 +274,20 @@
 //
 // Assumes IEEE floating point format
 template <typename Packet>
-struct pldexp_fast_impl {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_fast(const Packet& a, const Packet& exponent) {
   typedef typename unpacket_traits<Packet>::integer_packet PacketI;
   typedef typename unpacket_traits<Packet>::type Scalar;
   typedef typename unpacket_traits<PacketI>::type ScalarI;
   static constexpr int TotalBits = sizeof(Scalar) * CHAR_BIT, MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
                        ExponentBits = TotalBits - MantissaBits - 1;
 
-  static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet run(const Packet& a, const Packet& exponent) {
-    const Packet bias = pset1<Packet>(Scalar((ScalarI(1) << (ExponentBits - 1)) - ScalarI(1)));  // 127
-    const Packet limit = pset1<Packet>(Scalar((ScalarI(1) << ExponentBits) - ScalarI(1)));       // 255
-    // restrict biased exponent between 0 and 255 for float.
-    const PacketI e = pcast<Packet, PacketI>(pmin(pmax(padd(exponent, bias), pzero(limit)), limit));  // exponent + 127
-    // return a * (2^e)
-    return pmul(a, preinterpret<Packet>(plogical_shift_left<MantissaBits>(e)));
-  }
-};
+  const Packet bias = pset1<Packet>(Scalar((ScalarI(1) << (ExponentBits - 1)) - ScalarI(1)));  // 127
+  const Packet limit = pset1<Packet>(Scalar((ScalarI(1) << ExponentBits) - ScalarI(1)));       // 255
+  // restrict biased exponent between 0 and 255 for float.
+  const PacketI e = pcast<Packet, PacketI>(pmin(pmax(padd(exponent, bias), pzero(limit)), limit));  // exponent + 127
+  // return a * (2^e)
+  return pmul(a, preinterpret<Packet>(plogical_shift_left<MantissaBits>(e)));
+}
 
 // Natural or base 2 logarithm.
 // Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
@@ -514,6 +512,7 @@
   const Packet cst_half = pset1<Packet>(0.5f);
   const Packet cst_exp_hi = pset1<Packet>(88.723f);
   const Packet cst_exp_lo = pset1<Packet>(-104.f);
+  const Packet cst_pldexp_threshold = pset1<Packet>(87.0);
 
   const Packet cst_cephes_LOG2EF = pset1<Packet>(1.44269504088896341f);
   const Packet cst_p2 = pset1<Packet>(0.49999988079071044921875f);
@@ -549,7 +548,12 @@
   y = pmadd(r2, y, p_low);
 
   // Return 2^m * exp(r).
-  // TODO: replace pldexp with faster implementation since y in [-1, 1).
+  const Packet fast_pldexp_unsafe = pcmp_lt(cst_pldexp_threshold, pabs(x));
+  if (!predux_any(fast_pldexp_unsafe)) {
+    // For |x| <= 87, we know the result is not zero or inf, and we can safely use
+    // the fast version of pldexp.
+    return pmax(pldexp_fast(y, m), _x);
+  }
   return pselect(zero_mask, cst_zero, pmax(pldexp(y, m), _x));
 }
 
@@ -562,8 +566,8 @@
   const Packet cst_half = pset1<Packet>(0.5);
 
   const Packet cst_exp_hi = pset1<Packet>(709.784);
-  const Packet cst_exp_lo = pset1<Packet>(-709.784);
-
+  const Packet cst_exp_lo = pset1<Packet>(-745.519);
+  const Packet cst_pldexp_threshold = pset1<Packet>(708.0);
   const Packet cst_cephes_LOG2EF = pset1<Packet>(1.4426950408889634073599);
   const Packet cst_cephes_exp_p0 = pset1<Packet>(1.26177193074810590878e-4);
   const Packet cst_cephes_exp_p1 = pset1<Packet>(3.02994407707441961300e-2);
@@ -616,7 +620,12 @@
 
   // Construct the result 2^n * exp(g) = e * x. The max is used to catch
   // non-finite values in the input.
-  // TODO: replace pldexp with faster implementation since x in [-1, 1).
+  const Packet fast_pldexp_unsafe = pcmp_lt(cst_pldexp_threshold, pabs(_x));
+  if (!predux_any(fast_pldexp_unsafe)) {
+    // For |x| <= 708, we know the result is not zero or inf, and we can safely use
+    // the fast version of pldexp.
+    return pmax(pldexp_fast(x, fx), _x);
+  }
   return pselect(zero_mask, cst_zero, pmax(pldexp(x, fx), _x));
 }
 
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
index 3b362f4..ac0e2cf 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
@@ -42,6 +42,18 @@
 template <typename Packet>
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_generic(const Packet& a, const Packet& exponent);
 
+// Explicitly multiplies
+//    a * (2^e)
+// clamping e to the range
+// [NumTraits<Scalar>::min_exponent()-2, NumTraits<Scalar>::max_exponent()]
+//
+// This is approx 7x faster than pldexp_impl, but will prematurely over/underflow
+// if 2^e doesn't fit into a normal floating-point Scalar.
+//
+// Assumes IEEE floating point format
+template <typename Packet>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_fast(const Packet& a, const Packet& exponent);
+
 /** \internal \returns log(x) for single precision float */
 template <typename Packet>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_float(const Packet _x);
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 2f401fd..3f2d9d5 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -5141,7 +5141,7 @@
     HasSqrt = 1,
     HasRsqrt = 1,
     HasTanh = EIGEN_FAST_MATH,
-    HasErf = 0,
+    HasErf = EIGEN_FAST_MATH,
     HasErfc = EIGEN_FAST_MATH
   };
 };
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index c749763..f294009 100644
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -1766,7 +1766,6 @@
 
 // We specialize pldexp here, since the generic implementation uses Packet2l, which is not well
 // supported by SSE, and has more range than is needed for exponents.
-// TODO(rmlarsen): Remove this specialization once Packet2l has support or casting.
 template <>
 EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
   // Clamp exponent to [-2099, 2099]
@@ -1787,6 +1786,24 @@
   return out;
 }
 
+// We specialize pldexp here, since the generic implementation uses Packet2l, which is not well
+// supported by SSE, and has more range than is needed for exponents.
+template <>
+EIGEN_STRONG_INLINE Packet2d pldexp_fast<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
+  // Clamp exponent to [-1023, 1024]
+  const Packet2d min_exponent = pset1<Packet2d>(-1023.0);
+  const Packet2d max_exponent = pset1<Packet2d>(1024.0);
+  const Packet2d e = pmin(pmax(exponent, min_exponent), max_exponent);
+
+  // Convert e to integer and swizzle to low-order bits.
+  const Packet4i ei = vec4i_swizzle1(_mm_cvtpd_epi32(e), 0, 3, 1, 3);
+
+  // Compute 2^e multiply:
+  const Packet4i bias = _mm_set_epi32(0, 1023, 0, 1023);
+  const Packet2d c = _mm_castsi128_pd(_mm_slli_epi64(padd(ei, bias), 52));  // 2^e
+  return pmul(a, c);
+}
+
 // with AVX, the default implementations based on pload1 are faster
 #ifndef __AVX__
 template <>
diff --git a/Eigen/src/Core/functors/NullaryFunctors.h b/Eigen/src/Core/functors/NullaryFunctors.h
index c53bb90..a478b80 100644
--- a/Eigen/src/Core/functors/NullaryFunctors.h
+++ b/Eigen/src/Core/functors/NullaryFunctors.h
@@ -131,8 +131,7 @@
 struct functor_traits<linspaced_op<Scalar> > {
   enum {
     Cost = 1,
-    PacketAccess =
-        (!NumTraits<Scalar>::IsInteger) && packet_traits<Scalar>::HasSetLinear && packet_traits<Scalar>::HasBlend,
+    PacketAccess = (!NumTraits<Scalar>::IsInteger) && packet_traits<Scalar>::HasSetLinear,
     /*&& ((!NumTraits<Scalar>::IsInteger) || packet_traits<Scalar>::HasDiv),*/  // <- vectorization for integer is
                                                                                 // currently disabled
     IsRepeatable = true
diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h
index defd3c2..03542e3 100644
--- a/Eigen/src/Core/functors/UnaryFunctors.h
+++ b/Eigen/src/Core/functors/UnaryFunctors.h
@@ -1292,7 +1292,7 @@
     p = pmadd(r2, p, p_low);
 
     // 4. Undo subtractive range reduction exp(m*ln(2) + r) = 2^m * exp(r).
-    Packet e = pldexp_fast_impl<Packet>::run(p, m);
+    Packet e = pldexp_fast(p, m);
 
     // 5. Undo multiplicative range reduction by using exp(r) = exp(r/2)^2.
     e = pmul(e, e);
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index 2acdd9d..a278c91 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -391,7 +391,7 @@
 
 template <typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void check_size_for_overflow(std::size_t size) {
-  constexpr std::size_t max_elements = PTRDIFF_MAX / sizeof(T);
+  constexpr std::size_t max_elements = (std::numeric_limits<std::ptrdiff_t>::max)() / sizeof(T);
   if (size > max_elements) throw_std_bad_alloc();
 }
 
diff --git a/test/constexpr.cpp b/test/constexpr.cpp
index 9fdf447..34c728f 100644
--- a/test/constexpr.cpp
+++ b/test/constexpr.cpp
@@ -16,38 +16,48 @@
   // until after the constructor returns:
   // error: member ‘Eigen::internal::plain_array<int, 9, 0, 0>::array’ must be initialized by mem-initializer in
   // ‘constexpr’ constructor
-#if EIGEN_COMP_CXXVER >= 20
+#if __cpp_constexpr >= 201907L
   constexpr Matrix3i mat({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   VERIFY_IS_EQUAL(mat.size(), 9);
-  VERIFY_IS_EQUAL(mat(0, 0), 1);
+  static_assert(mat(0, 0) == 1);
+  static_assert(mat(0) == 1);
   static_assert(mat.coeff(0, 1) == 2);
   constexpr Array33i arr({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
-  VERIFY_IS_EQUAL(arr(0, 0), 1);
+  static_assert(arr(0, 0) == 1);
+  static_assert(arr(0) == 1);
   VERIFY_IS_EQUAL(arr.size(), 9);
   static_assert(arr.coeff(0, 1) == 2);
+  constexpr RowVector3i vec{{1, 2, 3}};
+  static_assert(vec(0, 0) == 1);
+  static_assert(vec[0] == 1);
+  VERIFY_IS_EQUAL(vec.size(), 3);
+  static_assert(vec.coeff(0, 1) == 2);
+
   // Also check dynamic size arrays/matrices with fixed-size storage (currently
   // only works if all elements are initialized, since otherwise the compiler
   // complains about uninitialized trailing elements.
   constexpr Matrix<int, Eigen::Dynamic, Eigen::Dynamic, 0, 3, 3> dyn_mat({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   VERIFY_IS_EQUAL(dyn_mat.size(), 9);
-  VERIFY_IS_EQUAL(dyn_mat(0, 0), 1);
+  static_assert(dyn_mat(0, 0) == 1);
   static_assert(dyn_mat.coeff(0, 1) == 2);
   constexpr Array<int, Eigen::Dynamic, Eigen::Dynamic, 0, 3, 3> dyn_arr({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
-  VERIFY_IS_EQUAL(dyn_arr(0, 0), 1);
+  static_assert(dyn_arr(0, 0) == 1);
+  static_assert(dyn_arr(0) == 1);
   VERIFY_IS_EQUAL(dyn_arr.size(), 9);
   static_assert(dyn_arr.coeff(0, 1) == 2);
-#endif  // EIGEN_COMP_CXXVER >= 20
+#endif  // __cpp_constexpr >= 201907L
 }
 
 // Check that we can use the std::initializer_list constructor for constexpr variables.
-#if EIGEN_COMP_CXXVER >= 20
+#if __cpp_constexpr >= 201907L
 // EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT() will fail constexpr evaluation unless
 // we have std::is_constant_evaluated().
 constexpr Matrix<int, 2, 2> global_mat({{1, 2}, {3, 4}});
 
 EIGEN_DECLARE_TEST(constexpr_global) {
   VERIFY_IS_EQUAL(global_mat.size(), 4);
-  VERIFY_IS_EQUAL(global_mat(0, 0), 1);
+  static_assert(global_mat(0, 0) == 1);
+  static_assert(global_mat(0) == 1);
   static_assert(global_mat.coeff(0, 0) == 1);
 }
-#endif  // EIGEN_COMP_CXXVER >= 20
+#endif  // __cpp_constexpr >= 201907L
diff --git a/test/dense_storage.cpp b/test/dense_storage.cpp
index 0f6ab64..d394a94 100644
--- a/test/dense_storage.cpp
+++ b/test/dense_storage.cpp
@@ -38,6 +38,13 @@
 // all fixed-size, fixed-dimension plain object types are trivially move constructible
 static_assert(std::is_trivially_move_constructible<Matrix4f>::value, "Matrix4f not trivially_move_constructible");
 static_assert(std::is_trivially_move_constructible<Array4f>::value, "Array4f not trivially_move_constructible");
+// all statically-allocated plain object types are trivially destructible
+static_assert(std::is_trivially_destructible<Matrix4f>::value, "Matrix4f not trivially_destructible");
+static_assert(std::is_trivially_destructible<Array4f>::value, "Array4f not trivially_destructible");
+static_assert(std::is_trivially_destructible<Matrix<float, 4, Dynamic, 0, 4, 4>>::value,
+              "Matrix4X44 not trivially_destructible");
+static_assert(std::is_trivially_destructible<Array<float, 4, Dynamic, 0, 4, 4>>::value,
+              "Array4X44 not trivially_destructible");
 #if !defined(EIGEN_DENSE_STORAGE_CTOR_PLUGIN)
 // all fixed-size, fixed-dimension plain object types are trivially copy constructible
 static_assert(std::is_trivially_copy_constructible<Matrix4f>::value, "Matrix4f not trivially_copy_constructible");
diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
index 0b266f9..5f95fd0 100644
--- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
@@ -269,81 +269,8 @@
   }
 };
 
-/****************************************************************************
- * Implementation of erf, requires C++11/C99                                *
- ****************************************************************************/
-
-/** \internal \returns the error function of \a a (coeff-wise)
-    This uses a 11/10-degree rational interpolantand is accurate to 3 ulp for
-    normalized floats.
-
-    This implementation works on both scalars and SIMD "packets".
-*/
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erf_float(const T& x) {
-  // The monomial coefficients of the numerator polynomial (odd).
-  constexpr float alpha[] = {2.123732201653183437883853912353515625e-06f, 2.861979592125862836837768554687500000e-04f,
-                             3.658048342913389205932617187500000000e-03f, 5.243302136659622192382812500000000000e-02f,
-                             1.874160766601562500000000000000000000e-01f, 1.128379106521606445312500000000000000e+00f};
-
-  // The monomial coefficients of the denominator polynomial (even).
-  constexpr float beta[] = {3.89185734093189239501953125000e-05f, 1.14329601638019084930419921875e-03f,
-                            1.47520881146192550659179687500e-02f, 1.12945675849914550781250000000e-01f,
-                            4.99425798654556274414062500000e-01f, 1.0f};
-
-  // Since the polynomials are odd/even, we need x^2.
-  // Since erf(4) == 1 in float, we clamp x^2 to 16 to avoid
-  // computing Inf/Inf below.
-  const T x2 = pmin(pset1<T>(16.0f), pmul(x, x));
-
-  // Evaluate the numerator polynomial p.
-  T p = ppolevl<T, 5>::run(x2, alpha);
-  p = pmul(x, p);
-
-  // Evaluate the denominator polynomial p.
-  T q = ppolevl<T, 5>::run(x2, beta);
-  const T r = pdiv(p, q);
-
-  // Clamp to [-1:1].
-  return pmax(pmin(r, pset1<T>(1.0f)), pset1<T>(-1.0f));
-}
-
-template <typename T>
-struct erf_impl {
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) { return generic_fast_erf_float(x); }
-};
-
-template <typename Scalar>
-struct erf_retval {
-  typedef Scalar type;
-};
-
-#if EIGEN_HAS_C99_MATH
-template <>
-struct erf_impl<float> {
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float run(float x) {
-#if defined(SYCL_DEVICE_ONLY)
-    return cl::sycl::erf(x);
-#else
-    return generic_fast_erf_float(x);
-#endif
-  }
-};
-
-template <>
-struct erf_impl<double> {
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE double run(double x) {
-#if defined(SYCL_DEVICE_ONLY)
-    return cl::sycl::erf(x);
-#else
-    return ::erf(x);
-#endif
-  }
-};
-#endif  // EIGEN_HAS_C99_MATH
-
 /***************************************************************************
- * Implementation of erfc, requires C++11/C99                               *
+ * Implementation of erfc.
  ****************************************************************************/
 template <typename Scalar>
 struct generic_fast_erfc {
@@ -366,7 +293,7 @@
                              2.67075151205062866210937500000e-02, -1.12800106406211853027343750000e-01,
                              3.76122951507568359375000000000e-01, -1.12837910652160644531250000000e+00};
   const T x2 = pmul(x, x);
-  const T one = pset1<T>(1.0);
+  const T one = pset1<T>(1.0f);
   const T erfc_small = pmadd(x, ppolevl<T, 5>::run(x2, alpha), one);
 
   // Return early if we don't need the more expensive approximation for any
@@ -401,46 +328,42 @@
   return pselect(x_abs_gt_one_mask, erfc_large, erfc_small);
 }
 
-template <>
+// Computes erf(x)/x for |x| <= 1. Used by both erf and erfc implementations.
+// Takes x2 = x^2 as input.
+//
+// PRECONDITION: x2 <= 1.
 template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erfc<double>::run(const T& x_in) {
-  // Clamp x to [-27:27] beyond which erfc(x) is either two or zero (below the underflow threshold).
-  // This avoids having to deal with twoprod(x,x) producing NaN for sufficiently large x.
-  constexpr double kClamp = 28.0;
-  const T x = pmin(pmax(x_in, pset1<T>(-kClamp)), pset1<T>(kClamp));
-
-  // erfc(x) = 1 + x * S(x^2) / T(x^2), |x| <= 1.
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T erf_over_x_double_small(const T& x2) {
+  // erf(x)/x =  S(x^2) / T(x^2), x^2 <= 1.
   //
   // Coefficients for S and T generated with Rminimax command:
-  //  ./ratapprox --function="erfc(x)-1" --dom='[-1,1]' --type=[9,10]
+  //  ./ratapprox --function="erf(x)" --dom='[-1,1]' --type=[9,10]
   //  --num="odd" --numF="[D]" --den="even" --denF="[D]" --log --dispCoeff="dec"
-  constexpr double alpha[] = {-1.9493725660006057018823477644531294572516344487667083740234375e-04,
-                              -1.8272566210022942682217328425053892715368419885635375976562500e-03,
-                              -4.5303363351690106863856044583371840417385101318359375000000000e-02,
-                              -1.4215015503619179981775744181504705920815467834472656250000000e-01,
-                              -1.1283791670955125585606992899556644260883331298828125000000000e+00};
+  constexpr double alpha[] = {1.9493725660006057018823477644531294572516344487667083740234375e-04,
+                              1.8272566210022942682217328425053892715368419885635375976562500e-03,
+                              4.5303363351690106863856044583371840417385101318359375000000000e-02,
+                              1.4215015503619179981775744181504705920815467834472656250000000e-01,
+                              1.1283791670955125585606992899556644260883331298828125000000000e+00};
   constexpr double beta[] = {2.0294484101083099089526257108317963684385176748037338256835938e-05,
                              6.8117805899186819641732970609382391558028757572174072265625000e-04,
                              1.0582026056098614921752165685120417037978768348693847656250000e-02,
                              9.3252603143757495374188692949246615171432495117187500000000000e-02,
                              4.5931062818368939559832142549566924571990966796875000000000000e-01,
                              1.0};
-  const T x2 = pmul(x, x);
   const T num_small = ppolevl<T, 4>::run(x2, alpha);
   const T denom_small = ppolevl<T, 5>::run(x2, beta);
-  const T one = pset1<T>(1.0);
-  const T erfc_small = pmadd(x, pdiv(num_small, denom_small), one);
+  return pdiv(num_small, denom_small);
+}
 
-  // Return early if we don't need the more expensive approximation for any
-  // entry in a.
-  const T x_abs_gt_one_mask = pcmp_lt(one, x2);
-  if (!predux_any(x_abs_gt_one_mask)) return erfc_small;
-
-  // erfc(x) = exp(-x^2) * 1/x * P(x) / Q(x), 1 < x < 27.
-  //
-  // Coefficients for P and Q generated with Rminimax command:
-  //  ./ratapprox --function="erfc(1/sqrt(x))*exp(1/x)/sqrt(x)"  --dom='[0.0013717,1]' --type=[9,9] --numF="[D]"
-  //  --denF="[D]" --log --dispCoeff="dec"
+// erfc(x) = exp(-x^2) * 1/x * P(1/x^2) / Q(1/x^2), 1 < x < 28.
+//
+// Coefficients for P and Q generated with Rminimax command:
+//  ./ratapprox --function="erfc(1/sqrt(x))*exp(1/x)/sqrt(x)"  --dom='[0.0013717,1]' --type=[9,9] --numF="[D]"
+//  --denF="[D]" --log --dispCoeff="dec"
+//
+// PRECONDITION: 1 < x < 28.
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T erfc_double_large(const T& x, const T& x2) {
   constexpr double gamma[] = {1.5252844933226974316088642158462107545346952974796295166015625e-04,
                               1.0909912393738931124520519233556115068495273590087890625000000e-02,
                               1.0628604636755033252537572252549580298364162445068359375000000e-01,
@@ -461,7 +384,7 @@
                               3.152505418656005586885981983868987299501895904541015625000000e-02,
                               2.565085751861882583380047861965067568235099315643310546875000e-03,
                               7.899362131678837697403017248376499992446042597293853759765625e-05};
-
+  // Compute exp(-x^2).
   const T x2_lo = twoprod_low(x, x, x2);
   // Here we use that
   //   exp(-x^2) = exp(-(x2+x2_lo)^2) ~= exp(-x2)*exp(-x2_lo) ~= exp(-x2)*(1-x2_lo)
@@ -469,12 +392,34 @@
   // from 258 ulps to below 7 ulps.
   const T exp2_hi = pexp(pnegate(x2));
   const T z = pnmadd(exp2_hi, x2_lo, exp2_hi);
+  // Compute r = P / Q.
   const T q2 = preciprocal(x2);
   const T num_large = ppolevl<T, 9>::run(q2, gamma);
   const T denom_large = pmul(x, ppolevl<T, 9>::run(q2, delta));
   const T r = pdiv(num_large, denom_large);
   const T maybe_two = pand(pcmp_lt(x, pset1<T>(0.0)), pset1<T>(2.0));
-  const T erfc_large = pmadd(z, r, maybe_two);
+  return pmadd(z, r, maybe_two);
+}
+
+template <>
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erfc<double>::run(const T& x_in) {
+  // Clamp x to [-28:28] beyond which erfc(x) is either two or zero (below the underflow threshold).
+  // This avoids having to deal with twoprod(x,x) producing NaN for sufficiently large x.
+  constexpr double kClamp = 28.0;
+  const T x = pmin(pmax(x_in, pset1<T>(-kClamp)), pset1<T>(kClamp));
+
+  // For |x| < 1, we use erfc(x) = 1 - erf(x).
+  const T x2 = pmul(x, x);
+  const T one = pset1<T>(1.0);
+  const T erfc_small = pnmadd(x, erf_over_x_double_small(x2), one);
+
+  // Return early if we don't need the more expensive approximation for any
+  // entry in a.
+  const T x_abs_gt_one_mask = pcmp_lt(one, x2);
+  if (!predux_any(x_abs_gt_one_mask)) return erfc_small;
+
+  const T erfc_large = erfc_double_large(x, x2);
   return pselect(x_abs_gt_one_mask, erfc_large, erfc_small);
 }
 
@@ -513,6 +458,104 @@
 };
 #endif  // EIGEN_HAS_C99_MATH
 
+/****************************************************************************
+ * Implementation of erf.
+ ****************************************************************************/
+
+template <typename Scalar>
+struct generic_fast_erf {
+  template <typename T>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T run(const T& x_in);
+};
+
+/** \internal \returns the error function of \a a (coeff-wise)
+    This uses a 11/10-degree rational interpolantand is accurate to 3 ulp for
+    normalized floats.
+
+    This implementation works on both scalars and SIMD "packets".
+*/
+template <>
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erf<float>::run(const T& x) {
+  // The monomial coefficients of the numerator polynomial (odd).
+  constexpr float alpha[] = {2.123732201653183437883853912353515625e-06f, 2.861979592125862836837768554687500000e-04f,
+                             3.658048342913389205932617187500000000e-03f, 5.243302136659622192382812500000000000e-02f,
+                             1.874160766601562500000000000000000000e-01f, 1.128379106521606445312500000000000000e+00f};
+
+  // The monomial coefficients of the denominator polynomial (even).
+  constexpr float beta[] = {3.89185734093189239501953125000e-05f, 1.14329601638019084930419921875e-03f,
+                            1.47520881146192550659179687500e-02f, 1.12945675849914550781250000000e-01f,
+                            4.99425798654556274414062500000e-01f, 1.0f};
+
+  // Since the polynomials are odd/even, we need x^2.
+  // Since erf(4) == 1 in float, we clamp x^2 to 16 to avoid
+  // computing Inf/Inf below.
+  const T x2 = pmin(pset1<T>(16.0f), pmul(x, x));
+
+  // Evaluate the numerator polynomial p.
+  T p = ppolevl<T, 5>::run(x2, alpha);
+  p = pmul(x, p);
+
+  // Evaluate the denominator polynomial p.
+  T q = ppolevl<T, 5>::run(x2, beta);
+  const T r = pdiv(p, q);
+
+  // Clamp to [-1:1].
+  return pmax(pmin(r, pset1<T>(1.0f)), pset1<T>(-1.0f));
+}
+
+template <>
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erf<double>::run(const T& x) {
+  T x2 = pmul(x, x);
+  T erf_small = pmul(x, erf_over_x_double_small(x2));
+
+  // Return early if we don't need the more expensive approximation for any
+  // entry in a.
+  const T one = pset1<T>(1.0);
+  const T x_abs_gt_one_mask = pcmp_lt(one, x2);
+  if (!predux_any(x_abs_gt_one_mask)) return erf_small;
+
+  // For |x| > 1, use erf(x) = 1 - erfc(x).
+  const T erf_large = psub(one, erfc_double_large(x, x2));
+  return pselect(x_abs_gt_one_mask, erf_large, erf_small);
+}
+
+template <typename T>
+struct erf_impl {
+  typedef typename unpacket_traits<T>::type Scalar;
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) { return generic_fast_erf<Scalar>::run(x); }
+};
+
+template <typename Scalar>
+struct erf_retval {
+  typedef Scalar type;
+};
+
+#if EIGEN_HAS_C99_MATH
+template <>
+struct erf_impl<float> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float run(const float x) {
+#if defined(SYCL_DEVICE_ONLY)
+    return cl::sycl::erf(x);
+#else
+    return generic_fast_erf<float>::run(x);
+#endif
+  }
+};
+
+template <>
+struct erf_impl<double> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE double run(const double x) {
+#if defined(SYCL_DEVICE_ONLY)
+    return cl::sycl::erf(x);
+#else
+    return generic_fast_erf<double>::run(x);
+#endif
+  }
+};
+#endif  // EIGEN_HAS_C99_MATH
+
 /***************************************************************************
  * Implementation of ndtri.                                                 *
  ****************************************************************************/