diff --git a/Eigen/Cholesky b/Eigen/Cholesky
index 1332b54..a318ceb 100644
--- a/Eigen/Cholesky
+++ b/Eigen/Cholesky
@@ -43,4 +43,3 @@
 #include "src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_CHOLESKY_MODULE_H
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/Eigen/Core b/Eigen/Core
index ef77851..1a60dcb 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -208,6 +208,10 @@
   #include "src/Core/arch/NEON/TypeCasting.h"
   #include "src/Core/arch/NEON/MathFunctions.h"
   #include "src/Core/arch/NEON/Complex.h"
+#elif defined EIGEN_VECTORIZE_SVE
+  #include "src/Core/arch/SVE/PacketMath.h"
+  #include "src/Core/arch/SVE/TypeCasting.h"
+  #include "src/Core/arch/SVE/MathFunctions.h"
 #elif defined EIGEN_VECTORIZE_ZVECTOR
   #include "src/Core/arch/ZVector/PacketMath.h"
   #include "src/Core/arch/ZVector/MathFunctions.h"
@@ -336,7 +340,9 @@
 #include "src/Core/ConditionEstimator.h"
 
 #if defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX)
-#include "src/Core/arch/AltiVec/MatrixProduct.h"
+  #include "src/Core/arch/AltiVec/MatrixProduct.h"
+#elif defined EIGEN_VECTORIZE_NEON
+  #include "src/Core/arch/NEON/GeneralBlockPanelKernel.h"
 #endif
 
 #include "src/Core/BooleanRedux.h"
diff --git a/Eigen/Eigenvalues b/Eigen/Eigenvalues
index 7d6ac78..5467a2e 100644
--- a/Eigen/Eigenvalues
+++ b/Eigen/Eigenvalues
@@ -58,4 +58,3 @@
 #include "src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_EIGENVALUES_MODULE_H
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/Eigen/Geometry b/Eigen/Geometry
index 16b4bd6..bc78110 100644
--- a/Eigen/Geometry
+++ b/Eigen/Geometry
@@ -50,11 +50,10 @@
 #include "src/Geometry/Umeyama.h"
 
 // Use the SSE optimized version whenever possible.
-#if defined EIGEN_VECTORIZE_SSE
-#include "src/Geometry/arch/Geometry_SSE.h"
+#if (defined EIGEN_VECTORIZE_SSE) || (defined EIGEN_VECTORIZE_NEON)
+#include "src/Geometry/arch/Geometry_SIMD.h"
 #endif
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_GEOMETRY_MODULE_H
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/Eigen/Householder b/Eigen/Householder
index 89cd81b..f2fa799 100644
--- a/Eigen/Householder
+++ b/Eigen/Householder
@@ -27,4 +27,3 @@
 #include "src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_HOUSEHOLDER_MODULE_H
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/Eigen/Jacobi b/Eigen/Jacobi
index 17c1d78..43edc7a 100644
--- a/Eigen/Jacobi
+++ b/Eigen/Jacobi
@@ -29,5 +29,4 @@
 #include "src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_JACOBI_MODULE_H
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */
 
diff --git a/Eigen/LU b/Eigen/LU
index 2a6b771..0fb184b 100644
--- a/Eigen/LU
+++ b/Eigen/LU
@@ -47,4 +47,3 @@
 #include "src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_LU_MODULE_H
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/Eigen/QR b/Eigen/QR
index 1be1863..8465b62 100644
--- a/Eigen/QR
+++ b/Eigen/QR
@@ -48,4 +48,3 @@
 #include "src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_QR_MODULE_H
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/Eigen/QtAlignedMalloc b/Eigen/QtAlignedMalloc
index 4f07df0..6fe8237 100644
--- a/Eigen/QtAlignedMalloc
+++ b/Eigen/QtAlignedMalloc
@@ -37,4 +37,3 @@
 #endif
 
 #endif // EIGEN_QTMALLOC_MODULE_H
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/Eigen/SVD b/Eigen/SVD
index 5d0e75f..3451794 100644
--- a/Eigen/SVD
+++ b/Eigen/SVD
@@ -48,4 +48,3 @@
 #include "src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_SVD_MODULE_H
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h
index 492cd5a..30f1f52 100644
--- a/Eigen/src/Cholesky/LDLT.h
+++ b/Eigen/src/Cholesky/LDLT.h
@@ -45,7 +45,7 @@
   * matrix \f$ A \f$ such that \f$ A =  P^TLDL^*P \f$, where P is a permutation matrix, L
   * is lower triangular with a unit diagonal and D is a diagonal matrix.
   *
-  * The decomposition uses pivoting to ensure stability, so that L will have
+  * The decomposition uses pivoting to ensure stability, so that D will have
   * zeros in the bottom right rank(A) - n submatrix. Avoiding the square root
   * on D also stabilizes the computation.
   *
@@ -200,7 +200,7 @@
       * \f$ L^* y_4 = y_3 \f$ and \f$ P x = y_4 \f$ in succession. If the matrix \f$ A \f$ is singular, then
       * \f$ D \f$ will also be singular (all the other matrices are invertible). In that case, the
       * least-square solution of \f$ D y_3 = y_2 \f$ is computed. This does not mean that this function
-      * computes the least-square solution of \f$ A x = b \f$ is \f$ A \f$ is singular.
+      * computes the least-square solution of \f$ A x = b \f$ if \f$ A \f$ is singular.
       *
       * \sa MatrixBase::ldlt(), SelfAdjointView::ldlt()
       */
diff --git a/Eigen/src/Core/CwiseNullaryOp.h b/Eigen/src/Core/CwiseNullaryOp.h
index ddac9df..8f3496f 100644
--- a/Eigen/src/Core/CwiseNullaryOp.h
+++ b/Eigen/src/Core/CwiseNullaryOp.h
@@ -383,6 +383,33 @@
   return setConstant(val);
 }
 
+/** Resizes to the given size, changing only the number of columns, and sets all
+  * coefficients in this expression to the given value \a val. For the parameter
+  * of type NoChange_t, just pass the special value \c NoChange.
+  *
+  * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&)
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setConstant(NoChange_t, Index cols, const Scalar& val)
+{
+  return setConstant(rows(), cols, val);
+}
+
+/** Resizes to the given size, changing only the number of rows, and sets all
+  * coefficients in this expression to the given value \a val. For the parameter
+  * of type NoChange_t, just pass the special value \c NoChange.
+  *
+  * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&)
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setConstant(Index rows, NoChange_t, const Scalar& val)
+{
+  return setConstant(rows, cols(), val);
+}
+
+
 /**
   * \brief Sets a linearly spaced vector.
   *
@@ -556,6 +583,32 @@
   return setConstant(Scalar(0));
 }
 
+/** Resizes to the given size, changing only the number of columns, and sets all
+  * coefficients in this expression to zero. For the parameter of type NoChange_t,
+  * just pass the special value \c NoChange.
+  *
+  * \sa DenseBase::setZero(), setZero(Index), setZero(Index, Index), setZero(Index, NoChange_t), class CwiseNullaryOp, DenseBase::Zero()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setZero(NoChange_t, Index cols)
+{
+  return setZero(rows(), cols);
+}
+
+/** Resizes to the given size, changing only the number of rows, and sets all
+  * coefficients in this expression to zero. For the parameter of type NoChange_t,
+  * just pass the special value \c NoChange.
+  *
+  * \sa DenseBase::setZero(), setZero(Index), setZero(Index, Index), setZero(NoChange_t, Index), class CwiseNullaryOp, DenseBase::Zero()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setZero(Index rows, NoChange_t)
+{
+  return setZero(rows, cols());
+}
+
 // ones:
 
 /** \returns an expression of a matrix where all coefficients equal one.
@@ -682,6 +735,32 @@
   return setConstant(Scalar(1));
 }
 
+/** Resizes to the given size, changing only the number of rows, and sets all
+  * coefficients in this expression to one. For the parameter of type NoChange_t,
+  * just pass the special value \c NoChange.
+  *
+ * \sa MatrixBase::setOnes(), setOnes(Index), setOnes(Index, Index), setOnes(NoChange_t, Index), class CwiseNullaryOp, MatrixBase::Ones()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setOnes(Index rows, NoChange_t)
+{
+  return setOnes(rows, cols());
+}
+
+/** Resizes to the given size, changing only the number of columns, and sets all
+  * coefficients in this expression to one. For the parameter of type NoChange_t,
+  * just pass the special value \c NoChange.
+  *
+ * \sa MatrixBase::setOnes(), setOnes(Index), setOnes(Index, Index), setOnes(Index, NoChange_t) class CwiseNullaryOp, MatrixBase::Ones()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setOnes(NoChange_t, Index cols)
+{
+  return setOnes(rows(), cols);
+}
+
 // Identity:
 
 /** \returns an expression of the identity matrix (not necessarily square).
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index 671ed3c..f8c7826 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -250,8 +250,7 @@
 
 template <typename RealScalar>
 EIGEN_DEVICE_FUNC inline std::complex<RealScalar> ptrue(const std::complex<RealScalar>& /*a*/) {
-  RealScalar b;
-  b = ptrue(b);
+  RealScalar b = ptrue(RealScalar(0));
   return std::complex<RealScalar>(b, b);
 }
 
@@ -267,37 +266,43 @@
   return c;
 }
 
+template<typename T>
+struct bit_and {
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE T operator()(const T& a, const T& b) const {
+    return a & b;
+  }
+};
+
+template<typename T>
+struct bit_or {
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE T operator()(const T& a, const T& b) const {
+    return a | b;
+  }
+};
+
+template<typename T>
+struct bit_xor {
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE T operator()(const T& a, const T& b) const {
+    return a ^ b;
+  }
+};
+
 /** \internal \returns the bitwise and of \a a and \a b */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pand(const Packet& a, const Packet& b) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-  return bitwise_helper(a ,b, std::bit_and<unsigned char>());
-#else
-  EIGEN_USING_STD(bit_and);
-  return bitwise_helper(a ,b, bit_and<unsigned char>());
-#endif
+  return bitwise_helper(a, b, bit_and<unsigned char>());
 }
 
 /** \internal \returns the bitwise or of \a a and \a b */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 por(const Packet& a, const Packet& b) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-  return bitwise_helper(a ,b, std::bit_or<unsigned char>());
-#else
-  EIGEN_USING_STD(bit_or);
   return bitwise_helper(a ,b, bit_or<unsigned char>());
-#endif
 }
 
 /** \internal \returns the bitwise xor of \a a and \a b */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pxor(const Packet& a, const Packet& b) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-  return bitwise_helper(a ,b, std::bit_xor<unsigned char>());
-#else
-  EIGEN_USING_STD(bit_xor);
   return bitwise_helper(a ,b, bit_xor<unsigned char>());
-#endif
 }
 
 /** \internal \returns the bitwise and of \a a and not \a b */
@@ -407,6 +412,12 @@
 template<> EIGEN_DEVICE_FUNC inline unsigned long long
 pabs(const unsigned long long& a) { return a; }
 
+/** \internal \returns the addsub value of \a a,b */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+paddsub(const Packet& a, const Packet& b) {
+  return pselect(peven_mask(a), padd(a, b), psub(a, b));
+ }
+
 /** \internal \returns the phase angle of \a a */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 parg(const Packet& a) { using numext::arg; return arg(a); }
@@ -437,18 +448,18 @@
 EIGEN_DEVICE_FUNC inline Packet pfrexp(const Packet& a, Packet& exponent) {
   int exp;
   EIGEN_USING_STD(frexp);
-  Packet result = frexp(a, &exp);
+  Packet result = static_cast<Packet>(frexp(a, &exp));
   exponent = static_cast<Packet>(exp);
   return result;
 }
 
-/** \internal \returns a * 2^exponent
+/** \internal \returns a * 2^((int)exponent)
   * See https://en.cppreference.com/w/cpp/numeric/math/ldexp
   */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pldexp(const Packet &a, const Packet &exponent) {
   EIGEN_USING_STD(ldexp)
-  return ldexp(a, static_cast<int>(exponent));
+  return static_cast<Packet>(ldexp(a, static_cast<int>(exponent)));
 }
 
 /** \internal \returns the min of \a a and \a b  (coeff-wise) */
@@ -894,28 +905,6 @@
   return ifPacket.select[0] ? thenPacket : elsePacket;
 }
 
-/***************************************************************************
- * Some generic implementations to be used by implementors
-***************************************************************************/
-
-/** Default implementation of pfrexp for float.
-  * It is expected to be called by implementers of template<> pfrexp.
-  */
-template<typename Packet> EIGEN_STRONG_INLINE Packet
-pfrexp_float(const Packet& a, Packet& exponent);
-
-/** Default implementation of pldexp for float.
-  * It is expected to be called by implementers of template<> pldexp.
-  */
-template<typename Packet> EIGEN_STRONG_INLINE Packet
-pldexp_float(Packet a, Packet exponent);
-
-/** Default implementation of pldexp for double.
-  * It is expected to be called by implementers of template<> pldexp.
-  */
-template<typename Packet> EIGEN_STRONG_INLINE Packet
-pldexp_double(Packet a, Packet exponent);
-
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index f64116a..e29733c 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -324,7 +324,7 @@
 };
 
 /****************************************************************************
-* Implementation of sqrt                                                 *
+* Implementation of sqrt/rsqrt                                             *
 ****************************************************************************/
 
 template<typename Scalar>
@@ -341,8 +341,8 @@
 // Complex sqrt defined in MathFunctionsImpl.h.
 template<typename T> EIGEN_DEVICE_FUNC std::complex<T> complex_sqrt(const std::complex<T>& a_x);
 
-// MSVC incorrectly handles inf cases.
-#if EIGEN_COMP_MSVC > 0
+// Custom implementation is faster than `std::sqrt`, works on
+// GPU, and correctly handles special cases (unlike MSVC).
 template<typename T>
 struct sqrt_impl<std::complex<T> >
 {
@@ -352,7 +352,6 @@
     return complex_sqrt<T>(x);
   }
 };
-#endif
 
 template<typename Scalar>
 struct sqrt_retval
@@ -360,6 +359,29 @@
   typedef Scalar type;
 };
 
+// Default implementation relies on numext::sqrt, at bottom of file.
+template<typename T>
+struct rsqrt_impl;
+
+// Complex rsqrt defined in MathFunctionsImpl.h.
+template<typename T> EIGEN_DEVICE_FUNC std::complex<T> complex_rsqrt(const std::complex<T>& a_x);
+
+template<typename T>
+struct rsqrt_impl<std::complex<T> >
+{
+  EIGEN_DEVICE_FUNC
+  static EIGEN_ALWAYS_INLINE std::complex<T> run(const std::complex<T>& x)
+  {
+    return complex_rsqrt<T>(x);
+  }
+};
+
+template<typename Scalar>
+struct rsqrt_retval
+{
+  typedef Scalar type;
+};
+
 /****************************************************************************
 * Implementation of norm1                                                *
 ****************************************************************************/
@@ -533,45 +555,63 @@
 ****************************************************************************/
 
 #if EIGEN_HAS_CXX11_MATH
-  template<typename Scalar>
-  struct arg_impl {
-    EIGEN_DEVICE_FUNC
-    static inline Scalar run(const Scalar& x)
-    {
-      #if defined(EIGEN_HIP_DEVICE_COMPILE)
-      // HIP does not seem to have a native device side implementation for the math routine "arg"
-      using std::arg;
-      #else
-      EIGEN_USING_STD(arg);
-      #endif
-      return arg(x);
-    }
-  };
+// std::arg is only defined for types of std::complex, or integer types or float/double/long double
+template<typename Scalar,
+          bool HasStdImpl = NumTraits<Scalar>::IsComplex || is_integral<Scalar>::value
+                            || is_same<Scalar, float>::value || is_same<Scalar, double>::value
+                            || is_same<Scalar, long double>::value >
+struct arg_default_impl;
+
+template<typename Scalar>
+struct arg_default_impl<Scalar, true> {
+  EIGEN_DEVICE_FUNC
+  static inline Scalar run(const Scalar& x)
+  {
+    #if defined(EIGEN_HIP_DEVICE_COMPILE)
+    // HIP does not seem to have a native device side implementation for the math routine "arg"
+    using std::arg;
+    #else
+    EIGEN_USING_STD(arg);
+    #endif
+    return static_cast<Scalar>(arg(x));
+  }
+};
+
+// Must be non-complex floating-point type (e.g. half/bfloat16).
+template<typename Scalar>
+struct arg_default_impl<Scalar, false> {
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline RealScalar run(const Scalar& x)
+  {
+    return (x < Scalar(0)) ? Scalar(EIGEN_PI) : Scalar(0);
+  }
+};
 #else
-  template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
-  struct arg_default_impl
+template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
+struct arg_default_impl
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline RealScalar run(const Scalar& x)
   {
-    typedef typename NumTraits<Scalar>::Real RealScalar;
-    EIGEN_DEVICE_FUNC
-    static inline RealScalar run(const Scalar& x)
-    {
-      return (x < Scalar(0)) ? Scalar(EIGEN_PI) : Scalar(0); }
-  };
+    return (x < Scalar(0)) ? Scalar(EIGEN_PI) : Scalar(0);
+  }
+};
 
-  template<typename Scalar>
-  struct arg_default_impl<Scalar,true>
+template<typename Scalar>
+struct arg_default_impl<Scalar,true>
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline RealScalar run(const Scalar& x)
   {
-    typedef typename NumTraits<Scalar>::Real RealScalar;
-    EIGEN_DEVICE_FUNC
-    static inline RealScalar run(const Scalar& x)
-    {
-      EIGEN_USING_STD(arg);
-      return arg(x);
-    }
-  };
-
-  template<typename Scalar> struct arg_impl : arg_default_impl<Scalar> {};
+    EIGEN_USING_STD(arg);
+    return arg(x);
+  }
+};
 #endif
+template<typename Scalar> struct arg_impl : arg_default_impl<Scalar> {};
 
 template<typename Scalar>
 struct arg_retval
@@ -623,36 +663,6 @@
   }
 };
 
-// Specialization for complex types that are not supported by std::expm1.
-template <typename RealScalar>
-struct expm1_impl<std::complex<RealScalar> > {
-  EIGEN_DEVICE_FUNC static inline std::complex<RealScalar> run(
-      const std::complex<RealScalar>& x) {
-    EIGEN_STATIC_ASSERT_NON_INTEGER(RealScalar)
-    RealScalar xr = x.real();
-    RealScalar xi = x.imag();
-    // expm1(z) = exp(z) - 1
-    //          = exp(x +  i * y) - 1
-    //          = exp(x) * (cos(y) + i * sin(y)) - 1
-    //          = exp(x) * cos(y) - 1 + i * exp(x) * sin(y)
-    // Imag(expm1(z)) = exp(x) * sin(y)
-    // Real(expm1(z)) = exp(x) * cos(y) - 1
-    //          = exp(x) * cos(y) - 1.
-    //          = expm1(x) + exp(x) * (cos(y) - 1)
-    //          = expm1(x) + exp(x) * (2 * sin(y / 2) ** 2)
-
-    // TODO better use numext::expm1 and numext::sin (but that would require forward declarations or moving this specialization down).
-    RealScalar erm1 = expm1_impl<RealScalar>::run(xr);
-    RealScalar er = erm1 + RealScalar(1.);
-    EIGEN_USING_STD(sin);
-    RealScalar sin2 = sin(xi / RealScalar(2.));
-    sin2 = sin2 * sin2;
-    RealScalar s = sin(xi);
-    RealScalar real_part = erm1 - RealScalar(2.) * er * sin2;
-    return std::complex<RealScalar>(real_part, er * s);
-  }
-};
-
 template<typename Scalar>
 struct expm1_retval
 {
@@ -1421,11 +1431,19 @@
 SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sqrt, sqrt)
 #endif
 
+/** \returns the reciprocal square root of \a x. **/
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T rsqrt(const T& x)
+{
+  return internal::rsqrt_impl<T>::run(x);
+}
+
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T log(const T &x) {
   EIGEN_USING_STD(log);
-  return log(x);
+  return static_cast<T>(log(x));
 }
 
 #if defined(SYCL_DEVICE_ONLY)
@@ -1602,7 +1620,7 @@
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T acosh(const T &x) {
   EIGEN_USING_STD(acosh);
-  return acosh(x);
+  return static_cast<T>(acosh(x));
 }
 #endif
 
@@ -1631,7 +1649,7 @@
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T asinh(const T &x) {
   EIGEN_USING_STD(asinh);
-  return asinh(x);
+  return static_cast<T>(asinh(x));
 }
 #endif
 
@@ -1652,7 +1670,7 @@
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T atan(const T &x) {
   EIGEN_USING_STD(atan);
-  return atan(x);
+  return static_cast<T>(atan(x));
 }
 
 #if EIGEN_HAS_CXX11_MATH
@@ -1660,7 +1678,7 @@
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T atanh(const T &x) {
   EIGEN_USING_STD(atanh);
-  return atanh(x);
+  return static_cast<T>(atanh(x));
 }
 #endif
 
@@ -1682,7 +1700,7 @@
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T cosh(const T &x) {
   EIGEN_USING_STD(cosh);
-  return cosh(x);
+  return static_cast<T>(cosh(x));
 }
 
 #if defined(SYCL_DEVICE_ONLY)
@@ -1701,7 +1719,7 @@
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T sinh(const T &x) {
   EIGEN_USING_STD(sinh);
-  return sinh(x);
+  return static_cast<T>(sinh(x));
 }
 
 #if defined(SYCL_DEVICE_ONLY)
@@ -1936,6 +1954,45 @@
 
 };
 
+} // end namespace internal
+
+// Default implementations that rely on other numext implementations
+namespace internal {
+
+// Specialization for complex types that are not supported by std::expm1.
+template <typename RealScalar>
+struct expm1_impl<std::complex<RealScalar> > {
+  EIGEN_DEVICE_FUNC static inline std::complex<RealScalar> run(
+      const std::complex<RealScalar>& x) {
+    EIGEN_STATIC_ASSERT_NON_INTEGER(RealScalar)
+    RealScalar xr = x.real();
+    RealScalar xi = x.imag();
+    // expm1(z) = exp(z) - 1
+    //          = exp(x +  i * y) - 1
+    //          = exp(x) * (cos(y) + i * sin(y)) - 1
+    //          = exp(x) * cos(y) - 1 + i * exp(x) * sin(y)
+    // Imag(expm1(z)) = exp(x) * sin(y)
+    // Real(expm1(z)) = exp(x) * cos(y) - 1
+    //          = exp(x) * cos(y) - 1.
+    //          = expm1(x) + exp(x) * (cos(y) - 1)
+    //          = expm1(x) + exp(x) * (2 * sin(y / 2) ** 2)
+    RealScalar erm1 = numext::expm1<RealScalar>(xr);
+    RealScalar er = erm1 + RealScalar(1.);
+    RealScalar sin2 = numext::sin(xi / RealScalar(2.));
+    sin2 = sin2 * sin2;
+    RealScalar s = numext::sin(xi);
+    RealScalar real_part = erm1 - RealScalar(2.) * er * sin2;
+    return std::complex<RealScalar>(real_part, er * s);
+  }
+};
+
+template<typename T>
+struct rsqrt_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_ALWAYS_INLINE T run(const T& x) {
+    return T(1)/numext::sqrt(x);
+  }
+};
 
 } // end namespace internal
 
diff --git a/Eigen/src/Core/MathFunctionsImpl.h b/Eigen/src/Core/MathFunctionsImpl.h
index 9222285..0d3f317 100644
--- a/Eigen/src/Core/MathFunctionsImpl.h
+++ b/Eigen/src/Core/MathFunctionsImpl.h
@@ -79,6 +79,12 @@
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 RealScalar positive_real_hypot(const RealScalar& x, const RealScalar& y)
 {
+  // IEEE IEC 6059 special cases.
+  if ((numext::isinf)(x) || (numext::isinf)(y))
+    return NumTraits<RealScalar>::infinity();
+  if ((numext::isnan)(x) || (numext::isnan)(y))
+    return NumTraits<RealScalar>::quiet_NaN();
+    
   EIGEN_USING_STD(sqrt);
   RealScalar p, qp;
   p = numext::maxi(x,y);
@@ -128,20 +134,56 @@
   const T x = numext::real(z);
   const T y = numext::imag(z);
   const T zero = T(0);
-  const T cst_half = T(0.5);
+  const T w = numext::sqrt(T(0.5) * (numext::abs(x) + numext::hypot(x, y)));
 
-  // Special case of isinf(y)
-  if ((numext::isinf)(y)) {
-    return std::complex<T>(std::numeric_limits<T>::infinity(), y);
-  }
-
-  T w = numext::sqrt(cst_half * (numext::abs(x) + numext::abs(z)));
   return
-    x == zero ? std::complex<T>(w, y < zero ? -w : w)
-    : x > zero ? std::complex<T>(w, y / (2 * w))
+    (numext::isinf)(y) ? std::complex<T>(NumTraits<T>::infinity(), y)
+      : x == zero ? std::complex<T>(w, y < zero ? -w : w)
+      : x > zero ? std::complex<T>(w, y / (2 * w))
       : std::complex<T>(numext::abs(y) / (2 * w), y < zero ? -w : w );
 }
 
+// Generic complex rsqrt implementation.
+template<typename T>
+EIGEN_DEVICE_FUNC std::complex<T> complex_rsqrt(const std::complex<T>& z) {
+  // Computes the principal reciprocal sqrt of the input.
+  //
+  // For a complex reciprocal square root of the number z = x + i*y. We want to
+  // find real numbers u and v such that
+  //    (u + i*v)^2 = 1 / (x + i*y)  <=>
+  //    u^2 - v^2 + i*2*u*v = x/|z|^2 - i*v/|z|^2.
+  // By equating the real and imaginary parts we get:
+  //    u^2 - v^2 = x/|z|^2
+  //    2*u*v = y/|z|^2.
+  //
+  // For x >= 0, this has the numerically stable solution
+  //    u = sqrt(0.5 * (x + |z|)) / |z|
+  //    v = -y / (2 * u * |z|)
+  // and for x < 0,
+  //    v = -sign(y) * sqrt(0.5 * (-x + |z|)) / |z|
+  //    u = -y / (2 * v * |z|)
+  //
+  // Letting w = sqrt(0.5 * (|x| + |z|)),
+  //   if x == 0: u = w / |z|, v = -sign(y) * w / |z|
+  //   if x > 0:  u = w / |z|, v = -y / (2 * w * |z|)
+  //   if x < 0:  u = |y| / (2 * w * |z|), v = -sign(y) * w / |z|
+
+  const T x = numext::real(z);
+  const T y = numext::imag(z);
+  const T zero = T(0);
+
+  const T abs_z = numext::hypot(x, y);
+  const T w = numext::sqrt(T(0.5) * (numext::abs(x) + abs_z));
+  const T woz = w / abs_z;
+  // Corner cases consistent with 1/sqrt(z) on gcc/clang.
+  return
+    abs_z == zero ? std::complex<T>(NumTraits<T>::infinity(), NumTraits<T>::quiet_NaN())
+      : ((numext::isinf)(x) || (numext::isinf)(y)) ? std::complex<T>(zero, zero)
+      : x == zero ? std::complex<T>(woz, y < zero ? woz : -woz)
+      : x > zero ? std::complex<T>(woz, -y / (2 * w * abs_z))
+      : std::complex<T>(numext::abs(y) / (2 * w * abs_z), y < zero ? woz : -woz );
+}
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/PlainObjectBase.h b/Eigen/src/Core/PlainObjectBase.h
index f6497e9..f2c7b8c 100644
--- a/Eigen/src/Core/PlainObjectBase.h
+++ b/Eigen/src/Core/PlainObjectBase.h
@@ -717,18 +717,26 @@
     using Base::setConstant;
     EIGEN_DEVICE_FUNC Derived& setConstant(Index size, const Scalar& val);
     EIGEN_DEVICE_FUNC Derived& setConstant(Index rows, Index cols, const Scalar& val);
+    EIGEN_DEVICE_FUNC Derived& setConstant(NoChange_t, Index cols, const Scalar& val);
+    EIGEN_DEVICE_FUNC Derived& setConstant(Index rows, NoChange_t, const Scalar& val);
 
     using Base::setZero;
     EIGEN_DEVICE_FUNC Derived& setZero(Index size);
     EIGEN_DEVICE_FUNC Derived& setZero(Index rows, Index cols);
+    EIGEN_DEVICE_FUNC Derived& setZero(NoChange_t, Index cols);
+    EIGEN_DEVICE_FUNC Derived& setZero(Index rows, NoChange_t);
 
     using Base::setOnes;
     EIGEN_DEVICE_FUNC Derived& setOnes(Index size);
     EIGEN_DEVICE_FUNC Derived& setOnes(Index rows, Index cols);
+    EIGEN_DEVICE_FUNC Derived& setOnes(NoChange_t, Index cols);
+    EIGEN_DEVICE_FUNC Derived& setOnes(Index rows, NoChange_t);
 
     using Base::setRandom;
     Derived& setRandom(Index size);
     Derived& setRandom(Index rows, Index cols);
+    Derived& setRandom(NoChange_t, Index cols);
+    Derived& setRandom(Index rows, NoChange_t);
 
     #ifdef EIGEN_PLAINOBJECTBASE_PLUGIN
     #include EIGEN_PLAINOBJECTBASE_PLUGIN
diff --git a/Eigen/src/Core/Random.h b/Eigen/src/Core/Random.h
index 486e9ed..dab2ac8 100644
--- a/Eigen/src/Core/Random.h
+++ b/Eigen/src/Core/Random.h
@@ -177,6 +177,42 @@
   return setRandom();
 }
 
+/** Resizes to the given size, changing only the number of columns, and sets all
+  * coefficients in this expression to random values. For the parameter of type
+  * NoChange_t, just pass the special value \c NoChange.
+  *
+  * Numbers are uniformly spread through their whole definition range for integer types,
+  * and in the [-1:1] range for floating point scalar types.
+  *
+  * \not_reentrant
+  *
+  * \sa DenseBase::setRandom(), setRandom(Index), setRandom(Index, NoChange_t), class CwiseNullaryOp, DenseBase::Random()
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setRandom(NoChange_t, Index cols)
+{
+  return setRandom(rows(), cols);
+}
+
+/** Resizes to the given size, changing only the number of rows, and sets all
+  * coefficients in this expression to random values. For the parameter of type
+  * NoChange_t, just pass the special value \c NoChange.
+  *
+  * Numbers are uniformly spread through their whole definition range for integer types,
+  * and in the [-1:1] range for floating point scalar types.
+  *
+  * \not_reentrant
+  *
+  * \sa DenseBase::setRandom(), setRandom(Index), setRandom(NoChange_t, Index), class CwiseNullaryOp, DenseBase::Random()
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setRandom(Index rows, NoChange_t)
+{
+  return setRandom(rows, cols());
+}
+
 } // end namespace Eigen
 
 #endif // EIGEN_RANDOM_H
diff --git a/Eigen/src/Core/Stride.h b/Eigen/src/Core/Stride.h
index 513742f..cbcb0a5 100644
--- a/Eigen/src/Core/Stride.h
+++ b/Eigen/src/Core/Stride.h
@@ -26,7 +26,7 @@
   *
   * The outer stride is the pointer increment between two consecutive rows of a row-major matrix or
   * between two consecutive columns of a column-major matrix.
-  *
+  * 
   * These two values can be passed either at compile-time as template parameters, or at runtime as
   * arguments to the constructor.
   *
@@ -38,6 +38,10 @@
   * \include Map_general_stride.cpp
   * Output: \verbinclude Map_general_stride.out
   *
+  * Both strides can be negative, however, a negative stride of -1 cannot be specified at compiletime
+  * because of the ambiguity with Dynamic which is defined to -1 (historically, negative strides were
+  * not allowed).
+  * 
   * \sa class InnerStride, class OuterStride, \ref TopicStorageOrders
   */
 template<int _OuterStrideAtCompileTime, int _InnerStrideAtCompileTime>
@@ -55,6 +59,8 @@
     Stride()
       : m_outer(OuterStrideAtCompileTime), m_inner(InnerStrideAtCompileTime)
     {
+      // FIXME: for Eigen 4 we should use DynamicIndex instead of Dynamic.
+      // FIXME: for Eigen 4 we should also unify this API with fix<>
       eigen_assert(InnerStrideAtCompileTime != Dynamic && OuterStrideAtCompileTime != Dynamic);
     }
 
@@ -63,7 +69,6 @@
     Stride(Index outerStride, Index innerStride)
       : m_outer(outerStride), m_inner(innerStride)
     {
-      eigen_assert(innerStride>=0 && outerStride>=0);
     }
 
     /** Copy constructor */
diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h
index c0d362f..67041c8 100644
--- a/Eigen/src/Core/arch/AVX/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX/MathFunctions.h
@@ -184,6 +184,19 @@
 F16_PACKET_FUNCTION(Packet8f, Packet8h, psqrt)
 F16_PACKET_FUNCTION(Packet8f, Packet8h, prsqrt)
 
+template <>
+EIGEN_STRONG_INLINE Packet8h pfrexp(const Packet8h& a, Packet8h& exponent) {
+  Packet8f fexponent;
+  const Packet8h out = float2half(pfrexp<Packet8f>(half2float(a), fexponent));
+  exponent = float2half(fexponent);
+  return out;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pldexp(const Packet8h& a, const Packet8h& exponent) {
+  return float2half(pldexp<Packet8f>(half2float(a), half2float(exponent)));
+}
+
 BF16_PACKET_FUNCTION(Packet8f, Packet8bf, psin)
 BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pcos)
 BF16_PACKET_FUNCTION(Packet8f, Packet8bf, plog)
@@ -195,6 +208,19 @@
 BF16_PACKET_FUNCTION(Packet8f, Packet8bf, psqrt)
 BF16_PACKET_FUNCTION(Packet8f, Packet8bf, prsqrt)
 
+template <>
+EIGEN_STRONG_INLINE Packet8bf pfrexp(const Packet8bf& a, Packet8bf& exponent) {
+  Packet8f fexponent;
+  const Packet8bf out = F32ToBf16(pfrexp<Packet8f>(Bf16ToF32(a), fexponent));
+  exponent = F32ToBf16(fexponent);
+  return out;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pldexp(const Packet8bf& a, const Packet8bf& exponent) {
+  return F32ToBf16(pldexp<Packet8f>(Bf16ToF32(a), Bf16ToF32(exponent)));
+}
+
 }  // end namespace internal
 
 }  // end namespace Eigen
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index 2299e4d..34bc242 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -273,6 +273,15 @@
 
 template<> EIGEN_STRONG_INLINE Packet8f psub<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_sub_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4d psub<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_sub_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8i psub<Packet8i>(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_sub_epi32(a,b);
+#else
+  __m128i lo = _mm_sub_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
+  __m128i hi = _mm_sub_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
 
 template<> EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a)
 {
@@ -379,6 +388,7 @@
   return _mm256_min_pd(b,a);
 #endif
 }
+
 template<> EIGEN_STRONG_INLINE Packet8f pmax<Packet8f>(const Packet8f& a, const Packet8f& b) {
 #if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
   // See pmin above
@@ -756,17 +766,29 @@
 }
 
 template<> EIGEN_STRONG_INLINE Packet4d pldexp<Packet4d>(const Packet4d& a, const Packet4d& exponent) {
-  // Build e=2^n by constructing the exponents in a 128-bit vector and
-  // shifting them to where they belong in double-precision values.
-  Packet4i cst_1023 = pset1<Packet4i>(1023);
-  __m128i emm0 = _mm256_cvtpd_epi32(exponent);
-  emm0 = _mm_add_epi32(emm0, cst_1023);
-  emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(3, 1, 2, 0));
-  __m128i lo = _mm_slli_epi64(emm0, 52);
-  __m128i hi = _mm_slli_epi64(_mm_srli_epi64(emm0, 32), 52);
-  __m256i e = _mm256_insertf128_si256(_mm256_setzero_si256(), lo, 0);
-  e = _mm256_insertf128_si256(e, hi, 1);
-  return pmul(a,_mm256_castsi256_pd(e));
+  // Clamp exponent to [-2099, 2099]
+  const Packet4d max_exponent = pset1<Packet4d>(2099.0);
+  const Packet4i e = _mm256_cvtpd_epi32(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
+  
+  // Split 2^e into four factors and multiply.
+  const Packet4i bias = pset1<Packet4i>(1023);
+  Packet4i b = parithmetic_shift_right<2>(e);  // floor(e/4)
+  
+  // 2^b
+  Packet4i hi = vec4i_swizzle1(padd(b, bias), 0, 2, 1, 3);
+  Packet4i lo = _mm_slli_epi64(hi, 52);
+  hi = _mm_slli_epi64(_mm_srli_epi64(hi, 32), 52);
+  Packet4d c = _mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1));
+  Packet4d out = pmul(pmul(pmul(a, c), c), c);  // a * 2^(3b)
+  
+  // 2^(e - 3b)
+  b = psub(psub(psub(e, b), b), b);  // e - 3b
+  hi = vec4i_swizzle1(padd(b, bias), 0, 2, 1, 3);
+  lo = _mm_slli_epi64(hi, 52);
+  hi = _mm_slli_epi64(_mm_srli_epi64(hi, 32), 52);
+  c = _mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1));
+  out = pmul(out, c); // a * 2^e
+  return out;
 }
 
 template<> EIGEN_STRONG_INLINE float predux<Packet8f>(const Packet8f& a)
diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h
index 66f3252..41929cb 100644
--- a/Eigen/src/Core/arch/AVX512/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h
@@ -191,6 +191,32 @@
 F16_PACKET_FUNCTION(Packet16f, Packet16h, pexp)
 BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pexp)
 
+template <>
+EIGEN_STRONG_INLINE Packet16h pfrexp(const Packet16h& a, Packet16h& exponent) {
+  Packet16f fexponent;
+  const Packet16h out = float2half(pfrexp<Packet16f>(half2float(a), fexponent));
+  exponent = float2half(fexponent);
+  return out;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pldexp(const Packet16h& a, const Packet16h& exponent) {
+  return float2half(pldexp<Packet16f>(half2float(a), half2float(exponent)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pfrexp(const Packet16bf& a, Packet16bf& exponent) {
+  Packet16f fexponent;
+  const Packet16bf out = F32ToBf16(pfrexp<Packet16f>(Bf16ToF32(a), fexponent));
+  exponent = F32ToBf16(fexponent);
+  return out;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pldexp(const Packet16bf& a, const Packet16bf& exponent) {
+  return F32ToBf16(pldexp<Packet16f>(Bf16ToF32(a), Bf16ToF32(exponent)));
+}
+
 // Functions for sqrt.
 // The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step
 // of Newton's method, at a cost of 1-2 bits of precision as opposed to the
diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h
index f838857..b9e9fdb 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
@@ -88,7 +88,7 @@
     HasCeil   = 1,
     HasRint   = 1,
     HasBessel = 1,
-    HasNdtri  = 1,
+    HasNdtri  = 1
   };
 };
 
@@ -917,16 +917,29 @@
 }
 
 template<> EIGEN_STRONG_INLINE Packet8d pldexp<Packet8d>(const Packet8d& a, const Packet8d& exponent) {
-  // Build e=2^n by constructing the exponents in a 256-bit vector and
-  // shifting them to where they belong in double-precision values.
-  Packet8i cst_1023 = pset1<Packet8i>(1023);
-  __m256i emm0 = _mm512_cvtpd_epi32(exponent);
-  emm0 = _mm256_add_epi32(emm0, cst_1023);
-  emm0 = _mm256_shuffle_epi32(emm0, _MM_SHUFFLE(3, 1, 2, 0));
-  __m256i lo = _mm256_slli_epi64(emm0, 52);
-  __m256i hi = _mm256_slli_epi64(_mm256_srli_epi64(emm0, 32), 52);
-  __m512d b = _mm512_castsi512_pd(_mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1));
-  return pmul(a, b);
+  // Clamp exponent to [-2099, 2099]
+  const Packet8d max_exponent = pset1<Packet8d>(2099.0);
+  const Packet8i e = _mm512_cvtpd_epi32(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
+  
+  // Split 2^e into four factors and multiply.
+  const Packet8i bias = pset1<Packet8i>(1023);
+  Packet8i b = parithmetic_shift_right<2>(e);  // floor(e/4)
+  
+  // 2^b
+  Packet8i hi = _mm256_shuffle_epi32(padd(b, bias), _MM_SHUFFLE(3, 1, 2, 0));
+  Packet8i lo = _mm256_slli_epi64(hi, 52);
+  hi = _mm256_slli_epi64(_mm256_srli_epi64(hi, 32), 52);
+  Packet8d c = _mm512_castsi512_pd(_mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1));
+  Packet8d out = pmul(pmul(pmul(a, c), c), c);  // a * 2^(3b)
+  
+  // 2^(e - 3b)
+  b = psub(psub(psub(e, b), b), b);  // e - 3b
+  hi = _mm256_shuffle_epi32(padd(b, bias), _MM_SHUFFLE(3, 1, 2, 0));
+  lo = _mm256_slli_epi64(hi, 52);
+  hi = _mm256_slli_epi64(_mm256_srli_epi64(hi, 32), 52);
+  c = _mm512_castsi512_pd(_mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1));
+  out = pmul(out, c);  // a * 2^e
+  return out;
 }
 
 #ifdef EIGEN_VECTORIZE_AVX512DQ
diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h
index f9d0eb5..d21e139 100644
--- a/Eigen/src/Core/arch/AltiVec/Complex.h
+++ b/Eigen/src/Core/arch/AltiVec/Complex.h
@@ -29,7 +29,7 @@
 //---------- float ----------
 struct Packet2cf
 {
-  EIGEN_STRONG_INLINE explicit Packet2cf() : v(p4f_ZERO) {}
+  EIGEN_STRONG_INLINE explicit Packet2cf() {}
   EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {}
   Packet4f  v;
 };
@@ -38,6 +38,7 @@
 {
   typedef Packet2cf type;
   typedef Packet2cf half;
+  typedef Packet4f as_real;
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
@@ -60,7 +61,7 @@
   };
 };
 
-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2cf half; };
+template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2cf half; typedef Packet4f as_real; };
 
 template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
 {
@@ -227,6 +228,11 @@
 }
 #endif
 
+template<> EIGEN_STRONG_INLINE Packet2cf psqrt<Packet2cf>(const Packet2cf& a)
+{
+  return psqrt_complex<Packet2cf>(a);
+}
+
 //---------- double ----------
 #ifdef __VSX__
 struct Packet1cd
@@ -240,6 +246,7 @@
 {
   typedef Packet1cd type;
   typedef Packet1cd half;
+  typedef Packet2d as_real;
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 0,
@@ -259,7 +266,7 @@
   };
 };
 
-template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet1cd half; };
+template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet1cd half; typedef Packet2d as_real; };
 
 template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from) { return Packet1cd(pload<Packet2d>((const double*)from)); }
 template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { return Packet1cd(ploadu<Packet2d>((const double*)from)); }
@@ -390,6 +397,11 @@
   return Packet1cd(vec_and(eq, eq_swapped));
 }
 
+template<> EIGEN_STRONG_INLINE Packet1cd psqrt<Packet1cd>(const Packet1cd& a)
+{
+  return psqrt_complex<Packet1cd>(a);
+}
+
 #endif // __VSX__
 } // end namespace internal
 
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
index b863675..53116ad 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
@@ -72,7 +72,7 @@
 // are responsible to extract from convert between Eigen's and MatrixProduct approach.
 const static Packet4f p4f_CONJUGATE = {-1.0f, -1.0f, -1.0f, -1.0f};
 
-const static Packet2d p2d_CONJUGATE = {-1.0f, -1.0f};
+const static Packet2d p2d_CONJUGATE = {-1.0, -1.0};
 
 const static Packet16uc p16uc_GETREAL32 = {  0,  1,  2,  3,
                                              8,  9, 10, 11,
@@ -1057,7 +1057,7 @@
     const int vectorSize = quad_traits<double>::vectorsize;
     Index ri = 0, j = 0;
     double *blockAt  = reinterpret_cast<double *>(blockA);
-    Packet conj = pset1<Packet>((double)-1.0f);
+    Packet conj = pset1<Packet>(-1.0);
 
     for(j = 0; j + vectorSize < rows; j+=vectorSize)
     {
@@ -1202,7 +1202,7 @@
   {
     const int vectorSize = quad_traits<double>::vectorsize;
     double *blockBt = reinterpret_cast<double *>(blockB);
-    Packet conj = pset1<Packet>((double)-1.0f);
+    Packet conj = pset1<Packet>(-1.0);
 
     Index ri = 0, j = 0;
     for(; j + 2*vectorSize < cols; j+=2*vectorSize)
diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index c989543..f29b595 100755
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -1056,6 +1056,7 @@
   MSQ = vec_perm(edges,(Packet16uc)from,align);             // misalign the data (MSQ)
   LSQ = vec_perm((Packet16uc)from,edges,align);             // misalign the data (LSQ)
   vec_st( LSQ, 15, (unsigned char *)to );                   // Store the LSQ part first
+  vec_st( MSQ, 0, (unsigned char *)to );                   // Store the MSQ part second
 #else
   vec_xst(from, 0, to);
 #endif
@@ -1209,6 +1210,16 @@
   );
 }
 
+// Simple interleaving of bool masks, prevents true values from being
+// converted to NaNs.
+EIGEN_STRONG_INLINE Packet8bf F32ToBf16Bool(Packet4f even, Packet4f odd) {
+  const _EIGEN_DECLARE_CONST_FAST_Packet4ui(high_mask, 0xFFFF0000);
+  Packet4f bf_odd, bf_even;
+  bf_odd = pand(reinterpret_cast<Packet4f>(p4ui_high_mask), odd);
+  bf_even = plogical_shift_right<16>(even);
+  return reinterpret_cast<Packet8us>(por<Packet4f>(bf_even, bf_odd));
+}
+
 EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f p4f){
   Packet4ui input = reinterpret_cast<Packet4ui>(p4f);
   Packet4ui lsb = plogical_shift_right<16>(input);
@@ -1272,6 +1283,15 @@
   Packet4f op_odd = OP(a_odd, b_odd);\
   return F32ToBf16(op_even, op_odd);\
 
+#define BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(OP, A, B) \
+  Packet4f a_even = Bf16ToF32Even(A);\
+  Packet4f a_odd = Bf16ToF32Odd(A);\
+  Packet4f b_even = Bf16ToF32Even(B);\
+  Packet4f b_odd = Bf16ToF32Odd(B);\
+  Packet4f op_even = OP(a_even, b_even);\
+  Packet4f op_odd = OP(a_odd, b_odd);\
+  return F32ToBf16Bool(op_even, op_odd);\
+
 template<> EIGEN_STRONG_INLINE Packet8bf padd<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
   BF16_TO_F32_BINARY_OP_WRAPPER(padd<Packet4f>, a, b);
 }
@@ -1301,6 +1321,28 @@
 template<> EIGEN_STRONG_INLINE Packet8bf pexp<Packet8bf> (const Packet8bf& a){
   BF16_TO_F32_UNARY_OP_WRAPPER(pexp_float, a);
 }
+
+template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
+  return pldexp_float(a,exponent);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf pldexp<Packet8bf> (const Packet8bf& a, const Packet8bf& exponent){
+  BF16_TO_F32_BINARY_OP_WRAPPER(pldexp_float, a, exponent);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
+  return pfrexp_float(a,exponent);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf pfrexp<Packet8bf> (const Packet8bf& a, Packet8bf& e){
+  Packet4f a_even = Bf16ToF32Even(a);
+  Packet4f a_odd = Bf16ToF32Odd(a);
+  Packet4f e_even;
+  Packet4f e_odd;
+  Packet4f op_even = pfrexp<Packet4f>(a_even, e_even);
+  Packet4f op_odd = pfrexp<Packet4f>(a_odd, e_odd);
+  e = F32ToBf16(e_even, e_odd);
+  return F32ToBf16(op_even, op_odd);
+}
+
 template<> EIGEN_STRONG_INLINE Packet8bf psin<Packet8bf> (const Packet8bf& a){
   BF16_TO_F32_UNARY_OP_WRAPPER(psin_float, a);
 }
@@ -1340,13 +1382,13 @@
 }
 
 template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt(const Packet8bf& a, const Packet8bf& b) {
-  BF16_TO_F32_BINARY_OP_WRAPPER(pcmp_lt<Packet4f>, a, b);
+  BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_lt<Packet4f>, a, b);
 }
 template<> EIGEN_STRONG_INLINE Packet8bf pcmp_le(const Packet8bf& a, const Packet8bf& b) {
-  BF16_TO_F32_BINARY_OP_WRAPPER(pcmp_le<Packet4f>, a, b);
+  BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_le<Packet4f>, a, b);
 }
 template<> EIGEN_STRONG_INLINE Packet8bf pcmp_eq(const Packet8bf& a, const Packet8bf& b) {
-  BF16_TO_F32_BINARY_OP_WRAPPER(pcmp_eq<Packet4f>, a, b);
+  BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_eq<Packet4f>, a, b);
 }
 
 template<> EIGEN_STRONG_INLINE bfloat16 pfirst(const Packet8bf& a) {
@@ -1364,14 +1406,6 @@
   return padd<Packet8bf>(pset1<Packet8bf>(a), pload<Packet8bf>(countdown));
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
-  return pfrexp_float(a,exponent);
-}
-
-template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
-  return pldexp_float(a,exponent);
-}
-
 template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
 {
   Packet4f b, sum;
@@ -2418,38 +2452,144 @@
 #endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
-  
-  // build 2^n
-  Packet2l emm0 = ConvertToPacket2l(exponent);
 
-#ifdef __POWER8_VECTOR__ 
-  const Packet2l  p2l_1023 = { 1023, 1023 };
-  const Packet2ul p2ul_52 = { 52, 52 };
-  emm0 = vec_add(emm0, p2l_1023);
-  emm0 = vec_sl(emm0, p2ul_52);
+// Packet2l shifts.
+// For POWER8 we simply use vec_sr/l. 
+//
+// Things are more complicated for POWER7. There is actually a
+// vec_xxsxdi intrinsic but it is not supported by some gcc versions.
+// So we need to shift by N % 32 and rearrage bytes.
+#ifdef __POWER8_VECTOR__
+
+template<int N>
+EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
+  const Packet2ul shift = { N, N };
+  return vec_sl(a, shift); 
+}
+
+template<int N>
+EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
+  const Packet2ul shift = { N, N };
+  return vec_sr(a, shift); 
+}
+
 #else
-  // Code is a bit complex for POWER7.  There is actually a
-  // vec_xxsldi intrinsic but it is not supported by some gcc versions.
-  // So we shift (52-32) bits and do a word swap with zeros.
-  const Packet4i p4i_1023 = pset1<Packet4i>(1023);
-  const Packet4i p4i_20 = pset1<Packet4i>(20);    // 52 - 32
 
-  Packet4i emm04i = reinterpret_cast<Packet4i>(emm0);
-  emm04i = vec_add(emm04i, p4i_1023);
-  emm04i = vec_sl(emm04i, reinterpret_cast<Packet4ui>(p4i_20));
+// Shifts [A, B, C, D] to [B, 0, D, 0].
+// Used to implement left shifts for Packet2l.
+EIGEN_ALWAYS_INLINE Packet4i shift_even_left(const Packet4i& a) {
   static const Packet16uc perm = {
-    0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03, 
-    0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b };
-#ifdef  _BIG_ENDIAN
-  emm0 = reinterpret_cast<Packet2l>(vec_perm(p4i_ZERO, emm04i, perm));
-#else
-  emm0 = reinterpret_cast<Packet2l>(vec_perm(emm04i, p4i_ZERO, perm));
+      0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03, 
+      0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b };
+  #ifdef  _BIG_ENDIAN
+    return vec_perm(p4i_ZERO, a, perm);
+  #else
+    return vec_perm(a, p4i_ZERO, perm);
+  #endif
+}
+
+// Shifts [A, B, C, D] to [0, A, 0, C].
+// Used to implement right shifts for Packet2l.
+EIGEN_ALWAYS_INLINE Packet4i shift_odd_right(const Packet4i& a) {
+  static const Packet16uc perm = {
+      0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13, 
+      0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b };
+  #ifdef  _BIG_ENDIAN
+    return vec_perm(p4i_ZERO, a, perm);
+  #else
+    return vec_perm(a, p4i_ZERO, perm);
+  #endif
+}
+
+template<int N, typename EnableIf = void>
+struct plogical_shift_left_impl;
+
+template<int N>
+struct plogical_shift_left_impl<N, typename enable_if<(N < 32) && (N >= 0)>::type> {
+  static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
+    static const unsigned n = static_cast<unsigned>(N);
+    const Packet4ui shift = {n, n, n, n};
+    const Packet4i ai = reinterpret_cast<Packet4i>(a);
+    static const unsigned m = static_cast<unsigned>(32 - N);
+    const Packet4ui shift_right = {m, m, m, m};
+    const Packet4i out_hi = vec_sl(ai, shift);
+    const Packet4i out_lo = shift_even_left(vec_sr(ai, shift_right));
+    return reinterpret_cast<Packet2l>(por<Packet4i>(out_hi, out_lo));
+  }
+};
+
+template<int N>
+struct plogical_shift_left_impl<N, typename enable_if<(N >= 32)>::type> {
+  static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
+    static const unsigned m = static_cast<unsigned>(N - 32);
+    const Packet4ui shift = {m, m, m, m};
+    const Packet4i ai = reinterpret_cast<Packet4i>(a);
+    return reinterpret_cast<Packet2l>(shift_even_left(vec_sl(ai, shift)));
+  }
+};
+
+template<int N>
+EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
+  return plogical_shift_left_impl<N>::run(a); 
+}
+
+template<int N, typename EnableIf = void>
+struct plogical_shift_right_impl;
+
+template<int N>
+struct plogical_shift_right_impl<N, typename enable_if<(N < 32) && (N >= 0)>::type> {
+  static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
+    static const unsigned n = static_cast<unsigned>(N);
+    const Packet4ui shift = {n, n, n, n};
+    const Packet4i ai = reinterpret_cast<Packet4i>(a);
+    static const unsigned m = static_cast<unsigned>(32 - N);
+    const Packet4ui shift_left = {m, m, m, m};
+    const Packet4i out_lo = vec_sr(ai, shift);
+    const Packet4i out_hi = shift_odd_right(vec_sl(ai, shift_left));
+    return reinterpret_cast<Packet2l>(por<Packet4i>(out_hi, out_lo));
+  }
+};
+
+template<int N>
+struct plogical_shift_right_impl<N, typename enable_if<(N >= 32)>::type> {
+  static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
+    static const unsigned m = static_cast<unsigned>(N - 32);
+    const Packet4ui shift = {m, m, m, m};
+    const Packet4i ai = reinterpret_cast<Packet4i>(a);
+    return reinterpret_cast<Packet2l>(shift_odd_right(vec_sr(ai, shift)));
+  }
+};
+
+template<int N>
+EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
+  return plogical_shift_right_impl<N>::run(a); 
+}
 #endif
 
-#endif
+template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
+  // Clamp exponent to [-2099, 2099]
+  const Packet2d max_exponent = pset1<Packet2d>(2099.0);
+  const Packet2l e = ConvertToPacket2l(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
 
-  return pmul(a, reinterpret_cast<Packet2d>(emm0));
+  // Split 2^e into four factors and multiply:
+  const Packet2l  bias = { 1023, 1023 };
+  Packet2l b = plogical_shift_right<2>(e);  // floor(e/4)
+  Packet2d c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias));
+  Packet2d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b)
+  b = psub(psub(psub(e, b), b), b);  // e - 3b
+  c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias)); // 2^(e - 3b)
+  out = pmul(out, c); // a * 2^e
+  return out;
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d> (const Packet2d& a, Packet2d& exponent) {
+  double exp[2] = { exponent[0], exponent[1] };
+  Packet2d ret = { pfrexp<double>(a[0], exp[0]), pfrexp<double>(a[1], exp[1]) };
+  exponent[0] = exp[0];
+  exponent[1] = exp[1];
+  return ret;
+//  This doesn't currently work (no integer_packet for Packet2d - but adding it causes other problems)
+//  return pfrexp_double(a, exponent);
 }
 
 template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
diff --git a/Eigen/src/Core/arch/CUDA/Complex.h b/Eigen/src/Core/arch/CUDA/Complex.h
index df5a3c2..caf3fe7 100644
--- a/Eigen/src/Core/arch/CUDA/Complex.h
+++ b/Eigen/src/Core/arch/CUDA/Complex.h
@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2021 C. Antonio Sanchez <cantonios@google.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -14,98 +15,231 @@
 
 #if defined(EIGEN_CUDACC) && defined(EIGEN_GPU_COMPILE_PHASE)
 
+// Many std::complex methods such as operator+, operator-, operator* and
+// operator/ are not constexpr. Due to this, GCC and older versions of clang do
+// not treat them as device functions and thus Eigen functors making use of
+// these operators fail to compile. Here, we manually specialize these
+// operators and functors for complex types when building for CUDA to enable
+// their use on-device.
+
+// Import Eigen's internal operator specializations.
+#define EIGEN_USING_STD_COMPLEX_OPERATORS           \
+  using Eigen::complex_operator_detail::operator+;  \
+  using Eigen::complex_operator_detail::operator-;  \
+  using Eigen::complex_operator_detail::operator*;  \
+  using Eigen::complex_operator_detail::operator/;  \
+  using Eigen::complex_operator_detail::operator+=; \
+  using Eigen::complex_operator_detail::operator-=; \
+  using Eigen::complex_operator_detail::operator*=; \
+  using Eigen::complex_operator_detail::operator/=; \
+  using Eigen::complex_operator_detail::operator==; \
+  using Eigen::complex_operator_detail::operator!=;
+
 namespace Eigen {
 
-namespace internal {
+// Specialized std::complex overloads.
+namespace complex_operator_detail {
 
-// Many std::complex methods such as operator+, operator-, operator* and
-// operator/ are not constexpr. Due to this, clang does not treat them as device
-// functions and thus Eigen functors making use of these operators fail to
-// compile. Here, we manually specialize these functors for complex types when
-// building for CUDA to avoid non-constexpr methods.
-
-// Sum
-template<typename T> struct scalar_sum_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > {
-  typedef typename std::complex<T> result_type;
-
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_sum_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
-    return std::complex<T>(numext::real(a) + numext::real(b),
-                           numext::imag(a) + numext::imag(b));
-  }
-};
-
-template<typename T> struct scalar_sum_op<std::complex<T>, std::complex<T> > : scalar_sum_op<const std::complex<T>, const std::complex<T> > {};
-
-
-// Difference
-template<typename T> struct scalar_difference_op<const std::complex<T>, const std::complex<T> >  : binary_op_base<const std::complex<T>, const std::complex<T> > {
-  typedef typename std::complex<T> result_type;
-
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_difference_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
-    return std::complex<T>(numext::real(a) - numext::real(b),
-                           numext::imag(a) - numext::imag(b));
-  }
-};
-
-template<typename T> struct scalar_difference_op<std::complex<T>, std::complex<T> > : scalar_difference_op<const std::complex<T>, const std::complex<T> > {};
-
-
-// Product
-template<typename T> struct scalar_product_op<const std::complex<T>, const std::complex<T> >  : binary_op_base<const std::complex<T>, const std::complex<T> > {
-  enum {
-    Vectorizable = packet_traits<std::complex<T> >::HasMul
-  };
-  typedef typename std::complex<T> result_type;
-
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
-    const T a_real = numext::real(a);
-    const T a_imag = numext::imag(a);
-    const T b_real = numext::real(b);
-    const T b_imag = numext::imag(b);
-    return std::complex<T>(a_real * b_real - a_imag * b_imag,
-                           a_real * b_imag + a_imag * b_real);
-  }
-};
-
-template<typename T> struct scalar_product_op<std::complex<T>, std::complex<T> > : scalar_product_op<const std::complex<T>, const std::complex<T> > {};
-
-
-// Quotient
-template<typename T> struct scalar_quotient_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > {
-  enum {
-    Vectorizable = packet_traits<std::complex<T> >::HasDiv
-  };
-  typedef typename std::complex<T> result_type;
-
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_quotient_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
-    const T a_real = numext::real(a);
-    const T a_imag = numext::imag(a);
-    const T b_real = numext::real(b);
-    const T b_imag = numext::imag(b);
-    const T norm = T(1) / (b_real * b_real + b_imag * b_imag);
-    return std::complex<T>((a_real * b_real + a_imag * b_imag) * norm,
-                           (a_imag * b_real - a_real * b_imag) * norm);
-  }
-};
-
-template<typename T> struct scalar_quotient_op<std::complex<T>, std::complex<T> > : scalar_quotient_op<const std::complex<T>, const std::complex<T> > {};
-
-// Complex sqrt is already specialized on Windows.
-#if EIGEN_COMP_MSVC == 0
 template<typename T>
-struct sqrt_impl<std::complex<T> >
-{
-  EIGEN_DEVICE_FUNC
-  static EIGEN_ALWAYS_INLINE std::complex<T> run(const std::complex<T>& x)
-  {
-    return complex_sqrt<T>(x);
-  }
-};
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<T> complex_multiply(const std::complex<T>& a, const std::complex<T>& b) {
+  const T a_real = numext::real(a);
+  const T a_imag = numext::imag(a);
+  const T b_real = numext::real(b);
+  const T b_imag = numext::imag(b);
+  return std::complex<T>(
+      a_real * b_real - a_imag * b_imag,
+      a_imag * b_real + a_real * b_imag);
+}
+
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<T> complex_divide_fast(const std::complex<T>& a, const std::complex<T>& b) {
+  const T a_real = numext::real(a);
+  const T a_imag = numext::imag(a);
+  const T b_real = numext::real(b);
+  const T b_imag = numext::imag(b);
+  const T norm = T(1) / (b_real * b_real + b_imag * b_imag);
+  return std::complex<T>((a_real * b_real + a_imag * b_imag) * norm,
+                          (a_imag * b_real - a_real * b_imag) * norm);
+}
+
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<T> complex_divide_stable(const std::complex<T>& a, const std::complex<T>& b) {
+  const T b_real = numext::real(b);
+  const T b_imag = numext::imag(b);
+  // Guard against over/under-flow.
+  const T scale = T(1) / (numext::abs(b_real) + numext::abs(b_imag));
+  const T a_real_scaled = numext::real(a) * scale;
+  const T a_imag_scaled = numext::imag(a) * scale;
+  const T b_real_scaled = b_real * scale;
+  const T b_imag_scaled = b_imag * scale;
+  
+  const T b_norm2_scaled = b_real_scaled * b_real_scaled + b_imag_scaled * b_imag_scaled;
+  return std::complex<T>(
+      (a_real_scaled * b_real_scaled + a_imag_scaled * b_imag_scaled) / b_norm2_scaled,
+      (a_imag_scaled * b_real_scaled - a_real_scaled * b_imag_scaled) / b_norm2_scaled);
+}
+
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<T> complex_divide(const std::complex<T>& a, const std::complex<T>& b) {
+#if EIGEN_FAST_MATH
+  return complex_divide_fast(a, b);
+#else
+  return complex_divide_stable(a, b);
 #endif
+}
+
+// NOTE: We cannot specialize compound assignment operators with Scalar T,
+//         (i.e.  operator@=(const T&), for @=+,-,*,/)
+//       since they are already specialized for float/double/long double within
+//       the standard <complex> header. We also do not specialize the stream
+//       operators.
+#define EIGEN_CREATE_STD_COMPLEX_OPERATOR_SPECIALIZATIONS(T)                                    \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator+(const std::complex<T>& a) { return a; }                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator-(const std::complex<T>& a) {                                           \
+  return std::complex<T>(-numext::real(a), -numext::imag(a));                                   \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator+(const std::complex<T>& a, const std::complex<T>& b) {                 \
+  return std::complex<T>(numext::real(a) + numext::real(b), numext::imag(a) + numext::imag(b)); \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator+(const std::complex<T>& a, const T& b) {                               \
+  return std::complex<T>(numext::real(a) + b, numext::imag(a));                                 \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator+(const T& a, const std::complex<T>& b) {                               \
+  return std::complex<T>(a + numext::real(b), numext::imag(b));                                 \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator-(const std::complex<T>& a, const std::complex<T>& b) {                 \
+  return std::complex<T>(numext::real(a) - numext::real(b), numext::imag(a) - numext::imag(b)); \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator-(const std::complex<T>& a, const T& b) {                               \
+  return std::complex<T>(numext::real(a) - b, numext::imag(a));                                 \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator-(const T& a, const std::complex<T>& b) {                               \
+  return std::complex<T>(a - numext::real(b), -numext::imag(b));                                \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator*(const std::complex<T>& a, const std::complex<T>& b) {                 \
+  return complex_multiply(a, b);                                                                \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator*(const std::complex<T>& a, const T& b) {                               \
+  return std::complex<T>(numext::real(a) * b, numext::imag(a) * b);                             \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator*(const T& a, const std::complex<T>& b) {                               \
+  return std::complex<T>(a * numext::real(b), a * numext::imag(b));                             \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator/(const std::complex<T>& a, const std::complex<T>& b) {                 \
+  return complex_divide(a, b);                                                                  \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator/(const std::complex<T>& a, const T& b) {                               \
+  return std::complex<T>(numext::real(a) / b, numext::imag(a) / b);                             \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator/(const T& a, const std::complex<T>& b) {                               \
+  return complex_divide(std::complex<T>(a, 0), b);                                              \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T>& operator+=(std::complex<T>& a, const std::complex<T>& b) {                     \
+  numext::real_ref(a) += numext::real(b);                                                       \
+  numext::imag_ref(a) += numext::imag(b);                                                       \
+  return a;                                                                                     \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T>& operator-=(std::complex<T>& a, const std::complex<T>& b) {                     \
+  numext::real_ref(a) -= numext::real(b);                                                       \
+  numext::imag_ref(a) -= numext::imag(b);                                                       \
+  return a;                                                                                     \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T>& operator*=(std::complex<T>& a, const std::complex<T>& b) {                     \
+  a = complex_multiply(a, b);                                                                   \
+  return a;                                                                                     \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T>& operator/=(std::complex<T>& a, const std::complex<T>& b) {                     \
+  a = complex_divide(a, b);                                                                     \
+  return  a;                                                                                    \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+bool operator==(const std::complex<T>& a, const std::complex<T>& b) {                           \
+  return numext::real(a) == numext::real(b) && numext::imag(a) == numext::imag(b);              \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+bool operator==(const std::complex<T>& a, const T& b) {                                         \
+  return numext::real(a) == b && numext::imag(a) == 0;                                          \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+bool operator==(const T& a, const std::complex<T>& b) {                                         \
+  return a  == numext::real(b) && 0 == numext::imag(b);                                         \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+bool operator!=(const std::complex<T>& a, const std::complex<T>& b) {                           \
+  return !(a == b);                                                                             \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+bool operator!=(const std::complex<T>& a, const T& b) {                                         \
+  return !(a == b);                                                                             \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+bool operator!=(const T& a, const std::complex<T>& b) {                                         \
+  return !(a == b);                                                                             \
+}
+
+// Do not specialize for long double, since that reduces to double on device.
+EIGEN_CREATE_STD_COMPLEX_OPERATOR_SPECIALIZATIONS(float)
+EIGEN_CREATE_STD_COMPLEX_OPERATOR_SPECIALIZATIONS(double)
+
+#undef EIGEN_CREATE_STD_COMPLEX_OPERATOR_SPECIALIZATIONS
+
+  
+}  // namespace complex_operator_detail
+
+EIGEN_USING_STD_COMPLEX_OPERATORS
+
+namespace numext {
+EIGEN_USING_STD_COMPLEX_OPERATORS
+}  // namespace numext
+
+namespace internal {
+EIGEN_USING_STD_COMPLEX_OPERATORS
 
 }  // namespace internal
 }  // namespace Eigen
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
index 9253d8ca..452019ec 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@@ -19,13 +19,19 @@
 namespace Eigen {
 namespace internal {
 
+template<typename Packet, int N> EIGEN_DEVICE_FUNC inline Packet
+pset(const typename unpacket_traits<Packet>::type (&a)[N] /* a */) {
+  EIGEN_STATIC_ASSERT(unpacket_traits<Packet>::size == N, THE_ARRAY_SIZE_SHOULD_EQUAL_WITH_PACKET_SIZE);
+  return pload<Packet>(a);
+}
+
 template<typename Packet> EIGEN_STRONG_INLINE Packet
 pfrexp_float(const Packet& a, Packet& exponent) {
   typedef typename unpacket_traits<Packet>::integer_packet PacketI;
   const Packet cst_126f = pset1<Packet>(126.0f);
   const Packet cst_half = pset1<Packet>(0.5f);
   const Packet cst_inv_mant_mask  = pset1frombits<Packet>(~0x7f800000u);
-  exponent = psub(pcast<PacketI,Packet>(plogical_shift_right<23>(preinterpret<PacketI>(a))), cst_126f);
+  exponent = psub(pcast<PacketI,Packet>(plogical_shift_right<23>(preinterpret<PacketI>(pabs<Packet>(a)))), cst_126f);
   return por(pand(a, cst_inv_mant_mask), cst_half);
 }
 
@@ -34,30 +40,99 @@
   typedef typename unpacket_traits<Packet>::integer_packet PacketI;
   const Packet cst_1022d = pset1<Packet>(1022.0);
   const Packet cst_half = pset1<Packet>(0.5);
-  const Packet cst_inv_mant_mask  = pset1frombits<Packet>(static_cast<uint64_t>(~0x7ff0000000000000ull));
-  exponent = psub(pcast<PacketI,Packet>(plogical_shift_right<52>(preinterpret<PacketI>(a))), cst_1022d);
+  const Packet cst_inv_mant_mask  = pset1frombits<Packet, uint64_t>(static_cast<uint64_t>(~0x7ff0000000000000ull));
+  exponent = psub(pcast<PacketI,Packet>(plogical_shift_right<52>(preinterpret<PacketI>(pabs<Packet>(a)))), cst_1022d);
   return por(pand(a, cst_inv_mant_mask), cst_half);
 }
 
-template<typename Packet> EIGEN_STRONG_INLINE Packet
-pldexp_float(Packet a, Packet exponent)
-{
+// Safely applies ldexp, correctly handles overflows, underflows and denormals.
+// Assumes IEEE floating point format.
+template<typename Packet>
+struct pldexp_impl {
   typedef typename unpacket_traits<Packet>::integer_packet PacketI;
-  const Packet cst_127 = pset1<Packet>(127.f);
-  // return a * 2^exponent
-  PacketI ei = pcast<Packet,PacketI>(padd(exponent, cst_127));
-  return pmul(a, preinterpret<Packet>(plogical_shift_left<23>(ei)));
-}
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  typedef typename unpacket_traits<PacketI>::type ScalarI;
+  enum {
+    TotalBits = sizeof(Scalar) * CHAR_BIT,
+    MantissaBits = std::numeric_limits<Scalar>::digits - 1,
+    ExponentBits = int(TotalBits) - int(MantissaBits) - 1
+  };
+   
+  static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+  Packet run(const Packet& a, const Packet& exponent) {
+    // We want to return a * 2^exponent, allowing for all possible integer
+    // exponents without overflowing or underflowing in intermediate
+    // computations.
+    //
+    // Since 'a' and the output can be denormal, the maximum range of 'exponent'
+    // to consider for a float is:
+    //   -255-23 -> 255+23
+    // Below -278 any finite float 'a' will become zero, and above +278 any
+    // finite float will become inf, including when 'a' is the smallest possible 
+    // denormal.
+    //
+    // Unfortunately, 2^(278) cannot be represented using either one or two
+    // finite normal floats, so we must split the scale factor into at least
+    // three parts. It turns out to be faster to split 'exponent' into four
+    // factors, since [exponent>>2] is much faster to compute that [exponent/3].
+    //
+    // Set e = min(max(exponent, -278), 278);
+    //     b = floor(e/4);
+    //   out = ((((a * 2^(b)) * 2^(b)) * 2^(b)) * 2^(e-3*b))
+    //
+    // This will avoid any intermediate overflows and correctly handle 0, inf,
+    // NaN cases.
+    const Packet max_exponent = pset1<Packet>(Scalar( (ScalarI(1)<<int(ExponentBits)) + ScalarI(MantissaBits) - ScalarI(1)));  // 278
+    const PacketI bias = pset1<PacketI>((ScalarI(1)<<(int(ExponentBits)-1)) - ScalarI(1));  // 127
+    const PacketI e = pcast<Packet, PacketI>(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
+    PacketI b = parithmetic_shift_right<2>(e); // floor(e/4);
+    Packet c = preinterpret<Packet>(plogical_shift_left<int(MantissaBits)>(padd(b, bias)));  // 2^b
+    Packet out = pmul(pmul(pmul(a, c), c), c);  // a * 2^(3b)
+    b = psub(psub(psub(e, b), b), b); // e - 3b
+    c = preinterpret<Packet>(plogical_shift_left<int(MantissaBits)>(padd(b, bias)));  // 2^(e-3*b)
+    out = pmul(out, c);
+    return out;
+  }
+};
+
+// Explicitly multiplies 
+//    a * (2^e)
+// clamping e to the range
+// [std::numeric_limits<Scalar>::min_exponent-2, std::numeric_limits<Scalar>::max_exponent]
+//
+// This is approx 7x faster than pldexp_impl, but will prematurely over/underflow
+// if 2^e doesn't fit into a normal floating-point Scalar.
+//
+// Assumes IEEE floating point format
+template<typename Packet>
+struct pldexp_fast_impl {
+  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  typedef typename unpacket_traits<PacketI>::type ScalarI;
+  enum {
+    TotalBits = sizeof(Scalar) * CHAR_BIT,
+    MantissaBits = std::numeric_limits<Scalar>::digits - 1,
+    ExponentBits = int(TotalBits) - int(MantissaBits) - 1
+  };
+  
+  static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+  Packet run(const Packet& a, const Packet& exponent) {
+    const Packet bias = pset1<Packet>(Scalar((ScalarI(1)<<(int(ExponentBits)-1)) - ScalarI(1)));  // 127
+    const Packet limit = pset1<Packet>(Scalar((ScalarI(1)<<int(ExponentBits)) - ScalarI(1)));     // 255
+    // restrict biased exponent between 0 and 255 for float.
+    const PacketI e = pcast<Packet, PacketI>(pmin(pmax(padd(exponent, bias), pzero(limit)), limit)); // exponent + 127
+    // return a * (2^e)
+    return pmul(a, preinterpret<Packet>(plogical_shift_left<int(MantissaBits)>(e)));
+  }
+};
 
 template<typename Packet> EIGEN_STRONG_INLINE Packet
-pldexp_double(Packet a, Packet exponent)
-{
-  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
-  const Packet cst_1023 = pset1<Packet>(1023.0);
-  // return a * 2^exponent
-  PacketI ei = pcast<Packet,PacketI>(padd(exponent, cst_1023));
-  return pmul(a, preinterpret<Packet>(plogical_shift_left<52>(ei)));
-}
+pldexp_float(const Packet& a, const Packet& exponent)
+{ return pldexp_impl<Packet>::run(a, exponent); }
+
+template<typename Packet> EIGEN_STRONG_INLINE Packet
+pldexp_double(const Packet& a, const Packet& exponent)
+{ return pldexp_impl<Packet>::run(a, exponent); }
 
 // Natural or base 2 logarithm.
 // Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
@@ -342,8 +417,8 @@
 {
   const Packet cst_1      = pset1<Packet>(1.0f);
   const Packet cst_half   = pset1<Packet>(0.5f);
-  const Packet cst_exp_hi = pset1<Packet>( 88.3762626647950f);
-  const Packet cst_exp_lo = pset1<Packet>(-88.3762626647949f);
+  const Packet cst_exp_hi = pset1<Packet>( 88.723f);
+  const Packet cst_exp_lo = pset1<Packet>(-88.723f);
 
   const Packet cst_cephes_LOG2EF = pset1<Packet>(1.44269504088896341f);
   const Packet cst_cephes_exp_p0 = pset1<Packet>(1.9875691500E-4f);
@@ -388,6 +463,7 @@
   y  = pmadd(y, r2, y2);
   
   // Return 2^m * exp(r).
+  // TODO: replace pldexp with faster implementation since y in [-1, 1).
   return pmax(pldexp(y,m), _x);
 }
 
@@ -402,8 +478,8 @@
   const Packet cst_2 = pset1<Packet>(2.0);
   const Packet cst_half = pset1<Packet>(0.5);
 
-  const Packet cst_exp_hi = pset1<Packet>(709.437);
-  const Packet cst_exp_lo = pset1<Packet>(-709.436139303);
+  const Packet cst_exp_hi = pset1<Packet>(709.784);
+  const Packet cst_exp_lo = pset1<Packet>(-709.784);
 
   const Packet cst_cephes_LOG2EF = pset1<Packet>(1.4426950408889634073599);
   const Packet cst_cephes_exp_p0 = pset1<Packet>(1.26177193074810590878e-4);
@@ -456,6 +532,7 @@
 
   // Construct the result 2^n * exp(g) = e * x. The max is used to catch
   // non-finite values in the input.
+  // TODO: replace pldexp with faster implementation since x in [-1, 1).
   return pmax(pldexp(x,fx), _x);
 }
 
@@ -787,6 +864,181 @@
                   pselect(is_real_inf, real_inf_result,result));
 }
 
+
+// This function implements the Veltkamp splitting. Given a floating point
+// number x it returns the pair {x_hi, x_lo} such that x_hi + x_lo = x holds
+// exactly and that half of the significant of x fits in x_hi.
+// This code corresponds to Algorithms 3 and 4 in
+// https://hal.inria.fr/hal-01774587v2/document
+template<typename Packet>
+EIGEN_STRONG_INLINE
+void veltkamp_splitting(const Packet& x, Packet& x_hi, Packet& x_lo) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  EIGEN_CONSTEXPR int shift = (NumTraits<Scalar>::digits() + 1) / 2;
+  Scalar shift_scale = Scalar(uint64_t(1) << shift);  // Scalar constructor not necessarily constexpr.
+  Packet gamma = pmul(pset1<Packet>(shift_scale + Scalar(1)), x);
+#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+  x_hi = pmadd(pset1<Packet>(-shift_scale), x, gamma);
+#else
+  Packet rho = psub(x, gamma);
+  x_hi = padd(rho, gamma);
+#endif
+  x_lo = psub(x, x_hi);
+}
+
+// This function splits x into the nearest integer n and fractional part r,
+// such that x = n + r holds exactly.
+template<typename Packet>
+EIGEN_STRONG_INLINE
+void integer_split(const Packet& x, Packet& n, Packet& r) {
+  n = pround(x);
+  r = psub(x, n);
+}
+
+// This function implements Dekker's algorithm for two products {x * y1, x * y2} with
+// a shared factor. Given floating point numbers {x, y1, y2} computes the pairs
+// {p1, r1} and {p2, r2} such that x * y1 = p1 + r1 holds exactly and
+// p1 = fl(x * y1), and x * y2 = p2 + r2 holds exactly and p2 = fl(x * y2).
+template<typename Packet>
+EIGEN_STRONG_INLINE
+void double_dekker(const Packet& x, const Packet& y1, const Packet& y2,
+                   Packet& p1, Packet& r1, Packet& p2, Packet& r2) {
+  Packet x_hi, x_lo, y1_hi, y1_lo, y2_hi, y2_lo;
+  veltkamp_splitting(x, x_hi, x_lo);
+  veltkamp_splitting(y1, y1_hi, y1_lo);
+  veltkamp_splitting(y2, y2_hi, y2_lo);
+
+  p1 = pmul(x, y1);
+  r1 = pmadd(x_hi, y1_hi, pnegate(p1));
+  r1 = pmadd(x_hi, y1_lo, r1);
+  r1 = pmadd(x_lo, y1_hi, r1);
+  r1 = pmadd(x_lo, y1_lo, r1);
+
+  p2 = pmul(x, y2);
+  r2 = pmadd(x_hi, y2_hi, pnegate(p2));
+  r2 = pmadd(x_hi, y2_lo, r2);
+  r2 = pmadd(x_lo, y2_hi, r2);
+  r2 = pmadd(x_lo, y2_lo, r2);
+}
+
+// This function implements the non-trivial case of pow(x,y) where x is
+// positive and y is (possibly) non-integer.
+// Formally, pow(x,y) = 2**(y * log2(x))
+template<typename Packet>
+EIGEN_STRONG_INLINE
+Packet generic_pow_impl(const Packet& x, const Packet& y) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  // Split x into exponent e_x and mantissa m_x.
+  Packet e_x;
+  Packet m_x = pfrexp(x, e_x);
+
+  // Adjust m_x to lie in [0.75:1.5) to minimize absolute error in log2(m_x).
+  Packet m_x_scale_mask = pcmp_lt(m_x, pset1<Packet>(Scalar(0.75)));
+  m_x = pselect(m_x_scale_mask, pmul(pset1<Packet>(Scalar(2)), m_x), m_x);
+  e_x = pselect(m_x_scale_mask, psub(e_x, pset1<Packet>(Scalar(1))), e_x);
+
+  Packet r_x = plog2(m_x);
+
+  // Compute the two terms {y * e_x, y * r_x} in f = y * log2(x) with doubled
+  // precision using Dekker's algorithm.
+  Packet f1_hi, f1_lo, f2_hi, f2_lo;
+  double_dekker(y, e_x, r_x, f1_hi, f1_lo, f2_hi, f2_lo);
+
+  // Separate f into integer and fractional parts, keeping f1_hi, and f2_hi
+  // separate to avoid cancellation.
+  Packet n1, r1, n2, r2;
+  integer_split(f1_hi, n1, r1);
+  integer_split(f2_hi, n2, r2);
+
+  // Add up integer parts and sum the remainders.
+  Packet n_z = padd(n1, n2);
+  // Notice: I experimented with using compensated (Kahan) summation here,
+  // but it does not seem to matter.
+  Packet rem = padd(padd(f1_lo, f2_lo), padd(r1, r2));
+
+  // Extract any additional integer part that may have accumulated in rem.
+  Packet nrem, r_z;
+  integer_split(rem, nrem, r_z);
+  n_z = padd(n_z, nrem);
+
+  // We now have an accurate split of f = n_z + r_z and can compute
+  // x^y = 2**{n_z + r_z) = exp(ln(2) * r_z) * 2**{n_z}.
+  // The first factor we compute by calling pexp(), while multiplication
+  // by an integer power of 2 can be done exactly using pldexp().
+  // Note: I experimented with using Dekker's algorithms for the
+  // multiplication by ln(2) here, but did not see any difference.
+  Packet e_r = pexp(pmul(pset1<Packet>(Scalar(EIGEN_LN2)), r_z));
+  // TODO: investigate bounds of e_r and n_z, potentially using faster
+  //       implementation of ldexp.
+  return pldexp(e_r, n_z);
+}
+
+// Generic implementation of pow(x,y).
+template<typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet generic_pow(const Packet& x, const Packet& y) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  const Packet cst_pos_inf = pset1<Packet>(NumTraits<Scalar>::infinity());
+  const Packet cst_zero = pset1<Packet>(Scalar(0));
+  const Packet cst_one = pset1<Packet>(Scalar(1));
+  const Packet cst_half = pset1<Packet>(Scalar(0.5));
+  const Packet cst_nan = pset1<Packet>(NumTraits<Scalar>::quiet_NaN());
+
+  Packet abs_x = pabs(x);
+  // Predicates for sign and magnitude of x.
+  Packet x_is_zero = pcmp_eq(x, cst_zero);
+  Packet x_is_neg = pcmp_lt(x, cst_zero);
+  Packet abs_x_is_inf = pcmp_eq(abs_x, cst_pos_inf);
+  Packet abs_x_is_one =  pcmp_eq(abs_x, cst_one);
+  Packet abs_x_is_gt_one = pcmp_lt(cst_one, abs_x);
+  Packet abs_x_is_lt_one = pcmp_lt(abs_x, cst_one);
+  Packet x_is_one =  pandnot(abs_x_is_one, x_is_neg);
+  Packet x_is_neg_one =  pand(abs_x_is_one, x_is_neg);
+  Packet x_is_nan = pandnot(ptrue(x), pcmp_eq(x, x));
+
+  // Predicates for sign and magnitude of y.
+  Packet y_is_zero = pcmp_eq(y, cst_zero);
+  Packet y_is_neg = pcmp_lt(y, cst_zero);
+  Packet y_is_pos = pandnot(ptrue(y), por(y_is_zero, y_is_neg));
+  Packet y_is_nan = pandnot(ptrue(y), pcmp_eq(y, y));
+  Packet abs_y_is_inf = pcmp_eq(pabs(y), cst_pos_inf);
+  // |y| is so large that (1+eps)^y over- or underflows.
+  EIGEN_CONSTEXPR Scalar huge_exponent =
+      (std::numeric_limits<Scalar>::digits * Scalar(EIGEN_LOG2E)) /
+      std::numeric_limits<Scalar>::epsilon();
+  Packet abs_y_is_huge = pcmp_lt(pset1<Packet>(huge_exponent), pabs(y));
+
+  // Predicates for whether y is integer and/or even.
+  Packet y_is_int = pcmp_eq(pfloor(y), y);
+  Packet y_div_2 = pmul(y, cst_half);
+  Packet y_is_even = pcmp_eq(pround(y_div_2), y_div_2);
+
+  // Predicates encoding special cases for the value of pow(x,y)
+  Packet invalid_negative_x = pandnot(pandnot(pandnot(x_is_neg, abs_x_is_inf), y_is_int), abs_y_is_inf);
+  Packet pow_is_one = por(por(x_is_one, y_is_zero),
+                          pand(x_is_neg_one,
+                               por(abs_y_is_inf, pandnot(y_is_even, invalid_negative_x))));
+  Packet pow_is_nan = por(invalid_negative_x, por(x_is_nan, y_is_nan));
+  Packet pow_is_zero = por(por(por(pand(x_is_zero, y_is_pos), pand(abs_x_is_inf, y_is_neg)),
+                               pand(pand(abs_x_is_lt_one, abs_y_is_huge), y_is_pos)),
+                           pand(pand(abs_x_is_gt_one, abs_y_is_huge), y_is_neg));
+  Packet pow_is_inf = por(por(por(pand(x_is_zero, y_is_neg), pand(abs_x_is_inf, y_is_pos)),
+                              pand(pand(abs_x_is_lt_one, abs_y_is_huge), y_is_neg)),
+                          pand(pand(abs_x_is_gt_one, abs_y_is_huge), y_is_pos));
+
+  // General computation of pow(x,y) for positive x or negative x and integer y.
+  Packet negate_pow_abs = pandnot(x_is_neg, y_is_even);
+  Packet pow_abs = generic_pow_impl(abs_x, y);
+
+  return pselect(pow_is_one, cst_one,
+                 pselect(pow_is_nan, cst_nan,
+                         pselect(pow_is_inf, cst_pos_inf,
+                                 pselect(pow_is_zero, cst_zero,
+                                         pselect(negate_pow_abs, pnegate(pow_abs), pow_abs)))));
+}
+
+
 /* polevl (modified for Eigen)
  *
  *      Evaluate polynomial
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
index 491f1c9..96c572f 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
@@ -17,14 +17,37 @@
 // implemented in GenericPacketMathFunctions.h
 // This is needed to workaround a circular dependency.
 
+/** \internal \returns a packet with constant coefficients \a a, e.g.: (a[N-1],...,a[0]) */
+template<typename Packet, int N> EIGEN_DEVICE_FUNC inline Packet
+pset(const typename unpacket_traits<Packet>::type (&a)[N] /* a */);
+
+/***************************************************************************
+ * Some generic implementations to be used by implementors
+***************************************************************************/
+
+/** Default implementation of pfrexp for float.
+  * It is expected to be called by implementers of template<> pfrexp.
+  */
 template<typename Packet> EIGEN_STRONG_INLINE Packet
 pfrexp_float(const Packet& a, Packet& exponent);
 
+/** Default implementation of pfrexp for double.
+  * It is expected to be called by implementers of template<> pfrexp.
+  */
 template<typename Packet> EIGEN_STRONG_INLINE Packet
 pfrexp_double(const Packet& a, Packet& exponent);
 
+/** Default implementation of pldexp for float.
+  * It is expected to be called by implementers of template<> pldexp.
+  */
 template<typename Packet> EIGEN_STRONG_INLINE Packet
-pldexp_float(Packet a, Packet exponent);
+pldexp_float(const Packet& a, const Packet& exponent);
+
+/** Default implementation of pldexp for double.
+  * It is expected to be called by implementers of template<> pldexp.
+  */
+template<typename Packet> EIGEN_STRONG_INLINE Packet
+pldexp_double(const Packet& a, const Packet& exponent);
 
 /** \internal \returns log(x) for single precision float */
 template <typename Packet>
diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h
index a889ab1..1aa361b 100644
--- a/Eigen/src/Core/arch/NEON/Complex.h
+++ b/Eigen/src/Core/arch/NEON/Complex.h
@@ -124,6 +124,13 @@
 template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 { return Packet2cf(psub<Packet4f>(a.v, b.v)); }
 
+template<> EIGEN_STRONG_INLINE Packet2cf pxor<Packet2cf>(const Packet2cf& a, const Packet2cf& b);
+template<> EIGEN_STRONG_INLINE Packet2cf paddsub<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  Packet4f mask = {-0.0f, -0.0f, 0.0f, 0.0f};
+  return Packet2cf(padd(a.v, pxor(mask, b.v)));  
+}
+
 template<> EIGEN_STRONG_INLINE Packet1cf pnegate(const Packet1cf& a) { return Packet1cf(pnegate<Packet2f>(a.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate<Packet4f>(a.v)); }
 
diff --git a/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h b/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h
new file mode 100644
index 0000000..3481f33
--- /dev/null
+++ b/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h
@@ -0,0 +1,183 @@
+namespace Eigen {
+namespace internal {
+  
+#if EIGEN_ARCH_ARM && EIGEN_COMP_CLANG
+
+// Clang seems to excessively spill registers in the GEBP kernel on 32-bit arm.
+// Here we specialize gebp_traits to eliminate these register spills.
+// See #2138.
+template<>
+struct gebp_traits <float,float,false,false,Architecture::NEON,GEBPPacketFull>
+ : gebp_traits<float,float,false,false,Architecture::Generic,GEBPPacketFull>
+{
+  EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
+  { 
+    // This volatile inline ASM both acts as a barrier to prevent reordering,
+    // as well as enforces strict register use.
+    asm volatile(
+      "vmla.f32 %q[r], %q[c], %q[alpha]"
+      : [r] "+w" (r)
+      : [c] "w" (c),
+        [alpha] "w" (alpha)
+      : );
+  }
+
+  template <typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const Packet4f& a, const Packet4f& b,
+                                Packet4f& c, Packet4f& tmp,
+                                const LaneIdType&) const {
+    acc(a, b, c);
+  }
+  
+  template <typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const Packet4f& a, const QuadPacket<Packet4f>& b,
+                                Packet4f& c, Packet4f& tmp,
+                                const LaneIdType& lane) const {
+    madd(a, b.get(lane), c, tmp, lane);
+  }
+};
+
+#endif // EIGEN_ARCH_ARM && EIGEN_COMP_CLANG
+
+#if EIGEN_ARCH_ARM64
+
+template<>
+struct gebp_traits <float,float,false,false,Architecture::NEON,GEBPPacketFull>
+ : gebp_traits<float,float,false,false,Architecture::Generic,GEBPPacketFull>
+{
+  typedef float RhsPacket;
+  typedef float32x4_t RhsPacketx4;
+
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
+  {
+    dest = *b;
+  }
+
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
+  {
+    dest = vld1q_f32(b);
+  }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
+  {
+    dest = *b;
+  }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
+  {}
+
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
+  {
+    loadRhs(b,dest);
+  }
+
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
+  {
+    c = vfmaq_n_f32(c, a, b);
+  }
+
+  // NOTE: Template parameter inference failed when compiled with Android NDK:
+  // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
+
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
+  { madd_helper<0>(a, b, c); }
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const
+  { madd_helper<1>(a, b, c); }
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const
+  { madd_helper<2>(a, b, c); }
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const
+  { madd_helper<3>(a, b, c); }
+
+ private:
+  template<int LaneID>
+  EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
+  {
+    #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0))
+    // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
+    // vfmaq_laneq_f32 is implemented through a costly dup
+         if(LaneID==0)  asm("fmla %0.4s, %1.4s, %2.s[0]\n" : "+w" (c) : "w" (a), "w" (b) :  );
+    else if(LaneID==1)  asm("fmla %0.4s, %1.4s, %2.s[1]\n" : "+w" (c) : "w" (a), "w" (b) :  );
+    else if(LaneID==2)  asm("fmla %0.4s, %1.4s, %2.s[2]\n" : "+w" (c) : "w" (a), "w" (b) :  );
+    else if(LaneID==3)  asm("fmla %0.4s, %1.4s, %2.s[3]\n" : "+w" (c) : "w" (a), "w" (b) :  );
+    #else
+    c = vfmaq_laneq_f32(c, a, b, LaneID);
+    #endif
+  }
+};
+
+
+template<>
+struct gebp_traits <double,double,false,false,Architecture::NEON>
+ : gebp_traits<double,double,false,false,Architecture::Generic>
+{
+  typedef double RhsPacket;
+
+  struct RhsPacketx4 {
+    float64x2_t B_0, B_1;
+  };
+
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
+  {
+    dest = *b;
+  }
+
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
+  {
+    dest.B_0 = vld1q_f64(b);
+    dest.B_1 = vld1q_f64(b+2);
+  }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
+  {
+    loadRhs(b,dest);
+  }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
+  {}
+
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
+  {
+    loadRhs(b,dest);
+  }
+
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
+  {
+    c = vfmaq_n_f64(c, a, b);
+  }
+
+  // NOTE: Template parameter inference failed when compiled with Android NDK:
+  // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
+
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
+  { madd_helper<0>(a, b, c); }
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const
+  { madd_helper<1>(a, b, c); }
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const
+  { madd_helper<2>(a, b, c); }
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const
+  { madd_helper<3>(a, b, c); }
+
+ private:
+  template <int LaneID>
+  EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
+  {
+    #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0))
+    // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
+    // vfmaq_laneq_f64 is implemented through a costly dup
+         if(LaneID==0)  asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_0) :  );
+    else if(LaneID==1)  asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_0) :  );
+    else if(LaneID==2)  asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_1) :  );
+    else if(LaneID==3)  asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_1) :  );
+    #else
+         if(LaneID==0) c = vfmaq_laneq_f64(c, a, b.B_0, 0);
+    else if(LaneID==1) c = vfmaq_laneq_f64(c, a, b.B_0, 1);
+    else if(LaneID==2) c = vfmaq_laneq_f64(c, a, b.B_1, 0);
+    else if(LaneID==3) c = vfmaq_laneq_f64(c, a, b.B_1, 1);
+    #endif
+  }
+};
+
+#endif // EIGEN_ARCH_ARM64
+
+}  // namespace internal
+}  // namespace Eigen
diff --git a/Eigen/src/Core/arch/NEON/MathFunctions.h b/Eigen/src/Core/arch/NEON/MathFunctions.h
index 28167b9..fa6615a 100644
--- a/Eigen/src/Core/arch/NEON/MathFunctions.h
+++ b/Eigen/src/Core/arch/NEON/MathFunctions.h
@@ -44,6 +44,18 @@
 BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pexp)
 BF16_PACKET_FUNCTION(Packet4f, Packet4bf, ptanh)
 
+template <>
+EIGEN_STRONG_INLINE Packet4bf pfrexp(const Packet4bf& a, Packet4bf& exponent) {
+  Packet4f fexponent;
+  const Packet4bf out = F32ToBf16(pfrexp<Packet4f>(Bf16ToF32(a), fexponent));
+  exponent = F32ToBf16(fexponent);
+  return out;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf pldexp(const Packet4bf& a, const Packet4bf& exponent) {
+  return F32ToBf16(pldexp<Packet4f>(Bf16ToF32(a), Bf16ToF32(exponent)));
+}
 
 //---------- double ----------
 
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 14f3dbd..c7d5397 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -84,12 +84,18 @@
 
 #endif // EIGEN_COMP_MSVC
 
+EIGEN_STRONG_INLINE Packet4f shuffle1(const Packet4f& m, int mask){
+  const float* a = reinterpret_cast<const float*>(&m);
+  Packet4f res = {*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3 )), *(a + ((mask >> 6) & 3))};
+  return res;
+}
+
 // fuctionally equivalent to _mm_shuffle_ps in SSE when interleave
 // == false (i.e. shuffle<false>(m, n, mask) equals _mm_shuffle_ps(m, n, mask)),
 // interleave m and n when interleave == true. Currently used in LU/arch/InverseSize4.h
-// to enable a shared implementation for fast inversion of matrices of size 4.
-template<bool interleave>
-EIGEN_STRONG_INLINE Packet4f shuffle(const Packet4f &m, const Packet4f &n, int mask)
+// to enable a shared implementation for fast inversion of matrices of size 4. 
+template<bool interleave> 
+EIGEN_STRONG_INLINE Packet4f shuffle2(const Packet4f &m, const Packet4f &n, int mask)
 {
   const float* a = reinterpret_cast<const float*>(&m);
   const float* b = reinterpret_cast<const float*>(&n);
@@ -97,8 +103,8 @@
   return res;
 }
 
-template<>
-EIGEN_STRONG_INLINE Packet4f shuffle<true>(const Packet4f &m, const Packet4f &n, int mask)
+template<> 
+EIGEN_STRONG_INLINE Packet4f shuffle2<true>(const Packet4f &m, const Packet4f &n, int mask) 
 {
   const float* a = reinterpret_cast<const float*>(&m);
   const float* b = reinterpret_cast<const float*>(&n);
@@ -108,25 +114,29 @@
 
 EIGEN_STRONG_INLINE static int eigen_neon_shuffle_mask(int p, int q, int r, int s) {return ((s)<<6|(r)<<4|(q)<<2|(p));}
 
+EIGEN_STRONG_INLINE Packet4f vec4f_swizzle1(const Packet4f& a, int p, int q, int r, int s)
+{ 
+  return shuffle1(a, eigen_neon_shuffle_mask(p, q, r, s));
+}
 EIGEN_STRONG_INLINE Packet4f vec4f_swizzle2(const Packet4f& a, const Packet4f& b, int p, int q, int r, int s)
-{
-  return shuffle<false>(a,b,eigen_neon_shuffle_mask(p, q, r, s));
+{ 
+  return shuffle2<false>(a,b,eigen_neon_shuffle_mask(p, q, r, s));
 }
 EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b)
 {
-  return shuffle<false>(a,b,eigen_neon_shuffle_mask(0, 1, 0, 1));
+  return shuffle2<false>(a,b,eigen_neon_shuffle_mask(0, 1, 0, 1));
 }
 EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b)
 {
-  return shuffle<false>(b,a,eigen_neon_shuffle_mask(2, 3, 2, 3));
+  return shuffle2<false>(b,a,eigen_neon_shuffle_mask(2, 3, 2, 3));
 }
 EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b)
 {
-  return shuffle<true>(a,b,eigen_neon_shuffle_mask(0, 0, 1, 1));
+  return shuffle2<true>(a,b,eigen_neon_shuffle_mask(0, 0, 1, 1));
 }
 EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b)
 {
-  return shuffle<true>(a,b,eigen_neon_shuffle_mask(2, 2, 3, 3));
+  return shuffle2<true>(a,b,eigen_neon_shuffle_mask(2, 2, 3, 3));
 }
 #define vec4f_duplane(a, p) \
   vdupq_lane_f32(vget_low_f32(a), p)
@@ -851,6 +861,17 @@
 template<> EIGEN_STRONG_INLINE Packet2l psub<Packet2l>(const Packet2l& a, const Packet2l& b) { return vsubq_s64(a,b); }
 template<> EIGEN_STRONG_INLINE Packet2ul psub<Packet2ul>(const Packet2ul& a, const Packet2ul& b) { return vsubq_u64(a,b); }
 
+template<> EIGEN_STRONG_INLINE Packet2f pxor<Packet2f>(const Packet2f& a, const Packet2f& b);
+template<> EIGEN_STRONG_INLINE Packet2f paddsub<Packet2f>(const Packet2f& a, const Packet2f & b) {
+  Packet2f mask = {-0.0f, 0.0f};
+  return padd(a, pxor(mask, b));
+}
+template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b);
+template<> EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  Packet4f mask = {-0.0f, 0.0f, -0.0f, 0.0f};
+  return padd(a, pxor(mask, b));
+}
+
 template<> EIGEN_STRONG_INLINE Packet2f pnegate(const Packet2f& a) { return vneg_f32(a); }
 template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return vnegq_f32(a); }
 template<> EIGEN_STRONG_INLINE Packet4c pnegate(const Packet4c& a)
@@ -3273,26 +3294,23 @@
 // effective latency. This is similar to Quake3's fast inverse square root.
 // For more details see: http://www.beyond3d.com/content/articles/8/
 template<> EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& _x){
-  Packet4f minus_half_x = vmulq_n_f32(_x, -0.5f);
   Packet4ui denormal_mask = vandq_u32(vcgeq_f32(_x, vdupq_n_f32(0.0f)),
                                       vcltq_f32(_x, pset1<Packet4f>((std::numeric_limits<float>::min)())));
   // Compute approximate reciprocal sqrt.
   Packet4f x = vrsqrteq_f32(_x);
-  // Do a single step of Newton's iteration.
-  //the number 1.5f was set reference to Quake3's fast inverse square root
-  x = pmul(x, pmadd(minus_half_x, pmul(x, x), pset1<Packet4f>(1.5f)));
+  // Do one Newton's iteration for 1/sqrt(x).
+  x = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_x, x), x), x);
   // Flush results for denormals to zero.
   return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(pmul(_x, x)), denormal_mask));
 }
 
 template<> EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& _x) {
-  Packet2f minus_half_x = vmul_n_f32(_x, -0.5f);
   Packet2ui denormal_mask = vand_u32(vcge_f32(_x, vdup_n_f32(0.0f)),
                                      vclt_f32(_x, pset1<Packet2f>((std::numeric_limits<float>::min)())));
   // Compute approximate reciprocal sqrt.
   Packet2f x = vrsqrte_f32(_x);
-  // Do a single step of Newton's iteration.
-  x = pmul(x, pmadd(minus_half_x, pmul(x, x), pset1<Packet2f>(1.5f)));
+  // Do one Newton's iteration for 1/sqrt(x).
+  x = vmul_f32(vrsqrts_f32(vmul_f32(_x, x), x), x);
   // Flush results for denormals to zero.
   return vreinterpret_f32_u32(vbic_u32(vreinterpret_u32_f32(pmul(_x, x)), denormal_mask));
 }
@@ -3717,6 +3735,12 @@
 
 template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return vsubq_f64(a,b); }
 
+template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& , const Packet2d& );
+template<> EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(const Packet2d& a, const Packet2d& b){
+  const Packet2d mask = {-0.0,0.0};
+  return padd(a, pxor(mask, b));
+}
+
 template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return vnegq_f64(a); }
 
 template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
@@ -4484,31 +4508,16 @@
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8hf, 4>& kernel)
 {
-  EIGEN_ALIGN16 Eigen::half in[4][8];
+  const float16x8x2_t zip16_1 = vzipq_f16(kernel.packet[0], kernel.packet[1]);
+  const float16x8x2_t zip16_2 = vzipq_f16(kernel.packet[2], kernel.packet[3]);
 
-  pstore<Eigen::half>(in[0], kernel.packet[0]);
-  pstore<Eigen::half>(in[1], kernel.packet[1]);
-  pstore<Eigen::half>(in[2], kernel.packet[2]);
-  pstore<Eigen::half>(in[3], kernel.packet[3]);
+  const float32x4x2_t zip32_1 = vzipq_f32(vreinterpretq_f32_f16(zip16_1.val[0]), vreinterpretq_f32_f16(zip16_2.val[0]));
+  const float32x4x2_t zip32_2 = vzipq_f32(vreinterpretq_f32_f16(zip16_1.val[1]), vreinterpretq_f32_f16(zip16_2.val[1]));
 
-  EIGEN_ALIGN16 Eigen::half out[4][8];
-
-  EIGEN_UNROLL_LOOP
-  for (int i = 0; i < 4; ++i) {
-    EIGEN_UNROLL_LOOP
-    for (int j = 0; j < 4; ++j) {
-      out[i][j] = in[j][2 * i];
-    }
-    EIGEN_UNROLL_LOOP
-    for (int j = 0; j < 4; ++j) {
-      out[i][j + 4] = in[j][2 * i + 1];
-    }
-  }
-
-  kernel.packet[0] = pload<Packet8hf>(out[0]);
-  kernel.packet[1] = pload<Packet8hf>(out[1]);
-  kernel.packet[2] = pload<Packet8hf>(out[2]);
-  kernel.packet[3] = pload<Packet8hf>(out[3]);
+  kernel.packet[0] = vreinterpretq_f16_f32(zip32_1.val[0]);
+  kernel.packet[1] = vreinterpretq_f16_f32(zip32_1.val[1]);
+  kernel.packet[2] = vreinterpretq_f16_f32(zip32_2.val[0]);
+  kernel.packet[3] = vreinterpretq_f16_f32(zip32_2.val[1]);
 }
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4hf, 4>& kernel) {
diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h
index 58cdb5d..f6f1b8c 100644
--- a/Eigen/src/Core/arch/SSE/Complex.h
+++ b/Eigen/src/Core/arch/SSE/Complex.h
@@ -66,6 +66,13 @@
 
 template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pxor<Packet2cf>(const Packet2cf& a, const Packet2cf& b);
+template<> EIGEN_STRONG_INLINE Packet2cf paddsub<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x0,0x0));
+  return Packet2cf(padd(a.v, pxor(mask, b.v)));
+}
+
 template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a)
 {
   const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000));
diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h
index 9f66d8a..8736d0d 100644
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@@ -150,7 +150,7 @@
 
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f prsqrt<Packet4f>(const Packet4f& x) {
-  // Unfortunately we can't use the much faster mm_rqsrt_ps since it only provides an approximation.
+  // Unfortunately we can't use the much faster mm_rsqrt_ps since it only provides an approximation.
   return _mm_div_ps(pset1<Packet4f>(1.0f), _mm_sqrt_ps(x));
 }
 
@@ -158,7 +158,6 @@
 
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet2d prsqrt<Packet2d>(const Packet2d& x) {
-  // Unfortunately we can't use the much faster mm_rqsrt_pd since it only provides an approximation.
   return _mm_div_pd(pset1<Packet2d>(1.0), _mm_sqrt_pd(x));
 }
 
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 4e733c7..422b0fc 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -26,7 +26,7 @@
 
 #ifdef EIGEN_VECTORIZE_FMA
 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
-#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
 #endif
 #endif
 
@@ -147,13 +147,13 @@
     HasTanh = EIGEN_FAST_MATH,
     HasErf = EIGEN_FAST_MATH,
     HasBlend = 1,
+    HasCeil = 1,
     HasFloor = 1
 
 #ifdef EIGEN_VECTORIZE_SSE4_1
     ,
     HasRint = 1,
-    HasRound = 1,
-    HasCeil = 1
+    HasRound = 1
 #endif
   };
 };
@@ -173,14 +173,14 @@
     HasExp  = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
-    HasBlend = 1
+    HasBlend = 1,
+    HasFloor = 1,
+    HasCeil = 1
 
 #ifdef EIGEN_VECTORIZE_SSE4_1
     ,
     HasRound = 1,
-    HasRint = 1,
-    HasFloor = 1,
-    HasCeil = 1
+    HasRint = 1
 #endif
   };
 };
@@ -301,6 +301,28 @@
 template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_sub_epi32(a,b); }
 template<> EIGEN_STRONG_INLINE Packet16b psub<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_xor_si128(a,b); }
 
+template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b);
+template<> EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+#ifdef EIGEN_VECTORIZE_SSE3
+  return _mm_addsub_ps(a,b);
+#else
+  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x0,0x80000000,0x0));
+  return padd(a, pxor(mask, b));
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& , const Packet2d& );
+template<> EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(const Packet2d& a, const Packet2d& b) 
+{
+#ifdef EIGEN_VECTORIZE_SSE3  
+  return _mm_addsub_pd(a,b); 
+#else
+  const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x80000000,0x0,0x0)); 
+  return padd(a, pxor(mask, b));
+#endif
+}
+
 template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a)
 {
   const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000));
@@ -377,21 +399,6 @@
 }
 #endif
 
-template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return _mm_cmple_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return _mm_cmplt_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return _mm_cmpnge_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return _mm_cmpeq_ps(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) { return _mm_cmple_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) { return _mm_cmplt_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) { return _mm_cmpnge_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return _mm_cmpeq_pd(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) { return _mm_cmplt_epi32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return _mm_cmpeq_epi32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16b pcmp_eq(const Packet16b& a, const Packet16b& b) { return _mm_cmpeq_epi8(a,b); }
-
-
 template<> EIGEN_STRONG_INLINE Packet4i ptrue<Packet4i>(const Packet4i& a) { return _mm_cmpeq_epi32(a, a); }
 template<> EIGEN_STRONG_INLINE Packet16b ptrue<Packet16b>(const Packet16b& a) { return _mm_cmpeq_epi8(a, a); }
 template<> EIGEN_STRONG_INLINE Packet4f
@@ -425,6 +432,21 @@
 template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(b,a); }
 template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(b,a); }
 
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return _mm_cmple_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return _mm_cmplt_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return _mm_cmpnge_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return _mm_cmpeq_ps(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) { return _mm_cmple_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) { return _mm_cmplt_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) { return _mm_cmpnge_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return _mm_cmpeq_pd(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) { return _mm_cmplt_epi32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return _mm_cmpeq_epi32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16b pcmp_eq(const Packet16b& a, const Packet16b& b) { return _mm_cmpeq_epi8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) { return por(pcmp_lt(a,b), pcmp_eq(a,b)); }
+
 template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
 #if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
   // There appears to be a bug in GCC, by which the optimizer may
@@ -628,6 +650,30 @@
   mask = pand(mask, cst_1);
   return psub(tmp, mask);
 }
+
+template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a)
+{
+  const Packet4f cst_1 = pset1<Packet4f>(1.0f);
+  Packet4i emm0 = _mm_cvttps_epi32(a);
+  Packet4f tmp  = _mm_cvtepi32_ps(emm0);
+  /* if greater, substract 1 */
+  Packet4f mask = _mm_cmplt_ps(tmp, a);
+  mask = pand(mask, cst_1);
+  return padd(tmp, mask);
+}
+
+// WARNING: this pfloor implementation makes sense for small inputs only,
+// It is currently only used by pexp and not exposed through HasFloor.
+template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a)
+{
+  const Packet2d cst_1 = pset1<Packet2d>(1.0);
+  Packet4i emm0 = _mm_cvttpd_epi32(a);
+  Packet2d tmp  = _mm_cvtepi32_pd(emm0);
+  /* if greater, substract 1 */
+  Packet2d mask = _mm_cmplt_pd(tmp, a);
+  mask = pand(mask, cst_1);
+  return padd(tmp, mask);
+}
 #endif
 
 template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float*   from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); }
@@ -861,13 +907,22 @@
 // We specialize pldexp here, since the generic implementation uses Packet2l, which is not well
 // supported by SSE, and has more range than is needed for exponents.
 template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
-  const Packet2d cst_1023 = pset1<Packet2d>(1023.0);
-  // Add exponent offset.
-  Packet4i ei = _mm_cvtpd_epi32(padd(exponent, cst_1023));
-  // Convert to exponents to int64 and swizzle to the low-order 32 bits.
-  ei = vec4i_swizzle1(ei, 0, 3, 1, 3);
-  // return a * 2^exponent
-  return pmul(a, _mm_castsi128_pd(_mm_slli_epi64(ei, 52)));
+  // Clamp exponent to [-2099, 2099]
+  const Packet2d max_exponent = pset1<Packet2d>(2099.0);
+  const Packet2d e = pmin(pmax(exponent, pnegate(max_exponent)), max_exponent);
+  
+  // Convert e to integer and swizzle to low-order bits.
+  const Packet4i ei = vec4i_swizzle1(_mm_cvtpd_epi32(e), 0, 3, 1, 3);
+  
+  // Split 2^e into four factors and multiply:
+  const Packet4i bias = _mm_set_epi32(0, 1023, 0, 1023);
+  Packet4i b = parithmetic_shift_right<2>(ei);  // floor(e/4)
+  Packet2d c = _mm_castsi128_pd(_mm_slli_epi64(padd(b, bias), 52));  // 2^b
+  Packet2d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b)
+  b = psub(psub(psub(ei, b), b), b);  // e - 3b
+  c = _mm_castsi128_pd(_mm_slli_epi64(padd(b, bias), 52));  // 2^(e - 3b)
+  out = pmul(out, c);  // a * 2^e
+  return out;
 }
 
 // with AVX, the default implementations based on pload1 are faster
diff --git a/Eigen/src/Core/arch/SVE/MathFunctions.h b/Eigen/src/Core/arch/SVE/MathFunctions.h
new file mode 100644
index 0000000..b139ea2
--- /dev/null
+++ b/Eigen/src/Core/arch/SVE/MathFunctions.h
@@ -0,0 +1,44 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2020, Arm Limited and Contributors
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATH_FUNCTIONS_SVE_H
+#define EIGEN_MATH_FUNCTIONS_SVE_H
+
+namespace Eigen {
+namespace internal {
+
+template <>
+EIGEN_STRONG_INLINE EIGEN_UNUSED PacketXf pexp<PacketXf>(const PacketXf& x) {
+  return pexp_float(x);
+}
+
+template <>
+EIGEN_STRONG_INLINE EIGEN_UNUSED PacketXf plog<PacketXf>(const PacketXf& x) {
+  return plog_float(x);
+}
+
+template <>
+EIGEN_STRONG_INLINE EIGEN_UNUSED PacketXf psin<PacketXf>(const PacketXf& x) {
+  return psin_float(x);
+}
+
+template <>
+EIGEN_STRONG_INLINE EIGEN_UNUSED PacketXf pcos<PacketXf>(const PacketXf& x) {
+  return pcos_float(x);
+}
+
+// Hyperbolic Tangent function.
+template <>
+EIGEN_STRONG_INLINE EIGEN_UNUSED PacketXf ptanh<PacketXf>(const PacketXf& x) {
+  return internal::generic_fast_tanh_float(x);
+}
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_MATH_FUNCTIONS_SVE_H
diff --git a/Eigen/src/Core/arch/SVE/PacketMath.h b/Eigen/src/Core/arch/SVE/PacketMath.h
new file mode 100644
index 0000000..98585e6
--- /dev/null
+++ b/Eigen/src/Core/arch/SVE/PacketMath.h
@@ -0,0 +1,756 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2020, Arm Limited and Contributors
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_SVE_H
+#define EIGEN_PACKET_MATH_SVE_H
+
+namespace Eigen
+{
+namespace internal
+{
+#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
+#endif
+
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#endif
+
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
+#endif
+
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
+
+template <typename Scalar, int SVEVectorLength>
+struct sve_packet_size_selector {
+  enum { size = SVEVectorLength / (sizeof(Scalar) * CHAR_BIT) };
+};
+
+/********************************* int32 **************************************/
+typedef svint32_t PacketXi __attribute__((arm_sve_vector_bits(EIGEN_ARM64_SVE_VL)));
+
+template <>
+struct packet_traits<numext::int32_t> : default_packet_traits {
+  typedef PacketXi type;
+  typedef PacketXi half;  // Half not implemented yet
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = sve_packet_size_selector<numext::int32_t, EIGEN_ARM64_SVE_VL>::size,
+    HasHalfPacket = 0,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 0,
+    HasBlend = 0,
+    HasReduxp = 0  // Not implemented in SVE
+  };
+};
+
+template <>
+struct unpacket_traits<PacketXi> {
+  typedef numext::int32_t type;
+  typedef PacketXi half;  // Half not yet implemented
+  enum {
+    size = sve_packet_size_selector<numext::int32_t, EIGEN_ARM64_SVE_VL>::size,
+    alignment = Aligned64,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<numext::int32_t>(const numext::int32_t* addr)
+{
+  svprfw(svptrue_b32(), addr, SV_PLDL1KEEP);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pset1<PacketXi>(const numext::int32_t& from)
+{
+  return svdup_n_s32(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi plset<PacketXi>(const numext::int32_t& a)
+{
+  numext::int32_t c[packet_traits<numext::int32_t>::size];
+  for (int i = 0; i < packet_traits<numext::int32_t>::size; i++) c[i] = i;
+  return svadd_s32_z(svptrue_b32(), pset1<PacketXi>(a), svld1_s32(svptrue_b32(), c));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi padd<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return svadd_s32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi psub<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return svsub_s32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pnegate(const PacketXi& a)
+{
+  return svneg_s32_z(svptrue_b32(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pconj(const PacketXi& a)
+{
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pmul<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return svmul_s32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pdiv<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return svdiv_s32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pmadd(const PacketXi& a, const PacketXi& b, const PacketXi& c)
+{
+  return svmla_s32_z(svptrue_b32(), c, a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pmin<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return svmin_s32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pmax<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return svmax_s32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pcmp_le<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return svdup_n_s32_z(svcmplt_s32(svptrue_b32(), a, b), 0xffffffffu);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pcmp_lt<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return svdup_n_s32_z(svcmplt_s32(svptrue_b32(), a, b), 0xffffffffu);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pcmp_eq<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return svdup_n_s32_z(svcmpeq_s32(svptrue_b32(), a, b), 0xffffffffu);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi ptrue<PacketXi>(const PacketXi& /*a*/)
+{
+  return svdup_n_s32_z(svptrue_b32(), 0xffffffffu);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pzero<PacketXi>(const PacketXi& /*a*/)
+{
+  return svdup_n_s32_z(svptrue_b32(), 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pand<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return svand_s32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi por<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return svorr_s32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pxor<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return sveor_s32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pandnot<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return svbic_s32_z(svptrue_b32(), a, b);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE PacketXi parithmetic_shift_right(PacketXi a)
+{
+  return svasrd_n_s32_z(svptrue_b32(), a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE PacketXi plogical_shift_right(PacketXi a)
+{
+  return svreinterpret_s32_u32(svlsr_u32_z(svptrue_b32(), svreinterpret_u32_s32(a), svdup_n_u32_z(svptrue_b32(), N)));
+}
+
+template <int N>
+EIGEN_STRONG_INLINE PacketXi plogical_shift_left(PacketXi a)
+{
+  return svlsl_s32_z(svptrue_b32(), a, svdup_n_u32_z(svptrue_b32(), N));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pload<PacketXi>(const numext::int32_t* from)
+{
+  EIGEN_DEBUG_ALIGNED_LOAD return svld1_s32(svptrue_b32(), from);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi ploadu<PacketXi>(const numext::int32_t* from)
+{
+  EIGEN_DEBUG_UNALIGNED_LOAD return svld1_s32(svptrue_b32(), from);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi ploaddup<PacketXi>(const numext::int32_t* from)
+{
+  svuint32_t indices = svindex_u32(0, 1);  // index {base=0, base+step=1, base+step*2, ...}
+  indices = svzip1_u32(indices, indices);  // index in the format {a0, a0, a1, a1, a2, a2, ...}
+  return svld1_gather_u32index_s32(svptrue_b32(), from, indices);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi ploadquad<PacketXi>(const numext::int32_t* from)
+{
+  svuint32_t indices = svindex_u32(0, 1);  // index {base=0, base+step=1, base+step*2, ...}
+  indices = svzip1_u32(indices, indices);  // index in the format {a0, a0, a1, a1, a2, a2, ...}
+  indices = svzip1_u32(indices, indices);  // index in the format {a0, a0, a0, a0, a1, a1, a1, a1, ...}
+  return svld1_gather_u32index_s32(svptrue_b32(), from, indices);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<numext::int32_t>(numext::int32_t* to, const PacketXi& from)
+{
+  EIGEN_DEBUG_ALIGNED_STORE svst1_s32(svptrue_b32(), to, from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<numext::int32_t>(numext::int32_t* to, const PacketXi& from)
+{
+  EIGEN_DEBUG_UNALIGNED_STORE svst1_s32(svptrue_b32(), to, from);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline PacketXi pgather<numext::int32_t, PacketXi>(const numext::int32_t* from, Index stride)
+{
+  // Indice format: {base=0, base+stride, base+stride*2, base+stride*3, ...}
+  svint32_t indices = svindex_s32(0, stride);
+  return svld1_gather_s32index_s32(svptrue_b32(), from, indices);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<numext::int32_t, PacketXi>(numext::int32_t* to, const PacketXi& from, Index stride)
+{
+  // Indice format: {base=0, base+stride, base+stride*2, base+stride*3, ...}
+  svint32_t indices = svindex_s32(0, stride);
+  svst1_scatter_s32index_s32(svptrue_b32(), to, indices, from);
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int32_t pfirst<PacketXi>(const PacketXi& a)
+{
+  // svlasta returns the first element if all predicate bits are 0
+  return svlasta_s32(svpfalse_b(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi preverse(const PacketXi& a)
+{
+  return svrev_s32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pabs(const PacketXi& a)
+{
+  return svabs_s32_z(svptrue_b32(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int32_t predux<PacketXi>(const PacketXi& a)
+{
+  return static_cast<numext::int32_t>(svaddv_s32(svptrue_b32(), a));
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int32_t predux_mul<PacketXi>(const PacketXi& a)
+{
+  EIGEN_STATIC_ASSERT((EIGEN_ARM64_SVE_VL % 128 == 0),
+                      EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT);
+
+  // Multiply the vector by its reverse
+  svint32_t prod = svmul_s32_z(svptrue_b32(), a, svrev_s32(a));
+  svint32_t half_prod;
+
+  // Extract the high half of the vector. Depending on the VL more reductions need to be done
+  if (EIGEN_ARM64_SVE_VL >= 2048) {
+    half_prod = svtbl_s32(prod, svindex_u32(32, 1));
+    prod = svmul_s32_z(svptrue_b32(), prod, half_prod);
+  }
+  if (EIGEN_ARM64_SVE_VL >= 1024) {
+    half_prod = svtbl_s32(prod, svindex_u32(16, 1));
+    prod = svmul_s32_z(svptrue_b32(), prod, half_prod);
+  }
+  if (EIGEN_ARM64_SVE_VL >= 512) {
+    half_prod = svtbl_s32(prod, svindex_u32(8, 1));
+    prod = svmul_s32_z(svptrue_b32(), prod, half_prod);
+  }
+  if (EIGEN_ARM64_SVE_VL >= 256) {
+    half_prod = svtbl_s32(prod, svindex_u32(4, 1));
+    prod = svmul_s32_z(svptrue_b32(), prod, half_prod);
+  }
+  // Last reduction
+  half_prod = svtbl_s32(prod, svindex_u32(2, 1));
+  prod = svmul_s32_z(svptrue_b32(), prod, half_prod);
+
+  // The reduction is done to the first element.
+  return pfirst<PacketXi>(prod);
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int32_t predux_min<PacketXi>(const PacketXi& a)
+{
+  return svminv_s32(svptrue_b32(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int32_t predux_max<PacketXi>(const PacketXi& a)
+{
+  return svmaxv_s32(svptrue_b32(), a);
+}
+
+template <int N>
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<PacketXi, N>& kernel) {
+  int buffer[packet_traits<numext::int32_t>::size * N] = {0};
+  int i = 0;
+
+  PacketXi stride_index = svindex_s32(0, N);
+
+  for (i = 0; i < N; i++) {
+    svst1_scatter_s32index_s32(svptrue_b32(), buffer + i, stride_index, kernel.packet[i]);
+  }
+  for (i = 0; i < N; i++) {
+    kernel.packet[i] = svld1_s32(svptrue_b32(), buffer + i * packet_traits<numext::int32_t>::size);
+  }
+}
+
+/********************************* float32 ************************************/
+
+typedef svfloat32_t PacketXf __attribute__((arm_sve_vector_bits(EIGEN_ARM64_SVE_VL)));
+
+template <>
+struct packet_traits<float> : default_packet_traits {
+  typedef PacketXf type;
+  typedef PacketXf half;
+
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = sve_packet_size_selector<float, EIGEN_ARM64_SVE_VL>::size,
+    HasHalfPacket = 0,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 0,
+    HasBlend = 0,
+    HasReduxp = 0,  // Not implemented in SVE
+
+    HasDiv = 1,
+    HasFloor = 1,
+
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasLog = 1,
+    HasExp = 1,
+    HasSqrt = 0,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH
+  };
+};
+
+template <>
+struct unpacket_traits<PacketXf> {
+  typedef float type;
+  typedef PacketXf half;  // Half not yet implemented
+  typedef PacketXi integer_packet;
+
+  enum {
+    size = sve_packet_size_selector<float, EIGEN_ARM64_SVE_VL>::size,
+    alignment = Aligned64,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pset1<PacketXf>(const float& from)
+{
+  return svdup_n_f32(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pset1frombits<PacketXf>(numext::uint32_t from)
+{
+  return svreinterpret_f32_u32(svdup_n_u32_z(svptrue_b32(), from));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf plset<PacketXf>(const float& a)
+{
+  float c[packet_traits<float>::size];
+  for (int i = 0; i < packet_traits<float>::size; i++) c[i] = i;
+  return svadd_f32_z(svptrue_b32(), pset1<PacketXf>(a), svld1_f32(svptrue_b32(), c));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf padd<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svadd_f32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf psub<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svsub_f32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pnegate(const PacketXf& a)
+{
+  return svneg_f32_z(svptrue_b32(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pconj(const PacketXf& a)
+{
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pmul<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svmul_f32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pdiv<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svdiv_f32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pmadd(const PacketXf& a, const PacketXf& b, const PacketXf& c)
+{
+  return svmla_f32_z(svptrue_b32(), c, a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pmin<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svmin_f32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pmin<PropagateNaN, PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return pmin<PacketXf>(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pmin<PropagateNumbers, PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svminnm_f32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pmax<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svmax_f32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pmax<PropagateNaN, PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return pmax<PacketXf>(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pmax<PropagateNumbers, PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svmaxnm_f32_z(svptrue_b32(), a, b);
+}
+
+// Float comparisons in SVE return svbool (predicate). Use svdup to set active
+// lanes to 1 (0xffffffffu) and inactive lanes to 0.
+template <>
+EIGEN_STRONG_INLINE PacketXf pcmp_le<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svreinterpret_f32_u32(svdup_n_u32_z(svcmplt_f32(svptrue_b32(), a, b), 0xffffffffu));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pcmp_lt<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svreinterpret_f32_u32(svdup_n_u32_z(svcmplt_f32(svptrue_b32(), a, b), 0xffffffffu));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pcmp_eq<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svreinterpret_f32_u32(svdup_n_u32_z(svcmpeq_f32(svptrue_b32(), a, b), 0xffffffffu));
+}
+
+// Do a predicate inverse (svnot_b_z) on the predicate resulted from the
+// greater/equal comparison (svcmpge_f32). Then fill a float vector with the
+// active elements.
+template <>
+EIGEN_STRONG_INLINE PacketXf pcmp_lt_or_nan<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svreinterpret_f32_u32(svdup_n_u32_z(svnot_b_z(svptrue_b32(), svcmpge_f32(svptrue_b32(), a, b)), 0xffffffffu));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pfloor<PacketXf>(const PacketXf& a)
+{
+  return svrintm_f32_z(svptrue_b32(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf ptrue<PacketXf>(const PacketXf& /*a*/)
+{
+  return svreinterpret_f32_u32(svdup_n_u32_z(svptrue_b32(), 0xffffffffu));
+}
+
+// Logical Operations are not supported for float, so reinterpret casts
+template <>
+EIGEN_STRONG_INLINE PacketXf pand<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svreinterpret_f32_u32(svand_u32_z(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf por<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svreinterpret_f32_u32(svorr_u32_z(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pxor<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svreinterpret_f32_u32(sveor_u32_z(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pandnot<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svreinterpret_f32_u32(svbic_u32_z(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pload<PacketXf>(const float* from)
+{
+  EIGEN_DEBUG_ALIGNED_LOAD return svld1_f32(svptrue_b32(), from);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf ploadu<PacketXf>(const float* from)
+{
+  EIGEN_DEBUG_UNALIGNED_LOAD return svld1_f32(svptrue_b32(), from);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf ploaddup<PacketXf>(const float* from)
+{
+  svuint32_t indices = svindex_u32(0, 1);  // index {base=0, base+step=1, base+step*2, ...}
+  indices = svzip1_u32(indices, indices);  // index in the format {a0, a0, a1, a1, a2, a2, ...}
+  return svld1_gather_u32index_f32(svptrue_b32(), from, indices);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf ploadquad<PacketXf>(const float* from)
+{
+  svuint32_t indices = svindex_u32(0, 1);  // index {base=0, base+step=1, base+step*2, ...}
+  indices = svzip1_u32(indices, indices);  // index in the format {a0, a0, a1, a1, a2, a2, ...}
+  indices = svzip1_u32(indices, indices);  // index in the format {a0, a0, a0, a0, a1, a1, a1, a1, ...}
+  return svld1_gather_u32index_f32(svptrue_b32(), from, indices);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<float>(float* to, const PacketXf& from)
+{
+  EIGEN_DEBUG_ALIGNED_STORE svst1_f32(svptrue_b32(), to, from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const PacketXf& from)
+{
+  EIGEN_DEBUG_UNALIGNED_STORE svst1_f32(svptrue_b32(), to, from);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline PacketXf pgather<float, PacketXf>(const float* from, Index stride)
+{
+  // Indice format: {base=0, base+stride, base+stride*2, base+stride*3, ...}
+  svint32_t indices = svindex_s32(0, stride);
+  return svld1_gather_s32index_f32(svptrue_b32(), from, indices);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<float, PacketXf>(float* to, const PacketXf& from, Index stride)
+{
+  // Indice format: {base=0, base+stride, base+stride*2, base+stride*3, ...}
+  svint32_t indices = svindex_s32(0, stride);
+  svst1_scatter_s32index_f32(svptrue_b32(), to, indices, from);
+}
+
+template <>
+EIGEN_STRONG_INLINE float pfirst<PacketXf>(const PacketXf& a)
+{
+  // svlasta returns the first element if all predicate bits are 0
+  return svlasta_f32(svpfalse_b(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf preverse(const PacketXf& a)
+{
+  return svrev_f32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pabs(const PacketXf& a)
+{
+  return svabs_f32_z(svptrue_b32(), a);
+}
+
+// TODO(tellenbach): Should this go into MathFunctions.h? If so, change for 
+// all vector extensions and the generic version.
+template <>
+EIGEN_STRONG_INLINE PacketXf pfrexp<PacketXf>(const PacketXf& a, PacketXf& exponent)
+{
+  return pfrexp_float(a, exponent);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux<PacketXf>(const PacketXf& a)
+{
+  return svaddv_f32(svptrue_b32(), a);
+}
+
+// Other reduction functions:
+// mul
+// Only works for SVE Vls multiple of 128
+template <>
+EIGEN_STRONG_INLINE float predux_mul<PacketXf>(const PacketXf& a)
+{
+  EIGEN_STATIC_ASSERT((EIGEN_ARM64_SVE_VL % 128 == 0),
+                      EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT);
+  // Multiply the vector by its reverse
+  svfloat32_t prod = svmul_f32_z(svptrue_b32(), a, svrev_f32(a));
+  svfloat32_t half_prod;
+
+  // Extract the high half of the vector. Depending on the VL more reductions need to be done
+  if (EIGEN_ARM64_SVE_VL >= 2048) {
+    half_prod = svtbl_f32(prod, svindex_u32(32, 1));
+    prod = svmul_f32_z(svptrue_b32(), prod, half_prod);
+  }
+  if (EIGEN_ARM64_SVE_VL >= 1024) {
+    half_prod = svtbl_f32(prod, svindex_u32(16, 1));
+    prod = svmul_f32_z(svptrue_b32(), prod, half_prod);
+  }
+  if (EIGEN_ARM64_SVE_VL >= 512) {
+    half_prod = svtbl_f32(prod, svindex_u32(8, 1));
+    prod = svmul_f32_z(svptrue_b32(), prod, half_prod);
+  }
+  if (EIGEN_ARM64_SVE_VL >= 256) {
+    half_prod = svtbl_f32(prod, svindex_u32(4, 1));
+    prod = svmul_f32_z(svptrue_b32(), prod, half_prod);
+  }
+  // Last reduction
+  half_prod = svtbl_f32(prod, svindex_u32(2, 1));
+  prod = svmul_f32_z(svptrue_b32(), prod, half_prod);
+
+  // The reduction is done to the first element.
+  return pfirst<PacketXf>(prod);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_min<PacketXf>(const PacketXf& a)
+{
+  return svminv_f32(svptrue_b32(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_max<PacketXf>(const PacketXf& a)
+{
+  return svmaxv_f32(svptrue_b32(), a);
+}
+
+template<int N>
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<PacketXf, N>& kernel)
+{
+  float buffer[packet_traits<float>::size * N] = {0};
+  int i = 0;
+
+  PacketXi stride_index = svindex_s32(0, N);
+
+  for (i = 0; i < N; i++) {
+    svst1_scatter_s32index_f32(svptrue_b32(), buffer + i, stride_index, kernel.packet[i]);
+  }
+
+  for (i = 0; i < N; i++) {
+    kernel.packet[i] = svld1_f32(svptrue_b32(), buffer + i * packet_traits<float>::size);
+  }
+}
+
+template<>
+EIGEN_STRONG_INLINE PacketXf pldexp<PacketXf>(const PacketXf& a, const PacketXf& exponent)
+{
+  return pldexp_float(a, exponent);
+}
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_PACKET_MATH_SVE_H
diff --git a/Eigen/src/Core/arch/SVE/TypeCasting.h b/Eigen/src/Core/arch/SVE/TypeCasting.h
new file mode 100644
index 0000000..7ba5d9c
--- /dev/null
+++ b/Eigen/src/Core/arch/SVE/TypeCasting.h
@@ -0,0 +1,49 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2020, Arm Limited and Contributors
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TYPE_CASTING_SVE_H
+#define EIGEN_TYPE_CASTING_SVE_H
+
+namespace Eigen {
+namespace internal {
+
+template <>
+struct type_casting_traits<float, numext::int32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+
+template <>
+struct type_casting_traits<numext::int32_t, float> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pcast<PacketXi, PacketXf>(const PacketXi& a) {
+  return svcvt_f32_s32_z(svptrue_b32(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pcast<PacketXf, PacketXi>(const PacketXf& a) {
+  return svcvt_s32_f32_z(svptrue_b32(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf preinterpret<PacketXf, PacketXi>(const PacketXi& a) {
+  return svreinterpret_f32_s32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi preinterpret<PacketXi, PacketXf>(const PacketXf& a) {
+  return svreinterpret_s32_f32(a);
+}
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif // EIGEN_TYPE_CASTING_SVE_H
diff --git a/Eigen/src/Core/arch/ZVector/PacketMath.h b/Eigen/src/Core/arch/ZVector/PacketMath.h
index eb378a1..b10c1f6 100755
--- a/Eigen/src/Core/arch/ZVector/PacketMath.h
+++ b/Eigen/src/Core/arch/ZVector/PacketMath.h
@@ -10,8 +10,6 @@
 #ifndef EIGEN_PACKET_MATH_ZVECTOR_H
 #define EIGEN_PACKET_MATH_ZVECTOR_H
 
-#include <stdint.h>
-
 namespace Eigen {
 
 namespace internal {
@@ -51,10 +49,10 @@
 #endif
 
 typedef union {
-  int32_t   i[4];
-  uint32_t ui[4];
-  int64_t   l[2];
-  uint64_t ul[2];
+  numext::int32_t   i[4];
+  numext::uint32_t ui[4];
+  numext::int64_t   l[2];
+  numext::uint64_t ul[2];
   double    d[2];
   float     f[4];
   Packet4i  v4i;
diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h
index f3509c4..2716081 100644
--- a/Eigen/src/Core/functors/BinaryFunctors.h
+++ b/Eigen/src/Core/functors/BinaryFunctors.h
@@ -403,6 +403,7 @@
 
 /** \internal
   * \brief Template functor to compute the pow of two scalars
+  * See the specification of pow in https://en.cppreference.com/w/cpp/numeric/math/pow
   */
 template<typename Scalar, typename Exponent>
 struct scalar_pow_op  : binary_op_base<Scalar,Exponent>
@@ -417,16 +418,31 @@
     EIGEN_SCALAR_BINARY_OP_PLUGIN
   }
 #endif
+
   EIGEN_DEVICE_FUNC
   inline result_type operator() (const Scalar& a, const Exponent& b) const { return numext::pow(a, b); }
+
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  {
+    return generic_pow(a,b);
+  }
 };
+
 template<typename Scalar, typename Exponent>
 struct functor_traits<scalar_pow_op<Scalar,Exponent> > {
-  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = (!NumTraits<Scalar>::IsComplex && !NumTraits<Scalar>::IsInteger &&
+                    packet_traits<Scalar>::HasExp && packet_traits<Scalar>::HasLog &&
+                    packet_traits<Scalar>::HasRound && packet_traits<Scalar>::HasCmp &&
+                    // Temporarly disable packet access for half/bfloat16 until
+                    // accuracy is improved.
+                    !is_same<Scalar, half>::value && !is_same<Scalar, bfloat16>::value
+                    )
+  };
 };
 
-
-
 //---------- non associative binary functors ----------
 
 /** \internal
diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h
index eee6ae1..c98fa57 100644
--- a/Eigen/src/Core/functors/UnaryFunctors.h
+++ b/Eigen/src/Core/functors/UnaryFunctors.h
@@ -403,7 +403,7 @@
   */
 template<typename Scalar> struct scalar_log2_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_log2_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return Scalar(EIGEN_LOG2E) * std::log(a); }
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return Scalar(EIGEN_LOG2E) * numext::log(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plog2(a); }
 };
@@ -456,7 +456,7 @@
   */
 template<typename Scalar> struct scalar_rsqrt_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_rsqrt_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return Scalar(1)/numext::sqrt(a); }
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::rsqrt(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::prsqrt(a); }
 };
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 216d1ad..79367f1 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -1076,148 +1076,6 @@
 
 };
 
-
-#if EIGEN_ARCH_ARM64 && defined EIGEN_VECTORIZE_NEON
-
-template<>
-struct gebp_traits <float, float, false, false,Architecture::NEON,GEBPPacketFull>
- : gebp_traits<float,float,false,false,Architecture::Generic,GEBPPacketFull>
-{
-  typedef float RhsPacket;
-
-  typedef float32x4_t RhsPacketx4;
-
-  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
-  {
-    dest = *b;
-  }
-
-  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
-  {
-    dest = vld1q_f32(b);
-  }
-
-  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
-  {
-    dest = *b;
-  }
-
-  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
-  {}
-
-  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
-  {
-    loadRhs(b,dest);
-  }
-
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
-  {
-    c = vfmaq_n_f32(c, a, b);
-  }
-
-  // NOTE: Template parameter inference failed when compiled with Android NDK:
-  // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
-
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
-  { madd_helper<0>(a, b, c); }
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const
-  { madd_helper<1>(a, b, c); }
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const
-  { madd_helper<2>(a, b, c); }
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const
-  { madd_helper<3>(a, b, c); }
-
- private:
-  template<int LaneID>
-  EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
-  {
-    #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0))
-    // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
-    // vfmaq_laneq_f32 is implemented through a costly dup
-         if(LaneID==0)  asm("fmla %0.4s, %1.4s, %2.s[0]\n" : "+w" (c) : "w" (a), "w" (b) :  );
-    else if(LaneID==1)  asm("fmla %0.4s, %1.4s, %2.s[1]\n" : "+w" (c) : "w" (a), "w" (b) :  );
-    else if(LaneID==2)  asm("fmla %0.4s, %1.4s, %2.s[2]\n" : "+w" (c) : "w" (a), "w" (b) :  );
-    else if(LaneID==3)  asm("fmla %0.4s, %1.4s, %2.s[3]\n" : "+w" (c) : "w" (a), "w" (b) :  );
-    #else
-    c = vfmaq_laneq_f32(c, a, b, LaneID);
-    #endif
-  }
-};
-
-
-template<>
-struct gebp_traits <double, double, false, false,Architecture::NEON>
- : gebp_traits<double,double,false,false,Architecture::Generic>
-{
-  typedef double RhsPacket;
-
-  struct RhsPacketx4 {
-    float64x2_t B_0, B_1;
-  };
-
-  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
-  {
-    dest = *b;
-  }
-
-  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
-  {
-    dest.B_0 = vld1q_f64(b);
-    dest.B_1 = vld1q_f64(b+2);
-  }
-
-  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
-  {
-    loadRhs(b,dest);
-  }
-
-  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
-  {}
-
-  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
-  {
-    loadRhs(b,dest);
-  }
-
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
-  {
-    c = vfmaq_n_f64(c, a, b);
-  }
-
-  // NOTE: Template parameter inference failed when compiled with Android NDK:
-  // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
-
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
-  { madd_helper<0>(a, b, c); }
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const
-  { madd_helper<1>(a, b, c); }
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const
-  { madd_helper<2>(a, b, c); }
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const
-  { madd_helper<3>(a, b, c); }
-
- private:
-  template <int LaneID>
-  EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
-  {
-    #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0))
-    // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
-    // vfmaq_laneq_f64 is implemented through a costly dup
-         if(LaneID==0)  asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_0) :  );
-    else if(LaneID==1)  asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_0) :  );
-    else if(LaneID==2)  asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_1) :  );
-    else if(LaneID==3)  asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_1) :  );
-    #else
-         if(LaneID==0) c = vfmaq_laneq_f64(c, a, b.B_0, 0);
-    else if(LaneID==1) c = vfmaq_laneq_f64(c, a, b.B_0, 1);
-    else if(LaneID==2) c = vfmaq_laneq_f64(c, a, b.B_1, 0);
-    else if(LaneID==3) c = vfmaq_laneq_f64(c, a, b.B_1, 1);
-    #endif
-  }
-};
-
-#endif
-
 /* optimized General packed Block * packed Panel product kernel
  *
  * Mixing type logic: C += A * B
diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h
index f07a284..af4e696 100644
--- a/Eigen/src/Core/util/ConfigureVectorization.h
+++ b/Eigen/src/Core/util/ConfigureVectorization.h
@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2008-2018 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2020, Arm Limited and Contributors
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -384,34 +385,50 @@
     #undef vector
     #undef pixel
 
-  #elif (defined  __ARM_NEON) || (defined __ARM_NEON__)
+  #elif ((defined  __ARM_NEON) || (defined __ARM_NEON__)) && !(defined EIGEN_ARM64_USE_SVE)
 
     #define EIGEN_VECTORIZE
     #define EIGEN_VECTORIZE_NEON
     #include <arm_neon.h>
 
-  #elif (defined __s390x__ && defined __VEC__)
+  // We currently require SVE to be enabled explicitly via EIGEN_ARM64_USE_SVE and
+  // will not select the backend automatically
+  #elif (defined __ARM_FEATURE_SVE) && (defined EIGEN_ARM64_USE_SVE)
 
     #define EIGEN_VECTORIZE
-    #define EIGEN_VECTORIZE_ZVECTOR
-    #include <vecintrin.h>
+    #define EIGEN_VECTORIZE_SVE
+    #include <arm_sve.h>
 
-  #elif defined __mips_msa
+    // Since we depend on knowing SVE vector lengths at compile-time, we need
+    // to ensure a fixed lengths is set
+    #if defined __ARM_FEATURE_SVE_BITS
+      #define EIGEN_ARM64_SVE_VL __ARM_FEATURE_SVE_BITS
+    #else
+#error "Eigen requires a fixed SVE lector length but EIGEN_ARM64_SVE_VL is not set."
+#endif
 
-    // Limit MSA optimizations to little-endian CPUs for now.
-    // TODO: Perhaps, eventually support MSA optimizations on big-endian CPUs?
-    #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
-      #if defined(__LP64__)
-        #define EIGEN_MIPS_64
-      #else
-        #define EIGEN_MIPS_32
-      #endif
-      #define EIGEN_VECTORIZE
-      #define EIGEN_VECTORIZE_MSA
-      #include <msa.h>
-    #endif
+#elif (defined __s390x__ && defined __VEC__)
 
-  #endif
+#define EIGEN_VECTORIZE
+#define EIGEN_VECTORIZE_ZVECTOR
+#include <vecintrin.h>
+
+#elif defined __mips_msa
+
+// Limit MSA optimizations to little-endian CPUs for now.
+// TODO: Perhaps, eventually support MSA optimizations on big-endian CPUs?
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#if defined(__LP64__)
+#define EIGEN_MIPS_64
+#else
+#define EIGEN_MIPS_32
+#endif
+#define EIGEN_VECTORIZE
+#define EIGEN_VECTORIZE_MSA
+#include <msa.h>
+#endif
+
+#endif
 #endif
 
 // Following the Arm ACLE arm_neon.h should also include arm_fp16.h but not all
@@ -478,6 +495,8 @@
   return "VSX";
 #elif defined(EIGEN_VECTORIZE_NEON)
   return "ARM NEON";
+#elif defined(EIGEN_VECTORIZE_SVE)
+  return "ARM SVE";
 #elif defined(EIGEN_VECTORIZE_ZVECTOR)
   return "S390X ZVECTOR";
 #elif defined(EIGEN_VECTORIZE_MSA)
diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h
index ad9af57..f7f907a 100644
--- a/Eigen/src/Core/util/Constants.h
+++ b/Eigen/src/Core/util/Constants.h
@@ -3,6 +3,7 @@
 //
 // Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2007-2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2020, Arm Limited and Contributors
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -474,6 +475,7 @@
     VSX = 0x3,
     NEON = 0x4,
     MSA = 0x5,
+    SVE = 0x6,
 #if defined EIGEN_VECTORIZE_SSE
     Target = SSE
 #elif defined EIGEN_VECTORIZE_ALTIVEC
@@ -482,6 +484,8 @@
     Target = VSX
 #elif defined EIGEN_VECTORIZE_NEON
     Target = NEON
+#elif defined EIGEN_VECTORIZE_SVE
+    Target = SVE
 #elif defined EIGEN_VECTORIZE_MSA
     Target = MSA
 #else
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index 258c391..14713aaa 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -1048,20 +1048,32 @@
 {
   if(max_std_funcs>=4)
     queryCacheSizes_intel_direct(l1,l2,l3);
-  else
+  else if(max_std_funcs>=2)
     queryCacheSizes_intel_codes(l1,l2,l3);
+  else
+    l1 = l2 = l3 = 0;
 }
 
 inline void queryCacheSizes_amd(int& l1, int& l2, int& l3)
 {
   int abcd[4];
   abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0;
-  EIGEN_CPUID(abcd,0x80000005,0);
-  l1 = (abcd[2] >> 24) * 1024; // C[31:24] = L1 size in KB
-  abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0;
-  EIGEN_CPUID(abcd,0x80000006,0);
-  l2 = (abcd[2] >> 16) * 1024; // C[31;16] = l2 cache size in KB
-  l3 = ((abcd[3] & 0xFFFC000) >> 18) * 512 * 1024; // D[31;18] = l3 cache size in 512KB
+  
+  // First query the max supported function.
+  EIGEN_CPUID(abcd,0x80000000,0);
+  if(static_cast<numext::uint32_t>(abcd[0]) >= static_cast<numext::uint32_t>(0x80000006))
+  {
+    EIGEN_CPUID(abcd,0x80000005,0);
+    l1 = (abcd[2] >> 24) * 1024; // C[31:24] = L1 size in KB
+    abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0;
+    EIGEN_CPUID(abcd,0x80000006,0);
+    l2 = (abcd[2] >> 16) * 1024; // C[31;16] = l2 cache size in KB
+    l3 = ((abcd[3] & 0xFFFC000) >> 18) * 512 * 1024; // D[31;18] = l3 cache size in 512KB
+  }
+  else
+  {
+    l1 = l2 = l3 = 0;
+  }
 }
 #endif
 
@@ -1077,7 +1089,7 @@
 
   // identify the CPU vendor
   EIGEN_CPUID(abcd,0x0,0);
-  int max_std_funcs = abcd[1];
+  int max_std_funcs = abcd[0];
   if(cpuid_is_vendor(abcd,GenuineIntel))
     queryCacheSizes_intel(l1,l2,l3,max_std_funcs);
   else if(cpuid_is_vendor(abcd,AuthenticAMD) || cpuid_is_vendor(abcd,AMDisbetter_))
diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h
index 64938d9..ab1bc01 100755
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@@ -25,8 +25,40 @@
 
 #endif
 
-#if EIGEN_COMP_ICC>=1600 &&  __cplusplus >= 201103L
+// Recent versions of ICC require <cstdint> for pointer types below.
+#define EIGEN_ICC_NEEDS_CSTDINT (EIGEN_COMP_ICC>=1600 && EIGEN_COMP_CXXVER >= 11)
+
+// Define portable (u)int{32,64} types
+#if EIGEN_HAS_CXX11 || EIGEN_ICC_NEEDS_CSTDINT
 #include <cstdint>
+namespace Eigen {
+namespace numext {
+typedef std::uint8_t  uint8_t;
+typedef std::int8_t   int8_t;
+typedef std::uint16_t uint16_t;
+typedef std::int16_t  int16_t;
+typedef std::uint32_t uint32_t;
+typedef std::int32_t  int32_t;
+typedef std::uint64_t uint64_t;
+typedef std::int64_t  int64_t;
+}
+}
+#else
+// Without c++11, all compilers able to compile Eigen also
+// provide the C99 stdint.h header file.
+#include <stdint.h>
+namespace Eigen {
+namespace numext {
+typedef ::uint8_t  uint8_t;
+typedef ::int8_t   int8_t;
+typedef ::uint16_t uint16_t;
+typedef ::int16_t  int16_t;
+typedef ::uint32_t uint32_t;
+typedef ::int32_t  int32_t;
+typedef ::uint64_t uint64_t;
+typedef ::int64_t  int64_t;
+}
+}
 #endif
 
 namespace Eigen {
@@ -52,13 +84,14 @@
 
 // Only recent versions of ICC complain about using ptrdiff_t to hold pointers,
 // and older versions do not provide *intptr_t types.
-#if EIGEN_COMP_ICC>=1600 &&  __cplusplus >= 201103L
+#if EIGEN_ICC_NEEDS_CSTDINT
 typedef std::intptr_t  IntPtr;
 typedef std::uintptr_t UIntPtr;
 #else
 typedef std::ptrdiff_t IntPtr;
 typedef std::size_t UIntPtr;
 #endif
+#undef EIGEN_ICC_NEEDS_CSTDINT
 
 struct true_type {  enum { value = 1 }; };
 struct false_type { enum { value = 0 }; };
@@ -688,37 +721,4 @@
 
 } // end namespace Eigen
 
-// Define portable (u)int{32,64} types
-#if EIGEN_HAS_CXX11
-#include <cstdint>
-namespace Eigen {
-namespace numext {
-typedef std::uint8_t  uint8_t;
-typedef std::int8_t   int8_t;
-typedef std::uint16_t uint16_t;
-typedef std::int16_t  int16_t;
-typedef std::uint32_t uint32_t;
-typedef std::int32_t  int32_t;
-typedef std::uint64_t uint64_t;
-typedef std::int64_t  int64_t;
-}
-}
-#else
-// Without c++11, all compilers able to compile Eigen also
-// provides the C99 stdint.h header file.
-#include <stdint.h>
-namespace Eigen {
-namespace numext {
-typedef ::uint8_t  uint8_t;
-typedef ::int8_t   int8_t;
-typedef ::uint16_t uint16_t;
-typedef ::int16_t  int16_t;
-typedef ::uint32_t uint32_t;
-typedef ::int32_t  int32_t;
-typedef ::uint64_t uint64_t;
-typedef ::int64_t  int64_t;
-}
-}
-#endif
-
 #endif // EIGEN_META_H
diff --git a/Eigen/src/Core/util/StaticAssert.h b/Eigen/src/Core/util/StaticAssert.h
index 95107ff..2ef19a4 100644
--- a/Eigen/src/Core/util/StaticAssert.h
+++ b/Eigen/src/Core/util/StaticAssert.h
@@ -105,7 +105,8 @@
         CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY=1,
         SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY=1,
         INVALID_TEMPLATE_PARAMETER=1,
-        GPU_TENSOR_CONTRACTION_DOES_NOT_SUPPORT_OUTPUT_KERNELS=1
+        GPU_TENSOR_CONTRACTION_DOES_NOT_SUPPORT_OUTPUT_KERNELS=1,
+        THE_ARRAY_SIZE_SHOULD_EQUAL_WITH_PACKET_SIZE=1
       };
     };
 
diff --git a/Eigen/src/Geometry/arch/Geometry_SSE.h b/Eigen/src/Geometry/arch/Geometry_SIMD.h
similarity index 79%
rename from Eigen/src/Geometry/arch/Geometry_SSE.h
rename to Eigen/src/Geometry/arch/Geometry_SIMD.h
index 108cc9f..9c15bfb 100644
--- a/Eigen/src/Geometry/arch/Geometry_SSE.h
+++ b/Eigen/src/Geometry/arch/Geometry_SIMD.h
@@ -8,15 +8,15 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef EIGEN_GEOMETRY_SSE_H
-#define EIGEN_GEOMETRY_SSE_H
+#ifndef EIGEN_GEOMETRY_SIMD_H
+#define EIGEN_GEOMETRY_SIMD_H
 
 namespace Eigen { 
 
 namespace internal {
 
 template<class Derived, class OtherDerived>
-struct quat_product<Architecture::SSE, Derived, OtherDerived, float>
+struct quat_product<Architecture::Target, Derived, OtherDerived, float>
 {
   enum {
     AAlignment = traits<Derived>::Alignment,
@@ -28,7 +28,8 @@
     evaluator<typename Derived::Coefficients> ae(_a.coeffs());
     evaluator<typename OtherDerived::Coefficients> be(_b.coeffs());
     Quaternion<float> res;
-    const Packet4f mask = _mm_setr_ps(0.f,0.f,0.f,-0.f);
+    float arr[4] = {0.f, 0.f, 0.f, -0.f};
+    const Packet4f mask = pset<Packet4f>(arr);
     Packet4f a = ae.template packet<AAlignment,Packet4f>(0);
     Packet4f b = be.template packet<BAlignment,Packet4f>(0);
     Packet4f s1 = pmul(vec4f_swizzle1(a,1,2,0,2),vec4f_swizzle1(b,2,0,1,2));
@@ -45,7 +46,7 @@
 };
 
 template<class Derived>
-struct quat_conj<Architecture::SSE, Derived, float>
+struct quat_conj<Architecture::Target, Derived, float>
 {
   enum {
     ResAlignment = traits<Quaternion<float> >::Alignment
@@ -54,7 +55,8 @@
   {
     evaluator<typename Derived::Coefficients> qe(q.coeffs());
     Quaternion<float> res;
-    const Packet4f mask = _mm_setr_ps(-0.f,-0.f,-0.f,0.f);
+    float arr[4] = {-0.f,-0.f,-0.f,0.f};
+    const Packet4f mask = pset<Packet4f>(arr);
     pstoret<float,Packet4f,ResAlignment>(&res.x(), pxor(mask, qe.template packet<traits<Derived>::Alignment,Packet4f>(0)));
     return res;
   }
@@ -62,7 +64,7 @@
 
 
 template<typename VectorLhs,typename VectorRhs>
-struct cross3_impl<Architecture::SSE,VectorLhs,VectorRhs,float,true>
+struct cross3_impl<Architecture::Target,VectorLhs,VectorRhs,float,true>
 {
   enum {
     ResAlignment = traits<typename plain_matrix_type<VectorLhs>::type>::Alignment
@@ -84,9 +86,10 @@
 
 
 
+#if (defined EIGEN_VECTORIZE_SSE) || (EIGEN_ARCH_ARM64)
 
 template<class Derived, class OtherDerived>
-struct quat_product<Architecture::SSE, Derived, OtherDerived, double>
+struct quat_product<Architecture::Target, Derived, OtherDerived, double>
 {
   enum {
     BAlignment = traits<OtherDerived>::Alignment,
@@ -95,8 +98,6 @@
 
   static inline Quaternion<double> run(const QuaternionBase<Derived>& _a, const QuaternionBase<OtherDerived>& _b)
   {
-  const Packet2d mask = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0));
-
   Quaternion<double> res;
 
   evaluator<typename Derived::Coefficients> ae(_a.coeffs());
@@ -120,12 +121,7 @@
    */
   t1 = padd(pmul(a_ww, b_xy), pmul(a_yy, b_zw));
   t2 = psub(pmul(a_zz, b_xy), pmul(a_xx, b_zw));
-#ifdef EIGEN_VECTORIZE_SSE3
-  EIGEN_UNUSED_VARIABLE(mask)
-  pstoret<double,Packet2d,ResAlignment>(&res.x(), _mm_addsub_pd(t1, preverse(t2)));
-#else
-  pstoret<double,Packet2d,ResAlignment>(&res.x(), padd(t1, pxor(mask,preverse(t2))));
-#endif
+  pstoret<double,Packet2d,ResAlignment>(&res.x(), paddsub(t1, preverse(t2)));
   
   /*
    * t1 = ww*zw - yy*xy
@@ -134,19 +130,14 @@
    */
   t1 = psub(pmul(a_ww, b_zw), pmul(a_yy, b_xy));
   t2 = padd(pmul(a_zz, b_zw), pmul(a_xx, b_xy));
-#ifdef EIGEN_VECTORIZE_SSE3
-  EIGEN_UNUSED_VARIABLE(mask)
-  pstoret<double,Packet2d,ResAlignment>(&res.z(), preverse(_mm_addsub_pd(preverse(t1), t2)));
-#else
-  pstoret<double,Packet2d,ResAlignment>(&res.z(), psub(t1, pxor(mask,preverse(t2))));
-#endif
+  pstoret<double,Packet2d,ResAlignment>(&res.z(), preverse(paddsub(preverse(t1), t2)));
 
   return res;
 }
 };
 
 template<class Derived>
-struct quat_conj<Architecture::SSE, Derived, double>
+struct quat_conj<Architecture::Target, Derived, double>
 {
   enum {
     ResAlignment = traits<Quaternion<double> >::Alignment
@@ -155,16 +146,20 @@
   {
     evaluator<typename Derived::Coefficients> qe(q.coeffs());
     Quaternion<double> res;
-    const Packet2d mask0 = _mm_setr_pd(-0.,-0.);
-    const Packet2d mask2 = _mm_setr_pd(-0.,0.);
+    double arr1[2] = {-0.0, -0.0};
+    double arr2[2] = {-0.0,  0.0};
+    const Packet2d mask0 = pset<Packet2d>(arr1);
+    const Packet2d mask2 = pset<Packet2d>(arr2);
     pstoret<double,Packet2d,ResAlignment>(&res.x(), pxor(mask0, qe.template packet<traits<Derived>::Alignment,Packet2d>(0)));
     pstoret<double,Packet2d,ResAlignment>(&res.z(), pxor(mask2, qe.template packet<traits<Derived>::Alignment,Packet2d>(2)));
     return res;
   }
 };
 
+#endif // end EIGEN_VECTORIZE_SSE_OR_EIGEN_ARCH_ARM64
+
 } // end namespace internal
 
 } // end namespace Eigen
 
-#endif // EIGEN_GEOMETRY_SSE_H
+#endif // EIGEN_GEOMETRY_SIMD_H
diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h
index bcec45f..e0c4456 100644
--- a/Eigen/src/SVD/BDCSVD.h
+++ b/Eigen/src/SVD/BDCSVD.h
@@ -1330,7 +1330,6 @@
 #endif
 }//end deflation
 
-#if !defined(EIGEN_GPUCC)
 /** \svd_module
   *
   * \return the singular value decomposition of \c *this computed by Divide & Conquer algorithm
@@ -1343,7 +1342,6 @@
 {
   return BDCSVD<PlainObject>(*this, computationOptions);
 }
-#endif
 
 } // end namespace Eigen
 
diff --git a/Eigen/src/SparseLU/SparseLU.h b/Eigen/src/SparseLU/SparseLU.h
index 1e7116c..0c8d893 100644
--- a/Eigen/src/SparseLU/SparseLU.h
+++ b/Eigen/src/SparseLU/SparseLU.h
@@ -18,6 +18,63 @@
 template <typename MappedSparseMatrixType> struct SparseLUMatrixLReturnType;
 template <typename MatrixLType, typename MatrixUType> struct SparseLUMatrixUReturnType;
 
+template <bool Conjugate,class SparseLUType>
+class SparseLUTransposeView : public SparseSolverBase<SparseLUTransposeView<Conjugate,SparseLUType> >
+{
+protected:
+  typedef SparseSolverBase<SparseLUTransposeView<Conjugate,SparseLUType> > APIBase;
+  using APIBase::m_isInitialized;
+public:
+  typedef typename SparseLUType::Scalar Scalar;
+  typedef typename SparseLUType::StorageIndex StorageIndex;
+  typedef typename SparseLUType::MatrixType MatrixType;
+  typedef typename SparseLUType::OrderingType OrderingType;
+
+  enum {
+    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+  };
+
+  SparseLUTransposeView() : m_sparseLU(NULL) {}
+  SparseLUTransposeView(const SparseLUTransposeView& view) {
+    this->m_sparseLU = view.m_sparseLU;
+  }
+  void setIsInitialized(const bool isInitialized) {this->m_isInitialized = isInitialized;}
+  void setSparseLU(SparseLUType* sparseLU) {m_sparseLU = sparseLU;}
+  using APIBase::_solve_impl;
+  template<typename Rhs, typename Dest>
+  bool _solve_impl(const MatrixBase<Rhs> &B, MatrixBase<Dest> &X_base) const
+  {
+    Dest& X(X_base.derived());
+    eigen_assert(m_sparseLU->info() == Success && "The matrix should be factorized first");
+    EIGEN_STATIC_ASSERT((Dest::Flags&RowMajorBit)==0,
+                        THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES);
+
+
+    // this ugly const_cast_derived() helps to detect aliasing when applying the permutations
+    for(Index j = 0; j < B.cols(); ++j){
+      X.col(j) = m_sparseLU->colsPermutation() * B.const_cast_derived().col(j);
+    }
+    //Forward substitution with transposed or adjoint of U
+    m_sparseLU->matrixU().template solveTransposedInPlace<Conjugate>(X);
+
+    //Backward substitution with transposed or adjoint of L
+    m_sparseLU->matrixL().template solveTransposedInPlace<Conjugate>(X);
+
+    // Permute back the solution
+    for (Index j = 0; j < B.cols(); ++j)
+      X.col(j) = m_sparseLU->rowsPermutation().transpose() * X.col(j);
+    return true;
+  }
+  inline Index rows() const { return m_sparseLU->rows(); }
+  inline Index cols() const { return m_sparseLU->cols(); }
+
+private:
+  SparseLUType *m_sparseLU;
+  SparseLUTransposeView& operator=(const SparseLUTransposeView&);
+};
+
+
 /** \ingroup SparseLU_Module
   * \class SparseLU
   * 
@@ -97,6 +154,7 @@
     };
     
   public:
+
     SparseLU():m_lastError(""),m_Ustore(0,0,0,0,0,0),m_symmetricmode(false),m_diagpivotthresh(1.0),m_detPermR(1)
     {
       initperfvalues(); 
@@ -128,6 +186,45 @@
       //Factorize
       factorize(matrix);
     } 
+
+    /** \returns an expression of the transposed of the factored matrix.
+      *
+      * A typical usage is to solve for the transposed problem A^T x = b:
+      * \code
+      * solver.compute(A);
+      * x = solver.transpose().solve(b);
+      * \endcode
+      *
+      * \sa adjoint(), solve()
+      */
+    const SparseLUTransposeView<false,SparseLU<_MatrixType,_OrderingType> > transpose()
+    {
+      SparseLUTransposeView<false,  SparseLU<_MatrixType,_OrderingType> > transposeView;
+      transposeView.setSparseLU(this);
+      transposeView.setIsInitialized(this->m_isInitialized);
+      return transposeView;
+    }
+
+
+    /** \returns an expression of the adjoint of the factored matrix
+      *
+      * A typical usage is to solve for the adjoint problem A' x = b:
+      * \code
+      * solver.compute(A);
+      * x = solver.adjoint().solve(b);
+      * \endcode
+      *
+      * For real scalar types, this function is equivalent to transpose().
+      *
+      * \sa transpose(), solve()
+      */
+    const SparseLUTransposeView<true, SparseLU<_MatrixType,_OrderingType> > adjoint()
+    {
+      SparseLUTransposeView<true,  SparseLU<_MatrixType,_OrderingType> > adjointView;
+      adjointView.setSparseLU(this);
+      adjointView.setIsInitialized(this->m_isInitialized);
+      return adjointView;
+    }
     
     inline Index rows() const { return m_mat.rows(); }
     inline Index cols() const { return m_mat.cols(); }
@@ -394,7 +491,6 @@
   private:
     // Disable copy constructor 
     SparseLU (const SparseLU& );
-  
 }; // End class SparseLU
 
 
@@ -587,7 +683,6 @@
   //  (a) a relaxed supernode at the bottom of the etree, or
   //  (b) panel_size contiguous columns, <panel_size> defined by the user
   Index jcol; 
-  IndexVector panel_histo(n);
   Index pivrow; // Pivotal row number in the original row matrix
   Index nseg1; // Number of segments in U-column above panel row jcol
   Index nseg; // Number of segments in each U-column 
@@ -713,6 +808,12 @@
   {
     m_mapL.solveInPlace(X);
   }
+  template<bool Conjugate, typename Dest>
+  void solveTransposedInPlace( MatrixBase<Dest> &X) const
+  {
+    m_mapL.template solveTransposedInPlace<Conjugate>(X);
+  }
+
   const MappedSupernodalType& m_mapL;
 };
 
@@ -767,6 +868,52 @@
       }
     } // End For U-solve
   }
+
+  template<bool Conjugate, typename Dest>   void solveTransposedInPlace(MatrixBase<Dest> &X) const
+  {
+    using numext::conj;
+    Index nrhs = X.cols();
+    Index n    = X.rows();
+    // Forward solve with U
+    for (Index k = 0; k <=  m_mapL.nsuper(); k++)
+    {
+      Index fsupc = m_mapL.supToCol()[k];
+      Index lda = m_mapL.colIndexPtr()[fsupc+1] - m_mapL.colIndexPtr()[fsupc]; // leading dimension
+      Index nsupc = m_mapL.supToCol()[k+1] - fsupc;
+      Index luptr = m_mapL.colIndexPtr()[fsupc];
+
+      for (Index j = 0; j < nrhs; ++j)
+      {
+        for (Index jcol = fsupc; jcol < fsupc + nsupc; jcol++)
+        {
+          typename MatrixUType::InnerIterator it(m_mapU, jcol);
+          for ( ; it; ++it)
+          {
+            Index irow = it.index();
+            X(jcol, j) -= X(irow, j) * (Conjugate? conj(it.value()): it.value());
+          }
+        }
+      }
+      if (nsupc == 1)
+      {
+        for (Index j = 0; j < nrhs; j++)
+        {
+          X(fsupc, j) /= (Conjugate? conj(m_mapL.valuePtr()[luptr]) : m_mapL.valuePtr()[luptr]);
+        }
+      }
+      else
+      {
+        Map<const Matrix<Scalar,Dynamic,Dynamic, ColMajor>, 0, OuterStride<> > A( &(m_mapL.valuePtr()[luptr]), nsupc, nsupc, OuterStride<>(lda) );
+        Map< Matrix<Scalar,Dynamic,Dest::ColsAtCompileTime, ColMajor>, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) );
+        if(Conjugate)
+          U = A.adjoint().template triangularView<Lower>().solve(U);
+        else
+          U = A.transpose().template triangularView<Lower>().solve(U);
+      }
+    }// End For U-solve
+  }
+
+
   const MatrixLType& m_mapL;
   const MatrixUType& m_mapU;
 };
diff --git a/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h b/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h
index 8583b1b..0be293d 100644
--- a/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h
+++ b/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h
@@ -156,6 +156,9 @@
     class InnerIterator; 
     template<typename Dest>
     void solveInPlace( MatrixBase<Dest>&X) const;
+    template<bool Conjugate, typename Dest>
+    void solveTransposedInPlace( MatrixBase<Dest>&X) const;
+
     
       
       
@@ -294,6 +297,77 @@
     } 
 }
 
+template<typename Scalar, typename Index_>
+template<bool Conjugate, typename Dest>
+void MappedSuperNodalMatrix<Scalar,Index_>::solveTransposedInPlace( MatrixBase<Dest>&X) const
+{
+    using numext::conj;
+  Index n    = int(X.rows());
+  Index nrhs = Index(X.cols());
+  const Scalar * Lval = valuePtr();                 // Nonzero values
+  Matrix<Scalar,Dynamic,Dest::ColsAtCompileTime, ColMajor> work(n, nrhs);     // working vector
+  work.setZero();
+  for (Index k = nsuper(); k >= 0; k--)
+  {
+    Index fsupc = supToCol()[k];                    // First column of the current supernode
+    Index istart = rowIndexPtr()[fsupc];            // Pointer index to the subscript of the current column
+    Index nsupr = rowIndexPtr()[fsupc+1] - istart;  // Number of rows in the current supernode
+    Index nsupc = supToCol()[k+1] - fsupc;          // Number of columns in the current supernode
+    Index nrow = nsupr - nsupc;                     // Number of rows in the non-diagonal part of the supernode
+    Index irow;                                     //Current index row
+
+    if (nsupc == 1 )
+    {
+      for (Index j = 0; j < nrhs; j++)
+      {
+        InnerIterator it(*this, fsupc);
+        ++it; // Skip the diagonal element
+        for (; it; ++it)
+        {
+          irow = it.row();
+          X(fsupc,j) -= X(irow, j) * (Conjugate?conj(it.value()):it.value());
+        }
+      }
+    }
+    else
+    {
+      // The supernode has more than one column
+      Index luptr = colIndexPtr()[fsupc];
+      Index lda = colIndexPtr()[fsupc+1] - luptr;
+
+      //Begin Gather
+      for (Index j = 0; j < nrhs; j++)
+      {
+        Index iptr = istart + nsupc;
+        for (Index i = 0; i < nrow; i++)
+        {
+          irow = rowIndex()[iptr];
+          work.topRows(nrow)(i,j)= X(irow,j); // Gather operation
+          iptr++;
+        }
+      }
+
+      // Matrix-vector product with transposed submatrix
+      Map<const Matrix<Scalar,Dynamic,Dynamic, ColMajor>, 0, OuterStride<> > A( &(Lval[luptr+nsupc]), nrow, nsupc, OuterStride<>(lda) );
+      Map< Matrix<Scalar,Dynamic,Dest::ColsAtCompileTime, ColMajor>, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) );
+      if(Conjugate)
+        U = U - A.adjoint() * work.topRows(nrow);
+      else
+        U = U - A.transpose() * work.topRows(nrow);
+
+      // Triangular solve (of transposed diagonal block)
+      new (&A) Map<const Matrix<Scalar,Dynamic,Dynamic, ColMajor>, 0, OuterStride<> > ( &(Lval[luptr]), nsupc, nsupc, OuterStride<>(lda) );
+      if(Conjugate)
+        U = A.adjoint().template triangularView<UnitUpper>().solve(U);
+      else
+        U = A.transpose().template triangularView<UnitUpper>().solve(U);
+
+    }
+
+  }
+}
+
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/blas/CMakeLists.txt b/blas/CMakeLists.txt
index 9887d58..545bc98 100644
--- a/blas/CMakeLists.txt
+++ b/blas/CMakeLists.txt
@@ -1,15 +1,13 @@
 
 project(EigenBlas CXX)
 
-include("../cmake/language_support.cmake")
-
-workaround_9220(Fortran EIGEN_Fortran_COMPILER_WORKS)
-
-if(EIGEN_Fortran_COMPILER_WORKS)
-  enable_language(Fortran OPTIONAL)
-  if(NOT CMAKE_Fortran_COMPILER)
-    set(EIGEN_Fortran_COMPILER_WORKS OFF)
-  endif()
+include(CheckLanguage)
+check_language(Fortran)
+if(CMAKE_Fortran_COMPILER)
+  enable_language(Fortran)
+  set(EIGEN_Fortran_COMPILER_WORKS ON)
+else()
+  set(EIGEN_Fortran_COMPILER_WORKS OFF)
 endif()
 
 add_custom_target(blas)
diff --git a/cmake/language_support.cmake b/cmake/language_support.cmake
deleted file mode 100644
index 591fca5..0000000
--- a/cmake/language_support.cmake
+++ /dev/null
@@ -1,67 +0,0 @@
-# cmake/modules/language_support.cmake
-#
-# Temporary additional general language support is contained within this
-# file.  
-
-# This additional function definition is needed to provide a workaround for
-# CMake bug 9220.
-
-# On debian testing (cmake 2.6.2), I get return code zero when calling 
-# cmake the first time, but cmake crashes when running a second time
-# as follows:
-#
-#  -- The Fortran compiler identification is unknown
-#  CMake Error at /usr/share/cmake-2.6/Modules/CMakeFortranInformation.cmake:7 (GET_FILENAME_COMPONENT):
-#    get_filename_component called with incorrect number of arguments
-#  Call Stack (most recent call first):
-#    CMakeLists.txt:3 (enable_language)
-#
-# My workaround is to invoke cmake twice.  If both return codes are zero, 
-# it is safe to invoke enable_language(Fortran OPTIONAL)
-
-function(workaround_9220 language language_works)
-  #message("DEBUG: language = ${language}")
-  set(text
-    "project(test NONE)
-    cmake_minimum_required(VERSION 2.8.11)
-    set (CMAKE_Fortran_FLAGS \"${CMAKE_Fortran_FLAGS}\")
-    set (CMAKE_EXE_LINKER_FLAGS \"${CMAKE_EXE_LINKER_FLAGS}\")
-    enable_language(${language})
-  ")
-  file(REMOVE_RECURSE ${CMAKE_BINARY_DIR}/language_tests/${language})
-  file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/language_tests/${language})
-  file(WRITE ${CMAKE_BINARY_DIR}/language_tests/${language}/CMakeLists.txt
-    ${text})
-  execute_process(
-    COMMAND ${CMAKE_COMMAND} . -G "${CMAKE_GENERATOR}" -DCMAKE_MAKE_PROGRAM=${CMAKE_MAKE_PROGRAM}
-    WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/language_tests/${language}
-    RESULT_VARIABLE return_code
-    OUTPUT_QUIET
-    ERROR_QUIET
-    )
-
-  if(return_code EQUAL 0)
-    # Second run
-    execute_process (
-      COMMAND ${CMAKE_COMMAND} . -G "${CMAKE_GENERATOR}" -DCMAKE_MAKE_PROGRAM=${CMAKE_MAKE_PROGRAM}
-      WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/language_tests/${language}
-      RESULT_VARIABLE return_code
-      OUTPUT_QUIET
-      ERROR_QUIET
-      )
-    if(return_code EQUAL 0)
-      set(${language_works} ON PARENT_SCOPE)
-    else()
-      set(${language_works} OFF PARENT_SCOPE)
-    endif()
-  else()
-    set(${language_works} OFF PARENT_SCOPE)
-  endif()
-endfunction()
-
-# Temporary tests of the above function.
-#workaround_9220(CXX CXX_language_works)
-#message("CXX_language_works = ${CXX_language_works}")
-#workaround_9220(CXXp CXXp_language_works)
-#message("CXXp_language_works = ${CXXp_language_works}")
-
diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt
index a85115b..9eec810 100644
--- a/lapack/CMakeLists.txt
+++ b/lapack/CMakeLists.txt
@@ -1,15 +1,13 @@
 
 project(EigenLapack CXX)
 
-include("../cmake/language_support.cmake")
-
-workaround_9220(Fortran EIGEN_Fortran_COMPILER_WORKS)
-
-if(EIGEN_Fortran_COMPILER_WORKS)
-  enable_language(Fortran OPTIONAL)
-  if(NOT CMAKE_Fortran_COMPILER)
-    set(EIGEN_Fortran_COMPILER_WORKS OFF)
-  endif()
+include(CheckLanguage)
+check_language(Fortran)
+if(CMAKE_Fortran_COMPILER)
+  enable_language(Fortran)
+  set(EIGEN_Fortran_COMPILER_WORKS ON)
+else()
+  set(EIGEN_Fortran_COMPILER_WORKS OFF)
 endif()
 
 add_custom_target(lapack)
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index ce37821..c624950 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -5,18 +5,13 @@
 endif()
 
 # check if we have a Fortran compiler
-include("../cmake/language_support.cmake")
-
-workaround_9220(Fortran EIGEN_Fortran_COMPILER_WORKS)
-
-if(EIGEN_Fortran_COMPILER_WORKS)
-  enable_language(Fortran OPTIONAL)
-  if(NOT CMAKE_Fortran_COMPILER)
-    set(EIGEN_Fortran_COMPILER_WORKS OFF)
-  endif()
-endif()
-
-if(NOT EIGEN_Fortran_COMPILER_WORKS)
+include(CheckLanguage)
+check_language(Fortran)
+if(CMAKE_Fortran_COMPILER)
+  enable_language(Fortran)
+  set(EIGEN_Fortran_COMPILER_WORKS ON)
+else()
+  set(EIGEN_Fortran_COMPILER_WORKS OFF)
   # search for a default Lapack library to complete Eigen's one
   find_package(LAPACK QUIET)
 endif()
diff --git a/test/array_cwise.cpp b/test/array_cwise.cpp
index 6910f0e..a1529bc 100644
--- a/test/array_cwise.cpp
+++ b/test/array_cwise.cpp
@@ -9,6 +9,77 @@
 
 #include "main.h"
 
+
+// Test the corner cases of pow(x, y) for real types.
+template<typename Scalar>
+void pow_test() {
+  const Scalar zero = Scalar(0);
+  const Scalar one = Scalar(1);
+  const Scalar two = Scalar(2);
+  const Scalar three = Scalar(3);
+  const Scalar sqrt_half = Scalar(std::sqrt(0.5));
+  const Scalar sqrt2 = Scalar(std::sqrt(2));
+  const Scalar inf = std::numeric_limits<Scalar>::infinity();
+  const Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
+  const Scalar min = (std::numeric_limits<Scalar>::min)();
+  const Scalar max = (std::numeric_limits<Scalar>::max)();
+  const static Scalar abs_vals[] = {zero,
+                                    sqrt_half,
+                                    one,
+                                    sqrt2,
+                                    two,
+                                    three,
+                                    min,
+                                    max,
+                                    inf,
+                                    nan};
+
+  const int abs_cases = 10;
+  const int num_cases = 2*abs_cases * 2*abs_cases;
+  // Repeat the same value to make sure we hit the vectorized path.
+  const int num_repeats = 32;
+  Array<Scalar, Dynamic, Dynamic> x(num_repeats, num_cases);
+  Array<Scalar, Dynamic, Dynamic> y(num_repeats, num_cases);
+  int count = 0;
+  for (int i = 0; i < abs_cases; ++i) {
+    const Scalar abs_x = abs_vals[i];
+    for (int sign_x = 0; sign_x < 2; ++sign_x) {
+      Scalar x_case = sign_x == 0 ? -abs_x : abs_x;
+      for (int j = 0; j < abs_cases; ++j) {
+        const Scalar abs_y = abs_vals[j];
+        for (int sign_y = 0; sign_y < 2; ++sign_y) {
+          Scalar y_case = sign_y == 0 ? -abs_y : abs_y;
+          for (int repeat = 0; repeat < num_repeats; ++repeat) {
+            x(repeat, count) = x_case;
+            y(repeat, count) = y_case;
+          }
+          ++count;
+        }
+      }
+    }
+  }
+
+  Array<Scalar, Dynamic, Dynamic> actual = x.pow(y);
+  const Scalar tol = test_precision<Scalar>();
+  bool all_pass = true;
+  for (int i = 0; i < 1; ++i) {
+    for (int j = 0; j < num_cases; ++j) {
+      // TODO(rmlarsen): Skip tests that trigger a known bug in pldexp for now.
+      if (std::abs(x(i,j)) == max || std::abs(x(i,j)) == min) continue;
+
+      Scalar e = numext::pow(x(i,j), y(i,j));
+      Scalar a = actual(i, j);
+      bool fail = !(a==e) && !internal::isApprox(a, e, tol) && !((numext::isnan)(a) && (numext::isnan)(e));
+      all_pass &= !fail;
+      if (fail) {
+        std::cout << "pow(" << x(i,j) << "," << y(i,j) << ")   =   " << a << " !=  " << e << std::endl;
+      }
+    }
+  }
+  VERIFY(all_pass);
+}
+
+
 template<typename ArrayType> void array(const ArrayType& m)
 {
   typedef typename ArrayType::Scalar Scalar;
@@ -273,7 +344,7 @@
             m3(rows, cols),
             m4 = m1;
 
-  m4 = (m4.abs()==Scalar(0)).select(1,m4);
+  m4 = (m4.abs()==Scalar(0)).select(Scalar(1),m4);
 
   Scalar  s1 = internal::random<Scalar>();
 
@@ -302,7 +373,7 @@
   VERIFY((m1.isNaN() == (Eigen::isnan)(m1)).all());
   VERIFY((m1.isInf() == (Eigen::isinf)(m1)).all());
   VERIFY((m1.isFinite() == (Eigen::isfinite)(m1)).all());
-  VERIFY_IS_APPROX(m1.inverse(), inverse(m1));
+  VERIFY_IS_APPROX(m4.inverse(), inverse(m4));
   VERIFY_IS_APPROX(m1.abs(), abs(m1));
   VERIFY_IS_APPROX(m1.abs2(), abs2(m1));
   VERIFY_IS_APPROX(m1.square(), square(m1));
@@ -311,11 +382,11 @@
   VERIFY_IS_APPROX(m1.sign(), sign(m1));
   VERIFY((m1.sqrt().sign().isNaN() == (Eigen::isnan)(sign(sqrt(m1)))).all());
 
-  // avoid NaNs with abs() so verification doesn't fail
-  m3 = m1.abs();
-  VERIFY_IS_APPROX(m3.sqrt(), sqrt(abs(m1)));
-  VERIFY_IS_APPROX(m3.rsqrt(), Scalar(1)/sqrt(abs(m1)));
-  VERIFY_IS_APPROX(rsqrt(m3), Scalar(1)/sqrt(abs(m1)));
+  // avoid inf and NaNs so verification doesn't fail
+  m3 = m4.abs();
+  VERIFY_IS_APPROX(m3.sqrt(), sqrt(abs(m3)));
+  VERIFY_IS_APPROX(m3.rsqrt(), Scalar(1)/sqrt(abs(m3)));
+  VERIFY_IS_APPROX(rsqrt(m3), Scalar(1)/sqrt(abs(m3)));
   VERIFY_IS_APPROX(m3.log(), log(m3));
   VERIFY_IS_APPROX(m3.log1p(), log1p(m3));
   VERIFY_IS_APPROX(m3.log10(), log10(m3));
@@ -327,23 +398,23 @@
   VERIFY_IS_APPROX(sin(m1.asin()), m1);
   VERIFY_IS_APPROX(cos(m1.acos()), m1);
   VERIFY_IS_APPROX(tan(m1.atan()), m1);
-  VERIFY_IS_APPROX(sinh(m1), 0.5*(exp(m1)-exp(-m1)));
-  VERIFY_IS_APPROX(cosh(m1), 0.5*(exp(m1)+exp(-m1)));
-  VERIFY_IS_APPROX(tanh(m1), (0.5*(exp(m1)-exp(-m1)))/(0.5*(exp(m1)+exp(-m1))));
-  VERIFY_IS_APPROX(logistic(m1), (1.0/(1.0+exp(-m1))));
-  VERIFY_IS_APPROX(arg(m1), ((m1<0).template cast<Scalar>())*std::acos(-1.0));
+  VERIFY_IS_APPROX(sinh(m1), Scalar(0.5)*(exp(m1)-exp(-m1)));
+  VERIFY_IS_APPROX(cosh(m1), Scalar(0.5)*(exp(m1)+exp(-m1)));
+  VERIFY_IS_APPROX(tanh(m1), (Scalar(0.5)*(exp(m1)-exp(-m1)))/(Scalar(0.5)*(exp(m1)+exp(-m1))));
+  VERIFY_IS_APPROX(logistic(m1), (Scalar(1)/(Scalar(1)+exp(-m1))));
+  VERIFY_IS_APPROX(arg(m1), ((m1<Scalar(0)).template cast<Scalar>())*Scalar(std::acos(Scalar(-1))));
   VERIFY((round(m1) <= ceil(m1) && round(m1) >= floor(m1)).all());
   VERIFY((rint(m1) <= ceil(m1) && rint(m1) >= floor(m1)).all());
   VERIFY(((ceil(m1) - round(m1)) <= Scalar(0.5) || (round(m1) - floor(m1)) <= Scalar(0.5)).all());
   VERIFY(((ceil(m1) - round(m1)) <= Scalar(1.0) && (round(m1) - floor(m1)) <= Scalar(1.0)).all());
   VERIFY(((ceil(m1) - rint(m1)) <= Scalar(0.5) || (rint(m1) - floor(m1)) <= Scalar(0.5)).all());
   VERIFY(((ceil(m1) - rint(m1)) <= Scalar(1.0) && (rint(m1) - floor(m1)) <= Scalar(1.0)).all());
-  VERIFY((Eigen::isnan)((m1*0.0)/0.0).all());
-  VERIFY((Eigen::isinf)(m4/0.0).all());
-  VERIFY(((Eigen::isfinite)(m1) && (!(Eigen::isfinite)(m1*0.0/0.0)) && (!(Eigen::isfinite)(m4/0.0))).all());
-  VERIFY_IS_APPROX(inverse(inverse(m1)),m1);
+  VERIFY((Eigen::isnan)((m1*Scalar(0))/Scalar(0)).all());
+  VERIFY((Eigen::isinf)(m4/Scalar(0)).all());
+  VERIFY(((Eigen::isfinite)(m1) && (!(Eigen::isfinite)(m1*Scalar(0)/Scalar(0))) && (!(Eigen::isfinite)(m4/Scalar(0)))).all());
+  VERIFY_IS_APPROX(inverse(inverse(m4)),m4);
   VERIFY((abs(m1) == m1 || abs(m1) == -m1).all());
-  VERIFY_IS_APPROX(m3, sqrt(abs2(m1)));
+  VERIFY_IS_APPROX(m3, sqrt(abs2(m3)));
   VERIFY_IS_APPROX(m1.absolute_difference(m2), (m1 > m2).select(m1 - m2, m2 - m1));
   VERIFY_IS_APPROX( m1.sign(), -(-m1).sign() );
   VERIFY_IS_APPROX( m1*m1.sign(),m1.abs());
@@ -356,15 +427,15 @@
 
   // shift argument of logarithm so that it is not zero
   Scalar smallNumber = NumTraits<Scalar>::dummy_precision();
-  VERIFY_IS_APPROX((m3 + smallNumber).log() , log(abs(m1) + smallNumber));
-  VERIFY_IS_APPROX((m3 + smallNumber + 1).log() , log1p(abs(m1) + smallNumber));
+  VERIFY_IS_APPROX((m3 + smallNumber).log() , log(abs(m3) + smallNumber));
+  VERIFY_IS_APPROX((m3 + smallNumber + Scalar(1)).log() , log1p(abs(m3) + smallNumber));
 
   VERIFY_IS_APPROX(m1.exp() * m2.exp(), exp(m1+m2));
   VERIFY_IS_APPROX(m1.exp(), exp(m1));
   VERIFY_IS_APPROX(m1.exp() / m2.exp(),(m1-m2).exp());
 
   VERIFY_IS_APPROX(m1.expm1(), expm1(m1));
-  VERIFY_IS_APPROX((m3 + smallNumber).exp() - 1, expm1(abs(m3) + smallNumber));
+  VERIFY_IS_APPROX((m3 + smallNumber).exp() - Scalar(1), expm1(abs(m3) + smallNumber));
 
   VERIFY_IS_APPROX(m3.pow(RealScalar(0.5)), m3.sqrt());
   VERIFY_IS_APPROX(pow(m3,RealScalar(0.5)), m3.sqrt());
@@ -372,8 +443,13 @@
   VERIFY_IS_APPROX(m3.pow(RealScalar(-0.5)), m3.rsqrt());
   VERIFY_IS_APPROX(pow(m3,RealScalar(-0.5)), m3.rsqrt());
 
-  VERIFY_IS_APPROX(log10(m3), log(m3)/log(10));
-  VERIFY_IS_APPROX(log2(m3), log(m3)/log(2));
+  // Avoid inf and NaN.
+  m3 = (m1.square()<NumTraits<Scalar>::epsilon()).select(Scalar(1),m3);
+  VERIFY_IS_APPROX(m3.pow(RealScalar(-2)), m3.square().inverse());
+  pow_test<Scalar>();
+
+  VERIFY_IS_APPROX(log10(m3), log(m3)/numext::log(Scalar(10)));
+  VERIFY_IS_APPROX(log2(m3), log(m3)/numext::log(Scalar(2)));
 
   // scalar by array division
   const RealScalar tiny = sqrt(std::numeric_limits<RealScalar>::epsilon());
@@ -422,7 +498,7 @@
   VERIFY((m1.isNaN() == (Eigen::isnan)(m1)).all());
   VERIFY((m1.isInf() == (Eigen::isinf)(m1)).all());
   VERIFY((m1.isFinite() == (Eigen::isfinite)(m1)).all());
-  VERIFY_IS_APPROX(m1.inverse(), inverse(m1));
+  VERIFY_IS_APPROX(m4.inverse(), inverse(m4));
   VERIFY_IS_APPROX(m1.log(), log(m1));
   VERIFY_IS_APPROX(m1.log10(), log10(m1));
   VERIFY_IS_APPROX(m1.log2(), log2(m1));
@@ -476,7 +552,7 @@
 
   VERIFY(((Eigen::isfinite)(m1) && (!(Eigen::isfinite)(m1*zero/zero)) && (!(Eigen::isfinite)(m1/zero))).all());
 
-  VERIFY_IS_APPROX(inverse(inverse(m1)),m1);
+  VERIFY_IS_APPROX(inverse(inverse(m4)),m4);
   VERIFY_IS_APPROX(conj(m1.conjugate()), m1);
   VERIFY_IS_APPROX(abs(m1), sqrt(square(m1.real())+square(m1.imag())));
   VERIFY_IS_APPROX(abs(m1), sqrt(abs2(m1)));
@@ -564,6 +640,8 @@
     CALL_SUBTEST_2( array_real(Array22f()) );
     CALL_SUBTEST_3( array_real(Array44d()) );
     CALL_SUBTEST_5( array_real(ArrayXXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_7( array_real(Array<Eigen::half, 32, 32>()) );
+    CALL_SUBTEST_8( array_real(Array<Eigen::bfloat16, 32, 32>()) );
   }
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_4( array_complex(ArrayXXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
diff --git a/test/gpu_basic.cu b/test/gpu_basic.cu
index b82b94d..46e4a43 100644
--- a/test/gpu_basic.cu
+++ b/test/gpu_basic.cu
@@ -107,6 +107,116 @@
 };
 
 template<typename T>
+struct complex_operators {
+  EIGEN_DEVICE_FUNC
+  void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
+  {
+    using namespace Eigen;
+    typedef typename T::Scalar ComplexType;
+    typedef typename T::Scalar::value_type ValueType;
+    const int num_scalar_operators = 24;
+    const int num_vector_operators = 23;  // no unary + operator.
+    int out_idx = i * (num_scalar_operators + num_vector_operators * T::MaxSizeAtCompileTime);
+    
+    // Scalar operators.
+    const ComplexType a = in[i];
+    const ComplexType b = in[i + 1];
+    
+    out[out_idx++] = +a;
+    out[out_idx++] = -a;
+    
+    out[out_idx++] = a + b;
+    out[out_idx++] = a + numext::real(b);
+    out[out_idx++] = numext::real(a) + b;
+    out[out_idx++] = a - b;
+    out[out_idx++] = a - numext::real(b);
+    out[out_idx++] = numext::real(a) - b;
+    out[out_idx++] = a * b;
+    out[out_idx++] = a * numext::real(b);
+    out[out_idx++] = numext::real(a) * b;
+    out[out_idx++] = a / b;
+    out[out_idx++] = a / numext::real(b);
+    out[out_idx++] = numext::real(a) / b;
+    
+    out[out_idx] = a; out[out_idx++] += b;
+    out[out_idx] = a; out[out_idx++] -= b;
+    out[out_idx] = a; out[out_idx++] *= b;
+    out[out_idx] = a; out[out_idx++] /= b;
+    
+    const ComplexType true_value = ComplexType(ValueType(1), ValueType(0));
+    const ComplexType false_value = ComplexType(ValueType(0), ValueType(0));
+    out[out_idx++] = (a == b ? true_value : false_value);
+    out[out_idx++] = (a == numext::real(b) ? true_value : false_value);
+    out[out_idx++] = (numext::real(a) == b ? true_value : false_value);
+    out[out_idx++] = (a != b ? true_value : false_value);
+    out[out_idx++] = (a != numext::real(b) ? true_value : false_value);
+    out[out_idx++] = (numext::real(a) != b ? true_value : false_value);
+    
+    // Vector versions.
+    T x1(in + i);
+    T x2(in + i + 1);
+    const int res_size = T::MaxSizeAtCompileTime * num_scalar_operators;
+    const int size = T::MaxSizeAtCompileTime;
+    int block_idx = 0;
+    
+    Map<VectorX<ComplexType>> res(out + out_idx, res_size);
+    res.segment(block_idx, size) = -x1;
+    block_idx += size;
+    
+    res.segment(block_idx, size) = x1 + x2;
+    block_idx += size;
+    res.segment(block_idx, size) = x1 + x2.real();
+    block_idx += size;
+    res.segment(block_idx, size) = x1.real() + x2;
+    block_idx += size;
+    res.segment(block_idx, size) = x1 - x2;
+    block_idx += size;
+    res.segment(block_idx, size) = x1 - x2.real();
+    block_idx += size;
+    res.segment(block_idx, size) = x1.real() - x2;
+    block_idx += size;
+    res.segment(block_idx, size) = x1.array() * x2.array();
+    block_idx += size;
+    res.segment(block_idx, size) = x1.array() * x2.real().array();
+    block_idx += size;
+    res.segment(block_idx, size) = x1.real().array() * x2.array();
+    block_idx += size;
+    res.segment(block_idx, size) = x1.array() / x2.array();
+    block_idx += size;
+    res.segment(block_idx, size) = x1.array() / x2.real().array();
+    block_idx += size;
+    res.segment(block_idx, size) = x1.real().array() / x2.array();
+    block_idx += size;
+    
+    res.segment(block_idx, size) = x1; res.segment(block_idx, size) += x2;
+    block_idx += size;
+    res.segment(block_idx, size) = x1; res.segment(block_idx, size) -= x2;
+    block_idx += size;
+    res.segment(block_idx, size) = x1; res.segment(block_idx, size).array() *= x2.array();
+    block_idx += size;
+    res.segment(block_idx, size) = x1; res.segment(block_idx, size).array() /= x2.array();
+    block_idx += size;
+
+    // Equality comparisons currently not functional on device
+    //   (std::equal_to<T> is host-only).
+    // const T true_vector = T::Constant(true_value);
+    // const T false_vector = T::Constant(false_value);
+    // res.segment(block_idx, size) = (x1 == x2 ? true_vector : false_vector);
+    // block_idx += size;
+    // res.segment(block_idx, size) = (x1 == x2.real() ? true_vector : false_vector);
+    // block_idx += size;
+    // res.segment(block_idx, size) = (x1.real() == x2 ? true_vector : false_vector);
+    // block_idx += size;
+    // res.segment(block_idx, size) = (x1 != x2 ? true_vector : false_vector);
+    // block_idx += size;
+    // res.segment(block_idx, size) = (x1 != x2.real() ? true_vector : false_vector);
+    // block_idx += size;
+    // res.segment(block_idx, size) = (x1.real() != x2 ? true_vector : false_vector);
+    // block_idx += size;
+  }
+};
+
+template<typename T>
 struct replicate {
   EIGEN_DEVICE_FUNC
   void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
@@ -297,6 +407,8 @@
   CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues_direct<Matrix3f>(), nthreads, in, out) );
   CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues_direct<Matrix2f>(), nthreads, in, out) );
 
+  // Test std::complex.
+  CALL_SUBTEST( run_and_compare_to_gpu(complex_operators<Vector3cf>(), nthreads, cfin, cfout) );
   CALL_SUBTEST( test_with_infs_nans(complex_sqrt<Vector3cf>(), nthreads, cfin, cfout) );
 
 #if defined(__NVCC__)
diff --git a/test/gpu_common.h b/test/gpu_common.h
index fe0485e..c37eaa1 100644
--- a/test/gpu_common.h
+++ b/test/gpu_common.h
@@ -117,6 +117,7 @@
   void operator()(int i, const int* /*in*/, int* info) const
   {
     if (i == 0) {
+      EIGEN_UNUSED_VARIABLE(info)
       #if defined(__CUDA_ARCH__)
       info[0] = int(__CUDA_ARCH__ +0);
       #endif
diff --git a/test/mapstride.cpp b/test/mapstride.cpp
index 0919660..fde73f2 100644
--- a/test/mapstride.cpp
+++ b/test/mapstride.cpp
@@ -162,6 +162,32 @@
     VERIFY_IS_APPROX(map,s1*m);
   }
 
+  // test negative strides
+  {
+    Matrix<Scalar,Dynamic,1>::Map(a_array1, arraysize+1).setRandom();
+    Index outerstride = m.innerSize()+4;
+    Scalar* array = array1;
+
+    {
+      Map<MatrixType, Alignment, OuterStride<> > map1(array, rows, cols, OuterStride<>( outerstride));
+      Map<MatrixType, Unaligned, OuterStride<> > map2(array+(m.outerSize()-1)*outerstride, rows, cols, OuterStride<>(-outerstride));
+      if(MatrixType::IsRowMajor)  VERIFY_IS_APPROX(map1.colwise().reverse(), map2);
+      else                        VERIFY_IS_APPROX(map1.rowwise().reverse(), map2);
+    }
+
+    {
+      Map<MatrixType, Alignment, OuterStride<> > map1(array, rows, cols, OuterStride<>( outerstride));
+      Map<MatrixType, Unaligned, Stride<Dynamic,Dynamic> > map2(array+(m.outerSize()-1)*outerstride+m.innerSize()-1, rows, cols, Stride<Dynamic,Dynamic>(-outerstride,-1));
+      VERIFY_IS_APPROX(map1.reverse(), map2);
+    }
+
+    {
+      Map<MatrixType, Alignment, OuterStride<> > map1(array, rows, cols, OuterStride<>( outerstride));
+      Map<MatrixType, Unaligned, Stride<Dynamic,-1> > map2(array+(m.outerSize()-1)*outerstride+m.innerSize()-1, rows, cols, Stride<Dynamic,-1>(-outerstride,-1));
+      VERIFY_IS_APPROX(map1.reverse(), map2);
+    }
+  }
+
   internal::aligned_delete(a_array1, arraysize+1);
 }
 
@@ -200,7 +226,7 @@
 EIGEN_DECLARE_TEST(mapstride)
 {
   for(int i = 0; i < g_repeat; i++) {
-    int maxn = 30;
+    int maxn = 3;
     CALL_SUBTEST_1( map_class_vector<Aligned>(Matrix<float, 1, 1>()) );
     CALL_SUBTEST_1( map_class_vector<Unaligned>(Matrix<float, 1, 1>()) );
     CALL_SUBTEST_2( map_class_vector<Aligned>(Vector4d()) );
diff --git a/test/numext.cpp b/test/numext.cpp
index ff4d13f..cf1ca17 100644
--- a/test/numext.cpp
+++ b/test/numext.cpp
@@ -9,6 +9,33 @@
 
 #include "main.h"
 
+template<typename T, typename U>
+bool check_if_equal_or_nans(const T& actual, const U& expected) {
+  return ((actual == expected) || ((numext::isnan)(actual) && (numext::isnan)(expected)));
+}
+
+template<typename T, typename U>
+bool check_if_equal_or_nans(const std::complex<T>& actual, const std::complex<U>& expected) {
+  return check_if_equal_or_nans(numext::real(actual), numext::real(expected))
+         && check_if_equal_or_nans(numext::imag(actual), numext::imag(expected));
+}
+
+template<typename T, typename U>
+bool test_is_equal_or_nans(const T& actual, const U& expected)
+{
+    if (check_if_equal_or_nans(actual, expected)) {
+      return true;
+    }
+
+    // false:
+    std::cerr
+        << "\n    actual   = " << actual
+        << "\n    expected = " << expected << "\n\n";
+    return false;
+}
+
+#define VERIFY_IS_EQUAL_OR_NANS(a, b) VERIFY(test_is_equal_or_nans(a, b))
+
 template<typename T>
 void check_abs() {
   typedef typename NumTraits<T>::Real Real;
@@ -19,7 +46,7 @@
   VERIFY_IS_EQUAL(numext::abs(T(0)), T(0));
   VERIFY_IS_EQUAL(numext::abs(T(1)), T(1));
 
-  for(int k=0; k<g_repeat*100; ++k)
+  for(int k=0; k<100; ++k)
   {
     T x = internal::random<T>();
     if(!internal::is_same<T,bool>::value)
@@ -34,22 +61,199 @@
   }
 }
 
-EIGEN_DECLARE_TEST(numext) {
-  CALL_SUBTEST( check_abs<bool>() );
-  CALL_SUBTEST( check_abs<signed char>() );
-  CALL_SUBTEST( check_abs<unsigned char>() );
-  CALL_SUBTEST( check_abs<short>() );
-  CALL_SUBTEST( check_abs<unsigned short>() );
-  CALL_SUBTEST( check_abs<int>() );
-  CALL_SUBTEST( check_abs<unsigned int>() );
-  CALL_SUBTEST( check_abs<long>() );
-  CALL_SUBTEST( check_abs<unsigned long>() );
-  CALL_SUBTEST( check_abs<half>() );
-  CALL_SUBTEST( check_abs<bfloat16>() );
-  CALL_SUBTEST( check_abs<float>() );
-  CALL_SUBTEST( check_abs<double>() );
-  CALL_SUBTEST( check_abs<long double>() );
+template<typename T>
+struct check_sqrt_impl {
+  static void run() {
+    for (int i=0; i<1000; ++i) {
+      const T x = numext::abs(internal::random<T>());
+      const T sqrtx = numext::sqrt(x);
+      VERIFY_IS_APPROX(sqrtx*sqrtx, x);
+    }
 
-  CALL_SUBTEST( check_abs<std::complex<float> >() );
-  CALL_SUBTEST( check_abs<std::complex<double> >() );
+    // Corner cases.
+    const T zero = T(0);
+    const T one = T(1);
+    const T inf = std::numeric_limits<T>::infinity();
+    const T nan = std::numeric_limits<T>::quiet_NaN();
+    VERIFY_IS_EQUAL(numext::sqrt(zero), zero);
+    VERIFY_IS_EQUAL(numext::sqrt(inf), inf);
+    VERIFY((numext::isnan)(numext::sqrt(nan)));
+    VERIFY((numext::isnan)(numext::sqrt(-one)));
+  }
+};
+
+template<typename T>
+struct check_sqrt_impl<std::complex<T>  > {
+  static void run() {
+    typedef typename std::complex<T> ComplexT;
+
+    for (int i=0; i<1000; ++i) {
+      const ComplexT x = internal::random<ComplexT>();
+      const ComplexT sqrtx = numext::sqrt(x);
+      VERIFY_IS_APPROX(sqrtx*sqrtx, x);
+    }
+
+    // Corner cases.
+    const T zero = T(0);
+    const T one = T(1);
+    const T inf = std::numeric_limits<T>::infinity();
+    const T nan = std::numeric_limits<T>::quiet_NaN();
+
+    // Set of corner cases from https://en.cppreference.com/w/cpp/numeric/complex/sqrt
+    const int kNumCorners = 20;
+    const ComplexT corners[kNumCorners][2] = {
+      {ComplexT(zero, zero), ComplexT(zero, zero)},
+      {ComplexT(-zero, zero), ComplexT(zero, zero)},
+      {ComplexT(zero, -zero), ComplexT(zero, zero)},
+      {ComplexT(-zero, -zero), ComplexT(zero, zero)},
+      {ComplexT(one, inf), ComplexT(inf, inf)},
+      {ComplexT(nan, inf), ComplexT(inf, inf)},
+      {ComplexT(one, -inf), ComplexT(inf, -inf)},
+      {ComplexT(nan, -inf), ComplexT(inf, -inf)},
+      {ComplexT(-inf, one), ComplexT(zero, inf)},
+      {ComplexT(inf, one), ComplexT(inf, zero)},
+      {ComplexT(-inf, -one), ComplexT(zero, -inf)},
+      {ComplexT(inf, -one), ComplexT(inf, -zero)},
+      {ComplexT(-inf, nan), ComplexT(nan, inf)},
+      {ComplexT(inf, nan), ComplexT(inf, nan)},
+      {ComplexT(zero, nan), ComplexT(nan, nan)},
+      {ComplexT(one, nan), ComplexT(nan, nan)},
+      {ComplexT(nan, zero), ComplexT(nan, nan)},
+      {ComplexT(nan, one), ComplexT(nan, nan)},
+      {ComplexT(nan, -one), ComplexT(nan, nan)},
+      {ComplexT(nan, nan), ComplexT(nan, nan)},
+    };
+
+    for (int i=0; i<kNumCorners; ++i) {
+      const ComplexT& x = corners[i][0];
+      const ComplexT sqrtx = corners[i][1];
+      VERIFY_IS_EQUAL_OR_NANS(numext::sqrt(x), sqrtx);
+    }
+  }
+};
+
+template<typename T>
+void check_sqrt() {
+  check_sqrt_impl<T>::run();
+}
+
+template<typename T>
+struct check_rsqrt_impl {
+  static void run() {
+    const T zero = T(0);
+    const T one = T(1);
+    const T inf = std::numeric_limits<T>::infinity();
+    const T nan = std::numeric_limits<T>::quiet_NaN();
+
+    for (int i=0; i<1000; ++i) {
+      const T x = numext::abs(internal::random<T>());
+      const T rsqrtx = numext::rsqrt(x);
+      const T invx = one / x;
+      VERIFY_IS_APPROX(rsqrtx*rsqrtx, invx);
+    }
+
+    // Corner cases.
+    VERIFY_IS_EQUAL(numext::rsqrt(zero), inf);
+    VERIFY_IS_EQUAL(numext::rsqrt(inf), zero);
+    VERIFY((numext::isnan)(numext::rsqrt(nan)));
+    VERIFY((numext::isnan)(numext::rsqrt(-one)));
+  }
+};
+
+template<typename T>
+struct check_rsqrt_impl<std::complex<T> > {
+  static void run() {
+    typedef typename std::complex<T> ComplexT;
+    const T zero = T(0);
+    const T one = T(1);
+    const T inf = std::numeric_limits<T>::infinity();
+    const T nan = std::numeric_limits<T>::quiet_NaN();
+
+    for (int i=0; i<1000; ++i) {
+      const ComplexT x = internal::random<ComplexT>();
+      const ComplexT invx = ComplexT(one, zero) / x;
+      const ComplexT rsqrtx = numext::rsqrt(x);
+      VERIFY_IS_APPROX(rsqrtx*rsqrtx, invx);
+    }
+
+    // GCC and MSVC differ in their treatment of 1/(0 + 0i)
+    //   GCC/clang = (inf, nan)
+    //   MSVC = (nan, nan)
+    // and 1 / (x + inf i)
+    //   GCC/clang = (0, 0)
+    //   MSVC = (nan, nan)
+    #if (EIGEN_COMP_GNUC)
+    {
+      const int kNumCorners = 20;
+      const ComplexT corners[kNumCorners][2] = {
+        // Only consistent across GCC, clang
+        {ComplexT(zero, zero), ComplexT(zero, zero)},
+        {ComplexT(-zero, zero), ComplexT(zero, zero)},
+        {ComplexT(zero, -zero), ComplexT(zero, zero)},
+        {ComplexT(-zero, -zero), ComplexT(zero, zero)},
+        {ComplexT(one, inf), ComplexT(inf, inf)},
+        {ComplexT(nan, inf), ComplexT(inf, inf)},
+        {ComplexT(one, -inf), ComplexT(inf, -inf)},
+        {ComplexT(nan, -inf), ComplexT(inf, -inf)},
+        // Consistent across GCC, clang, MSVC
+        {ComplexT(-inf, one), ComplexT(zero, inf)},
+        {ComplexT(inf, one), ComplexT(inf, zero)},
+        {ComplexT(-inf, -one), ComplexT(zero, -inf)},
+        {ComplexT(inf, -one), ComplexT(inf, -zero)},
+        {ComplexT(-inf, nan), ComplexT(nan, inf)},
+        {ComplexT(inf, nan), ComplexT(inf, nan)},
+        {ComplexT(zero, nan), ComplexT(nan, nan)},
+        {ComplexT(one, nan), ComplexT(nan, nan)},
+        {ComplexT(nan, zero), ComplexT(nan, nan)},
+        {ComplexT(nan, one), ComplexT(nan, nan)},
+        {ComplexT(nan, -one), ComplexT(nan, nan)},
+        {ComplexT(nan, nan), ComplexT(nan, nan)},
+      };
+
+      for (int i=0; i<kNumCorners; ++i) {
+        const ComplexT& x = corners[i][0];
+        const ComplexT rsqrtx = ComplexT(one, zero) / corners[i][1];
+        VERIFY_IS_EQUAL_OR_NANS(numext::rsqrt(x), rsqrtx);
+      }
+    }
+    #endif
+  }
+};
+
+template<typename T>
+void check_rsqrt() {
+  check_rsqrt_impl<T>::run();
+}
+
+EIGEN_DECLARE_TEST(numext) {
+  for(int k=0; k<g_repeat; ++k)
+  {
+    CALL_SUBTEST( check_abs<bool>() );
+    CALL_SUBTEST( check_abs<signed char>() );
+    CALL_SUBTEST( check_abs<unsigned char>() );
+    CALL_SUBTEST( check_abs<short>() );
+    CALL_SUBTEST( check_abs<unsigned short>() );
+    CALL_SUBTEST( check_abs<int>() );
+    CALL_SUBTEST( check_abs<unsigned int>() );
+    CALL_SUBTEST( check_abs<long>() );
+    CALL_SUBTEST( check_abs<unsigned long>() );
+    CALL_SUBTEST( check_abs<half>() );
+    CALL_SUBTEST( check_abs<bfloat16>() );
+    CALL_SUBTEST( check_abs<float>() );
+    CALL_SUBTEST( check_abs<double>() );
+    CALL_SUBTEST( check_abs<long double>() );
+
+    CALL_SUBTEST( check_abs<std::complex<float> >() );
+    CALL_SUBTEST( check_abs<std::complex<double> >() );
+
+    CALL_SUBTEST( check_sqrt<float>() );
+    CALL_SUBTEST( check_sqrt<double>() );
+    CALL_SUBTEST( check_sqrt<std::complex<float> >() );
+    CALL_SUBTEST( check_sqrt<std::complex<double> >() );
+    
+    CALL_SUBTEST( check_rsqrt<float>() );
+    CALL_SUBTEST( check_rsqrt<double>() );
+    CALL_SUBTEST( check_rsqrt<std::complex<float> >() );
+    CALL_SUBTEST( check_rsqrt<std::complex<double> >() );
+  }
 }
diff --git a/test/packetmath.cpp b/test/packetmath.cpp
index ab9bec1..c388f3a 100644
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@@ -46,6 +46,21 @@
   return a && b;
 }
 
+template <typename T>
+inline T REF_FREXP(const T& x, T& exp) {
+  int iexp;
+  EIGEN_USING_STD(frexp)
+  const T out = static_cast<T>(frexp(x, &iexp));
+  exp = static_cast<T>(iexp);
+  return out;
+}
+
+template <typename T>
+inline T REF_LDEXP(const T& x, const T& exp) {
+  EIGEN_USING_STD(ldexp)
+  return static_cast<T>(ldexp(x, static_cast<int>(exp)));
+}
+
 // Uses pcast to cast from one array to another.
 template <typename SrcPacket, typename TgtPacket, int SrcCoeffRatio, int TgtCoeffRatio>
 struct pcast_array;
@@ -552,6 +567,48 @@
     data2[i] = Scalar(internal::random<double>(-87, 88));
   }
   CHECK_CWISE1_IF(PacketTraits::HasExp, std::exp, internal::pexp);
+  CHECK_CWISE1_BYREF1_IF(PacketTraits::HasExp, REF_FREXP, internal::pfrexp);
+  for (int i = 0; i < PacketSize; ++i) {
+    data1[i] = Scalar(internal::random<double>(-1, 1));
+    data2[i] = Scalar(internal::random<double>(-1, 1));
+  }
+  for (int i = 0; i < PacketSize; ++i) {
+    data1[i+PacketSize] = Scalar(internal::random<int>(-4, 4));
+    data2[i+PacketSize] = Scalar(internal::random<double>(-4, 4));
+  }
+  CHECK_CWISE2_IF(PacketTraits::HasExp, REF_LDEXP, internal::pldexp);
+  if (PacketTraits::HasExp) {
+    data1[0] = Scalar(-1);
+    // underflow to zero
+    data1[PacketSize] = Scalar(std::numeric_limits<Scalar>::min_exponent-10);
+    CHECK_CWISE2_IF(PacketTraits::HasExp, REF_LDEXP, internal::pldexp);
+    // overflow to inf
+    data1[PacketSize] = Scalar(std::numeric_limits<Scalar>::max_exponent+10);
+    CHECK_CWISE2_IF(PacketTraits::HasExp, REF_LDEXP, internal::pldexp);
+    // NaN stays NaN
+    data1[0] = NumTraits<Scalar>::quiet_NaN();
+    CHECK_CWISE2_IF(PacketTraits::HasExp, REF_LDEXP, internal::pldexp);
+    VERIFY((numext::isnan)(data2[0]));
+    // inf stays inf
+    data1[0] = NumTraits<Scalar>::infinity();
+    data1[PacketSize] = Scalar(std::numeric_limits<Scalar>::min_exponent-10);
+    CHECK_CWISE2_IF(PacketTraits::HasExp, REF_LDEXP, internal::pldexp);
+    // zero stays zero
+    data1[0] = Scalar(0);
+    data1[PacketSize] = Scalar(std::numeric_limits<Scalar>::max_exponent+10);
+    CHECK_CWISE2_IF(PacketTraits::HasExp, REF_LDEXP, internal::pldexp);
+    // Small number big exponent.
+    data1[0] = Scalar(std::ldexp(Scalar(1.0), std::numeric_limits<Scalar>::min_exponent-1));
+    data1[PacketSize] = Scalar(-std::numeric_limits<Scalar>::min_exponent
+                               +std::numeric_limits<Scalar>::max_exponent);
+    CHECK_CWISE2_IF(PacketTraits::HasExp, REF_LDEXP, internal::pldexp);
+    // Big number small exponent.
+    data1[0] = Scalar(std::ldexp(Scalar(1.0), std::numeric_limits<Scalar>::max_exponent-1));
+    data1[PacketSize] = Scalar(+std::numeric_limits<Scalar>::min_exponent
+                               -std::numeric_limits<Scalar>::max_exponent);
+    CHECK_CWISE2_IF(PacketTraits::HasExp, REF_LDEXP, internal::pldexp);
+  }
+
   for (int i = 0; i < size; ++i) {
     data1[i] = Scalar(internal::random<double>(-1, 1) * std::pow(10., internal::random<double>(-6, 6)));
     data2[i] = Scalar(internal::random<double>(-1, 1) * std::pow(10., internal::random<double>(-6, 6)));
diff --git a/test/packetmath_test_shared.h b/test/packetmath_test_shared.h
index 46a4260..027715a 100644
--- a/test/packetmath_test_shared.h
+++ b/test/packetmath_test_shared.h
@@ -143,6 +143,9 @@
 
   template<typename T>
   inline void store(T* to, const Packet& x, unsigned long long umask) const { internal::pstoreu(to, x, umask); }
+
+  template<typename T>
+  inline Packet& forward_reference(Packet& packet, T& /*scalar*/) const { return packet; }
 };
 
 template<typename Packet>
@@ -162,6 +165,9 @@
 
   template<typename T>
   inline void store(T* to, const T& x, unsigned long long) const { *to = x; }
+
+  template<typename T>
+  inline T& forward_reference(Packet& /*packet*/, T& scalar) const { return scalar; }
 };
 
 #define CHECK_CWISE1_IF(COND, REFOP, POP) if(COND) { \
@@ -180,6 +186,18 @@
   VERIFY(test::areApprox(ref, data2, PacketSize) && #POP); \
 }
 
+// One input, one output by reference.
+#define CHECK_CWISE1_BYREF1_IF(COND, REFOP, POP) if(COND) { \
+  test::packet_helper<COND,Packet> h; \
+  for (int i=0; i<PacketSize; ++i) \
+    ref[i] = Scalar(REFOP(data1[i], ref[i+PacketSize]));     \
+  Packet pout; \
+  Scalar sout; \
+  h.store(data2, POP(h.load(data1), h.forward_reference(pout, sout))); \
+  h.store(data2+PacketSize, h.forward_reference(pout, sout)); \
+  VERIFY(test::areApprox(ref, data2, 2 * PacketSize) && #POP); \
+}
+
 #define CHECK_CWISE3_IF(COND, REFOP, POP) if (COND) {                      \
   test::packet_helper<COND, Packet> h;                                     \
   for (int i = 0; i < PacketSize; ++i)                                     \
diff --git a/test/product_extra.cpp b/test/product_extra.cpp
index bd31df8..15c6989 100644
--- a/test/product_extra.cpp
+++ b/test/product_extra.cpp
@@ -93,6 +93,22 @@
 
   VERIFY_IS_APPROX(m1.col(j2).adjoint() * m1.block(0,j,m1.rows(),c), m1.col(j2).adjoint().eval() * m1.block(0,j,m1.rows(),c).eval());
   VERIFY_IS_APPROX(m1.block(i,0,r,m1.cols()) * m1.row(i2).adjoint(), m1.block(i,0,r,m1.cols()).eval() * m1.row(i2).adjoint().eval());
+
+  // test negative strides
+  {
+    Map<MatrixType,Unaligned,Stride<Dynamic,Dynamic> > map1(&m1(rows-1,cols-1), rows, cols, Stride<Dynamic,Dynamic>(-m1.outerStride(),-1));
+    Map<MatrixType,Unaligned,Stride<Dynamic,Dynamic> > map2(&m2(rows-1,cols-1), rows, cols, Stride<Dynamic,Dynamic>(-m2.outerStride(),-1));
+    Map<RowVectorType,Unaligned,InnerStride<-1> > mapv1(&v1(v1.size()-1), v1.size(), InnerStride<-1>(-1));
+    Map<ColVectorType,Unaligned,InnerStride<-1> > mapvc2(&vc2(vc2.size()-1), vc2.size(), InnerStride<-1>(-1));
+    VERIFY_IS_APPROX(MatrixType(map1), m1.reverse());
+    VERIFY_IS_APPROX(MatrixType(map2), m2.reverse());
+    VERIFY_IS_APPROX(m3.noalias() = MatrixType(map1) * MatrixType(map2).adjoint(), m1.reverse() * m2.reverse().adjoint());
+    VERIFY_IS_APPROX(m3.noalias() = map1 * map2.adjoint(), m1.reverse() * m2.reverse().adjoint());
+    VERIFY_IS_APPROX(map1 * vc2, m1.reverse() * vc2);
+    VERIFY_IS_APPROX(m1 * mapvc2, m1 * mapvc2);
+    VERIFY_IS_APPROX(map1.adjoint() * v1.transpose(), m1.adjoint().reverse() * v1.transpose());
+    VERIFY_IS_APPROX(m1.adjoint() * mapv1.transpose(), m1.adjoint() * v1.reverse().transpose());
+  }
   
   // regression test
   MatrixType tmp = m1 * m1.adjoint() * s1;
diff --git a/test/sparse_solver.h b/test/sparse_solver.h
index f45d7ef..5892794 100644
--- a/test/sparse_solver.h
+++ b/test/sparse_solver.h
@@ -9,6 +9,7 @@
 
 #include "sparse.h"
 #include <Eigen/SparseCore>
+#include <Eigen/SparseLU>
 #include <sstream>
 
 template<typename Solver, typename Rhs, typename Guess,typename Result>
@@ -144,6 +145,136 @@
   }
 }
 
+// specialization of generic check_sparse_solving for SuperLU in order to also test adjoint and transpose solves
+template<typename Scalar, typename Rhs, typename DenseMat, typename DenseRhs>
+void check_sparse_solving(Eigen::SparseLU<Eigen::SparseMatrix<Scalar> >& solver, const typename Eigen::SparseMatrix<Scalar>& A, const Rhs& b, const DenseMat& dA, const DenseRhs& db)
+{
+  typedef typename Eigen::SparseMatrix<Scalar> Mat;
+  typedef typename Mat::StorageIndex StorageIndex;
+  typedef typename Eigen::SparseLU<Eigen::SparseMatrix<Scalar> > Solver;
+
+  // reference solutions computed by dense QR solver
+  DenseRhs refX1 = dA.householderQr().solve(db); // solution of A x = db
+  DenseRhs refX2 = dA.transpose().householderQr().solve(db); // solution of A^T * x = db (use transposed matrix A^T)
+  DenseRhs refX3 = dA.adjoint().householderQr().solve(db);  // solution of A^* * x = db (use adjoint matrix A^*)
+
+
+  {
+    Rhs x1(A.cols(), b.cols());
+    Rhs x2(A.cols(), b.cols());
+    Rhs x3(A.cols(), b.cols());
+    Rhs oldb = b;
+
+    solver.compute(A);
+    if (solver.info() != Success)
+    {
+      std::cerr << "ERROR | sparse solver testing, factorization failed (" << typeid(Solver).name() << ")\n";
+      VERIFY(solver.info() == Success);
+    }
+    x1 = solver.solve(b);
+    if (solver.info() != Success)
+    {
+      std::cerr << "WARNING | sparse solver testing: solving failed (" << typeid(Solver).name() << ")\n";
+      return;
+    }
+    VERIFY(oldb.isApprox(b,0.0) && "sparse solver testing: the rhs should not be modified!");
+    VERIFY(x1.isApprox(refX1,test_precision<Scalar>()));
+
+    // test solve with transposed
+    x2 = solver.transpose().solve(b);
+    VERIFY(oldb.isApprox(b) && "sparse solver testing: the rhs should not be modified!");
+    VERIFY(x2.isApprox(refX2,test_precision<Scalar>()));
+
+
+    // test solve with adjoint
+    //solver.template _solve_impl_transposed<true>(b, x3);
+    x3 = solver.adjoint().solve(b);
+    VERIFY(oldb.isApprox(b,0.0) && "sparse solver testing: the rhs should not be modified!");
+    VERIFY(x3.isApprox(refX3,test_precision<Scalar>()));
+
+    x1.setZero();
+    solve_with_guess(solver, b, x1, x1);
+    VERIFY(solver.info() == Success && "solving failed when using analyzePattern/factorize API");
+    VERIFY(oldb.isApprox(b,0.0) && "sparse solver testing: the rhs should not be modified!");
+    VERIFY(x1.isApprox(refX1,test_precision<Scalar>()));
+
+    x1.setZero();
+    x2.setZero();
+    x3.setZero();
+    // test the analyze/factorize API
+    solver.analyzePattern(A);
+    solver.factorize(A);
+    VERIFY(solver.info() == Success && "factorization failed when using analyzePattern/factorize API");
+    x1 = solver.solve(b);
+    x2 = solver.transpose().solve(b);
+    x3 = solver.adjoint().solve(b);
+
+    VERIFY(solver.info() == Success && "solving failed when using analyzePattern/factorize API");
+    VERIFY(oldb.isApprox(b,0.0) && "sparse solver testing: the rhs should not be modified!");
+    VERIFY(x1.isApprox(refX1,test_precision<Scalar>()));
+    VERIFY(x2.isApprox(refX2,test_precision<Scalar>()));
+    VERIFY(x3.isApprox(refX3,test_precision<Scalar>()));
+
+    x1.setZero();
+    // test with Map
+    MappedSparseMatrix<Scalar,Mat::Options,StorageIndex> Am(A.rows(), A.cols(), A.nonZeros(), const_cast<StorageIndex*>(A.outerIndexPtr()), const_cast<StorageIndex*>(A.innerIndexPtr()), const_cast<Scalar*>(A.valuePtr()));
+    solver.compute(Am);
+    VERIFY(solver.info() == Success && "factorization failed when using Map");
+    DenseRhs dx(refX1);
+    dx.setZero();
+    Map<DenseRhs> xm(dx.data(), dx.rows(), dx.cols());
+    Map<const DenseRhs> bm(db.data(), db.rows(), db.cols());
+    xm = solver.solve(bm);
+    VERIFY(solver.info() == Success && "solving failed when using Map");
+    VERIFY(oldb.isApprox(bm,0.0) && "sparse solver testing: the rhs should not be modified!");
+    VERIFY(xm.isApprox(refX1,test_precision<Scalar>()));
+  }
+
+  // if not too large, do some extra check:
+  if(A.rows()<2000)
+  {
+    // test initialization ctor
+    {
+      Rhs x(b.rows(), b.cols());
+      Solver solver2(A);
+      VERIFY(solver2.info() == Success);
+      x = solver2.solve(b);
+      VERIFY(x.isApprox(refX1,test_precision<Scalar>()));
+    }
+
+    // test dense Block as the result and rhs:
+    {
+      DenseRhs x(refX1.rows(), refX1.cols());
+      DenseRhs oldb(db);
+      x.setZero();
+      x.block(0,0,x.rows(),x.cols()) = solver.solve(db.block(0,0,db.rows(),db.cols()));
+      VERIFY(oldb.isApprox(db,0.0) && "sparse solver testing: the rhs should not be modified!");
+      VERIFY(x.isApprox(refX1,test_precision<Scalar>()));
+    }
+
+    // test uncompressed inputs
+    {
+      Mat A2 = A;
+      A2.reserve((ArrayXf::Random(A.outerSize())+2).template cast<typename Mat::StorageIndex>().eval());
+      solver.compute(A2);
+      Rhs x = solver.solve(b);
+      VERIFY(x.isApprox(refX1,test_precision<Scalar>()));
+    }
+
+    // test expression as input
+    {
+      solver.compute(0.5*(A+A));
+      Rhs x = solver.solve(b);
+      VERIFY(x.isApprox(refX1,test_precision<Scalar>()));
+
+      Solver solver2(0.5*(A+A));
+      Rhs x2 = solver2.solve(b);
+      VERIFY(x2.isApprox(refX1,test_precision<Scalar>()));
+    }
+  }
+}
+
+
 template<typename Solver, typename Rhs>
 void check_sparse_solving_real_cases(Solver& solver, const typename Solver::MatrixType& A, const Rhs& b, const typename Solver::MatrixType& fullA, const Rhs& refX)
 {
diff --git a/test/stable_norm.cpp b/test/stable_norm.cpp
index ee5f916..cb8a80c 100644
--- a/test/stable_norm.cpp
+++ b/test/stable_norm.cpp
@@ -170,7 +170,13 @@
     VERIFY(!(numext::isfinite)(v.norm()));          VERIFY((numext::isnan)(v.norm()));
     VERIFY(!(numext::isfinite)(v.stableNorm()));    VERIFY((numext::isnan)(v.stableNorm()));
     VERIFY(!(numext::isfinite)(v.blueNorm()));      VERIFY((numext::isnan)(v.blueNorm()));
-    VERIFY(!(numext::isfinite)(v.hypotNorm()));     VERIFY((numext::isnan)(v.hypotNorm()));
+    if (i2 != i || j2 != j) {
+      // hypot propagates inf over NaN.
+      VERIFY(!(numext::isfinite)(v.hypotNorm()));     VERIFY((numext::isinf)(v.hypotNorm()));
+    } else {
+      // inf is overwritten by NaN, expect norm to be NaN.
+      VERIFY(!(numext::isfinite)(v.hypotNorm()));     VERIFY((numext::isnan)(v.hypotNorm()));
+    }
   }
 
   // stableNormalize[d]
diff --git a/unsupported/Eigen/ArpackSupport b/unsupported/Eigen/ArpackSupport
index 28c95ff..67c4ac8 100644
--- a/unsupported/Eigen/ArpackSupport
+++ b/unsupported/Eigen/ArpackSupport
@@ -28,4 +28,3 @@
 #include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_ARPACKSUPPORT_MODULE_H
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index beed230..d73c600 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -42,18 +42,6 @@
 #include <thread>
 
 #ifdef _WIN32
-typedef __int16 int16_t;
-typedef unsigned __int16 uint16_t;
-typedef __int32 int32_t;
-typedef unsigned __int32 uint32_t;
-typedef __int64 int64_t;
-typedef unsigned __int64 uint64_t;
-#include <windows.h>
-#else
-#include <stdint.h>
-#endif
-
-#ifdef _WIN32
 #include <windows.h>
 #elif defined(__APPLE__)
 #include <mach/mach_time.h>
diff --git a/unsupported/Eigen/CXX11/ThreadPool b/unsupported/Eigen/CXX11/ThreadPool
index 93b7791..575c060 100644
--- a/unsupported/Eigen/CXX11/ThreadPool
+++ b/unsupported/Eigen/CXX11/ThreadPool
@@ -33,7 +33,6 @@
 #if __cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900
 #include <cstddef>
 #include <cstring>
-#include <stdint.h>
 #include <time.h>
 
 #include <vector>
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
index 000ed5b..81bed57 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
@@ -258,12 +258,12 @@
           m_outputPlanes = numext::ceil(m_input_planes_eff / static_cast<float>(m_plane_strides));
           m_outputRows = numext::ceil(m_input_rows_eff / static_cast<float>(m_row_strides));
           m_outputCols = numext::ceil(m_input_cols_eff / static_cast<float>(m_col_strides));
-          const Index dz = m_outputPlanes * m_plane_strides + m_patch_planes_eff - 1 - m_input_planes_eff;
-          const Index dy = m_outputRows * m_row_strides + m_patch_rows_eff - 1 - m_input_rows_eff;
-          const Index dx = m_outputCols * m_col_strides + m_patch_cols_eff - 1 - m_input_cols_eff;
-          m_planePaddingTop = dz - dz / 2;
-          m_rowPaddingTop = dy - dy / 2;
-          m_colPaddingLeft = dx - dx / 2;
+          const Index dz = (m_outputPlanes - 1) * m_plane_strides + m_patch_planes_eff - m_input_planes_eff;
+          const Index dy = (m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff;
+          const Index dx = (m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff;
+          m_planePaddingTop = dz / 2;
+          m_rowPaddingTop = dy / 2;
+          m_colPaddingLeft = dx / 2;
           break;
         }
         default:
diff --git a/unsupported/Eigen/FFT b/unsupported/Eigen/FFT
index d9ad21a..cb99bd2 100644
--- a/unsupported/Eigen/FFT
+++ b/unsupported/Eigen/FFT
@@ -421,4 +421,3 @@
 #include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 
 #endif
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/unsupported/Eigen/Polynomials b/unsupported/Eigen/Polynomials
index 146e5c4..32ce2a2 100644
--- a/unsupported/Eigen/Polynomials
+++ b/unsupported/Eigen/Polynomials
@@ -135,4 +135,3 @@
 #include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_POLYNOMIALS_MODULE_H
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/unsupported/Eigen/src/FFT/ei_fftw_impl.h b/unsupported/Eigen/src/FFT/ei_fftw_impl.h
index 7c1f716..1c2cd24 100644
--- a/unsupported/Eigen/src/FFT/ei_fftw_impl.h
+++ b/unsupported/Eigen/src/FFT/ei_fftw_impl.h
@@ -259,5 +259,3 @@
 } // end namespace internal
 
 } // end namespace Eigen
-
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/unsupported/Eigen/src/FFT/ei_kissfft_impl.h b/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
index 079e886..9bdc012 100644
--- a/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
+++ b/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
@@ -416,5 +416,3 @@
 } // end namespace internal
 
 } // end namespace Eigen
-
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h b/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h
index 8fe3ed8..07c5ef0 100644
--- a/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h
+++ b/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h
@@ -52,7 +52,7 @@
         Parameters()
             : factor(Scalar(100.))
             , maxfev(1000)
-            , xtol(std::sqrt(NumTraits<Scalar>::epsilon()))
+            , xtol(numext::sqrt(NumTraits<Scalar>::epsilon()))
             , nb_of_subdiagonals(-1)
             , nb_of_superdiagonals(-1)
             , epsfcn(Scalar(0.)) {}
@@ -70,7 +70,7 @@
 
     HybridNonLinearSolverSpace::Status hybrj1(
             FVectorType  &x,
-            const Scalar tol = std::sqrt(NumTraits<Scalar>::epsilon())
+            const Scalar tol = numext::sqrt(NumTraits<Scalar>::epsilon())
             );
 
     HybridNonLinearSolverSpace::Status solveInit(FVectorType  &x);
@@ -79,7 +79,7 @@
 
     HybridNonLinearSolverSpace::Status hybrd1(
             FVectorType  &x,
-            const Scalar tol = std::sqrt(NumTraits<Scalar>::epsilon())
+            const Scalar tol = numext::sqrt(NumTraits<Scalar>::epsilon())
             );
 
     HybridNonLinearSolverSpace::Status solveNumericalDiffInit(FVectorType  &x);
diff --git a/unsupported/test/cxx11_tensor_volume_patch.cpp b/unsupported/test/cxx11_tensor_volume_patch.cpp
index 3aa363e..862212e 100644
--- a/unsupported/test/cxx11_tensor_volume_patch.cpp
+++ b/unsupported/test/cxx11_tensor_volume_patch.cpp
@@ -70,9 +70,9 @@
   const int dy = patch_y - 1;
   const int dx = patch_x - 1;
 
-  const int forward_pad_z = dz - dz / 2;
-  const int forward_pad_y = dy - dy / 2;
-  const int forward_pad_x = dx - dx / 2;
+  const int forward_pad_z = dz / 2;
+  const int forward_pad_y = dy / 2;
+  const int forward_pad_x = dx / 2;
 
   for (int pz = 0; pz < patch_z; pz++) {
     for (int py = 0; py < patch_y; py++) {
diff --git a/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp b/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp
index ca7994f..8d99a48 100644
--- a/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp
@@ -166,9 +166,9 @@
   const int dy = patch_y - 1;
   const int dx = patch_x - 1;
 
-  const int forward_pad_z = dz - dz / 2;
-  const int forward_pad_y = dy - dy / 2;
-  const int forward_pad_x = dx - dx / 2;
+  const int forward_pad_z = dz / 2;
+  const int forward_pad_y = dy / 2;
+  const int forward_pad_x = dx / 2;
 
   for (int pz = 0; pz < patch_z; pz++) {
     for (int py = 0; py < patch_y; py++) {
