Update Eigen to commit:f3912891504ad965d5fe8bd11d7346faa69b4026

CHANGELOG
=========
f39128915 - Fix bug in checking subnormals.
5a90fbcea - Update documentation of lapack second/dsecnd.
907982024 - Remove simple relicense script.
851b40afd - LAPACK CPU time functions.
a73970a86 - Fix arm32 issues.
580812201 - Formatting.
92f9544f6 - Remove explicit specialization of member function.
2692fb2b7 - Fix compile-time error caused by chip static asserts
2c6b61c00 - Add half and quarter vector support to HVX architecture
05a457534 - Chipping Asserts v2
fc92fe312 - SPQR: Fix build error, Index/StorageIndex mismatch.
eede526b7 - [Compressed Storage] Use smaller type of Index & StorageIndex for determining maximum size during resize.
772057c55 - Revert "Add asserts for .chip"
6163dbe2b - Allow specifying a temporary directory for fileio outputs.
6b6bb9d34 - Fix unused warnings in failtest.
f6e41e643 - Revert "Clean up stableNorm"
b1ae206ea - Add asserts for .chip
b0f906419 - add missing constexpr qualifier
34fd46a9b - Update CI with testing framework from eigen_ci_cross_testing.
b2814d53a - Fix stableNorm when input is zero-sized.
c29a41011 - check pointers before freeing
48e0c827d - [ROCm] MI300 related test support
538577301 - Fix TensorForcedEval in the case of the evaluator being copied.
3f3bc6d86 - Improve documentation of SparseLU
d33174d5a - Doc: Fix Basic slicing examples minor issues
19b119288 - Add factor getters to Cholmod LLT/LDLT
a1a96fafd - Clean up stableNorm
3026f1f29 - Fix various asan errors.
a2cf99ec6 - Fix GPU+clang+asan.
fee5d60b5 - Fix MSAN failures.
9697d481c - Fix up clang-format CI.
85efa8329 - Set up clang-format in CI
2c4541f73 - fix msvc clz
75e273afc - Add internal ctz/clz implementation.

PiperOrigin-RevId: 601659527
Change-Id: I52089e61fd81692bb2147f7d15ad98eb56da27c1
diff --git a/Eigen/Core b/Eigen/Core
index 39f2b3f..f9d9974 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -378,10 +378,6 @@
 #include "src/Core/arch/AVX512/GemmKernel.h"
 #endif
 
-#if defined(EIGEN_VECTORIZE_HVX)
-#include "src/Core/arch/HVX/GeneralBlockPanelKernel.h"
-#endif
-
 #include "src/Core/Select.h"
 #include "src/Core/VectorwiseOp.h"
 #include "src/Core/PartialReduxEvaluator.h"
diff --git a/Eigen/src/CholmodSupport/CholmodSupport.h b/Eigen/src/CholmodSupport/CholmodSupport.h
index be2e737..447a393 100644
--- a/Eigen/src/CholmodSupport/CholmodSupport.h
+++ b/Eigen/src/CholmodSupport/CholmodSupport.h
@@ -149,11 +149,20 @@
 
 /** Returns a view of the Cholmod sparse matrix \a cm as an Eigen sparse matrix.
  * The data are not copied but shared. */
-template <typename Scalar, int Flags, typename StorageIndex>
-Map<SparseMatrix<Scalar, Flags, StorageIndex> > viewAsEigen(cholmod_sparse& cm) {
-  return Map<SparseMatrix<Scalar, Flags, StorageIndex> >(cm.nrow, cm.ncol, static_cast<StorageIndex*>(cm.p)[cm.ncol],
-                                                         static_cast<StorageIndex*>(cm.p),
-                                                         static_cast<StorageIndex*>(cm.i), static_cast<Scalar*>(cm.x));
+template <typename Scalar, typename StorageIndex>
+Map<const SparseMatrix<Scalar, ColMajor, StorageIndex> > viewAsEigen(cholmod_sparse& cm) {
+  return Map<const SparseMatrix<Scalar, ColMajor, StorageIndex> >(
+      cm.nrow, cm.ncol, static_cast<StorageIndex*>(cm.p)[cm.ncol], static_cast<StorageIndex*>(cm.p),
+      static_cast<StorageIndex*>(cm.i), static_cast<Scalar*>(cm.x));
+}
+
+/** Returns a view of the Cholmod sparse matrix factor \a cm as an Eigen sparse matrix.
+ * The data are not copied but shared. */
+template <typename Scalar, typename StorageIndex>
+Map<const SparseMatrix<Scalar, ColMajor, StorageIndex> > viewAsEigen(cholmod_factor& cm) {
+  return Map<const SparseMatrix<Scalar, ColMajor, StorageIndex> >(
+      cm.n, cm.n, static_cast<StorageIndex*>(cm.p)[cm.n], static_cast<StorageIndex*>(cm.p),
+      static_cast<StorageIndex*>(cm.i), static_cast<Scalar*>(cm.x));
 }
 
 namespace internal {
@@ -188,6 +197,7 @@
 EIGEN_CHOLMOD_SPECIALIZE1(int, free_sparse, cholmod_sparse*, A)
 
 EIGEN_CHOLMOD_SPECIALIZE1(cholmod_factor*, analyze, cholmod_sparse, A)
+EIGEN_CHOLMOD_SPECIALIZE1(cholmod_sparse*, factor_to_sparse, cholmod_factor, L)
 
 template <typename StorageIndex_>
 inline cholmod_dense* cm_solve(int sys, cholmod_factor& L, cholmod_dense& B, cholmod_common& Common) {
@@ -377,7 +387,7 @@
     // TODO optimize this copy by swapping when possible (be careful with alignment, etc.)
     // NOTE cholmod_spsolve in fact just calls the dense solver for blocks of 4 columns at a time (similar to Eigen's
     // sparse solver)
-    dest.derived() = viewAsEigen<typename DestDerived::Scalar, ColMajor, typename DestDerived::StorageIndex>(*x_cs);
+    dest.derived() = viewAsEigen<typename DestDerived::Scalar, typename DestDerived::StorageIndex>(*x_cs);
     internal::cm_free_sparse<StorageIndex>(x_cs, m_cholmod);
   }
 #endif  // EIGEN_PARSED_BY_DOXYGEN
@@ -483,6 +493,11 @@
 
  public:
   typedef MatrixType_ MatrixType;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef typename MatrixType::StorageIndex StorageIndex;
+  typedef TriangularView<const MatrixType, Eigen::Lower> MatrixL;
+  typedef TriangularView<const typename MatrixType::AdjointReturnType, Eigen::Upper> MatrixU;
 
   CholmodSimplicialLLT() : Base() { init(); }
 
@@ -493,6 +508,12 @@
 
   ~CholmodSimplicialLLT() {}
 
+  /** \returns an expression of the factor L */
+  inline MatrixL matrixL() const { return viewAsEigen<Scalar, StorageIndex>(*Base::m_cholmodFactor); }
+
+  /** \returns an expression of the factor U (= L^*) */
+  inline MatrixU matrixU() const { return matrixL().adjoint(); }
+
  protected:
   void init() {
     m_cholmod.final_asis = 0;
@@ -531,6 +552,12 @@
 
  public:
   typedef MatrixType_ MatrixType;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef typename MatrixType::StorageIndex StorageIndex;
+  typedef Matrix<Scalar, Dynamic, 1> VectorType;
+  typedef TriangularView<const MatrixType, Eigen::UnitLower> MatrixL;
+  typedef TriangularView<const typename MatrixType::AdjointReturnType, Eigen::UnitUpper> MatrixU;
 
   CholmodSimplicialLDLT() : Base() { init(); }
 
@@ -541,6 +568,26 @@
 
   ~CholmodSimplicialLDLT() {}
 
+  /** \returns a vector expression of the diagonal D */
+  inline VectorType vectorD() const {
+    auto cholmodL = viewAsEigen<Scalar, StorageIndex>(*Base::m_cholmodFactor);
+
+    VectorType D{cholmodL.rows()};
+
+    for (Index k = 0; k < cholmodL.outerSize(); ++k) {
+      typename decltype(cholmodL)::InnerIterator it{cholmodL, k};
+      D(k) = it.value();
+    }
+
+    return D;
+  }
+
+  /** \returns an expression of the factor L */
+  inline MatrixL matrixL() const { return viewAsEigen<Scalar, StorageIndex>(*Base::m_cholmodFactor); }
+
+  /** \returns an expression of the factor U (= L^*) */
+  inline MatrixU matrixU() const { return matrixL().adjoint(); }
+
  protected:
   void init() {
     m_cholmod.final_asis = 1;
@@ -578,6 +625,9 @@
 
  public:
   typedef MatrixType_ MatrixType;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef typename MatrixType::StorageIndex StorageIndex;
 
   CholmodSupernodalLLT() : Base() { init(); }
 
@@ -588,6 +638,19 @@
 
   ~CholmodSupernodalLLT() {}
 
+  /** \returns an expression of the factor L */
+  inline MatrixType matrixL() const {
+    // Convert Cholmod factor's supernodal storage format to Eigen's CSC storage format
+    cholmod_sparse* cholmodL = internal::cm_factor_to_sparse(*Base::m_cholmodFactor, m_cholmod);
+    MatrixType L = viewAsEigen<Scalar, StorageIndex>(*cholmodL);
+    internal::cm_free_sparse<StorageIndex>(cholmodL, m_cholmod);
+
+    return L;
+  }
+
+  /** \returns an expression of the factor U (= L^*) */
+  inline MatrixType matrixU() const { return matrixL().adjoint(); }
+
  protected:
   void init() {
     m_cholmod.final_asis = 1;
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index 5936336..3a302c0 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -612,11 +612,7 @@
   }
 };
 
-#ifndef SYCL_DEVICE_ONLY
-#define EIGEN_BINARY_OP_NAN_PROPAGATION(Type, Func) Func
-#else
 #define EIGEN_BINARY_OP_NAN_PROPAGATION(Type, Func) [](const Type& a, const Type& b) { return Func(a, b); }
-#endif
 
 /** \internal \returns the min of \a a and \a b  (coeff-wise).
     If \a a or \b b is NaN, the return value is implementation defined. */
diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index 95f9b97..0be29bc 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -628,6 +628,147 @@
   // no value, error at compile time
 };
 
+template <typename BitsType, typename EnableIf = void>
+struct count_bits_impl {
+  static_assert(std::is_integral<BitsType>::value && std::is_unsigned<BitsType>::value,
+                "BitsType must be an unsigned integer");
+
+  static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) {
+    int n = CHAR_BIT * sizeof(BitsType);
+    int shift = n / 2;
+    while (bits > 0 && shift > 0) {
+      BitsType y = bits >> shift;
+      if (y > 0) {
+        n -= shift;
+        bits = y;
+      }
+      shift /= 2;
+    }
+    if (shift == 0) {
+      --n;
+    }
+    return n;
+  }
+
+  static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) {
+    int n = CHAR_BIT * sizeof(BitsType);
+    int shift = n / 2;
+    while (bits > 0 && shift > 0) {
+      BitsType y = bits << shift;
+      if (y > 0) {
+        n -= shift;
+        bits = y;
+      }
+      shift /= 2;
+    }
+    if (shift == 0) {
+      --n;
+    }
+    return n;
+  }
+};
+
+// Count leading zeros.
+template <typename BitsType>
+EIGEN_DEVICE_FUNC inline int clz(BitsType bits) {
+  return count_bits_impl<BitsType>::clz(bits);
+}
+
+// Count trailing zeros.
+template <typename BitsType>
+EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) {
+  return count_bits_impl<BitsType>::ctz(bits);
+}
+
+#if EIGEN_COMP_GNUC || EIGEN_COMP_CLANG
+
+template <typename BitsType>
+struct count_bits_impl<BitsType, std::enable_if_t<sizeof(BitsType) <= sizeof(unsigned int)>> {
+  static constexpr int kNumBits = static_cast<int>(sizeof(BitsType) * CHAR_BIT);
+  static_assert(std::is_integral<BitsType>::value, "BitsType must be a built-in integer");
+  static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) {
+    static constexpr int kLeadingBitsOffset = (sizeof(unsigned int) - sizeof(BitsType)) * CHAR_BIT;
+    return bits == 0 ? kNumBits : __builtin_clz(static_cast<unsigned int>(bits)) - kLeadingBitsOffset;
+  }
+
+  static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) {
+    return bits == 0 ? kNumBits : __builtin_ctz(static_cast<unsigned int>(bits));
+  }
+};
+
+template <typename BitsType>
+struct count_bits_impl<
+    BitsType, std::enable_if_t<sizeof(unsigned int) < sizeof(BitsType) && sizeof(BitsType) <= sizeof(unsigned long)>> {
+  static constexpr int kNumBits = static_cast<int>(sizeof(BitsType) * CHAR_BIT);
+  static_assert(std::is_integral<BitsType>::value, "BitsType must be a built-in integer");
+  static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) {
+    static constexpr int kLeadingBitsOffset = (sizeof(unsigned long) - sizeof(BitsType)) * CHAR_BIT;
+    return bits == 0 ? kNumBits : __builtin_clzl(static_cast<unsigned long>(bits)) - kLeadingBitsOffset;
+  }
+
+  static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) {
+    return bits == 0 ? kNumBits : __builtin_ctzl(static_cast<unsigned long>(bits));
+  }
+};
+
+template <typename BitsType>
+struct count_bits_impl<BitsType, std::enable_if_t<sizeof(unsigned long) < sizeof(BitsType) &&
+                                                  sizeof(BitsType) <= sizeof(unsigned long long)>> {
+  static constexpr int kNumBits = static_cast<int>(sizeof(BitsType) * CHAR_BIT);
+  static_assert(std::is_integral<BitsType>::value, "BitsType must be a built-in integer");
+  static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) {
+    static constexpr int kLeadingBitsOffset = (sizeof(unsigned long long) - sizeof(BitsType)) * CHAR_BIT;
+    return bits == 0 ? kNumBits : __builtin_clzll(static_cast<unsigned long long>(bits)) - kLeadingBitsOffset;
+  }
+
+  static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) {
+    return bits == 0 ? kNumBits : __builtin_ctzll(static_cast<unsigned long long>(bits));
+  }
+};
+
+#elif EIGEN_COMP_MSVC
+
+template <typename BitsType>
+struct count_bits_impl<BitsType, std::enable_if_t<sizeof(BitsType) <= sizeof(unsigned long)>> {
+  static constexpr int kNumBits = static_cast<int>(sizeof(BitsType) * CHAR_BIT);
+  static_assert(std::is_integral<BitsType>::value, "BitsType must be a built-in integer");
+  static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) {
+    unsigned long out;
+    _BitScanReverse(&out, static_cast<unsigned long>(bits));
+    return bits == 0 ? kNumBits : (kNumBits - 1) - static_cast<int>(out);
+  }
+
+  static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) {
+    unsigned long out;
+    _BitScanForward(&out, static_cast<unsigned long>(bits));
+    return bits == 0 ? kNumBits : static_cast<int>(out);
+  }
+};
+
+#ifdef _WIN64
+
+template <typename BitsType>
+struct count_bits_impl<
+    BitsType, std::enable_if_t<sizeof(unsigned long) < sizeof(BitsType) && sizeof(BitsType) <= sizeof(__int64)>> {
+  static constexpr int kNumBits = static_cast<int>(sizeof(BitsType) * CHAR_BIT);
+  static_assert(std::is_integral<BitsType>::value, "BitsType must be a built-in integer");
+  static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) {
+    unsigned long out;
+    _BitScanReverse64(&out, static_cast<unsigned __int64>(bits));
+    return bits == 0 ? kNumBits : (kNumBits - 1) - static_cast<int>(out);
+  }
+
+  static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) {
+    unsigned long out;
+    _BitScanForward64(&out, static_cast<unsigned __int64>(bits));
+    return bits == 0 ? kNumBits : static_cast<int>(out);
+  }
+};
+
+#endif  // _WIN64
+
+#endif  // EIGEN_COMP_GNUC || EIGEN_COMP_CLANG
+
 template <typename Scalar>
 struct random_default_impl<Scalar, false, true> {
   static inline Scalar run(const Scalar& x, const Scalar& y) {
diff --git a/Eigen/src/Core/PlainObjectBase.h b/Eigen/src/Core/PlainObjectBase.h
index a8307c7..f9bf737 100644
--- a/Eigen/src/Core/PlainObjectBase.h
+++ b/Eigen/src/Core/PlainObjectBase.h
@@ -204,7 +204,9 @@
    * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts.
    *
    * See DenseCoeffsBase<Derived,ReadOnlyAccessors>::coeff(Index) const for details. */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const { return m_storage.data()[index]; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const Scalar& coeff(Index index) const {
+    return m_storage.data()[index];
+  }
 
   /** This is an overloaded version of DenseCoeffsBase<Derived,WriteAccessors>::coeffRef(Index,Index) const
    * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts.
diff --git a/Eigen/src/Core/arch/AVX512/PacketMathFP16.h b/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
index fc11174..131e6f1 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
@@ -1,870 +1,870 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-//
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_PACKET_MATH_FP16_AVX512_H
-#define EIGEN_PACKET_MATH_FP16_AVX512_H
-
-// IWYU pragma: private
-#include "../../InternalHeaderCheck.h"
-
-namespace Eigen {
-
-namespace internal {
-
-typedef __m512h Packet32h;
-typedef eigen_packet_wrapper<__m256i, 1> Packet16h;
-typedef eigen_packet_wrapper<__m128i, 2> Packet8h;
-
-template <>
-struct is_arithmetic<Packet8h> {
-  enum { value = true };
-};
-
-template <>
-struct packet_traits<half> : default_packet_traits {
-  typedef Packet32h type;
-  typedef Packet16h half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 32,
-
-    HasCmp = 1,
-    HasAdd = 1,
-    HasSub = 1,
-    HasMul = 1,
-    HasDiv = 1,
-    HasNegate = 1,
-    HasAbs = 1,
-    HasAbs2 = 0,
-    HasMin = 1,
-    HasMax = 1,
-    HasConj = 1,
-    HasSetLinear = 0,
-    HasLog = 1,
-    HasLog1p = 1,
-    HasExp = 1,
-    HasExpm1 = 1,
-    HasSqrt = 1,
-    HasRsqrt = 1,
-    // These ones should be implemented in future
-    HasBessel = 0,
-    HasNdtri = 0,
-    HasSin = EIGEN_FAST_MATH,
-    HasCos = EIGEN_FAST_MATH,
-    HasTanh = EIGEN_FAST_MATH,
-    HasErf = 0,  // EIGEN_FAST_MATH,
-    HasBlend = 0,
-    HasRound = 1,
-    HasFloor = 1,
-    HasCeil = 1,
-    HasRint = 1
-  };
-};
-
-template <>
-struct unpacket_traits<Packet32h> {
-  typedef Eigen::half type;
-  typedef Packet16h half;
-  enum {
-    size = 32,
-    alignment = Aligned64,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
-
-template <>
-struct unpacket_traits<Packet16h> {
-  typedef Eigen::half type;
-  typedef Packet8h half;
-  enum {
-    size = 16,
-    alignment = Aligned32,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
-
-template <>
-struct unpacket_traits<Packet8h> {
-  typedef Eigen::half type;
-  typedef Packet8h half;
-  enum {
-    size = 8,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
-
-// Memory functions
-
-// pset1
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pset1<Packet32h>(const Eigen::half& from) {
-  return _mm512_set1_ph(static_cast<_Float16>(from));
-}
-
-// pset1frombits
-template <>
-EIGEN_STRONG_INLINE Packet32h pset1frombits<Packet32h>(unsigned short from) {
-  return _mm512_castsi512_ph(_mm512_set1_epi16(from));
-}
-
-// pfirst
-
-template <>
-EIGEN_STRONG_INLINE Eigen::half pfirst<Packet32h>(const Packet32h& from) {
-#ifdef EIGEN_VECTORIZE_AVX512DQ
-  return half_impl::raw_uint16_to_half(
-      static_cast<unsigned short>(_mm256_extract_epi16(_mm512_extracti32x8_epi32(_mm512_castph_si512(from), 0), 0)));
-#else
-  Eigen::half dest[32];
-  _mm512_storeu_ph(dest, from);
-  return dest[0];
-#endif
-}
-
-// pload
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pload<Packet32h>(const Eigen::half* from) {
-  EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_ph(from);
-}
-
-// ploadu
-
-template <>
-EIGEN_STRONG_INLINE Packet32h ploadu<Packet32h>(const Eigen::half* from) {
-  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_ph(from);
-}
-
-// pstore
-
-template <>
-EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet32h& from) {
-  EIGEN_DEBUG_ALIGNED_STORE _mm512_store_ph(to, from);
-}
-
-// pstoreu
-
-template <>
-EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet32h& from) {
-  EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_ph(to, from);
-}
-
-// ploaddup
-template <>
-EIGEN_STRONG_INLINE Packet32h ploaddup<Packet32h>(const Eigen::half* from) {
-  __m512h a = _mm512_castph256_ph512(_mm256_loadu_ph(from));
-  return _mm512_permutexvar_ph(_mm512_set_epi16(15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6,
-                                                5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0),
-                               a);
-}
-
-// ploadquad
-template <>
-EIGEN_STRONG_INLINE Packet32h ploadquad<Packet32h>(const Eigen::half* from) {
-  __m512h a = _mm512_castph128_ph512(_mm_loadu_ph(from));
-  return _mm512_permutexvar_ph(
-      _mm512_set_epi16(7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0),
-      a);
-}
-
-// pabs
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pabs<Packet32h>(const Packet32h& a) {
-  return _mm512_abs_ph(a);
-}
-
-// psignbit
-
-template <>
-EIGEN_STRONG_INLINE Packet32h psignbit<Packet32h>(const Packet32h& a) {
-  return _mm512_castsi512_ph(_mm512_srai_epi16(_mm512_castph_si512(a), 15));
-}
-
-// pmin
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pmin<Packet32h>(const Packet32h& a, const Packet32h& b) {
-  return _mm512_min_ph(a, b);
-}
-
-// pmax
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pmax<Packet32h>(const Packet32h& a, const Packet32h& b) {
-  return _mm512_max_ph(a, b);
-}
-
-// plset
-template <>
-EIGEN_STRONG_INLINE Packet32h plset<Packet32h>(const half& a) {
-  return _mm512_add_ph(_mm512_set1_ph(a),
-                       _mm512_set_ph(31.0f, 30.0f, 29.0f, 28.0f, 27.0f, 26.0f, 25.0f, 24.0f, 23.0f, 22.0f, 21.0f, 20.0f,
-                                     19.0f, 18.0f, 17.0f, 16.0f, 15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f,
-                                     7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f));
-}
-
-// por
-
-template <>
-EIGEN_STRONG_INLINE Packet32h por(const Packet32h& a, const Packet32h& b) {
-  return _mm512_castsi512_ph(_mm512_or_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
-}
-
-// pxor
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pxor(const Packet32h& a, const Packet32h& b) {
-  return _mm512_castsi512_ph(_mm512_xor_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
-}
-
-// pand
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pand(const Packet32h& a, const Packet32h& b) {
-  return _mm512_castsi512_ph(_mm512_and_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
-}
-
-// pandnot
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pandnot(const Packet32h& a, const Packet32h& b) {
-  return _mm512_castsi512_ph(_mm512_andnot_si512(_mm512_castph_si512(b), _mm512_castph_si512(a)));
-}
-
-// pselect
-
-template <>
-EIGEN_DEVICE_FUNC inline Packet32h pselect(const Packet32h& mask, const Packet32h& a, const Packet32h& b) {
-  __mmask32 mask32 = _mm512_cmp_epi16_mask(_mm512_castph_si512(mask), _mm512_setzero_epi32(), _MM_CMPINT_EQ);
-  return _mm512_mask_blend_ph(mask32, a, b);
-}
-
-// pcmp_eq
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pcmp_eq(const Packet32h& a, const Packet32h& b) {
-  __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_EQ_OQ);
-  return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu));
-}
-
-// pcmp_le
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pcmp_le(const Packet32h& a, const Packet32h& b) {
-  __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_LE_OQ);
-  return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu));
-}
-
-// pcmp_lt
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pcmp_lt(const Packet32h& a, const Packet32h& b) {
-  __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_LT_OQ);
-  return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu));
-}
-
-// pcmp_lt_or_nan
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pcmp_lt_or_nan(const Packet32h& a, const Packet32h& b) {
-  __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_NGE_UQ);
-  return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi16(0), mask, 0xffffu));
-}
-
-// padd
-
-template <>
-EIGEN_STRONG_INLINE Packet32h padd<Packet32h>(const Packet32h& a, const Packet32h& b) {
-  return _mm512_add_ph(a, b);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
-  return _mm256_castph_si256(_mm256_add_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {
-  return _mm_castph_si128(_mm_add_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
-}
-
-// psub
-
-template <>
-EIGEN_STRONG_INLINE Packet32h psub<Packet32h>(const Packet32h& a, const Packet32h& b) {
-  return _mm512_sub_ph(a, b);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16h psub<Packet16h>(const Packet16h& a, const Packet16h& b) {
-  return _mm256_castph_si256(_mm256_sub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8h psub<Packet8h>(const Packet8h& a, const Packet8h& b) {
-  return _mm_castph_si128(_mm_sub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
-}
-
-// pmul
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pmul<Packet32h>(const Packet32h& a, const Packet32h& b) {
-  return _mm512_mul_ph(a, b);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {
-  return _mm256_castph_si256(_mm256_mul_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {
-  return _mm_castph_si128(_mm_mul_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
-}
-
-// pdiv
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pdiv<Packet32h>(const Packet32h& a, const Packet32h& b) {
-  return _mm512_div_ph(a, b);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16h pdiv<Packet16h>(const Packet16h& a, const Packet16h& b) {
-  return _mm256_castph_si256(_mm256_div_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8h pdiv<Packet8h>(const Packet8h& a, const Packet8h& b) {
-  return _mm_castph_si128(_mm_div_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
-}
-
-// pround
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pround<Packet32h>(const Packet32h& a) {
-  // Work-around for default std::round rounding mode.
-
-  // Mask for the sign bit
-  const Packet32h signMask = pset1frombits<Packet32h>(static_cast<numext::uint16_t>(0x8000u));
-  // The largest half-preicision float less than 0.5
-  const Packet32h prev0dot5 = pset1frombits<Packet32h>(static_cast<numext::uint16_t>(0x37FFu));
-
-  return _mm512_roundscale_ph(padd(por(pand(a, signMask), prev0dot5), a), _MM_FROUND_TO_ZERO);
-}
-
-// print
-
-template <>
-EIGEN_STRONG_INLINE Packet32h print<Packet32h>(const Packet32h& a) {
-  return _mm512_roundscale_ph(a, _MM_FROUND_CUR_DIRECTION);
-}
-
-// pceil
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pceil<Packet32h>(const Packet32h& a) {
-  return _mm512_roundscale_ph(a, _MM_FROUND_TO_POS_INF);
-}
-
-// pfloor
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pfloor<Packet32h>(const Packet32h& a) {
-  return _mm512_roundscale_ph(a, _MM_FROUND_TO_NEG_INF);
-}
-
-// predux
-template <>
-EIGEN_STRONG_INLINE half predux<Packet32h>(const Packet32h& a) {
-  return (half)_mm512_reduce_add_ph(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& a) {
-  return (half)_mm256_reduce_add_ph(_mm256_castsi256_ph(a));
-}
-
-template <>
-EIGEN_STRONG_INLINE half predux<Packet8h>(const Packet8h& a) {
-  return (half)_mm_reduce_add_ph(_mm_castsi128_ph(a));
-}
-
-// predux_half_dowto4
-template <>
-EIGEN_STRONG_INLINE Packet16h predux_half_dowto4<Packet32h>(const Packet32h& a) {
-#ifdef EIGEN_VECTORIZE_AVX512DQ
-  __m256i lowHalf = _mm256_castps_si256(_mm512_extractf32x8_ps(_mm512_castph_ps(a), 0));
-  __m256i highHalf = _mm256_castps_si256(_mm512_extractf32x8_ps(_mm512_castph_ps(a), 1));
-
-  return Packet16h(padd<Packet16h>(lowHalf, highHalf));
-#else
-  Eigen::half data[32];
-  _mm512_storeu_ph(data, a);
-
-  __m256i lowHalf = _mm256_castph_si256(_mm256_loadu_ph(data));
-  __m256i highHalf = _mm256_castph_si256(_mm256_loadu_ph(data + 16));
-
-  return Packet16h(padd<Packet16h>(lowHalf, highHalf));
-#endif
-}
-
-// predux_max
-
-// predux_min
-
-// predux_mul
-
-#ifdef EIGEN_VECTORIZE_FMA
-
-// pmadd
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pmadd(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
-  return _mm512_fmadd_ph(a, b, c);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16h pmadd(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
-  return _mm256_castph_si256(_mm256_fmadd_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8h pmadd(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
-  return _mm_castph_si128(_mm_fmadd_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
-}
-
-// pmsub
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pmsub(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
-  return _mm512_fmsub_ph(a, b, c);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16h pmsub(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
-  return _mm256_castph_si256(_mm256_fmsub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8h pmsub(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
-  return _mm_castph_si128(_mm_fmsub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
-}
-
-// pnmadd
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pnmadd(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
-  return _mm512_fnmadd_ph(a, b, c);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16h pnmadd(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
-  return _mm256_castph_si256(_mm256_fnmadd_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8h pnmadd(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
-  return _mm_castph_si128(_mm_fnmadd_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
-}
-
-// pnmsub
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pnmsub(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
-  return _mm512_fnmsub_ph(a, b, c);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16h pnmsub(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
-  return _mm256_castph_si256(_mm256_fnmsub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8h pnmsub(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
-  return _mm_castph_si128(_mm_fnmsub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
-}
-
-#endif
-
-// pnegate
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pnegate<Packet32h>(const Packet32h& a) {
-  return _mm512_sub_ph(_mm512_set1_ph(0.0), a);
-}
-
-// pconj
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pconj<Packet32h>(const Packet32h& a) {
-  return a;
-}
-
-// psqrt
-
-template <>
-EIGEN_STRONG_INLINE Packet32h psqrt<Packet32h>(const Packet32h& a) {
-  return _mm512_sqrt_ph(a);
-}
-
-// prsqrt
-
-template <>
-EIGEN_STRONG_INLINE Packet32h prsqrt<Packet32h>(const Packet32h& a) {
-  return _mm512_rsqrt_ph(a);
-}
-
-// preciprocal
-
-template <>
-EIGEN_STRONG_INLINE Packet32h preciprocal<Packet32h>(const Packet32h& a) {
-  return _mm512_rcp_ph(a);
-}
-
-// ptranspose
-
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet32h, 32>& a) {
-  __m512i t[32];
-
-  EIGEN_UNROLL_LOOP
-  for (int i = 0; i < 16; i++) {
-    t[2 * i] = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[2 * i]), _mm512_castph_si512(a.packet[2 * i + 1]));
-    t[2 * i + 1] =
-        _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[2 * i]), _mm512_castph_si512(a.packet[2 * i + 1]));
-  }
-
-  __m512i p[32];
-
-  EIGEN_UNROLL_LOOP
-  for (int i = 0; i < 8; i++) {
-    p[4 * i] = _mm512_unpacklo_epi32(t[4 * i], t[4 * i + 2]);
-    p[4 * i + 1] = _mm512_unpackhi_epi32(t[4 * i], t[4 * i + 2]);
-    p[4 * i + 2] = _mm512_unpacklo_epi32(t[4 * i + 1], t[4 * i + 3]);
-    p[4 * i + 3] = _mm512_unpackhi_epi32(t[4 * i + 1], t[4 * i + 3]);
-  }
-
-  __m512i q[32];
-
-  EIGEN_UNROLL_LOOP
-  for (int i = 0; i < 4; i++) {
-    q[8 * i] = _mm512_unpacklo_epi64(p[8 * i], p[8 * i + 4]);
-    q[8 * i + 1] = _mm512_unpackhi_epi64(p[8 * i], p[8 * i + 4]);
-    q[8 * i + 2] = _mm512_unpacklo_epi64(p[8 * i + 1], p[8 * i + 5]);
-    q[8 * i + 3] = _mm512_unpackhi_epi64(p[8 * i + 1], p[8 * i + 5]);
-    q[8 * i + 4] = _mm512_unpacklo_epi64(p[8 * i + 2], p[8 * i + 6]);
-    q[8 * i + 5] = _mm512_unpackhi_epi64(p[8 * i + 2], p[8 * i + 6]);
-    q[8 * i + 6] = _mm512_unpacklo_epi64(p[8 * i + 3], p[8 * i + 7]);
-    q[8 * i + 7] = _mm512_unpackhi_epi64(p[8 * i + 3], p[8 * i + 7]);
-  }
-
-  __m512i f[32];
-
-#define PACKET32H_TRANSPOSE_HELPER(X, Y)                                                            \
-  do {                                                                                              \
-    f[Y * 8] = _mm512_inserti32x4(f[Y * 8], _mm512_extracti32x4_epi32(q[X * 8], Y), X);             \
-    f[Y * 8 + 1] = _mm512_inserti32x4(f[Y * 8 + 1], _mm512_extracti32x4_epi32(q[X * 8 + 1], Y), X); \
-    f[Y * 8 + 2] = _mm512_inserti32x4(f[Y * 8 + 2], _mm512_extracti32x4_epi32(q[X * 8 + 2], Y), X); \
-    f[Y * 8 + 3] = _mm512_inserti32x4(f[Y * 8 + 3], _mm512_extracti32x4_epi32(q[X * 8 + 3], Y), X); \
-    f[Y * 8 + 4] = _mm512_inserti32x4(f[Y * 8 + 4], _mm512_extracti32x4_epi32(q[X * 8 + 4], Y), X); \
-    f[Y * 8 + 5] = _mm512_inserti32x4(f[Y * 8 + 5], _mm512_extracti32x4_epi32(q[X * 8 + 5], Y), X); \
-    f[Y * 8 + 6] = _mm512_inserti32x4(f[Y * 8 + 6], _mm512_extracti32x4_epi32(q[X * 8 + 6], Y), X); \
-    f[Y * 8 + 7] = _mm512_inserti32x4(f[Y * 8 + 7], _mm512_extracti32x4_epi32(q[X * 8 + 7], Y), X); \
-  } while (false);
-
-  PACKET32H_TRANSPOSE_HELPER(0, 0);
-  PACKET32H_TRANSPOSE_HELPER(1, 1);
-  PACKET32H_TRANSPOSE_HELPER(2, 2);
-  PACKET32H_TRANSPOSE_HELPER(3, 3);
-
-  PACKET32H_TRANSPOSE_HELPER(1, 0);
-  PACKET32H_TRANSPOSE_HELPER(2, 0);
-  PACKET32H_TRANSPOSE_HELPER(3, 0);
-  PACKET32H_TRANSPOSE_HELPER(2, 1);
-  PACKET32H_TRANSPOSE_HELPER(3, 1);
-  PACKET32H_TRANSPOSE_HELPER(3, 2);
-
-  PACKET32H_TRANSPOSE_HELPER(0, 1);
-  PACKET32H_TRANSPOSE_HELPER(0, 2);
-  PACKET32H_TRANSPOSE_HELPER(0, 3);
-  PACKET32H_TRANSPOSE_HELPER(1, 2);
-  PACKET32H_TRANSPOSE_HELPER(1, 3);
-  PACKET32H_TRANSPOSE_HELPER(2, 3);
-
-#undef PACKET32H_TRANSPOSE_HELPER
-
-  EIGEN_UNROLL_LOOP
-  for (int i = 0; i < 32; i++) {
-    a.packet[i] = _mm512_castsi512_ph(f[i]);
-  }
-}
-
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet32h, 4>& a) {
-  __m512i p0, p1, p2, p3, t0, t1, t2, t3, a0, a1, a2, a3;
-  t0 = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[0]), _mm512_castph_si512(a.packet[1]));
-  t1 = _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[0]), _mm512_castph_si512(a.packet[1]));
-  t2 = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[2]), _mm512_castph_si512(a.packet[3]));
-  t3 = _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[2]), _mm512_castph_si512(a.packet[3]));
-
-  p0 = _mm512_unpacklo_epi32(t0, t2);
-  p1 = _mm512_unpackhi_epi32(t0, t2);
-  p2 = _mm512_unpacklo_epi32(t1, t3);
-  p3 = _mm512_unpackhi_epi32(t1, t3);
-
-  a0 = p0;
-  a1 = p1;
-  a2 = p2;
-  a3 = p3;
-
-  a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p1, 0), 1);
-  a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p0, 1), 0);
-
-  a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p2, 0), 2);
-  a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p0, 2), 0);
-
-  a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p3, 0), 3);
-  a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p0, 3), 0);
-
-  a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p2, 1), 2);
-  a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p1, 2), 1);
-
-  a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p3, 2), 3);
-  a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p2, 3), 2);
-
-  a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p3, 1), 3);
-  a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p1, 3), 1);
-
-  a.packet[0] = _mm512_castsi512_ph(a0);
-  a.packet[1] = _mm512_castsi512_ph(a1);
-  a.packet[2] = _mm512_castsi512_ph(a2);
-  a.packet[3] = _mm512_castsi512_ph(a3);
-}
-
-// preverse
-
-template <>
-EIGEN_STRONG_INLINE Packet32h preverse(const Packet32h& a) {
-  return _mm512_permutexvar_ph(_mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
-                                                20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31),
-                               a);
-}
-
-// pscatter
-
-template <>
-EIGEN_STRONG_INLINE void pscatter<half, Packet32h>(half* to, const Packet32h& from, Index stride) {
-  EIGEN_ALIGN64 half aux[32];
-  pstore(aux, from);
-
-  EIGEN_UNROLL_LOOP
-  for (int i = 0; i < 32; i++) {
-    to[stride * i] = aux[i];
-  }
-}
-
-// pgather
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pgather<Eigen::half, Packet32h>(const Eigen::half* from, Index stride) {
-  return _mm512_castsi512_ph(_mm512_set_epi16(
-      from[31 * stride].x, from[30 * stride].x, from[29 * stride].x, from[28 * stride].x, from[27 * stride].x,
-      from[26 * stride].x, from[25 * stride].x, from[24 * stride].x, from[23 * stride].x, from[22 * stride].x,
-      from[21 * stride].x, from[20 * stride].x, from[19 * stride].x, from[18 * stride].x, from[17 * stride].x,
-      from[16 * stride].x, from[15 * stride].x, from[14 * stride].x, from[13 * stride].x, from[12 * stride].x,
-      from[11 * stride].x, from[10 * stride].x, from[9 * stride].x, from[8 * stride].x, from[7 * stride].x,
-      from[6 * stride].x, from[5 * stride].x, from[4 * stride].x, from[3 * stride].x, from[2 * stride].x,
-      from[1 * stride].x, from[0 * stride].x));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16h pcos<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h psin<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h plog<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h plog2<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h plog1p<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h pexp<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h pexpm1<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h ptanh<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h pfrexp<Packet16h>(const Packet16h&, Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h pldexp<Packet16h>(const Packet16h&, const Packet16h&);
-
-EIGEN_STRONG_INLINE Packet32h combine2Packet16h(const Packet16h& a, const Packet16h& b) {
-  __m512d result = _mm512_undefined_pd();
-  result = _mm512_insertf64x4(result, _mm256_castsi256_pd(a), 0);
-  result = _mm512_insertf64x4(result, _mm256_castsi256_pd(b), 1);
-  return _mm512_castpd_ph(result);
-}
-
-EIGEN_STRONG_INLINE void extract2Packet16h(const Packet32h& x, Packet16h& a, Packet16h& b) {
-  a = _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(x), 0));
-  b = _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(x), 1));
-}
-
-// psin
-template <>
-EIGEN_STRONG_INLINE Packet32h psin<Packet32h>(const Packet32h& a) {
-  Packet16h low;
-  Packet16h high;
-  extract2Packet16h(a, low, high);
-
-  Packet16h lowOut = psin(low);
-  Packet16h highOut = psin(high);
-
-  return combine2Packet16h(lowOut, highOut);
-}
-
-// pcos
-template <>
-EIGEN_STRONG_INLINE Packet32h pcos<Packet32h>(const Packet32h& a) {
-  Packet16h low;
-  Packet16h high;
-  extract2Packet16h(a, low, high);
-
-  Packet16h lowOut = pcos(low);
-  Packet16h highOut = pcos(high);
-
-  return combine2Packet16h(lowOut, highOut);
-}
-
-// plog
-template <>
-EIGEN_STRONG_INLINE Packet32h plog<Packet32h>(const Packet32h& a) {
-  Packet16h low;
-  Packet16h high;
-  extract2Packet16h(a, low, high);
-
-  Packet16h lowOut = plog(low);
-  Packet16h highOut = plog(high);
-
-  return combine2Packet16h(lowOut, highOut);
-}
-
-// plog2
-template <>
-EIGEN_STRONG_INLINE Packet32h plog2<Packet32h>(const Packet32h& a) {
-  Packet16h low;
-  Packet16h high;
-  extract2Packet16h(a, low, high);
-
-  Packet16h lowOut = plog2(low);
-  Packet16h highOut = plog2(high);
-
-  return combine2Packet16h(lowOut, highOut);
-}
-
-// plog1p
-template <>
-EIGEN_STRONG_INLINE Packet32h plog1p<Packet32h>(const Packet32h& a) {
-  Packet16h low;
-  Packet16h high;
-  extract2Packet16h(a, low, high);
-
-  Packet16h lowOut = plog1p(low);
-  Packet16h highOut = plog1p(high);
-
-  return combine2Packet16h(lowOut, highOut);
-}
-
-// pexp
-template <>
-EIGEN_STRONG_INLINE Packet32h pexp<Packet32h>(const Packet32h& a) {
-  Packet16h low;
-  Packet16h high;
-  extract2Packet16h(a, low, high);
-
-  Packet16h lowOut = pexp(low);
-  Packet16h highOut = pexp(high);
-
-  return combine2Packet16h(lowOut, highOut);
-}
-
-// pexpm1
-template <>
-EIGEN_STRONG_INLINE Packet32h pexpm1<Packet32h>(const Packet32h& a) {
-  Packet16h low;
-  Packet16h high;
-  extract2Packet16h(a, low, high);
-
-  Packet16h lowOut = pexpm1(low);
-  Packet16h highOut = pexpm1(high);
-
-  return combine2Packet16h(lowOut, highOut);
-}
-
-// ptanh
-template <>
-EIGEN_STRONG_INLINE Packet32h ptanh<Packet32h>(const Packet32h& a) {
-  Packet16h low;
-  Packet16h high;
-  extract2Packet16h(a, low, high);
-
-  Packet16h lowOut = ptanh(low);
-  Packet16h highOut = ptanh(high);
-
-  return combine2Packet16h(lowOut, highOut);
-}
-
-// pfrexp
-template <>
-EIGEN_STRONG_INLINE Packet32h pfrexp<Packet32h>(const Packet32h& a, Packet32h& exponent) {
-  Packet16h low;
-  Packet16h high;
-  extract2Packet16h(a, low, high);
-
-  Packet16h exp1 = _mm256_undefined_si256();
-  Packet16h exp2 = _mm256_undefined_si256();
-
-  Packet16h lowOut = pfrexp(low, exp1);
-  Packet16h highOut = pfrexp(high, exp2);
-
-  exponent = combine2Packet16h(exp1, exp2);
-
-  return combine2Packet16h(lowOut, highOut);
-}
-
-// pldexp
-template <>
-EIGEN_STRONG_INLINE Packet32h pldexp<Packet32h>(const Packet32h& a, const Packet32h& exponent) {
-  Packet16h low;
-  Packet16h high;
-  extract2Packet16h(a, low, high);
-
-  Packet16h exp1;
-  Packet16h exp2;
-  extract2Packet16h(exponent, exp1, exp2);
-
-  Packet16h lowOut = pldexp(low, exp1);
-  Packet16h highOut = pldexp(high, exp2);
-
-  return combine2Packet16h(lowOut, highOut);
-}
-
-}  // end namespace internal
-}  // end namespace Eigen
-
-#endif  // EIGEN_PACKET_MATH_FP16_AVX512_H
+// This file is part of Eigen, a lightweight C++ template library

+// for linear algebra.

+//

+//

+//

+// This Source Code Form is subject to the terms of the Mozilla

+// Public License v. 2.0. If a copy of the MPL was not distributed

+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

+

+#ifndef EIGEN_PACKET_MATH_FP16_AVX512_H

+#define EIGEN_PACKET_MATH_FP16_AVX512_H

+

+// IWYU pragma: private

+#include "../../InternalHeaderCheck.h"

+

+namespace Eigen {

+

+namespace internal {

+

+typedef __m512h Packet32h;

+typedef eigen_packet_wrapper<__m256i, 1> Packet16h;

+typedef eigen_packet_wrapper<__m128i, 2> Packet8h;

+

+template <>

+struct is_arithmetic<Packet8h> {

+  enum { value = true };

+};

+

+template <>

+struct packet_traits<half> : default_packet_traits {

+  typedef Packet32h type;

+  typedef Packet16h half;

+  enum {

+    Vectorizable = 1,

+    AlignedOnScalar = 1,

+    size = 32,

+

+    HasCmp = 1,

+    HasAdd = 1,

+    HasSub = 1,

+    HasMul = 1,

+    HasDiv = 1,

+    HasNegate = 1,

+    HasAbs = 1,

+    HasAbs2 = 0,

+    HasMin = 1,

+    HasMax = 1,

+    HasConj = 1,

+    HasSetLinear = 0,

+    HasLog = 1,

+    HasLog1p = 1,

+    HasExp = 1,

+    HasExpm1 = 1,

+    HasSqrt = 1,

+    HasRsqrt = 1,

+    // These ones should be implemented in future

+    HasBessel = 0,

+    HasNdtri = 0,

+    HasSin = EIGEN_FAST_MATH,

+    HasCos = EIGEN_FAST_MATH,

+    HasTanh = EIGEN_FAST_MATH,

+    HasErf = 0,  // EIGEN_FAST_MATH,

+    HasBlend = 0,

+    HasRound = 1,

+    HasFloor = 1,

+    HasCeil = 1,

+    HasRint = 1

+  };

+};

+

+template <>

+struct unpacket_traits<Packet32h> {

+  typedef Eigen::half type;

+  typedef Packet16h half;

+  enum {

+    size = 32,

+    alignment = Aligned64,

+    vectorizable = true,

+    masked_load_available = false,

+    masked_store_available = false

+  };

+};

+

+template <>

+struct unpacket_traits<Packet16h> {

+  typedef Eigen::half type;

+  typedef Packet8h half;

+  enum {

+    size = 16,

+    alignment = Aligned32,

+    vectorizable = true,

+    masked_load_available = false,

+    masked_store_available = false

+  };

+};

+

+template <>

+struct unpacket_traits<Packet8h> {

+  typedef Eigen::half type;

+  typedef Packet8h half;

+  enum {

+    size = 8,

+    alignment = Aligned16,

+    vectorizable = true,

+    masked_load_available = false,

+    masked_store_available = false

+  };

+};

+

+// Memory functions

+

+// pset1

+

+template <>

+EIGEN_STRONG_INLINE Packet32h pset1<Packet32h>(const Eigen::half& from) {

+  return _mm512_set1_ph(static_cast<_Float16>(from));

+}

+

+// pset1frombits

+template <>

+EIGEN_STRONG_INLINE Packet32h pset1frombits<Packet32h>(unsigned short from) {

+  return _mm512_castsi512_ph(_mm512_set1_epi16(from));

+}

+

+// pfirst

+

+template <>

+EIGEN_STRONG_INLINE Eigen::half pfirst<Packet32h>(const Packet32h& from) {

+#ifdef EIGEN_VECTORIZE_AVX512DQ

+  return half_impl::raw_uint16_to_half(

+      static_cast<unsigned short>(_mm256_extract_epi16(_mm512_extracti32x8_epi32(_mm512_castph_si512(from), 0), 0)));

+#else

+  Eigen::half dest[32];

+  _mm512_storeu_ph(dest, from);

+  return dest[0];

+#endif

+}

+

+// pload

+

+template <>

+EIGEN_STRONG_INLINE Packet32h pload<Packet32h>(const Eigen::half* from) {

+  EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_ph(from);

+}

+

+// ploadu

+

+template <>

+EIGEN_STRONG_INLINE Packet32h ploadu<Packet32h>(const Eigen::half* from) {

+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_ph(from);

+}

+

+// pstore

+

+template <>

+EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet32h& from) {

+  EIGEN_DEBUG_ALIGNED_STORE _mm512_store_ph(to, from);

+}

+

+// pstoreu

+

+template <>

+EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet32h& from) {

+  EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_ph(to, from);

+}

+

+// ploaddup

+template <>

+EIGEN_STRONG_INLINE Packet32h ploaddup<Packet32h>(const Eigen::half* from) {

+  __m512h a = _mm512_castph256_ph512(_mm256_loadu_ph(from));

+  return _mm512_permutexvar_ph(_mm512_set_epi16(15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6,

+                                                5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0),

+                               a);

+}

+

+// ploadquad

+template <>

+EIGEN_STRONG_INLINE Packet32h ploadquad<Packet32h>(const Eigen::half* from) {

+  __m512h a = _mm512_castph128_ph512(_mm_loadu_ph(from));

+  return _mm512_permutexvar_ph(

+      _mm512_set_epi16(7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0),

+      a);

+}

+

+// pabs

+

+template <>

+EIGEN_STRONG_INLINE Packet32h pabs<Packet32h>(const Packet32h& a) {

+  return _mm512_abs_ph(a);

+}

+

+// psignbit

+

+template <>

+EIGEN_STRONG_INLINE Packet32h psignbit<Packet32h>(const Packet32h& a) {

+  return _mm512_castsi512_ph(_mm512_srai_epi16(_mm512_castph_si512(a), 15));

+}

+

+// pmin

+

+template <>

+EIGEN_STRONG_INLINE Packet32h pmin<Packet32h>(const Packet32h& a, const Packet32h& b) {

+  return _mm512_min_ph(a, b);

+}

+

+// pmax

+

+template <>

+EIGEN_STRONG_INLINE Packet32h pmax<Packet32h>(const Packet32h& a, const Packet32h& b) {

+  return _mm512_max_ph(a, b);

+}

+

+// plset

+template <>

+EIGEN_STRONG_INLINE Packet32h plset<Packet32h>(const half& a) {

+  return _mm512_add_ph(_mm512_set1_ph(a),

+                       _mm512_set_ph(31.0f, 30.0f, 29.0f, 28.0f, 27.0f, 26.0f, 25.0f, 24.0f, 23.0f, 22.0f, 21.0f, 20.0f,

+                                     19.0f, 18.0f, 17.0f, 16.0f, 15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f,

+                                     7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f));

+}

+

+// por

+

+template <>

+EIGEN_STRONG_INLINE Packet32h por(const Packet32h& a, const Packet32h& b) {

+  return _mm512_castsi512_ph(_mm512_or_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));

+}

+

+// pxor

+

+template <>

+EIGEN_STRONG_INLINE Packet32h pxor(const Packet32h& a, const Packet32h& b) {

+  return _mm512_castsi512_ph(_mm512_xor_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));

+}

+

+// pand

+

+template <>

+EIGEN_STRONG_INLINE Packet32h pand(const Packet32h& a, const Packet32h& b) {

+  return _mm512_castsi512_ph(_mm512_and_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));

+}

+

+// pandnot

+

+template <>

+EIGEN_STRONG_INLINE Packet32h pandnot(const Packet32h& a, const Packet32h& b) {

+  return _mm512_castsi512_ph(_mm512_andnot_si512(_mm512_castph_si512(b), _mm512_castph_si512(a)));

+}

+

+// pselect

+

+template <>

+EIGEN_DEVICE_FUNC inline Packet32h pselect(const Packet32h& mask, const Packet32h& a, const Packet32h& b) {

+  __mmask32 mask32 = _mm512_cmp_epi16_mask(_mm512_castph_si512(mask), _mm512_setzero_epi32(), _MM_CMPINT_EQ);

+  return _mm512_mask_blend_ph(mask32, a, b);

+}

+

+// pcmp_eq

+

+template <>

+EIGEN_STRONG_INLINE Packet32h pcmp_eq(const Packet32h& a, const Packet32h& b) {

+  __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_EQ_OQ);

+  return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu));

+}

+

+// pcmp_le

+

+template <>

+EIGEN_STRONG_INLINE Packet32h pcmp_le(const Packet32h& a, const Packet32h& b) {

+  __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_LE_OQ);

+  return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu));

+}

+

+// pcmp_lt

+

+template <>

+EIGEN_STRONG_INLINE Packet32h pcmp_lt(const Packet32h& a, const Packet32h& b) {

+  __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_LT_OQ);

+  return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu));

+}

+

+// pcmp_lt_or_nan

+

+template <>

+EIGEN_STRONG_INLINE Packet32h pcmp_lt_or_nan(const Packet32h& a, const Packet32h& b) {

+  __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_NGE_UQ);

+  return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi16(0), mask, 0xffffu));

+}

+

+// padd

+

+template <>

+EIGEN_STRONG_INLINE Packet32h padd<Packet32h>(const Packet32h& a, const Packet32h& b) {

+  return _mm512_add_ph(a, b);

+}

+

+template <>

+EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {

+  return _mm256_castph_si256(_mm256_add_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));

+}

+

+template <>

+EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {

+  return _mm_castph_si128(_mm_add_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));

+}

+

+// psub

+

+template <>

+EIGEN_STRONG_INLINE Packet32h psub<Packet32h>(const Packet32h& a, const Packet32h& b) {

+  return _mm512_sub_ph(a, b);

+}

+

+template <>

+EIGEN_STRONG_INLINE Packet16h psub<Packet16h>(const Packet16h& a, const Packet16h& b) {

+  return _mm256_castph_si256(_mm256_sub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));

+}

+

+template <>

+EIGEN_STRONG_INLINE Packet8h psub<Packet8h>(const Packet8h& a, const Packet8h& b) {

+  return _mm_castph_si128(_mm_sub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));

+}

+

+// pmul

+

+template <>

+EIGEN_STRONG_INLINE Packet32h pmul<Packet32h>(const Packet32h& a, const Packet32h& b) {

+  return _mm512_mul_ph(a, b);

+}

+

+template <>

+EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {

+  return _mm256_castph_si256(_mm256_mul_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));

+}

+

+template <>

+EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {

+  return _mm_castph_si128(_mm_mul_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));

+}

+

+// pdiv

+

+template <>

+EIGEN_STRONG_INLINE Packet32h pdiv<Packet32h>(const Packet32h& a, const Packet32h& b) {

+  return _mm512_div_ph(a, b);

+}

+

+template <>

+EIGEN_STRONG_INLINE Packet16h pdiv<Packet16h>(const Packet16h& a, const Packet16h& b) {

+  return _mm256_castph_si256(_mm256_div_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));

+}

+

+template <>

+EIGEN_STRONG_INLINE Packet8h pdiv<Packet8h>(const Packet8h& a, const Packet8h& b) {

+  return _mm_castph_si128(_mm_div_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));

+}

+

+// pround

+

+template <>

+EIGEN_STRONG_INLINE Packet32h pround<Packet32h>(const Packet32h& a) {

+  // Work-around for default std::round rounding mode.

+

+  // Mask for the sign bit

+  const Packet32h signMask = pset1frombits<Packet32h>(static_cast<numext::uint16_t>(0x8000u));

+  // The largest half-preicision float less than 0.5

+  const Packet32h prev0dot5 = pset1frombits<Packet32h>(static_cast<numext::uint16_t>(0x37FFu));

+

+  return _mm512_roundscale_ph(padd(por(pand(a, signMask), prev0dot5), a), _MM_FROUND_TO_ZERO);

+}

+

+// print

+

+template <>

+EIGEN_STRONG_INLINE Packet32h print<Packet32h>(const Packet32h& a) {

+  return _mm512_roundscale_ph(a, _MM_FROUND_CUR_DIRECTION);

+}

+

+// pceil

+

+template <>

+EIGEN_STRONG_INLINE Packet32h pceil<Packet32h>(const Packet32h& a) {

+  return _mm512_roundscale_ph(a, _MM_FROUND_TO_POS_INF);

+}

+

+// pfloor

+

+template <>

+EIGEN_STRONG_INLINE Packet32h pfloor<Packet32h>(const Packet32h& a) {

+  return _mm512_roundscale_ph(a, _MM_FROUND_TO_NEG_INF);

+}

+

+// predux

+template <>

+EIGEN_STRONG_INLINE half predux<Packet32h>(const Packet32h& a) {

+  return (half)_mm512_reduce_add_ph(a);

+}

+

+template <>

+EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& a) {

+  return (half)_mm256_reduce_add_ph(_mm256_castsi256_ph(a));

+}

+

+template <>

+EIGEN_STRONG_INLINE half predux<Packet8h>(const Packet8h& a) {

+  return (half)_mm_reduce_add_ph(_mm_castsi128_ph(a));

+}

+

+// predux_half_dowto4

+template <>

+EIGEN_STRONG_INLINE Packet16h predux_half_dowto4<Packet32h>(const Packet32h& a) {

+#ifdef EIGEN_VECTORIZE_AVX512DQ

+  __m256i lowHalf = _mm256_castps_si256(_mm512_extractf32x8_ps(_mm512_castph_ps(a), 0));

+  __m256i highHalf = _mm256_castps_si256(_mm512_extractf32x8_ps(_mm512_castph_ps(a), 1));

+

+  return Packet16h(padd<Packet16h>(lowHalf, highHalf));

+#else

+  Eigen::half data[32];

+  _mm512_storeu_ph(data, a);

+

+  __m256i lowHalf = _mm256_castph_si256(_mm256_loadu_ph(data));

+  __m256i highHalf = _mm256_castph_si256(_mm256_loadu_ph(data + 16));

+

+  return Packet16h(padd<Packet16h>(lowHalf, highHalf));

+#endif

+}

+

+// predux_max

+

+// predux_min

+

+// predux_mul

+

+#ifdef EIGEN_VECTORIZE_FMA

+

+// pmadd

+

+template <>

+EIGEN_STRONG_INLINE Packet32h pmadd(const Packet32h& a, const Packet32h& b, const Packet32h& c) {

+  return _mm512_fmadd_ph(a, b, c);

+}

+

+template <>

+EIGEN_STRONG_INLINE Packet16h pmadd(const Packet16h& a, const Packet16h& b, const Packet16h& c) {

+  return _mm256_castph_si256(_mm256_fmadd_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));

+}

+

+template <>

+EIGEN_STRONG_INLINE Packet8h pmadd(const Packet8h& a, const Packet8h& b, const Packet8h& c) {

+  return _mm_castph_si128(_mm_fmadd_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));

+}

+

+// pmsub

+

+template <>

+EIGEN_STRONG_INLINE Packet32h pmsub(const Packet32h& a, const Packet32h& b, const Packet32h& c) {

+  return _mm512_fmsub_ph(a, b, c);

+}

+

+template <>

+EIGEN_STRONG_INLINE Packet16h pmsub(const Packet16h& a, const Packet16h& b, const Packet16h& c) {

+  return _mm256_castph_si256(_mm256_fmsub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));

+}

+

+template <>

+EIGEN_STRONG_INLINE Packet8h pmsub(const Packet8h& a, const Packet8h& b, const Packet8h& c) {

+  return _mm_castph_si128(_mm_fmsub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));

+}

+

+// pnmadd

+

+template <>

+EIGEN_STRONG_INLINE Packet32h pnmadd(const Packet32h& a, const Packet32h& b, const Packet32h& c) {

+  return _mm512_fnmadd_ph(a, b, c);

+}

+

+template <>

+EIGEN_STRONG_INLINE Packet16h pnmadd(const Packet16h& a, const Packet16h& b, const Packet16h& c) {

+  return _mm256_castph_si256(_mm256_fnmadd_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));

+}

+

+template <>

+EIGEN_STRONG_INLINE Packet8h pnmadd(const Packet8h& a, const Packet8h& b, const Packet8h& c) {

+  return _mm_castph_si128(_mm_fnmadd_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));

+}

+

+// pnmsub

+

+template <>

+EIGEN_STRONG_INLINE Packet32h pnmsub(const Packet32h& a, const Packet32h& b, const Packet32h& c) {

+  return _mm512_fnmsub_ph(a, b, c);

+}

+

+template <>

+EIGEN_STRONG_INLINE Packet16h pnmsub(const Packet16h& a, const Packet16h& b, const Packet16h& c) {

+  return _mm256_castph_si256(_mm256_fnmsub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));

+}

+

+template <>

+EIGEN_STRONG_INLINE Packet8h pnmsub(const Packet8h& a, const Packet8h& b, const Packet8h& c) {

+  return _mm_castph_si128(_mm_fnmsub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));

+}

+

+#endif

+

+// pnegate

+

+template <>

+EIGEN_STRONG_INLINE Packet32h pnegate<Packet32h>(const Packet32h& a) {

+  return _mm512_sub_ph(_mm512_set1_ph(0.0), a);

+}

+

+// pconj

+

+template <>

+EIGEN_STRONG_INLINE Packet32h pconj<Packet32h>(const Packet32h& a) {

+  return a;

+}

+

+// psqrt

+

+template <>

+EIGEN_STRONG_INLINE Packet32h psqrt<Packet32h>(const Packet32h& a) {

+  return _mm512_sqrt_ph(a);

+}

+

+// prsqrt

+

+template <>

+EIGEN_STRONG_INLINE Packet32h prsqrt<Packet32h>(const Packet32h& a) {

+  return _mm512_rsqrt_ph(a);

+}

+

+// preciprocal

+

+template <>

+EIGEN_STRONG_INLINE Packet32h preciprocal<Packet32h>(const Packet32h& a) {

+  return _mm512_rcp_ph(a);

+}

+

+// ptranspose

+

+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet32h, 32>& a) {

+  __m512i t[32];

+

+  EIGEN_UNROLL_LOOP

+  for (int i = 0; i < 16; i++) {

+    t[2 * i] = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[2 * i]), _mm512_castph_si512(a.packet[2 * i + 1]));

+    t[2 * i + 1] =

+        _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[2 * i]), _mm512_castph_si512(a.packet[2 * i + 1]));

+  }

+

+  __m512i p[32];

+

+  EIGEN_UNROLL_LOOP

+  for (int i = 0; i < 8; i++) {

+    p[4 * i] = _mm512_unpacklo_epi32(t[4 * i], t[4 * i + 2]);

+    p[4 * i + 1] = _mm512_unpackhi_epi32(t[4 * i], t[4 * i + 2]);

+    p[4 * i + 2] = _mm512_unpacklo_epi32(t[4 * i + 1], t[4 * i + 3]);

+    p[4 * i + 3] = _mm512_unpackhi_epi32(t[4 * i + 1], t[4 * i + 3]);

+  }

+

+  __m512i q[32];

+

+  EIGEN_UNROLL_LOOP

+  for (int i = 0; i < 4; i++) {

+    q[8 * i] = _mm512_unpacklo_epi64(p[8 * i], p[8 * i + 4]);

+    q[8 * i + 1] = _mm512_unpackhi_epi64(p[8 * i], p[8 * i + 4]);

+    q[8 * i + 2] = _mm512_unpacklo_epi64(p[8 * i + 1], p[8 * i + 5]);

+    q[8 * i + 3] = _mm512_unpackhi_epi64(p[8 * i + 1], p[8 * i + 5]);

+    q[8 * i + 4] = _mm512_unpacklo_epi64(p[8 * i + 2], p[8 * i + 6]);

+    q[8 * i + 5] = _mm512_unpackhi_epi64(p[8 * i + 2], p[8 * i + 6]);

+    q[8 * i + 6] = _mm512_unpacklo_epi64(p[8 * i + 3], p[8 * i + 7]);

+    q[8 * i + 7] = _mm512_unpackhi_epi64(p[8 * i + 3], p[8 * i + 7]);

+  }

+

+  __m512i f[32];

+

+#define PACKET32H_TRANSPOSE_HELPER(X, Y)                                                            \

+  do {                                                                                              \

+    f[Y * 8] = _mm512_inserti32x4(f[Y * 8], _mm512_extracti32x4_epi32(q[X * 8], Y), X);             \

+    f[Y * 8 + 1] = _mm512_inserti32x4(f[Y * 8 + 1], _mm512_extracti32x4_epi32(q[X * 8 + 1], Y), X); \

+    f[Y * 8 + 2] = _mm512_inserti32x4(f[Y * 8 + 2], _mm512_extracti32x4_epi32(q[X * 8 + 2], Y), X); \

+    f[Y * 8 + 3] = _mm512_inserti32x4(f[Y * 8 + 3], _mm512_extracti32x4_epi32(q[X * 8 + 3], Y), X); \

+    f[Y * 8 + 4] = _mm512_inserti32x4(f[Y * 8 + 4], _mm512_extracti32x4_epi32(q[X * 8 + 4], Y), X); \

+    f[Y * 8 + 5] = _mm512_inserti32x4(f[Y * 8 + 5], _mm512_extracti32x4_epi32(q[X * 8 + 5], Y), X); \

+    f[Y * 8 + 6] = _mm512_inserti32x4(f[Y * 8 + 6], _mm512_extracti32x4_epi32(q[X * 8 + 6], Y), X); \

+    f[Y * 8 + 7] = _mm512_inserti32x4(f[Y * 8 + 7], _mm512_extracti32x4_epi32(q[X * 8 + 7], Y), X); \

+  } while (false);

+

+  PACKET32H_TRANSPOSE_HELPER(0, 0);

+  PACKET32H_TRANSPOSE_HELPER(1, 1);

+  PACKET32H_TRANSPOSE_HELPER(2, 2);

+  PACKET32H_TRANSPOSE_HELPER(3, 3);

+

+  PACKET32H_TRANSPOSE_HELPER(1, 0);

+  PACKET32H_TRANSPOSE_HELPER(2, 0);

+  PACKET32H_TRANSPOSE_HELPER(3, 0);

+  PACKET32H_TRANSPOSE_HELPER(2, 1);

+  PACKET32H_TRANSPOSE_HELPER(3, 1);

+  PACKET32H_TRANSPOSE_HELPER(3, 2);

+

+  PACKET32H_TRANSPOSE_HELPER(0, 1);

+  PACKET32H_TRANSPOSE_HELPER(0, 2);

+  PACKET32H_TRANSPOSE_HELPER(0, 3);

+  PACKET32H_TRANSPOSE_HELPER(1, 2);

+  PACKET32H_TRANSPOSE_HELPER(1, 3);

+  PACKET32H_TRANSPOSE_HELPER(2, 3);

+

+#undef PACKET32H_TRANSPOSE_HELPER

+

+  EIGEN_UNROLL_LOOP

+  for (int i = 0; i < 32; i++) {

+    a.packet[i] = _mm512_castsi512_ph(f[i]);

+  }

+}

+

+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet32h, 4>& a) {

+  __m512i p0, p1, p2, p3, t0, t1, t2, t3, a0, a1, a2, a3;

+  t0 = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[0]), _mm512_castph_si512(a.packet[1]));

+  t1 = _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[0]), _mm512_castph_si512(a.packet[1]));

+  t2 = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[2]), _mm512_castph_si512(a.packet[3]));

+  t3 = _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[2]), _mm512_castph_si512(a.packet[3]));

+

+  p0 = _mm512_unpacklo_epi32(t0, t2);

+  p1 = _mm512_unpackhi_epi32(t0, t2);

+  p2 = _mm512_unpacklo_epi32(t1, t3);

+  p3 = _mm512_unpackhi_epi32(t1, t3);

+

+  a0 = p0;

+  a1 = p1;

+  a2 = p2;

+  a3 = p3;

+

+  a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p1, 0), 1);

+  a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p0, 1), 0);

+

+  a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p2, 0), 2);

+  a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p0, 2), 0);

+

+  a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p3, 0), 3);

+  a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p0, 3), 0);

+

+  a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p2, 1), 2);

+  a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p1, 2), 1);

+

+  a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p3, 2), 3);

+  a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p2, 3), 2);

+

+  a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p3, 1), 3);

+  a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p1, 3), 1);

+

+  a.packet[0] = _mm512_castsi512_ph(a0);

+  a.packet[1] = _mm512_castsi512_ph(a1);

+  a.packet[2] = _mm512_castsi512_ph(a2);

+  a.packet[3] = _mm512_castsi512_ph(a3);

+}

+

+// preverse

+

+template <>

+EIGEN_STRONG_INLINE Packet32h preverse(const Packet32h& a) {

+  return _mm512_permutexvar_ph(_mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,

+                                                20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31),

+                               a);

+}

+

+// pscatter

+

+template <>

+EIGEN_STRONG_INLINE void pscatter<half, Packet32h>(half* to, const Packet32h& from, Index stride) {

+  EIGEN_ALIGN64 half aux[32];

+  pstore(aux, from);

+

+  EIGEN_UNROLL_LOOP

+  for (int i = 0; i < 32; i++) {

+    to[stride * i] = aux[i];

+  }

+}

+

+// pgather

+

+template <>

+EIGEN_STRONG_INLINE Packet32h pgather<Eigen::half, Packet32h>(const Eigen::half* from, Index stride) {

+  return _mm512_castsi512_ph(_mm512_set_epi16(

+      from[31 * stride].x, from[30 * stride].x, from[29 * stride].x, from[28 * stride].x, from[27 * stride].x,

+      from[26 * stride].x, from[25 * stride].x, from[24 * stride].x, from[23 * stride].x, from[22 * stride].x,

+      from[21 * stride].x, from[20 * stride].x, from[19 * stride].x, from[18 * stride].x, from[17 * stride].x,

+      from[16 * stride].x, from[15 * stride].x, from[14 * stride].x, from[13 * stride].x, from[12 * stride].x,

+      from[11 * stride].x, from[10 * stride].x, from[9 * stride].x, from[8 * stride].x, from[7 * stride].x,

+      from[6 * stride].x, from[5 * stride].x, from[4 * stride].x, from[3 * stride].x, from[2 * stride].x,

+      from[1 * stride].x, from[0 * stride].x));

+}

+

+template <>

+EIGEN_STRONG_INLINE Packet16h pcos<Packet16h>(const Packet16h&);

+template <>

+EIGEN_STRONG_INLINE Packet16h psin<Packet16h>(const Packet16h&);

+template <>

+EIGEN_STRONG_INLINE Packet16h plog<Packet16h>(const Packet16h&);

+template <>

+EIGEN_STRONG_INLINE Packet16h plog2<Packet16h>(const Packet16h&);

+template <>

+EIGEN_STRONG_INLINE Packet16h plog1p<Packet16h>(const Packet16h&);

+template <>

+EIGEN_STRONG_INLINE Packet16h pexp<Packet16h>(const Packet16h&);

+template <>

+EIGEN_STRONG_INLINE Packet16h pexpm1<Packet16h>(const Packet16h&);

+template <>

+EIGEN_STRONG_INLINE Packet16h ptanh<Packet16h>(const Packet16h&);

+template <>

+EIGEN_STRONG_INLINE Packet16h pfrexp<Packet16h>(const Packet16h&, Packet16h&);

+template <>

+EIGEN_STRONG_INLINE Packet16h pldexp<Packet16h>(const Packet16h&, const Packet16h&);

+

+EIGEN_STRONG_INLINE Packet32h combine2Packet16h(const Packet16h& a, const Packet16h& b) {

+  __m512d result = _mm512_undefined_pd();

+  result = _mm512_insertf64x4(result, _mm256_castsi256_pd(a), 0);

+  result = _mm512_insertf64x4(result, _mm256_castsi256_pd(b), 1);

+  return _mm512_castpd_ph(result);

+}

+

+EIGEN_STRONG_INLINE void extract2Packet16h(const Packet32h& x, Packet16h& a, Packet16h& b) {

+  a = _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(x), 0));

+  b = _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(x), 1));

+}

+

+// psin

+template <>

+EIGEN_STRONG_INLINE Packet32h psin<Packet32h>(const Packet32h& a) {

+  Packet16h low;

+  Packet16h high;

+  extract2Packet16h(a, low, high);

+

+  Packet16h lowOut = psin(low);

+  Packet16h highOut = psin(high);

+

+  return combine2Packet16h(lowOut, highOut);

+}

+

+// pcos

+template <>

+EIGEN_STRONG_INLINE Packet32h pcos<Packet32h>(const Packet32h& a) {

+  Packet16h low;

+  Packet16h high;

+  extract2Packet16h(a, low, high);

+

+  Packet16h lowOut = pcos(low);

+  Packet16h highOut = pcos(high);

+

+  return combine2Packet16h(lowOut, highOut);

+}

+

+// plog

+template <>

+EIGEN_STRONG_INLINE Packet32h plog<Packet32h>(const Packet32h& a) {

+  Packet16h low;

+  Packet16h high;

+  extract2Packet16h(a, low, high);

+

+  Packet16h lowOut = plog(low);

+  Packet16h highOut = plog(high);

+

+  return combine2Packet16h(lowOut, highOut);

+}

+

+// plog2

+template <>

+EIGEN_STRONG_INLINE Packet32h plog2<Packet32h>(const Packet32h& a) {

+  Packet16h low;

+  Packet16h high;

+  extract2Packet16h(a, low, high);

+

+  Packet16h lowOut = plog2(low);

+  Packet16h highOut = plog2(high);

+

+  return combine2Packet16h(lowOut, highOut);

+}

+

+// plog1p

+template <>

+EIGEN_STRONG_INLINE Packet32h plog1p<Packet32h>(const Packet32h& a) {

+  Packet16h low;

+  Packet16h high;

+  extract2Packet16h(a, low, high);

+

+  Packet16h lowOut = plog1p(low);

+  Packet16h highOut = plog1p(high);

+

+  return combine2Packet16h(lowOut, highOut);

+}

+

+// pexp

+template <>

+EIGEN_STRONG_INLINE Packet32h pexp<Packet32h>(const Packet32h& a) {

+  Packet16h low;

+  Packet16h high;

+  extract2Packet16h(a, low, high);

+

+  Packet16h lowOut = pexp(low);

+  Packet16h highOut = pexp(high);

+

+  return combine2Packet16h(lowOut, highOut);

+}

+

+// pexpm1

+template <>

+EIGEN_STRONG_INLINE Packet32h pexpm1<Packet32h>(const Packet32h& a) {

+  Packet16h low;

+  Packet16h high;

+  extract2Packet16h(a, low, high);

+

+  Packet16h lowOut = pexpm1(low);

+  Packet16h highOut = pexpm1(high);

+

+  return combine2Packet16h(lowOut, highOut);

+}

+

+// ptanh

+template <>

+EIGEN_STRONG_INLINE Packet32h ptanh<Packet32h>(const Packet32h& a) {

+  Packet16h low;

+  Packet16h high;

+  extract2Packet16h(a, low, high);

+

+  Packet16h lowOut = ptanh(low);

+  Packet16h highOut = ptanh(high);

+

+  return combine2Packet16h(lowOut, highOut);

+}

+

+// pfrexp

+template <>

+EIGEN_STRONG_INLINE Packet32h pfrexp<Packet32h>(const Packet32h& a, Packet32h& exponent) {

+  Packet16h low;

+  Packet16h high;

+  extract2Packet16h(a, low, high);

+

+  Packet16h exp1 = _mm256_undefined_si256();

+  Packet16h exp2 = _mm256_undefined_si256();

+

+  Packet16h lowOut = pfrexp(low, exp1);

+  Packet16h highOut = pfrexp(high, exp2);

+

+  exponent = combine2Packet16h(exp1, exp2);

+

+  return combine2Packet16h(lowOut, highOut);

+}

+

+// pldexp

+template <>

+EIGEN_STRONG_INLINE Packet32h pldexp<Packet32h>(const Packet32h& a, const Packet32h& exponent) {

+  Packet16h low;

+  Packet16h high;

+  extract2Packet16h(a, low, high);

+

+  Packet16h exp1;

+  Packet16h exp2;

+  extract2Packet16h(exponent, exp1, exp2);

+

+  Packet16h lowOut = pldexp(low, exp1);

+  Packet16h highOut = pldexp(high, exp2);

+

+  return combine2Packet16h(lowOut, highOut);

+}

+

+}  // end namespace internal

+}  // end namespace Eigen

+

+#endif  // EIGEN_PACKET_MATH_FP16_AVX512_H

diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
index 8fb5b68..d84b1cc 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@@ -582,8 +582,8 @@
 
 // Subtract y * Pi/2 to reduce x to the interval -Pi/4 <= x <= +Pi/4
 // using "Extended precision modular arithmetic"
-#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD)
-  // This version requires true FMA for high accuracy
+#if defined(EIGEN_VECTORIZE_FMA)
+  // This version requires true FMA for high accuracy.
   // It provides a max error of 1ULP up to (with absolute_error < 5.9605e-08):
   const float huge_th = ComputeSine ? 117435.992f : 71476.0625f;
   x = pmadd(y, pset1<Packet>(-1.57079601287841796875f), x);
@@ -1181,7 +1181,7 @@
   s_lo = psub(y, t);
 }
 
-#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#ifdef EIGEN_VECTORIZE_FMA
 // This function implements the extended precision product of
 // a pair of floating point numbers. Given {x, y}, it computes the pair
 // {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and
@@ -1227,7 +1227,7 @@
   p_lo = pmadd(x_lo, y_lo, p_lo);
 }
 
-#endif  // EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#endif  // EIGEN_VECTORIZE_FMA
 
 // This function implements Dekker's algorithm for the addition
 // of two double word numbers represented by {x_hi, x_lo} and {y_hi, y_lo}.
diff --git a/Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h b/Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h
deleted file mode 100644
index a159739..0000000
--- a/Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h
+++ /dev/null
@@ -1,41 +0,0 @@
-#ifndef EIGEN_HVX_GENERAL_BLOCK_KERNEL_H
-#define EIGEN_HVX_GENERAL_BLOCK_KERNEL_H
-
-// Only support 128B HVX now.
-// Floating-point operations are only supported since V68.
-#if defined __HVX__ && (__HVX_LENGTH__ == 128) && __HVX_ARCH__ >= 68
-
-namespace Eigen {
-namespace internal {
-
-template <bool ConjLhs_, bool ConjRhs_, int PacketSize_>
-class gebp_traits<float, float, ConjLhs_, ConjRhs_, Architecture::Target, PacketSize_>
-    : public gebp_traits<float, float, ConjLhs_, ConjRhs_, Architecture::Generic, PacketSize_> {
- public:
-  typedef Packet32qf AccPacket;
-
-  EIGEN_STRONG_INLINE void initAcc(Packet32qf& p) { p = pzero<Packet32qf>(p); }
-
-  template <typename LaneIdType>
-  EIGEN_STRONG_INLINE void madd(const Packet32f& a, const Packet32f& b, Packet32qf& c, Packet32f& /*tmp*/,
-                                const LaneIdType&) const {
-    c = pmadd_f32_to_qf32(a, b, c);
-  }
-
-  template <typename LaneIdType>
-  EIGEN_STRONG_INLINE void madd(const Packet32f& a, const QuadPacket<Packet32f>& b, Packet32qf& c, Packet32f& tmp,
-                                const LaneIdType& lane) const {
-    madd(a, b.get(lane), c, tmp, lane);
-  }
-
-  EIGEN_STRONG_INLINE void acc(const Packet32qf& c, const Packet32f& alpha, Packet32f& r) const {
-    r = pmadd_qf32_to_f32(c, alpha, r);
-  }
-};
-
-}  // end namespace internal
-}  // end namespace Eigen
-
-#endif  // __HVX__ && (__HVX_LENGTH__ == 128) && __HVX_ARCH__ >= 68
-
-#endif  // EIGEN_HVX_GENERAL_BLOCK_KERNEL_H
diff --git a/Eigen/src/Core/arch/HVX/PacketMath.h b/Eigen/src/Core/arch/HVX/PacketMath.h
index 7c69f3b..7e139de 100644
--- a/Eigen/src/Core/arch/HVX/PacketMath.h
+++ b/Eigen/src/Core/arch/HVX/PacketMath.h
@@ -18,18 +18,107 @@
 namespace Eigen {
 namespace internal {
 
-EIGEN_STRONG_INLINE HVX_Vector HVX_load(const void* mem) { return *((HVX_Vector*)mem); }
+// HVX utilities.
 
-EIGEN_STRONG_INLINE HVX_Vector HVX_loadu(const void* mem) { return *((HVX_UVector*)mem); }
+template <int D>
+EIGEN_STRONG_INLINE HVX_Vector HVX_vmem(const void* m) {
+  HVX_Vector v;
+#if EIGEN_COMP_CLANG
+  // Use inlined assembly for aligned vmem load on unaligned memory.
+  // Use type cast to HVX_Vector* may mess up with compiler data alignment.
+  __asm__("%0 = vmem(%1+#%2)" : "=v"(v) : "r"(m), "i"(D) : "memory");
+#else
+  void* aligned_mem =
+      reinterpret_cast<void*>((reinterpret_cast<uintptr_t>(m) & ~(__HVX_LENGTH__ - 1)) + D * __HVX_LENGTH__);
+  memcpy(&v, aligned_mem, __HVX_LENGTH__);
+#endif
+  return v;
+}
 
-EIGEN_STRONG_INLINE void HVX_store(void* mem, HVX_Vector v) { *((HVX_Vector*)mem) = v; }
+template <typename T>
+EIGEN_STRONG_INLINE HVX_Vector HVX_load(const T* mem) {
+  HVX_Vector v;
+  memcpy(&v, reinterpret_cast<const HVX_Vector*>(mem), __HVX_LENGTH__);
+  return v;
+}
 
-EIGEN_STRONG_INLINE void HVX_storeu(void* mem, HVX_Vector v) { *((HVX_UVector*)mem) = v; }
+template <typename T>
+EIGEN_STRONG_INLINE HVX_Vector HVX_loadu(const T* mem) {
+  HVX_Vector v;
+  memcpy(&v, mem, __HVX_LENGTH__);
+  return v;
+}
+
+template <size_t Size, size_t Alignment, typename T>
+EIGEN_STRONG_INLINE HVX_Vector HVX_load_partial(const T* mem) {
+#if defined(EIGEN_HVX_FAST_PARTIAL_VECTOR_LOAD)
+  // Fast partial vector load through aligned vmem load.
+  // The load may past end of array but is aligned to prevent memory fault.
+  HVX_Vector v0 = HVX_vmem<0>(mem);
+  HVX_Vector v1 = v0;
+  uintptr_t mem_addr = reinterpret_cast<uintptr_t>(mem);
+  EIGEN_IF_CONSTEXPR(Size * sizeof(T) <= Alignment) {
+    // Data size less than alignment will never cross multiple aligned vectors.
+    v1 = v0;
+  }
+  else {
+    uintptr_t left_off = mem_addr & (__HVX_LENGTH__ - 1);
+    if (left_off + Size * sizeof(T) > __HVX_LENGTH__) {
+      v1 = HVX_vmem<1>(mem);
+    } else {
+      v1 = v0;
+    }
+  }
+  return Q6_V_valign_VVR(v1, v0, mem_addr);
+#else
+  HVX_Vector v;
+  memcpy(&v, mem, Size * sizeof(T));
+  return v;
+#endif
+}
+
+template <typename T>
+EIGEN_STRONG_INLINE void HVX_store(T* mem, HVX_Vector v) {
+  memcpy(reinterpret_cast<HVX_Vector*>(mem), &v, __HVX_LENGTH__);
+}
+
+template <typename T>
+EIGEN_STRONG_INLINE void HVX_storeu(T* mem, HVX_Vector v) {
+  memcpy(mem, &v, __HVX_LENGTH__);
+}
+
+template <size_t Size, size_t Alignment, typename T>
+EIGEN_STRONG_INLINE void HVX_store_partial(T* mem, HVX_Vector v) {
+  uintptr_t mem_addr = reinterpret_cast<uintptr_t>(mem);
+  HVX_Vector value = Q6_V_vlalign_VVR(v, v, mem_addr);
+  uintptr_t left_off = mem_addr & (__HVX_LENGTH__ - 1);
+  uintptr_t right_off = left_off + Size * sizeof(T);
+
+  HVX_VectorPred ql_not = Q6_Q_vsetq_R(mem_addr);
+  HVX_VectorPred qr = Q6_Q_vsetq2_R(right_off);
+
+  EIGEN_IF_CONSTEXPR(Size * sizeof(T) > Alignment) {
+    if (right_off > __HVX_LENGTH__) {
+      Q6_vmem_QRIV(qr, mem + __HVX_LENGTH__ / sizeof(T), value);
+      qr = Q6_Q_vcmp_eq_VbVb(value, value);
+    }
+  }
+
+  ql_not = Q6_Q_or_QQn(ql_not, qr);
+  Q6_vmem_QnRIV(ql_not, mem, value);
+}
+
+// Packet definitions.
+enum class HVXPacketSize {
+  Full,
+  Half,
+  Quarter,
+};
 
 // Hexagon compiler uses same HVX_Vector to represent all HVX vector types.
 // Wrap different vector type (float32, int32, etc) to different class with
 // explicit constructor and casting back-and-force to HVX_Vector.
-template <int unique_id>
+template <HVXPacketSize T>
 class HVXPacket {
  public:
   HVXPacket() = default;
@@ -41,24 +130,62 @@
   HVX_Vector m_val = Q6_V_vzero();
 };
 
-typedef HVXPacket<0> Packet32f;   // float32
-typedef HVXPacket<1> Packet32qf;  // qfloat32
+typedef HVXPacket<HVXPacketSize::Full> Packet32f;
+typedef HVXPacket<HVXPacketSize::Half> Packet16f;
+typedef HVXPacket<HVXPacketSize::Quarter> Packet8f;
 
+// Packet traits.
 template <>
 struct packet_traits<float> : default_packet_traits {
   typedef Packet32f type;
-  typedef Packet32f half;
+  typedef Packet16f half;
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 32,
+
+    HasCmp = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 0,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 0,
+    HasAbsDiff = 0,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 0,
+    HasSetLinear = 0,
+    HasBlend = 0,
+
+    HasDiv = 0,
+    HasFloor = 0,
+    HasCeil = 0,
+    HasRint = 0,
+
+    HasSin = 0,
+    HasCos = 0,
+    HasACos = 0,
+    HasASin = 0,
+    HasATan = 0,
+    HasATanh = 0,
+    HasLog = 0,
+    HasExp = 0,
+    HasSqrt = 0,
+    HasRsqrt = 0,
+    HasTanh = 0,
+    HasErf = 0,
+    HasBessel = 0,
+    HasNdtri = 0
   };
 };
 
 template <>
 struct unpacket_traits<Packet32f> {
   typedef float type;
-  typedef Packet32f half;
+  typedef Packet16f half;
   enum {
     size = 32,
     alignment = Aligned128,
@@ -68,15 +195,89 @@
   };
 };
 
-// float32 operations.
 template <>
-EIGEN_STRONG_INLINE Packet32f pset1<Packet32f>(const float& from) {
+struct unpacket_traits<Packet16f> {
+  typedef float type;
+  typedef Packet8f half;
+  enum {
+    size = 16,
+    // Many code assume alignment on packet size instead of following trait
+    // So we do not use Aligned128 to optimize aligned load/store,
+    alignment = Aligned64,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template <>
+struct unpacket_traits<Packet8f> {
+  typedef float type;
+  typedef Packet8f half;
+  enum {
+    size = 8,
+    // Many code assume alignment on packet size instead of following trait
+    // So we do not use Aligned128 to optimize aligned load/store,
+    alignment = Aligned32,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+// float32 operations.
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> pzero_hvx(const HVXPacket<T>&) {
+  return HVXPacket<T>::Create(Q6_V_vzero());
+}
+template <>
+EIGEN_STRONG_INLINE Packet32f pzero<Packet32f>(const Packet32f&) {
+  return pzero_hvx(Packet32f());
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pzero<Packet16f>(const Packet16f&) {
+  return pzero_hvx(Packet16f());
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pzero<Packet8f>(const Packet8f&) {
+  return pzero_hvx(Packet8f());
+}
+
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE typename unpacket_traits<HVXPacket<T>>::half predux_half_dowto4_hvx(const HVXPacket<T>& a) {
+  const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
+  return unpacket_traits<HVXPacket<T>>::half::Create(
+      Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_vror_VR(a.Get(), sizeof(float) * packet_size / 2), a.Get())));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f predux_half_dowto4(const Packet32f& a) {
+  return predux_half_dowto4_hvx(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f predux_half_dowto4(const Packet16f& a) {
+  return predux_half_dowto4_hvx(a);
+}
+
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> pset1_hvx(const float& from) {
   union {
     float f;
     int32_t i;
   } u;
   u.f = from;
-  return Packet32f::Create(Q6_V_vsplat_R(u.i));
+  return HVXPacket<T>::Create(Q6_V_vsplat_R(u.i));
+}
+template <>
+EIGEN_STRONG_INLINE Packet32f pset1<Packet32f>(const float& from) {
+  return pset1_hvx<HVXPacketSize::Full>(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pset1<Packet16f>(const float& from) {
+  return pset1_hvx<HVXPacketSize::Half>(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float& from) {
+  return pset1_hvx<HVXPacketSize::Quarter>(from);
 }
 
 template <>
@@ -84,78 +285,236 @@
   return Packet32f::Create(HVX_load(from));
 }
 template <>
+EIGEN_STRONG_INLINE Packet16f pload<Packet16f>(const float* from) {
+  return Packet16f::Create(
+      HVX_load_partial<unpacket_traits<Packet16f>::size, unpacket_traits<Packet16f>::alignment>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pload<Packet8f>(const float* from) {
+  return Packet8f::Create(
+      HVX_load_partial<unpacket_traits<Packet8f>::size, unpacket_traits<Packet8f>::alignment>(from));
+}
+
+template <>
 EIGEN_STRONG_INLINE Packet32f ploadu<Packet32f>(const float* from) {
   return Packet32f::Create(HVX_loadu(from));
 }
+template <>
+EIGEN_STRONG_INLINE Packet16f ploadu<Packet16f>(const float* from) {
+  return Packet16f::Create(HVX_load_partial<unpacket_traits<Packet16f>::size, 0>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from) {
+  return Packet8f::Create(HVX_load_partial<unpacket_traits<Packet8f>::size, 0>(from));
+}
 
 template <>
 EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet32f& from) {
   HVX_store(to, from.Get());
 }
 template <>
+EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet16f& from) {
+  HVX_store_partial<unpacket_traits<Packet16f>::size, unpacket_traits<Packet16f>::alignment>(to, from.Get());
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet8f& from) {
+  HVX_store_partial<unpacket_traits<Packet8f>::size, unpacket_traits<Packet8f>::alignment>(to, from.Get());
+}
+
+template <>
 EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet32f& from) {
   HVX_storeu(to, from.Get());
 }
+template <>
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet16f& from) {
+  HVX_store_partial<unpacket_traits<Packet16f>::size, 0>(to, from.Get());
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet8f& from) {
+  HVX_store_partial<unpacket_traits<Packet8f>::size, 0>(to, from.Get());
+}
 
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> pmul_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
+  return HVXPacket<T>::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(a.Get(), b.Get())));
+}
 template <>
 EIGEN_STRONG_INLINE Packet32f pmul<Packet32f>(const Packet32f& a, const Packet32f& b) {
-  return Packet32f::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(a.Get(), b.Get())));
+  return pmul_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pmul<Packet16f>(const Packet16f& a, const Packet16f& b) {
+  return pmul_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pmul<Packet8f>(const Packet8f& a, const Packet8f& b) {
+  return pmul_hvx(a, b);
 }
 
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> padd_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
+  return HVXPacket<T>::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(a.Get(), b.Get())));
+}
 template <>
 EIGEN_STRONG_INLINE Packet32f padd<Packet32f>(const Packet32f& a, const Packet32f& b) {
-  return Packet32f::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(a.Get(), b.Get())));
+  return padd_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f padd<Packet16f>(const Packet16f& a, const Packet16f& b) {
+  return padd_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b) {
+  return padd_hvx(a, b);
 }
 
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> psub_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
+  return HVXPacket<T>::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(a.Get(), b.Get())));
+}
 template <>
 EIGEN_STRONG_INLINE Packet32f psub<Packet32f>(const Packet32f& a, const Packet32f& b) {
-  return Packet32f::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(a.Get(), b.Get())));
+  return psub_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f psub<Packet16f>(const Packet16f& a, const Packet16f& b) {
+  return psub_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f psub<Packet8f>(const Packet8f& a, const Packet8f& b) {
+  return psub_hvx(a, b);
 }
 
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> pnegate_hvx(const HVXPacket<T>& a) {
+  return HVXPacket<T>::Create(a.Get() ^ Q6_V_vsplat_R(0x80000000));
+}
 template <>
 EIGEN_STRONG_INLINE Packet32f pnegate(const Packet32f& a) {
-  return psub(Packet32f::Create(Q6_V_vzero()), a);
+  return pnegate_hvx(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pnegate(const Packet16f& a) {
+  return pnegate_hvx(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a) {
+  return pnegate_hvx(a);
 }
 
-template <>
-EIGEN_STRONG_INLINE Packet32f pcmp_le(const Packet32f& a, const Packet32f& b) {
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> pcmp_le_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
   HVX_Vector v_true = Q6_Vb_vsplat_R(0xff);
   HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(a.Get(), b.Get());
-  return Packet32f::Create(Q6_V_vmux_QVV(pred, Q6_V_vzero(), v_true));
+  return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, Q6_V_vzero(), v_true));
+}
+template <>
+EIGEN_STRONG_INLINE Packet32f pcmp_le(const Packet32f& a, const Packet32f& b) {
+  return pcmp_le_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pcmp_le(const Packet16f& a, const Packet16f& b) {
+  return pcmp_le_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pcmp_le(const Packet8f& a, const Packet8f& b) {
+  return pcmp_le_hvx(a, b);
 }
 
-template <>
-EIGEN_STRONG_INLINE Packet32f pcmp_eq(const Packet32f& a, const Packet32f& b) {
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> pcmp_eq_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
   HVX_Vector v_true = Q6_Vb_vsplat_R(0xff);
   HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(a.Get(), b.Get());
-  return Packet32f::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero()));
+  return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero()));
+}
+template <>
+EIGEN_STRONG_INLINE Packet32f pcmp_eq(const Packet32f& a, const Packet32f& b) {
+  return pcmp_eq_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) {
+  return pcmp_eq_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pcmp_eq(const Packet8f& a, const Packet8f& b) {
+  return pcmp_eq_hvx(a, b);
 }
 
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> pcmp_lt_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
+  HVX_Vector v_true = Q6_Vb_vsplat_R(0xff);
+  HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get());
+  return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero()));
+}
 template <>
 EIGEN_STRONG_INLINE Packet32f pcmp_lt(const Packet32f& a, const Packet32f& b) {
-  HVX_Vector v_true = Q6_Vb_vsplat_R(0xff);
-  HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get());
-  return Packet32f::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero()));
+  return pcmp_lt_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pcmp_lt(const Packet16f& a, const Packet16f& b) {
+  return pcmp_lt_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pcmp_lt(const Packet8f& a, const Packet8f& b) {
+  return pcmp_lt_hvx(a, b);
 }
 
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> pcmp_lt_or_nan_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
+  HVX_Vector v_true = Q6_Vb_vsplat_R(0xff);
+  HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get());
+  return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero()));
+}
 template <>
 EIGEN_STRONG_INLINE Packet32f pcmp_lt_or_nan(const Packet32f& a, const Packet32f& b) {
-  HVX_Vector v_true = Q6_Vb_vsplat_R(0xff);
-  HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get());
-  return Packet32f::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero()));
+  return pcmp_lt_or_nan_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) {
+  return pcmp_lt_or_nan_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pcmp_lt_or_nan(const Packet8f& a, const Packet8f& b) {
+  return pcmp_lt_or_nan_hvx(a, b);
 }
 
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> pabs_hvx(const HVXPacket<T>& a) {
+  return HVXPacket<T>::Create(a.Get() & Q6_V_vsplat_R(0x7FFFFFFF));
+}
 template <>
 EIGEN_STRONG_INLINE Packet32f pabs(const Packet32f& a) {
-  HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(a.Get(), Q6_V_vzero());
-  return Packet32f::Create(Q6_V_vmux_QVV(pred, a.Get(), pnegate(a).Get()));
+  return pabs_hvx(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pabs(const Packet16f& a) {
+  return pabs_hvx(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a) {
+  return pabs_hvx(a);
 }
 
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE float pfirst_hvx(const HVXPacket<T>& a) {
+  union {
+    float array[1];
+    HVX_Vector vector;
+  } HVX_and_array;
+  HVX_and_array.vector = a.Get();
+  return HVX_and_array.array[0];
+}
 template <>
 EIGEN_STRONG_INLINE float pfirst(const Packet32f& a) {
-  float vsf[32] __attribute__((aligned(128)));
-  pstore(vsf, a);
-  return vsf[0];
+  return pfirst_hvx(a);
+}
+template <>
+EIGEN_STRONG_INLINE float pfirst(const Packet16f& a) {
+  return pfirst_hvx(a);
+}
+template <>
+EIGEN_STRONG_INLINE float pfirst(const Packet8f& a) {
+  return pfirst_hvx(a);
 }
 
 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet32f, 4>& kernel) {
@@ -166,13 +525,107 @@
   // Shuffle the 64-bit lanes.
   HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
   HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_3_2), HEXAGON_HVX_GET_V1(v_0_1_0), -8);
-
   kernel.packet[0] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_1_1_0));
   kernel.packet[1] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_1_1_0));
   kernel.packet[2] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_1_3_2));
   kernel.packet[3] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_1_3_2));
 }
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16f, 4>& kernel) {
+  // Shuffle the 32-bit lanes.
+  HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
+  HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
 
+  // Shuffle the 64-bit lanes.
+  HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
+
+  kernel.packet[0] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_1_0));
+  kernel.packet[1] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 64));
+  kernel.packet[2] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_1_0));
+  kernel.packet[3] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_1_0), HEXAGON_HVX_GET_V1(v_1_1_0), 64));
+}
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8f, 4>& kernel) {
+  // Shuffle the 32-bit lanes.
+  HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
+  HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
+
+  // Shuffle the 64-bit lanes.
+  HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
+
+  kernel.packet[0] = Packet8f::Create(HEXAGON_HVX_GET_V0(v_1_1_0));
+  kernel.packet[1] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 32));
+  kernel.packet[2] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 64));
+  kernel.packet[3] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 96));
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8f, 8>& kernel) {
+  // Shuffle the 32-bit lanes.
+  HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
+  HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
+  HVX_VectorPair v_0_5_4 = Q6_W_vshuff_VVR(kernel.packet[5].Get(), kernel.packet[4].Get(), -4);
+  HVX_VectorPair v_0_7_6 = Q6_W_vshuff_VVR(kernel.packet[7].Get(), kernel.packet[6].Get(), -4);
+
+  // Shuffle the 64-bit lanes.
+  HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
+  HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_7_6), HEXAGON_HVX_GET_V0(v_0_5_4), -8);
+
+  // Shuffle the 128-bit lanes.
+  v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_3_2), HEXAGON_HVX_GET_V0(v_1_1_0), -16);
+
+  kernel.packet[0] = Packet8f::Create(HEXAGON_HVX_GET_V0(v_0_1_0));
+  kernel.packet[1] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_0_1_0), HEXAGON_HVX_GET_V0(v_0_1_0), 32));
+  kernel.packet[2] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_0_1_0), HEXAGON_HVX_GET_V0(v_0_1_0), 64));
+  kernel.packet[3] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_0_1_0), HEXAGON_HVX_GET_V0(v_0_1_0), 96));
+  kernel.packet[4] = Packet8f::Create(HEXAGON_HVX_GET_V1(v_0_1_0));
+  kernel.packet[5] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_0_1_0), HEXAGON_HVX_GET_V1(v_0_1_0), 32));
+  kernel.packet[6] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_0_1_0), HEXAGON_HVX_GET_V1(v_0_1_0), 64));
+  kernel.packet[7] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_0_1_0), HEXAGON_HVX_GET_V1(v_0_1_0), 96));
+}
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16f, 16>& kernel) {
+  // Shuffle the 32-bit lanes.
+  HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
+  HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
+  HVX_VectorPair v_0_5_4 = Q6_W_vshuff_VVR(kernel.packet[5].Get(), kernel.packet[4].Get(), -4);
+  HVX_VectorPair v_0_7_6 = Q6_W_vshuff_VVR(kernel.packet[7].Get(), kernel.packet[6].Get(), -4);
+  HVX_VectorPair v_0_9_8 = Q6_W_vshuff_VVR(kernel.packet[9].Get(), kernel.packet[8].Get(), -4);
+  HVX_VectorPair v_0_11_10 = Q6_W_vshuff_VVR(kernel.packet[11].Get(), kernel.packet[10].Get(), -4);
+  HVX_VectorPair v_0_13_12 = Q6_W_vshuff_VVR(kernel.packet[13].Get(), kernel.packet[12].Get(), -4);
+  HVX_VectorPair v_0_15_14 = Q6_W_vshuff_VVR(kernel.packet[15].Get(), kernel.packet[14].Get(), -4);
+
+  // Shuffle the 64-bit lanes.
+  HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
+  HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_7_6), HEXAGON_HVX_GET_V0(v_0_5_4), -8);
+  HVX_VectorPair v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10), HEXAGON_HVX_GET_V0(v_0_9_8), -8);
+  HVX_VectorPair v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_15_14), HEXAGON_HVX_GET_V0(v_0_13_12), -8);
+
+  // Shuffle the 128-bit lanes.
+  v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_3_2), HEXAGON_HVX_GET_V0(v_1_1_0), -16);
+  v_0_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_3_2), HEXAGON_HVX_GET_V1(v_1_1_0), -16);
+  v_0_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_7_6), HEXAGON_HVX_GET_V0(v_1_5_4), -16);
+  v_0_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_7_6), HEXAGON_HVX_GET_V1(v_1_5_4), -16);
+
+  // Shuffle the 256-bit lanes.
+  v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_9_8), HEXAGON_HVX_GET_V0(v_0_1_0), -32);
+  v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_9_8), HEXAGON_HVX_GET_V1(v_0_1_0), -32);
+  v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10), HEXAGON_HVX_GET_V0(v_0_3_2), -32);
+  v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_11_10), HEXAGON_HVX_GET_V1(v_0_3_2), -32);
+
+  kernel.packet[0] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_1_0));
+  kernel.packet[1] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 64));
+  kernel.packet[2] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_1_0));
+  kernel.packet[3] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_1_0), HEXAGON_HVX_GET_V1(v_1_1_0), 64));
+  kernel.packet[4] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_3_2));
+  kernel.packet[5] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_3_2), HEXAGON_HVX_GET_V0(v_1_3_2), 64));
+  kernel.packet[6] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_3_2));
+  kernel.packet[7] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_3_2), HEXAGON_HVX_GET_V1(v_1_3_2), 64));
+  kernel.packet[8] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_5_4));
+  kernel.packet[9] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_5_4), HEXAGON_HVX_GET_V0(v_1_5_4), 64));
+  kernel.packet[10] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_5_4));
+  kernel.packet[11] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_5_4), HEXAGON_HVX_GET_V1(v_1_5_4), 64));
+  kernel.packet[12] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_7_6));
+  kernel.packet[13] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_7_6), HEXAGON_HVX_GET_V0(v_1_7_6), 64));
+  kernel.packet[14] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_7_6));
+  kernel.packet[15] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_7_6), HEXAGON_HVX_GET_V1(v_1_7_6), 64));
+}
 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet32f, 32>& kernel) {
   // Shuffle the 32-bit lanes.
   HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
@@ -298,29 +751,67 @@
   kernel.packet[31] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_31_30));
 }
 
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE float predux_hvx(const HVXPacket<T>& a) {
+  const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
+  HVX_Vector vsum = Q6_Vqf32_vadd_VsfVsf(a.Get(), Q6_V_vror_VR(a.Get(), sizeof(float)));
+  for (int i = 2; i < packet_size; i <<= 1) {
+    vsum = Q6_Vqf32_vadd_Vqf32Vqf32(vsum, Q6_V_vror_VR(vsum, i * sizeof(float)));
+  }
+  return pfirst(HVXPacket<T>::Create(Q6_Vsf_equals_Vqf32(vsum)));
+}
 template <>
 EIGEN_STRONG_INLINE float predux<Packet32f>(const Packet32f& a) {
-  HVX_Vector vsum_4 = Q6_Vqf32_vadd_VsfVsf(Q6_V_vror_VR(a.Get(), 4), a.Get());
-  HVX_Vector vsum_8 = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_vror_VR(vsum_4, 8), vsum_4);
-  HVX_Vector vsum_16 = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_vror_VR(vsum_8, 16), vsum_8);
-  HVX_Vector vsum_32 = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_vror_VR(vsum_16, 32), vsum_16);
-  HVX_Vector vsum_64 = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_vror_VR(vsum_32, 64), vsum_32);
-  return pfirst(Packet32f::Create(Q6_Vsf_equals_Vqf32(vsum_64)));
+  return predux_hvx(a);
+}
+template <>
+EIGEN_STRONG_INLINE float predux<Packet16f>(const Packet16f& a) {
+  return predux_hvx(a);
+}
+template <>
+EIGEN_STRONG_INLINE float predux<Packet8f>(const Packet8f& a) {
+  return predux_hvx(a);
 }
 
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> ploaddup_hvx(const float* from) {
+  constexpr Index size = unpacket_traits<HVXPacket<T>>::size / 2;
+  HVX_Vector load = HVX_load_partial<size, 0>(from);
+  HVX_VectorPair dup = Q6_W_vshuff_VVR(load, load, -4);
+  return HVXPacket<T>::Create(HEXAGON_HVX_GET_V0(dup));
+}
 template <>
 EIGEN_STRONG_INLINE Packet32f ploaddup(const float* from) {
-  HVX_Vector load = HVX_loadu(from);
-  HVX_VectorPair dup = Q6_W_vshuff_VVR(load, load, -4);
-  return Packet32f::Create(HEXAGON_HVX_GET_V0(dup));
+  return ploaddup_hvx<HVXPacketSize::Full>(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f ploaddup(const float* from) {
+  return ploaddup_hvx<HVXPacketSize::Half>(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f ploaddup(const float* from) {
+  return ploaddup_hvx<HVXPacketSize::Quarter>(from);
 }
 
-template <>
-EIGEN_STRONG_INLINE Packet32f ploadquad(const float* from) {
-  HVX_Vector load = HVX_loadu(from);
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> ploadquad_hvx(const float* from) {
+  constexpr Index size = unpacket_traits<HVXPacket<T>>::size / 4;
+  HVX_Vector load = HVX_load_partial<size, 0>(from);
   HVX_VectorPair dup = Q6_W_vshuff_VVR(load, load, -4);
   HVX_VectorPair quad = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(dup), HEXAGON_HVX_GET_V0(dup), -8);
-  return Packet32f::Create(HEXAGON_HVX_GET_V0(quad));
+  return HVXPacket<T>::Create(HEXAGON_HVX_GET_V0(quad));
+}
+template <>
+EIGEN_STRONG_INLINE Packet32f ploadquad(const float* from) {
+  return ploadquad_hvx<HVXPacketSize::Full>(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f ploadquad(const float* from) {
+  return ploadquad_hvx<HVXPacketSize::Half>(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f ploadquad(const float* from) {
+  return ploadquad_hvx<HVXPacketSize::Quarter>(from);
 }
 
 template <>
@@ -330,99 +821,249 @@
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet32f pmin(const Packet32f& a, const Packet32f& b) {
-  return Packet32f::Create(Q6_Vsf_vmin_VsfVsf(a.Get(), b.Get()));
+EIGEN_STRONG_INLINE Packet16f preverse(const Packet16f& a) {
+  HVX_Vector delta = Q6_Vb_vsplat_R(0x3c);
+  return Packet16f::Create(Q6_V_vdelta_VV(a.Get(), delta));
 }
 
 template <>
+EIGEN_STRONG_INLINE Packet8f preverse(const Packet8f& a) {
+  HVX_Vector delta = Q6_Vb_vsplat_R(0x1c);
+  return Packet8f::Create(Q6_V_vdelta_VV(a.Get(), delta));
+}
+
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> pmin_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
+  return HVXPacket<T>::Create(Q6_Vsf_vmin_VsfVsf(a.Get(), b.Get()));
+}
+template <>
+EIGEN_STRONG_INLINE Packet32f pmin(const Packet32f& a, const Packet32f& b) {
+  return pmin_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pmin(const Packet16f& a, const Packet16f& b) {
+  return pmin_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pmin(const Packet8f& a, const Packet8f& b) {
+  return pmin_hvx(a, b);
+}
+
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> pmax_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
+  return HVXPacket<T>::Create(Q6_Vsf_vmax_VsfVsf(a.Get(), b.Get()));
+}
+template <>
 EIGEN_STRONG_INLINE Packet32f pmax(const Packet32f& a, const Packet32f& b) {
-  return Packet32f::Create(Q6_Vsf_vmax_VsfVsf(a.Get(), b.Get()));
+  return pmax_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pmax(const Packet16f& a, const Packet16f& b) {
+  return pmax_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pmax(const Packet8f& a, const Packet8f& b) {
+  return pmax_hvx(a, b);
 }
 
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> pand_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
+  return HVXPacket<T>::Create(a.Get() & b.Get());
+}
 template <>
 EIGEN_STRONG_INLINE Packet32f pand(const Packet32f& a, const Packet32f& b) {
-  return Packet32f::Create(a.Get() & b.Get());
+  return pand_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pand(const Packet16f& a, const Packet16f& b) {
+  return pand_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pand(const Packet8f& a, const Packet8f& b) {
+  return pand_hvx(a, b);
 }
 
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> por_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
+  return HVXPacket<T>::Create(a.Get() | b.Get());
+}
 template <>
 EIGEN_STRONG_INLINE Packet32f por(const Packet32f& a, const Packet32f& b) {
-  return Packet32f::Create(a.Get() | b.Get());
+  return por_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f por(const Packet16f& a, const Packet16f& b) {
+  return por_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f por(const Packet8f& a, const Packet8f& b) {
+  return por_hvx(a, b);
 }
 
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> pxor_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
+  return HVXPacket<T>::Create(a.Get() ^ b.Get());
+}
 template <>
 EIGEN_STRONG_INLINE Packet32f pxor(const Packet32f& a, const Packet32f& b) {
-  return Packet32f::Create(a.Get() ^ b.Get());
+  return pxor_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pxor(const Packet16f& a, const Packet16f& b) {
+  return pxor_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pxor(const Packet8f& a, const Packet8f& b) {
+  return pxor_hvx(a, b);
 }
 
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> pnot_hvx(const HVXPacket<T>& a) {
+  return HVXPacket<T>::Create(~a.Get());
+}
 template <>
 EIGEN_STRONG_INLINE Packet32f pnot(const Packet32f& a) {
-  return Packet32f::Create(~a.Get());
+  return pnot_hvx(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pnot(const Packet16f& a) {
+  return pnot_hvx(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pnot(const Packet8f& a) {
+  return pnot_hvx(a);
 }
 
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> pselect_hvx(const HVXPacket<T>& mask, const HVXPacket<T>& a, const HVXPacket<T>& b) {
+  HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(mask.Get(), Q6_V_vzero());
+  return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, b.Get(), a.Get()));
+}
 template <>
 EIGEN_STRONG_INLINE Packet32f pselect(const Packet32f& mask, const Packet32f& a, const Packet32f& b) {
-  HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(mask.Get(), Q6_V_vzero());
-  return Packet32f::Create(Q6_V_vmux_QVV(pred, b.Get(), a.Get()));
+  return pselect_hvx(mask, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pselect(const Packet16f& mask, const Packet16f& a, const Packet16f& b) {
+  return pselect_hvx(mask, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pselect(const Packet8f& mask, const Packet8f& a, const Packet8f& b) {
+  return pselect_hvx(mask, a, b);
 }
 
-template <typename Op>
-EIGEN_STRONG_INLINE float predux_generic(const Packet32f& a, Op op) {
-  Packet32f vredux_4 = op(Packet32f::Create(Q6_V_vror_VR(a.Get(), 4)), a);
-  Packet32f vredux_8 = op(Packet32f::Create(Q6_V_vror_VR(vredux_4.Get(), 8)), vredux_4);
-  Packet32f vredux_16 = op(Packet32f::Create(Q6_V_vror_VR(vredux_8.Get(), 16)), vredux_8);
-  Packet32f vredux_32 = op(Packet32f::Create(Q6_V_vror_VR(vredux_16.Get(), 32)), vredux_16);
-  Packet32f vredux_64 = op(Packet32f::Create(Q6_V_vror_VR(vredux_32.Get(), 64)), vredux_32);
-  return pfirst(vredux_64);
+template <HVXPacketSize T, typename Op>
+EIGEN_STRONG_INLINE float predux_generic(const HVXPacket<T>& a, Op op) {
+  const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
+  HVXPacket<T> vredux = a;
+  for (int i = 1; i < packet_size; i <<= 1) {
+    vredux = op(vredux, HVXPacket<T>::Create(Q6_V_vror_VR(vredux.Get(), i * sizeof(float))));
+  }
+  return pfirst(vredux);
 }
 
 template <>
 EIGEN_STRONG_INLINE float predux_max(const Packet32f& a) {
   return predux_generic(a, pmax<Packet32f>);
 }
+template <>
+EIGEN_STRONG_INLINE float predux_max(const Packet16f& a) {
+  return predux_generic(a, pmax<Packet16f>);
+}
+template <>
+EIGEN_STRONG_INLINE float predux_max(const Packet8f& a) {
+  return predux_generic(a, pmax<Packet8f>);
+}
 
 template <>
 EIGEN_STRONG_INLINE float predux_min(const Packet32f& a) {
   return predux_generic(a, pmin<Packet32f>);
 }
+template <>
+EIGEN_STRONG_INLINE float predux_min(const Packet16f& a) {
+  return predux_generic(a, pmin<Packet16f>);
+}
+template <>
+EIGEN_STRONG_INLINE float predux_min(const Packet8f& a) {
+  return predux_generic(a, pmin<Packet8f>);
+}
 
 template <>
 EIGEN_STRONG_INLINE bool predux_any(const Packet32f& a) {
   return predux_generic(a, por<Packet32f>) != 0.0f;
 }
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet16f& a) {
+  return predux_generic(a, por<Packet16f>) != 0.0f;
+}
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet8f& a) {
+  return predux_generic(a, por<Packet8f>) != 0.0f;
+}
 
 static const float index_vsf[32]
-    __attribute__((aligned(128))) = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-                                     16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
+    __attribute__((aligned(__HVX_LENGTH__))) = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
 
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> plset_hvx(const float& a) {
+  return padd(pload<HVXPacket<T>>(index_vsf), pset1<HVXPacket<T>>(a));
+}
 template <>
 EIGEN_STRONG_INLINE Packet32f plset(const float& a) {
-  return padd(pload<Packet32f>(index_vsf), pset1<Packet32f>(a));
+  return plset_hvx<HVXPacketSize::Full>(a);
 }
-
-// qfloat32 operations.
 template <>
-EIGEN_STRONG_INLINE Packet32qf pzero<Packet32qf>(const Packet32qf&) {
-  return Packet32qf::Create(Q6_V_vzero());
+EIGEN_STRONG_INLINE Packet16f plset(const float& a) {
+  return plset_hvx<HVXPacketSize::Half>(a);
 }
-
 template <>
-EIGEN_STRONG_INLINE Packet32qf pmul<Packet32qf>(const Packet32qf& a, const Packet32qf& b) {
-  return Packet32qf::Create(Q6_Vqf32_vmpy_Vqf32Vqf32(a.Get(), b.Get()));
+EIGEN_STRONG_INLINE Packet8f plset(const float& a) {
+  return plset_hvx<HVXPacketSize::Quarter>(a);
 }
 
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE void pscatter_hvx(float* to, const HVXPacket<T>& from, Index stride) {
+  const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
+  float elements[packet_size] __attribute__((aligned(__HVX_LENGTH__)));
+  pstore<float>(elements, from);
+  for (Index i = 0; i < packet_size; ++i) {
+    to[i * stride] = elements[i];
+  }
+}
 template <>
-EIGEN_STRONG_INLINE Packet32qf padd<Packet32qf>(const Packet32qf& a, const Packet32qf& b) {
-  return Packet32qf::Create(Q6_Vqf32_vadd_Vqf32Vqf32(a.Get(), b.Get()));
+EIGEN_STRONG_INLINE void pscatter<float, Packet32f>(float* to, const Packet32f& from, Index stride) {
+  pscatter_hvx(to, from, stride);
+}
+template <>
+EIGEN_STRONG_INLINE void pscatter<float, Packet16f>(float* to, const Packet16f& from, Index stride) {
+  pscatter_hvx(to, from, stride);
+}
+template <>
+EIGEN_STRONG_INLINE void pscatter<float, Packet8f>(float* to, const Packet8f& from, Index stride) {
+  pscatter_hvx(to, from, stride);
 }
 
-// Mixed float32 and qfloat32 operations.
-EIGEN_STRONG_INLINE Packet32qf pmadd_f32_to_qf32(const Packet32f& a, const Packet32f& b, const Packet32qf& c) {
-  return Packet32qf::Create(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_Vqf32_vmpy_VsfVsf(a.Get(), b.Get()), c.Get()));
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> pgather_hvx(const float* from, Index stride) {
+  const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
+  float elements[packet_size] __attribute__((aligned(__HVX_LENGTH__)));
+  for (Index i = 0; i < packet_size; i++) {
+    elements[i] = from[i * stride];
+  }
+  return pload<HVXPacket<T>>(elements);
 }
-
-EIGEN_STRONG_INLINE Packet32f pmadd_qf32_to_f32(const Packet32qf& a, const Packet32f& b, const Packet32f& c) {
-  return Packet32f::Create(Q6_Vsf_equals_Vqf32(
-      Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(a.Get()), b.Get()), c.Get())));
+template <>
+EIGEN_STRONG_INLINE Packet32f pgather<float, Packet32f>(const float* from, Index stride) {
+  return pgather_hvx<HVXPacketSize::Full>(from, stride);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pgather<float, Packet16f>(const float* from, Index stride) {
+  return pgather_hvx<HVXPacketSize::Half>(from, stride);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pgather<float, Packet8f>(const float* from, Index stride) {
+  return pgather_hvx<HVXPacketSize::Quarter>(from, stride);
 }
 
 }  // end namespace internal
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 4e3a14d..71e5f5f 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -1271,7 +1271,7 @@
   return pset1<Packet2ul>(0ULL);
 }
 
-#ifdef __ARM_FEATURE_FMA
+#ifdef EIGEN_VECTORIZE_FMA
 template <>
 EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
   return vfmaq_f32(c, a, b);
@@ -5249,7 +5249,7 @@
   return vdivq_f64(a, b);
 }
 
-#ifdef __ARM_FEATURE_FMA
+#ifdef EIGEN_VECTORIZE_FMA
 // See bug 936. See above comment about FMA for float.
 template <>
 EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h
index b16952a..e692438 100644
--- a/Eigen/src/Core/util/ConfigureVectorization.h
+++ b/Eigen/src/Core/util/ConfigureVectorization.h
@@ -354,6 +354,7 @@
 
 #define EIGEN_VECTORIZE
 #define EIGEN_VECTORIZE_VSX 1
+#define EIGEN_VECTORIZE_FMA
 #include <altivec.h>
 // We need to #undef all these ugly tokens defined in <altivec.h>
 // => use __vector instead of vector
@@ -365,6 +366,7 @@
 
 #define EIGEN_VECTORIZE
 #define EIGEN_VECTORIZE_ALTIVEC
+#define EIGEN_VECTORIZE_FMA
 #include <altivec.h>
 // We need to #undef all these ugly tokens defined in <altivec.h>
 // => use __vector instead of vector
@@ -431,6 +433,11 @@
 #include <arm_fp16.h>
 #endif
 
+// Enable FMA for ARM.
+#if defined(__ARM_FEATURE_FMA)
+#define EIGEN_VECTORIZE_FMA
+#endif
+
 #if defined(__F16C__) && !defined(EIGEN_GPUCC) && (!EIGEN_COMP_CLANG_STRICT || EIGEN_CLANG_STRICT_AT_LEAST(3, 8, 0))
 // We can use the optimized fp16 to float and float to fp16 conversion routines
 #define EIGEN_HAS_FP16_C
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index 31f1057..6253454 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -156,7 +156,7 @@
 
 /** \internal Frees memory allocated with handmade_aligned_malloc */
 EIGEN_DEVICE_FUNC inline void handmade_aligned_free(void* ptr) {
-  if (ptr) {
+  if (ptr != nullptr) {
     uint8_t offset = static_cast<uint8_t>(*(static_cast<uint8_t*>(ptr) - 1));
     void* original = static_cast<void*>(static_cast<uint8_t*>(ptr) - offset);
 
@@ -224,9 +224,11 @@
 EIGEN_DEVICE_FUNC inline void aligned_free(void* ptr) {
 #if (EIGEN_DEFAULT_ALIGN_BYTES == 0) || EIGEN_MALLOC_ALREADY_ALIGNED
 
-  if (ptr) check_that_malloc_is_allowed();
-  EIGEN_USING_STD(free)
-  free(ptr);
+  if (ptr != nullptr) {
+    check_that_malloc_is_allowed();
+    EIGEN_USING_STD(free)
+    free(ptr);
+  }
 
 #else
   handmade_aligned_free(ptr);
@@ -294,9 +296,11 @@
 
 template <>
 EIGEN_DEVICE_FUNC inline void conditional_aligned_free<false>(void* ptr) {
-  if (ptr) check_that_malloc_is_allowed();
-  EIGEN_USING_STD(free)
-  free(ptr);
+  if (ptr != nullptr) {
+    check_that_malloc_is_allowed();
+    EIGEN_USING_STD(free)
+    free(ptr);
+  }
 }
 
 template <bool Align>
diff --git a/Eigen/src/Eigenvalues/ComplexSchur.h b/Eigen/src/Eigenvalues/ComplexSchur.h
index 126b442..1ec8fb8 100644
--- a/Eigen/src/Eigenvalues/ComplexSchur.h
+++ b/Eigen/src/Eigenvalues/ComplexSchur.h
@@ -275,7 +275,7 @@
 template <typename MatrixType>
 typename ComplexSchur<MatrixType>::ComplexScalar ComplexSchur<MatrixType>::computeShift(Index iu, Index iter) {
   using std::abs;
-  if (iter == 10 || iter == 20) {
+  if ((iter == 10 || iter == 20) && iu > 1) {
     // exceptional shift, taken from http://www.netlib.org/eispack/comqr.f
     return abs(numext::real(m_matT.coeff(iu, iu - 1))) + abs(numext::real(m_matT.coeff(iu - 1, iu - 2)));
   }
diff --git a/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h b/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
index 1eb07f7..3132794 100644
--- a/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
+++ b/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
@@ -142,8 +142,8 @@
     cholmod_sparse A;
     A = viewAsCholmod(mat);
     m_rows = matrix.rows();
-    Index col = matrix.cols();
-    m_rank = SuiteSparseQR<Scalar>(m_ordering, pivotThreshold, col, &A, &m_cR, &m_E, &m_H, &m_HPinv, &m_HTau, &m_cc);
+    m_rank = SuiteSparseQR<Scalar>(m_ordering, pivotThreshold, internal::convert_index<StorageIndex>(matrix.cols()), &A,
+                                   &m_cR, &m_E, &m_H, &m_HPinv, &m_HTau, &m_cc);
 
     if (!m_cR) {
       m_info = NumericalIssue;
@@ -196,7 +196,7 @@
   const MatrixType matrixR() const {
     eigen_assert(m_isInitialized && " The QR factorization should be computed first, call compute()");
     if (!m_isRUpToDate) {
-      m_R = viewAsEigen<Scalar, ColMajor, typename MatrixType::StorageIndex>(*m_cR);
+      m_R = viewAsEigen<Scalar, StorageIndex>(*m_cR);
       m_isRUpToDate = true;
     }
     return m_R;
diff --git a/Eigen/src/SparseCore/CompressedStorage.h b/Eigen/src/SparseCore/CompressedStorage.h
index 123c89c..8f8a696 100644
--- a/Eigen/src/SparseCore/CompressedStorage.h
+++ b/Eigen/src/SparseCore/CompressedStorage.h
@@ -71,8 +71,13 @@
 
   void resize(Index size, double reserveSizeFactor = 0) {
     if (m_allocatedSize < size) {
+      // Avoid underflow on the std::min<Index> call by choosing the smaller index type.
+      using SmallerIndexType =
+          typename std::conditional<static_cast<size_t>((std::numeric_limits<Index>::max)()) <
+                                        static_cast<size_t>((std::numeric_limits<StorageIndex>::max)()),
+                                    Index, StorageIndex>::type;
       Index realloc_size =
-          (std::min<Index>)(NumTraits<StorageIndex>::highest(), size + Index(reserveSizeFactor * double(size)));
+          (std::min<Index>)(NumTraits<SmallerIndexType>::highest(), size + Index(reserveSizeFactor * double(size)));
       if (realloc_size < size) internal::throw_std_bad_alloc();
       reallocate(realloc_size);
     }
diff --git a/Eigen/src/SparseLU/SparseLU.h b/Eigen/src/SparseLU/SparseLU.h
index aee3d94..29be01a 100644
--- a/Eigen/src/SparseLU/SparseLU.h
+++ b/Eigen/src/SparseLU/SparseLU.h
@@ -99,13 +99,34 @@
  * \code
  * VectorXd x(n), b(n);
  * SparseMatrix<double> A;
- * SparseLU<SparseMatrix<double>, COLAMDOrdering<int> >   solver;
- * // fill A and b;
- * // Compute the ordering permutation vector from the structural pattern of A
+ * SparseLU<SparseMatrix<double>, COLAMDOrdering<int> > solver;
+ * // Fill A and b.
+ * // Compute the ordering permutation vector from the structural pattern of A.
  * solver.analyzePattern(A);
- * // Compute the numerical factorization
+ * // Compute the numerical factorization.
  * solver.factorize(A);
- * //Use the factors to solve the linear system
+ * // Use the factors to solve the linear system.
+ * x = solver.solve(b);
+ * \endcode
+ *
+ * We can directly call compute() instead of analyzePattern() and factorize()
+ * \code
+ * VectorXd x(n), b(n);
+ * SparseMatrix<double> A;
+ * SparseLU<SparseMatrix<double>, COLAMDOrdering<int> > solver;
+ * // Fill A and b.
+ * solver.compute(A);
+ * // Use the factors to solve the linear system.
+ * x = solver.solve(b);
+ * \endcode
+ *
+ * Or give the matrix to the constructor SparseLU(const MatrixType& matrix)
+ * \code
+ * VectorXd x(n), b(n);
+ * SparseMatrix<double> A;
+ * // Fill A and b.
+ * SparseLU<SparseMatrix<double>, COLAMDOrdering<int> > solver(A);
+ * // Use the factors to solve the linear system.
  * x = solver.solve(b);
  * \endcode
  *
@@ -150,10 +171,18 @@
   enum { ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime };
 
  public:
+  /** \brief Basic constructor of the solver.
+   *
+   * Construct a SparseLU. As no matrix is given as argument, compute() should be called afterward with a matrix.
+   */
   SparseLU()
       : m_lastError(""), m_Ustore(0, 0, 0, 0, 0, 0), m_symmetricmode(false), m_diagpivotthresh(1.0), m_detPermR(1) {
     initperfvalues();
   }
+  /** \brief Constructor of the solver already based on a specific matrix.
+   *
+   * Construct a SparseLU. compute() is already called with the given matrix.
+   */
   explicit SparseLU(const MatrixType& matrix)
       : m_lastError(""), m_Ustore(0, 0, 0, 0, 0, 0), m_symmetricmode(false), m_diagpivotthresh(1.0), m_detPermR(1) {
     initperfvalues();
@@ -168,9 +197,15 @@
   void factorize(const MatrixType& matrix);
   void simplicialfactorize(const MatrixType& matrix);
 
-  /**
+  /** \brief Analyze and factorize the matrix so the solver is ready to solve.
+   *
    * Compute the symbolic and numeric factorization of the input sparse matrix.
-   * The input matrix should be in column-major storage.
+   * The input matrix should be in column-major storage, otherwise analyzePattern()
+   * will do a heavy copy.
+   *
+   * Call analyzePattern() followed by factorize()
+   *
+   * \sa analyzePattern(), factorize()
    */
   void compute(const MatrixType& matrix) {
     // Analyze
@@ -179,7 +214,9 @@
     factorize(matrix);
   }
 
-  /** \returns an expression of the transposed of the factored matrix.
+  /** \brief Return a solver for the transposed matrix.
+   *
+   * \returns an expression of the transposed of the factored matrix.
    *
    * A typical usage is to solve for the transposed problem A^T x = b:
    * \code
@@ -196,7 +233,9 @@
     return transposeView;
   }
 
-  /** \returns an expression of the adjoint of the factored matrix
+  /** \brief Return a solver for the adjointed matrix.
+   *
+   * \returns an expression of the adjoint of the factored matrix
    *
    * A typical usage is to solve for the adjoint problem A' x = b:
    * \code
@@ -215,19 +254,28 @@
     return adjointView;
   }
 
+  /** \brief Give the number of rows.
+   */
   inline Index rows() const { return m_mat.rows(); }
+  /** \brief Give the numver of columns.
+   */
   inline Index cols() const { return m_mat.cols(); }
-  /** Indicate that the pattern of the input matrix is symmetric */
+  /** \brief Let you set that the pattern of the input matrix is symmetric
+   */
   void isSymmetric(bool sym) { m_symmetricmode = sym; }
 
-  /** \returns an expression of the matrix L, internally stored as supernodes
+  /** \brief Give the matrixL
+   *
+   * \returns an expression of the matrix L, internally stored as supernodes
    * The only operation available with this expression is the triangular solve
    * \code
    * y = b; matrixL().solveInPlace(y);
    * \endcode
    */
   SparseLUMatrixLReturnType<SCMatrix> matrixL() const { return SparseLUMatrixLReturnType<SCMatrix>(m_Lstore); }
-  /** \returns an expression of the matrix U,
+  /** \brief Give the MatrixU
+   *
+   * \returns an expression of the matrix U,
    * The only operation available with this expression is the triangular solve
    * \code
    * y = b; matrixU().solveInPlace(y);
@@ -237,12 +285,14 @@
     return SparseLUMatrixUReturnType<SCMatrix, Map<SparseMatrix<Scalar, ColMajor, StorageIndex>>>(m_Lstore, m_Ustore);
   }
 
-  /**
+  /** \brief Give the row matrix permutation.
+   *
    * \returns a reference to the row matrix permutation \f$ P_r \f$ such that \f$P_r A P_c^T = L U\f$
    * \sa colsPermutation()
    */
   inline const PermutationType& rowsPermutation() const { return m_perm_r; }
-  /**
+  /** \brief Give the column matrix permutation.
+   *
    * \returns a reference to the column matrix permutation\f$ P_c^T \f$ such that \f$P_r A P_c^T = L U\f$
    * \sa rowsPermutation()
    */
@@ -251,7 +301,9 @@
   void setPivotThreshold(const RealScalar& thresh) { m_diagpivotthresh = thresh; }
 
 #ifdef EIGEN_PARSED_BY_DOXYGEN
-  /** \returns the solution X of \f$ A X = B \f$ using the current decomposition of A.
+  /** \brief Solve a system \f$ A X = B \f$
+   *
+   * \returns the solution X of \f$ A X = B \f$ using the current decomposition of A.
    *
    * \warning the destination matrix X in X = this->solve(B) must be colmun-major.
    *
@@ -267,14 +319,17 @@
    *          \c NumericalIssue if the LU factorization reports a problem, zero diagonal for instance
    *          \c InvalidInput if the input matrix is invalid
    *
-   * \sa iparm()
+   * You can get a readable error message with lastErrorMessage().
+   *
+   * \sa lastErrorMessage()
    */
   ComputationInfo info() const {
     eigen_assert(m_isInitialized && "Decomposition is not initialized.");
     return m_info;
   }
 
-  /**
+  /** \brief Give a human readable error
+   *
    * \returns A string describing the type of error
    */
   std::string lastErrorMessage() const { return m_lastError; }
@@ -302,7 +357,8 @@
     return true;
   }
 
-  /**
+  /** \brief Give the absolute value of the determinant.
+   *
    * \returns the absolute value of the determinant of the matrix of which
    * *this is the QR decomposition.
    *
@@ -330,7 +386,9 @@
     return det;
   }
 
-  /** \returns the natural log of the absolute value of the determinant of the matrix
+  /** \brief Give the natural log of the absolute determinant.
+   *
+   * \returns the natural log of the absolute value of the determinant of the matrix
    * of which **this is the QR decomposition
    *
    * \note This method is useful to work around the risk of overflow/underflow that's
@@ -356,7 +414,9 @@
     return det;
   }
 
-  /** \returns A number representing the sign of the determinant
+  /** \brief Give the sign of the determinant.
+   *
+   * \returns A number representing the sign of the determinant
    *
    * \sa absDeterminant(), logAbsDeterminant()
    */
@@ -380,7 +440,9 @@
     return det * m_detPermR * m_detPermC;
   }
 
-  /** \returns The determinant of the matrix.
+  /** \brief Give the determinant.
+   *
+   * \returns The determinant of the matrix.
    *
    * \sa absDeterminant(), logAbsDeterminant()
    */
@@ -401,7 +463,11 @@
     return (m_detPermR * m_detPermC) > 0 ? det : -det;
   }
 
+  /** \brief Give the number of non zero in matrix L.
+   */
   Index nnzL() const { return m_nnzL; }
+  /** \brief Give the number of non zero in matrix U.
+   */
   Index nnzU() const { return m_nnzU; }
 
  protected:
@@ -442,7 +508,8 @@
 };  // End class SparseLU
 
 // Functions needed by the anaysis phase
-/**
+/** \brief Compute the column permutation.
+ *
  * Compute the column permutation to minimize the fill-in
  *
  *  - Apply this permutation to the input matrix -
@@ -451,6 +518,11 @@
  *
  *  - Postorder the elimination tree and the column permutation
  *
+ * It is possible to call compute() instead of analyzePattern() + factorize().
+ *
+ * If the matrix is row-major this function will do an heavy copy.
+ *
+ * \sa factorize(), compute()
  */
 template <typename MatrixType, typename OrderingType>
 void SparseLU<MatrixType, OrderingType>::analyzePattern(const MatrixType& mat) {
@@ -516,23 +588,24 @@
 
 // Functions needed by the numerical factorization phase
 
-/**
+/** \brief Factorize the matrix to get the solver ready.
+ *
  *  - Numerical factorization
  *  - Interleaved with the symbolic factorization
- * On exit,  info is
  *
- *    = 0: successful factorization
+ * To get error of this function you should check info(), you can get more info of
+ * errors with lastErrorMessage().
  *
- *    > 0: if info = i, and i is
+ * In the past (before 2012 (git history is not older)), this function was returning an integer.
+ * This exit was 0 if successful factorization.
+ * > 0 if info = i, and i is been completed, but the factor U is exactly singular,
+ * and division by zero will occur if it is used to solve a system of equation.
+ * > A->ncol: number of bytes allocated when memory allocation failure occured, plus A->ncol.
+ * If lwork = -1, it is the estimated amount of space needed, plus A->ncol.
  *
- *       <= A->ncol: U(i,i) is exactly zero. The factorization has
- *          been completed, but the factor U is exactly singular,
- *          and division by zero will occur if it is used to solve a
- *          system of equations.
+ * It seems that A was the name of the matrix in the past.
  *
- *       > A->ncol: number of bytes allocated when memory allocation
- *         failure occurred, plus A->ncol. If lwork = -1, it is
- *         the estimated amount of space needed, plus A->ncol.
+ * \sa analyzePattern(), compute(), SparseLU(), info(), lastErrorMessage()
  */
 template <typename MatrixType, typename OrderingType>
 void SparseLU<MatrixType, OrderingType>::factorize(const MatrixType& matrix) {
@@ -572,6 +645,8 @@
   Index maxpanel = m_perfv.panel_size * m;
   // Allocate working storage common to the factor routines
   Index lwork = 0;
+  // Return the size of actually allocated memory when allocation failed,
+  // and 0 on success.
   Index info = Base::memInit(m, n, nnz, lwork, m_perfv.fillfactor, m_perfv.panel_size, m_glu);
   if (info) {
     m_lastError = "UNABLE TO ALLOCATE WORKING MEMORY\n\n";
@@ -656,6 +731,7 @@
       // Depth-first-search for the current column
       VectorBlock<IndexVector> panel_lsubk(panel_lsub, k, m);
       VectorBlock<IndexVector> repfnz_k(repfnz, k, m);
+      // Return 0 on success and > 0 number of bytes allocated when run out of space.
       info = Base::column_dfs(m, jj, m_perm_r.indices(), m_perfv.maxsuper, nseg, panel_lsubk, segrep, repfnz_k, xprune,
                               marker, parent, xplore, m_glu);
       if (info) {
@@ -667,6 +743,7 @@
       // Numeric updates to this column
       VectorBlock<ScalarVector> dense_k(dense, k, m);
       VectorBlock<IndexVector> segrep_k(segrep, nseg1, m - nseg1);
+      // Return 0 on success and > 0 number of bytes allocated when run out of space.
       info = Base::column_bmod(jj, (nseg - nseg1), dense_k, tempv, segrep_k, repfnz_k, jcol, m_glu);
       if (info) {
         m_lastError = "UNABLE TO EXPAND MEMORY IN COLUMN_BMOD() ";
@@ -676,6 +753,7 @@
       }
 
       // Copy the U-segments to ucol(*)
+      // Return 0 on success and > 0 number of bytes allocated when run out of space.
       info = Base::copy_to_ucol(jj, nseg, segrep, repfnz_k, m_perm_r.indices(), dense_k, m_glu);
       if (info) {
         m_lastError = "UNABLE TO EXPAND MEMORY IN COPY_TO_UCOL() ";
@@ -685,6 +763,7 @@
       }
 
       // Form the L-segment
+      // Return O if success, i > 0 if U(i, i) is exactly zero.
       info = Base::pivotL(jj, m_diagpivotthresh, m_perm_r.indices(), iperm_c.indices(), pivrow, m_glu);
       if (info) {
         m_lastError = "THE MATRIX IS STRUCTURALLY SINGULAR";
diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake
index 2022cf0..2f6f89e 100644
--- a/cmake/EigenTesting.cmake
+++ b/cmake/EigenTesting.cmake
@@ -30,7 +30,7 @@
       hip_reset_flags()
       hip_add_executable(${targetname} ${filename} HIPCC_OPTIONS -std=c++14)
       target_compile_definitions(${targetname} PRIVATE -DEIGEN_USE_HIP)
-      set_property(TARGET ${targetname} PROPERTY HIP_ARCHITECTURES gfx900 gfx906 gfx908 gfx90a gfx1030)
+      set_property(TARGET ${targetname} PROPERTY HIP_ARCHITECTURES gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
     elseif(EIGEN_TEST_CUDA_CLANG)
       set_source_files_properties(${filename} PROPERTIES LANGUAGE CXX)
       
diff --git a/debug/msvc/eigen.natvis b/debug/msvc/eigen.natvis
index 22cf346..da89857 100644
--- a/debug/msvc/eigen.natvis
+++ b/debug/msvc/eigen.natvis
@@ -1,235 +1,235 @@
-<?xml version="1.0" encoding="utf-8"?>
-
-<AutoVisualizer xmlns="http://schemas.microsoft.com/vstudio/debugger/natvis/2010">
-
-  <!-- Fixed x Fixed Matrix -->
-  <Type Name="Eigen::Matrix&lt;*,*,*,*,*,*&gt;">      
-      <AlternativeType Name="Eigen::Array&lt;*,-1,-1,*,*,*&gt;"/>
-      <DisplayString>[{$T2}, {$T3}] (fixed matrix)</DisplayString>
-      <Expand>
-        <ArrayItems Condition="Flags%2"> <!-- row major layout -->
-          <Rank>2</Rank>
-          <Size>$i==0 ? $T2 : $T3</Size>
-          <ValuePointer>m_storage.m_data.array</ValuePointer>
-        </ArrayItems>
-        <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->
-          <Direction>Backward</Direction>
-          <Rank>2</Rank>
-          <Size>$i==0 ? $T2 : $T3</Size>
-          <ValuePointer>m_storage.m_data.array</ValuePointer>
-        </ArrayItems>
-      </Expand>
-  </Type>
-  
-  <!-- 2 x 2 Matrix -->
-  <Type Name="Eigen::Matrix&lt;*,2,2,*,*,*&gt;">      
-      <AlternativeType Name="Eigen::Array&lt;*,2,2,*,*,*&gt;"/>
-      <DisplayString>[2, 2] (fixed matrix)</DisplayString>
-      <Expand>
-        <Synthetic Name="[row 0]" Condition="Flags%2">
-          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]})</DisplayString>
-        </Synthetic>
-        <Synthetic Name="[row 0]" Condition="!(Flags%2)">
-          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[2]})</DisplayString>
-        </Synthetic>
-        <Synthetic Name="[row 1]" Condition="Flags%2">
-          <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>
-        </Synthetic>
-        <Synthetic Name="[row 1]" Condition="!(Flags%2)">
-          <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[3]})</DisplayString>
-        </Synthetic>        
-      </Expand>
-  </Type>
-  
-  <!-- 3 x 3 Matrix -->
-  <Type Name="Eigen::Matrix&lt;*,3,3,*,*,*&gt;">      
-      <AlternativeType Name="Eigen::Array&lt;*,3,3,*,*,*&gt;"/>
-      <DisplayString>[3, 3] (fixed matrix)</DisplayString>
-      <Expand>
-        <Synthetic Name="[row 0]" Condition="Flags%2">
-          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]})</DisplayString>
-        </Synthetic>
-        <Synthetic Name="[row 0]" Condition="!(Flags%2)">
-          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[3]}, {m_storage.m_data.array[6]})</DisplayString>
-        </Synthetic>
-        <Synthetic Name="[row 1]" Condition="Flags%2">
-          <DisplayString>({m_storage.m_data.array[3]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[5]})</DisplayString>
-        </Synthetic>
-        <Synthetic Name="[row 1]" Condition="!(Flags%2)">
-          <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[7]})</DisplayString>
-        </Synthetic>
-        <Synthetic Name="[row 2]" Condition="Flags%2">
-          <DisplayString>({m_storage.m_data.array[6]}, {m_storage.m_data.array[7]}, {m_storage.m_data.array[8]})</DisplayString>
-        </Synthetic>
-        <Synthetic Name="[row 2]" Condition="!(Flags%2)">
-          <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[8]})</DisplayString>
-        </Synthetic>        
-      </Expand>
-  </Type>
-  
-  <!-- 4 x 4 Matrix -->
-  <Type Name="Eigen::Matrix&lt;*,4,4,*,*,*&gt;">      
-      <AlternativeType Name="Eigen::Array&lt;*,4,4,*,*,*&gt;"/>
-      <DisplayString>[4, 4] (fixed matrix)</DisplayString>
-      <Expand>
-        <Synthetic Name="[row 0]" Condition="Flags%2">
-          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>
-        </Synthetic>
-        <Synthetic Name="[row 0]" Condition="!(Flags%2)">
-          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[8]}, {m_storage.m_data.array[12]})</DisplayString>
-        </Synthetic>
-        <Synthetic Name="[row 1]" Condition="Flags%2">
-          <DisplayString>({m_storage.m_data.array[4]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[6]}, {m_storage.m_data.array[7]})</DisplayString>
-        </Synthetic>
-        <Synthetic Name="[row 1]" Condition="!(Flags%2)">
-          <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[9]}, {m_storage.m_data.array[13]})</DisplayString>
-        </Synthetic>
-        <Synthetic Name="[row 2]" Condition="Flags%2">
-          <DisplayString>({m_storage.m_data.array[8]}, {m_storage.m_data.array[9]}, {m_storage.m_data.array[10]}, {m_storage.m_data.array[11]})</DisplayString>
-        </Synthetic>
-        <Synthetic Name="[row 2]" Condition="!(Flags%2)">
-          <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[6]}, {m_storage.m_data.array[10]}, {m_storage.m_data.array[14]})</DisplayString>
-        </Synthetic>
-        <Synthetic Name="[row 3]" Condition="Flags%2">
-          <DisplayString>({m_storage.m_data.array[12]}, {m_storage.m_data.array[13]}, {m_storage.m_data.array[14]}, {m_storage.m_data.array[15]})</DisplayString>
-        </Synthetic>
-        <Synthetic Name="[row 3]" Condition="!(Flags%2)">
-          <DisplayString>({m_storage.m_data.array[3]}, {m_storage.m_data.array[7]}, {m_storage.m_data.array[11]}, {m_storage.m_data.array[15]})</DisplayString>
-        </Synthetic>        
-      </Expand>
-  </Type>  
-  
-  <!-- Dynamic x Dynamic Matrix -->
-  <Type Name="Eigen::Matrix&lt;*,-1,-1,*,*,*&gt;">      
-      <AlternativeType Name="Eigen::Array&lt;*,-1,-1,*,*,*&gt;"/>
-      <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
-      <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}, {m_storage.m_cols}] (dynamic matrix)</DisplayString>
-      <Expand>
-        <ArrayItems Condition="Flags%2"> <!-- row major layout -->
-          <Rank>2</Rank>
-          <Size>$i==0 ? m_storage.m_rows : m_storage.m_cols</Size>
-          <ValuePointer>m_storage.m_data</ValuePointer>
-        </ArrayItems>
-        <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->
-          <Direction>Backward</Direction>
-          <Rank>2</Rank>
-          <Size>$i==0 ? m_storage.m_rows : m_storage.m_cols</Size>
-          <ValuePointer>m_storage.m_data</ValuePointer>
-        </ArrayItems>
-      </Expand>
-  </Type>
-  
-  <!-- Fixed x Dynamic Matrix -->
-  <Type Name="Eigen::Matrix&lt;*,*,-1,*,*,*&gt;">
-      <AlternativeType Name="Eigen::Array&lt;*,*,-1,*,*,*&gt;"/>
-      <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
-      <DisplayString Condition="m_storage.m_data != 0">[{$T2}, {m_storage.m_cols}] (dynamic column matrix)</DisplayString>
-      <Expand>
-        <ArrayItems Condition="Flags%2"> <!-- row major layout -->
-          <Rank>2</Rank>
-          <Size>$i==0 ? $T2 : m_storage.m_cols</Size>
-          <ValuePointer>m_storage.m_data</ValuePointer>
-        </ArrayItems>
-        <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->
-          <Direction>Backward</Direction>
-          <Rank>2</Rank>
-          <Size>$i==0 ? $T2 : m_storage.m_cols</Size>
-          <ValuePointer>m_storage.m_data</ValuePointer>
-        </ArrayItems>
-      </Expand>
-  </Type>
-  
-  <!-- Dynamic x Fixed Matrix -->
-  <Type Name="Eigen::Matrix&lt;*,-1,*,*,*,*&gt;">
-      <AlternativeType Name="Eigen::Array&lt;*,-1,*,*,*,*&gt;"/>
-      <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
-      <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}, {$T2}] (dynamic row matrix)</DisplayString>
-      <Expand>
-        <ArrayItems Condition="Flags%2"> <!-- row major layout -->
-          <Rank>2</Rank>
-          <Size>$i==0 ? m_storage.m_rows : $T2</Size>
-          <ValuePointer>m_storage.m_data</ValuePointer>
-        </ArrayItems>
-        <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->
-          <Direction>Backward</Direction>
-          <Rank>2</Rank>
-          <Size>$i==0 ? m_storage.m_rows : $T2</Size>
-          <ValuePointer>m_storage.m_data</ValuePointer>
-        </ArrayItems>
-      </Expand>
-  </Type>
-  
-  <!-- Dynamic Column Vector -->
-  <Type Name="Eigen::Matrix&lt;*,1,-1,*,*,*&gt;">
-      <AlternativeType Name="Eigen::Array&lt;*,1,-1,*,*,*&gt;"/>
-      <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
-      <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_cols}] (dynamic column vector)</DisplayString>
-      <Expand>
-        <Item Name="[size]">m_storage.m_cols</Item>
-        <ArrayItems>
-          <Size>m_storage.m_cols</Size>
-          <ValuePointer>m_storage.m_data</ValuePointer>
-        </ArrayItems>
-      </Expand>
-  </Type>
-  
-  <!-- Dynamic Row Vector -->
-  <Type Name="Eigen::Matrix&lt;*,-1,1,*,*,*&gt;">
-      <AlternativeType Name="Eigen::Array&lt;*,-1,1,*,*,*&gt;"/>
-      <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
-      <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}] (dynamic row vector)</DisplayString>
-      <Expand>
-        <Item Name="[size]">m_storage.m_rows</Item>
-        <ArrayItems>
-          <Size>m_storage.m_rows</Size>
-          <ValuePointer>m_storage.m_data</ValuePointer>
-        </ArrayItems>
-      </Expand>
-  </Type>
-  
-  <!-- Fixed Vector -->
-  <Type Name="Eigen::Matrix&lt;*,1,1,*,*,*&gt;">
-      <AlternativeType Name="Eigen::Array&lt;*,1,1,*,*,*&gt;"/>
-      <DisplayString>[1] ({m_storage.m_data.array[0]})</DisplayString>
-      <Expand>
-        <Item Name="[x]">m_storage.m_data.array[0]</Item>
-      </Expand>
-  </Type>
-  
-  <Type Name="Eigen::Matrix&lt;*,2,1,*,*,*&gt;">
-      <AlternativeType Name="Eigen::Matrix&lt;*,1,2,*,*,*&gt;"/>
-      <AlternativeType Name="Eigen::Array&lt;*,2,1,*,*,*&gt;"/>
-      <AlternativeType Name="Eigen::Array&lt;*,1,2,*,*,*&gt;"/>
-      <DisplayString>[2] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]})</DisplayString>
-      <Expand>
-        <Item Name="[x]">m_storage.m_data.array[0]</Item>
-        <Item Name="[y]">m_storage.m_data.array[1]</Item>
-      </Expand>
-  </Type>
-  
-  <Type Name="Eigen::Matrix&lt;*,3,1,*,*,*&gt;">
-      <AlternativeType Name="Eigen::Matrix&lt;*,1,3,*,*,*&gt;"/>
-      <AlternativeType Name="Eigen::Array&lt;*,3,1,*,*,*&gt;"/>
-      <AlternativeType Name="Eigen::Array&lt;*,1,3,*,*,*&gt;"/>
-      <DisplayString>[3] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]})</DisplayString>
-      <Expand>
-        <Item Name="[x]">m_storage.m_data.array[0]</Item>
-        <Item Name="[y]">m_storage.m_data.array[1]</Item>
-        <Item Name="[z]">m_storage.m_data.array[2]</Item>
-      </Expand>
-  </Type>
-  
-    <Type Name="Eigen::Matrix&lt;*,4,1,*,*,*&gt;">
-      <AlternativeType Name="Eigen::Matrix&lt;*,1,4,*,*,*&gt;"/>
-      <AlternativeType Name="Eigen::Array&lt;*,4,1,*,*,*&gt;"/>
-      <AlternativeType Name="Eigen::Array&lt;*,1,4,*,*,*&gt;"/>
-      <DisplayString>[4] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>
-      <Expand>
-        <Item Name="[x]">m_storage.m_data.array[0]</Item>
-        <Item Name="[y]">m_storage.m_data.array[1]</Item>
-        <Item Name="[z]">m_storage.m_data.array[2]</Item>
-        <Item Name="[w]">m_storage.m_data.array[3]</Item>
-      </Expand>
-  </Type>
-
-</AutoVisualizer>
+<?xml version="1.0" encoding="utf-8"?>

+

+<AutoVisualizer xmlns="http://schemas.microsoft.com/vstudio/debugger/natvis/2010">

+

+  <!-- Fixed x Fixed Matrix -->

+  <Type Name="Eigen::Matrix&lt;*,*,*,*,*,*&gt;">      

+      <AlternativeType Name="Eigen::Array&lt;*,-1,-1,*,*,*&gt;"/>

+      <DisplayString>[{$T2}, {$T3}] (fixed matrix)</DisplayString>

+      <Expand>

+        <ArrayItems Condition="Flags%2"> <!-- row major layout -->

+          <Rank>2</Rank>

+          <Size>$i==0 ? $T2 : $T3</Size>

+          <ValuePointer>m_storage.m_data.array</ValuePointer>

+        </ArrayItems>

+        <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->

+          <Direction>Backward</Direction>

+          <Rank>2</Rank>

+          <Size>$i==0 ? $T2 : $T3</Size>

+          <ValuePointer>m_storage.m_data.array</ValuePointer>

+        </ArrayItems>

+      </Expand>

+  </Type>

+  

+  <!-- 2 x 2 Matrix -->

+  <Type Name="Eigen::Matrix&lt;*,2,2,*,*,*&gt;">      

+      <AlternativeType Name="Eigen::Array&lt;*,2,2,*,*,*&gt;"/>

+      <DisplayString>[2, 2] (fixed matrix)</DisplayString>

+      <Expand>

+        <Synthetic Name="[row 0]" Condition="Flags%2">

+          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]})</DisplayString>

+        </Synthetic>

+        <Synthetic Name="[row 0]" Condition="!(Flags%2)">

+          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[2]})</DisplayString>

+        </Synthetic>

+        <Synthetic Name="[row 1]" Condition="Flags%2">

+          <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>

+        </Synthetic>

+        <Synthetic Name="[row 1]" Condition="!(Flags%2)">

+          <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[3]})</DisplayString>

+        </Synthetic>        

+      </Expand>

+  </Type>

+  

+  <!-- 3 x 3 Matrix -->

+  <Type Name="Eigen::Matrix&lt;*,3,3,*,*,*&gt;">      

+      <AlternativeType Name="Eigen::Array&lt;*,3,3,*,*,*&gt;"/>

+      <DisplayString>[3, 3] (fixed matrix)</DisplayString>

+      <Expand>

+        <Synthetic Name="[row 0]" Condition="Flags%2">

+          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]})</DisplayString>

+        </Synthetic>

+        <Synthetic Name="[row 0]" Condition="!(Flags%2)">

+          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[3]}, {m_storage.m_data.array[6]})</DisplayString>

+        </Synthetic>

+        <Synthetic Name="[row 1]" Condition="Flags%2">

+          <DisplayString>({m_storage.m_data.array[3]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[5]})</DisplayString>

+        </Synthetic>

+        <Synthetic Name="[row 1]" Condition="!(Flags%2)">

+          <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[7]})</DisplayString>

+        </Synthetic>

+        <Synthetic Name="[row 2]" Condition="Flags%2">

+          <DisplayString>({m_storage.m_data.array[6]}, {m_storage.m_data.array[7]}, {m_storage.m_data.array[8]})</DisplayString>

+        </Synthetic>

+        <Synthetic Name="[row 2]" Condition="!(Flags%2)">

+          <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[8]})</DisplayString>

+        </Synthetic>        

+      </Expand>

+  </Type>

+  

+  <!-- 4 x 4 Matrix -->

+  <Type Name="Eigen::Matrix&lt;*,4,4,*,*,*&gt;">      

+      <AlternativeType Name="Eigen::Array&lt;*,4,4,*,*,*&gt;"/>

+      <DisplayString>[4, 4] (fixed matrix)</DisplayString>

+      <Expand>

+        <Synthetic Name="[row 0]" Condition="Flags%2">

+          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>

+        </Synthetic>

+        <Synthetic Name="[row 0]" Condition="!(Flags%2)">

+          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[8]}, {m_storage.m_data.array[12]})</DisplayString>

+        </Synthetic>

+        <Synthetic Name="[row 1]" Condition="Flags%2">

+          <DisplayString>({m_storage.m_data.array[4]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[6]}, {m_storage.m_data.array[7]})</DisplayString>

+        </Synthetic>

+        <Synthetic Name="[row 1]" Condition="!(Flags%2)">

+          <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[9]}, {m_storage.m_data.array[13]})</DisplayString>

+        </Synthetic>

+        <Synthetic Name="[row 2]" Condition="Flags%2">

+          <DisplayString>({m_storage.m_data.array[8]}, {m_storage.m_data.array[9]}, {m_storage.m_data.array[10]}, {m_storage.m_data.array[11]})</DisplayString>

+        </Synthetic>

+        <Synthetic Name="[row 2]" Condition="!(Flags%2)">

+          <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[6]}, {m_storage.m_data.array[10]}, {m_storage.m_data.array[14]})</DisplayString>

+        </Synthetic>

+        <Synthetic Name="[row 3]" Condition="Flags%2">

+          <DisplayString>({m_storage.m_data.array[12]}, {m_storage.m_data.array[13]}, {m_storage.m_data.array[14]}, {m_storage.m_data.array[15]})</DisplayString>

+        </Synthetic>

+        <Synthetic Name="[row 3]" Condition="!(Flags%2)">

+          <DisplayString>({m_storage.m_data.array[3]}, {m_storage.m_data.array[7]}, {m_storage.m_data.array[11]}, {m_storage.m_data.array[15]})</DisplayString>

+        </Synthetic>        

+      </Expand>

+  </Type>  

+  

+  <!-- Dynamic x Dynamic Matrix -->

+  <Type Name="Eigen::Matrix&lt;*,-1,-1,*,*,*&gt;">      

+      <AlternativeType Name="Eigen::Array&lt;*,-1,-1,*,*,*&gt;"/>

+      <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>

+      <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}, {m_storage.m_cols}] (dynamic matrix)</DisplayString>

+      <Expand>

+        <ArrayItems Condition="Flags%2"> <!-- row major layout -->

+          <Rank>2</Rank>

+          <Size>$i==0 ? m_storage.m_rows : m_storage.m_cols</Size>

+          <ValuePointer>m_storage.m_data</ValuePointer>

+        </ArrayItems>

+        <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->

+          <Direction>Backward</Direction>

+          <Rank>2</Rank>

+          <Size>$i==0 ? m_storage.m_rows : m_storage.m_cols</Size>

+          <ValuePointer>m_storage.m_data</ValuePointer>

+        </ArrayItems>

+      </Expand>

+  </Type>

+  

+  <!-- Fixed x Dynamic Matrix -->

+  <Type Name="Eigen::Matrix&lt;*,*,-1,*,*,*&gt;">

+      <AlternativeType Name="Eigen::Array&lt;*,*,-1,*,*,*&gt;"/>

+      <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>

+      <DisplayString Condition="m_storage.m_data != 0">[{$T2}, {m_storage.m_cols}] (dynamic column matrix)</DisplayString>

+      <Expand>

+        <ArrayItems Condition="Flags%2"> <!-- row major layout -->

+          <Rank>2</Rank>

+          <Size>$i==0 ? $T2 : m_storage.m_cols</Size>

+          <ValuePointer>m_storage.m_data</ValuePointer>

+        </ArrayItems>

+        <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->

+          <Direction>Backward</Direction>

+          <Rank>2</Rank>

+          <Size>$i==0 ? $T2 : m_storage.m_cols</Size>

+          <ValuePointer>m_storage.m_data</ValuePointer>

+        </ArrayItems>

+      </Expand>

+  </Type>

+  

+  <!-- Dynamic x Fixed Matrix -->

+  <Type Name="Eigen::Matrix&lt;*,-1,*,*,*,*&gt;">

+      <AlternativeType Name="Eigen::Array&lt;*,-1,*,*,*,*&gt;"/>

+      <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>

+      <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}, {$T2}] (dynamic row matrix)</DisplayString>

+      <Expand>

+        <ArrayItems Condition="Flags%2"> <!-- row major layout -->

+          <Rank>2</Rank>

+          <Size>$i==0 ? m_storage.m_rows : $T2</Size>

+          <ValuePointer>m_storage.m_data</ValuePointer>

+        </ArrayItems>

+        <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->

+          <Direction>Backward</Direction>

+          <Rank>2</Rank>

+          <Size>$i==0 ? m_storage.m_rows : $T2</Size>

+          <ValuePointer>m_storage.m_data</ValuePointer>

+        </ArrayItems>

+      </Expand>

+  </Type>

+  

+  <!-- Dynamic Column Vector -->

+  <Type Name="Eigen::Matrix&lt;*,1,-1,*,*,*&gt;">

+      <AlternativeType Name="Eigen::Array&lt;*,1,-1,*,*,*&gt;"/>

+      <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>

+      <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_cols}] (dynamic column vector)</DisplayString>

+      <Expand>

+        <Item Name="[size]">m_storage.m_cols</Item>

+        <ArrayItems>

+          <Size>m_storage.m_cols</Size>

+          <ValuePointer>m_storage.m_data</ValuePointer>

+        </ArrayItems>

+      </Expand>

+  </Type>

+  

+  <!-- Dynamic Row Vector -->

+  <Type Name="Eigen::Matrix&lt;*,-1,1,*,*,*&gt;">

+      <AlternativeType Name="Eigen::Array&lt;*,-1,1,*,*,*&gt;"/>

+      <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>

+      <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}] (dynamic row vector)</DisplayString>

+      <Expand>

+        <Item Name="[size]">m_storage.m_rows</Item>

+        <ArrayItems>

+          <Size>m_storage.m_rows</Size>

+          <ValuePointer>m_storage.m_data</ValuePointer>

+        </ArrayItems>

+      </Expand>

+  </Type>

+  

+  <!-- Fixed Vector -->

+  <Type Name="Eigen::Matrix&lt;*,1,1,*,*,*&gt;">

+      <AlternativeType Name="Eigen::Array&lt;*,1,1,*,*,*&gt;"/>

+      <DisplayString>[1] ({m_storage.m_data.array[0]})</DisplayString>

+      <Expand>

+        <Item Name="[x]">m_storage.m_data.array[0]</Item>

+      </Expand>

+  </Type>

+  

+  <Type Name="Eigen::Matrix&lt;*,2,1,*,*,*&gt;">

+      <AlternativeType Name="Eigen::Matrix&lt;*,1,2,*,*,*&gt;"/>

+      <AlternativeType Name="Eigen::Array&lt;*,2,1,*,*,*&gt;"/>

+      <AlternativeType Name="Eigen::Array&lt;*,1,2,*,*,*&gt;"/>

+      <DisplayString>[2] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]})</DisplayString>

+      <Expand>

+        <Item Name="[x]">m_storage.m_data.array[0]</Item>

+        <Item Name="[y]">m_storage.m_data.array[1]</Item>

+      </Expand>

+  </Type>

+  

+  <Type Name="Eigen::Matrix&lt;*,3,1,*,*,*&gt;">

+      <AlternativeType Name="Eigen::Matrix&lt;*,1,3,*,*,*&gt;"/>

+      <AlternativeType Name="Eigen::Array&lt;*,3,1,*,*,*&gt;"/>

+      <AlternativeType Name="Eigen::Array&lt;*,1,3,*,*,*&gt;"/>

+      <DisplayString>[3] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]})</DisplayString>

+      <Expand>

+        <Item Name="[x]">m_storage.m_data.array[0]</Item>

+        <Item Name="[y]">m_storage.m_data.array[1]</Item>

+        <Item Name="[z]">m_storage.m_data.array[2]</Item>

+      </Expand>

+  </Type>

+  

+    <Type Name="Eigen::Matrix&lt;*,4,1,*,*,*&gt;">

+      <AlternativeType Name="Eigen::Matrix&lt;*,1,4,*,*,*&gt;"/>

+      <AlternativeType Name="Eigen::Array&lt;*,4,1,*,*,*&gt;"/>

+      <AlternativeType Name="Eigen::Array&lt;*,1,4,*,*,*&gt;"/>

+      <DisplayString>[4] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>

+      <Expand>

+        <Item Name="[x]">m_storage.m_data.array[0]</Item>

+        <Item Name="[y]">m_storage.m_data.array[1]</Item>

+        <Item Name="[z]">m_storage.m_data.array[2]</Item>

+        <Item Name="[w]">m_storage.m_data.array[3]</Item>

+      </Expand>

+  </Type>

+

+</AutoVisualizer>

diff --git a/debug/msvc/eigen_autoexp_part.dat b/debug/msvc/eigen_autoexp_part.dat
index 273c10d..35ef580 100644
--- a/debug/msvc/eigen_autoexp_part.dat
+++ b/debug/msvc/eigen_autoexp_part.dat
@@ -1,295 +1,295 @@
-; ***************************************************************
-; * Eigen Visualizer
-; *
-; * Author: Hauke Heibel <hauke.heibel@gmail.com>
-; *
-; * Support the enhanced debugging of the following Eigen
-; * types (*: any, +:fixed dimension) :
-; *
-; * - Eigen::Matrix<*,4,1,*,*,*> and Eigen::Matrix<*,1,4,*,*,*>
-; * - Eigen::Matrix<*,3,1,*,*,*> and Eigen::Matrix<*,1,3,*,*,*>
-; * - Eigen::Matrix<*,2,1,*,*,*> and Eigen::Matrix<*,1,2,*,*,*>
-; * - Eigen::Matrix<*,-1,-1,*,*,*>
-; * - Eigen::Matrix<*,+,-1,*,*,*>
-; * - Eigen::Matrix<*,-1,+,*,*,*>
-; * - Eigen::Matrix<*,+,+,*,*,*>
-; *
-; * Matrices are displayed properly independently of the memory
-; * alignment (RowMajor vs. ColMajor).
-; *
-; * This file is distributed WITHOUT ANY WARRANTY. Please ensure
-; * that your original autoexp.dat file is copied to a safe 
-; * place before proceeding with its modification.
-; ***************************************************************
-
-[Visualizer]
-
-; Fixed size 4-vectors
-Eigen::Matrix<*,4,1,*,*,*>|Eigen::Matrix<*,1,4,*,*,*>{
-   children
-   (
-      #(
-        [internals]: [$c,!],
-         x : ($c.m_storage.m_data.array)[0],
-         y : ($c.m_storage.m_data.array)[1],
-         z : ($c.m_storage.m_data.array)[2],
-         w : ($c.m_storage.m_data.array)[3]
-      )
-   )
-
-   preview
-   (
-      #(
-        "[",
-        4,
-        "](",
-        #array(expr: $e.m_storage.m_data.array[$i], size: 4),
-        ")"
-      )
-   )
-}
-
-; Fixed size 3-vectors
-Eigen::Matrix<*,3,1,*,*,*>|Eigen::Matrix<*,1,3,*,*,*>{
-   children
-   (
-      #(
-        [internals]: [$c,!],
-         x : ($c.m_storage.m_data.array)[0],
-         y : ($c.m_storage.m_data.array)[1],
-         z : ($c.m_storage.m_data.array)[2]
-      )
-   )
-
-   preview
-   (
-      #(
-        "[",
-        3,
-        "](",
-        #array(expr: $e.m_storage.m_data.array[$i], size: 3),
-        ")"
-      )
-   )
-}
-
-; Fixed size 2-vectors
-Eigen::Matrix<*,2,1,*,*,*>|Eigen::Matrix<*,1,2,*,*,*>{
-   children
-   (
-      #(
-        [internals]: [$c,!],
-         x : ($c.m_storage.m_data.array)[0],
-         y : ($c.m_storage.m_data.array)[1]
-      )
-   )
-
-   preview
-   (
-      #(
-        "[",
-        2,
-        "](",
-        #array(expr: $e.m_storage.m_data.array[$i], size: 2),
-        ")"
-      )
-   )
-}
-
-; Fixed size 1-vectors
-Eigen::Matrix<*,1,1,*,*,*>|Eigen::Matrix<*,1,1,*,*,*>{
-   children
-   (
-      #(
-        [internals]: [$c,!],
-         x : ($c.m_storage.m_data.array)[0]
-      )
-   )
-
-   preview
-   (
-      #(
-        "[",
-        1,
-        "](",
-        #array(expr: $e.m_storage.m_data.array[$i], size: 1),
-        ")"
-      )
-   )
-}
-
-; Dynamic matrices (ColMajor and RowMajor support)
-Eigen::Matrix<*,-1,-1,*,*,*>{
-  children
-   (
-      #(
-         [internals]: [$c,!],
-         rows: $c.m_storage.m_rows,
-         cols: $c.m_storage.m_cols,
-         ; Check for RowMajorBit
-         #if ($c.Flags & 0x1) (
-             #array(
-                rank: 2,
-                base: 0,
-                expr: ($c.m_storage.m_data)[($i % $c.m_storage.m_rows)*$c.m_storage.m_cols + (($i- $i % $c.m_storage.m_rows)/$c.m_storage.m_rows)], 
-                size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.m_storage.m_cols
-             )
-         ) #else (
-             #array(
-                rank: 2,
-                base: 0,
-                expr: ($c.m_storage.m_data)[$i],
-                size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.m_storage.m_cols
-             )
-         )
-      )
-   )
-
-   preview
-   (
-     #(
-         "[",
-           $c.m_storage.m_rows,
-         ",",
-           $c.m_storage.m_cols,
-         "](",
-           #array(
-            expr :    [($c.m_storage.m_data)[$i],g],
-            size :    $c.m_storage.m_rows*$c.m_storage.m_cols
-           ),
-         ")"
-      )
-   )
-}
-
-; Fixed rows, dynamic columns matrix (ColMajor and RowMajor support)
-Eigen::Matrix<*,*,-1,*,*,*>{
-  children
-   (
-      #(
-         [internals]: [$c,!],
-         rows: $c.RowsAtCompileTime,
-         cols: $c.m_storage.m_cols,
-         ; Check for RowMajorBit
-         #if ($c.Flags & 0x1) (
-             #array(
-                rank: 2,
-                base: 0,
-                expr: ($c.m_storage.m_data)[($i % $c.RowsAtCompileTime)*$c.m_storage.m_cols + (($i- $i % $c.RowsAtCompileTime)/$c.RowsAtCompileTime)],
-                size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.m_storage.m_cols
-             )
-         ) #else (
-             #array(
-                rank: 2,
-                base: 0,
-                expr: ($c.m_storage.m_data)[$i],
-                size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.m_storage.m_cols
-             )
-         )
-      )
-   )
-
-   preview
-   (
-     #(
-         "[",
-           $c.RowsAtCompileTime,
-         ",",
-           $c.m_storage.m_cols,
-         "](",
-           #array(
-            expr :    [($c.m_storage.m_data)[$i],g],
-            size :    $c.RowsAtCompileTime*$c.m_storage.m_cols
-           ),
-         ")"
-      )
-   )
-}
-
-; Dynamic rows, fixed columns matrix (ColMajor and RowMajor support)
-Eigen::Matrix<*,-1,*,*,*,*>{
-  children
-   (
-      #(
-         [internals]: [$c,!],
-         rows: $c.m_storage.m_rows,
-         cols: $c.ColsAtCompileTime,
-         ; Check for RowMajorBit
-         #if ($c.Flags & 0x1) (
-             #array(
-                rank: 2,
-                base: 0,
-                expr: ($c.m_storage.m_data)[($i % $c.m_storage.m_rows)*$c.ColsAtCompileTime + (($i- $i % $c.m_storage.m_rows)/$c.m_storage.m_rows)], 
-                size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.ColsAtCompileTime
-             )
-         ) #else (
-             #array(
-                rank: 2,
-                base: 0,
-                expr: ($c.m_storage.m_data)[$i],
-                size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.ColsAtCompileTime
-             )
-         )
-      )
-   )
-
-   preview
-   (
-     #(
-         "[",
-           $c.m_storage.m_rows,
-         ",",
-           $c.ColsAtCompileTime,
-         "](",
-           #array(
-            expr :    [($c.m_storage.m_data)[$i],g],
-            size :    $c.m_storage.m_rows*$c.ColsAtCompileTime
-           ),
-         ")"
-      )
-   )
-}
-
-; Fixed size matrix (ColMajor and RowMajor support)
-Eigen::Matrix<*,*,*,*,*,*>{
-  children
-   (
-      #(
-         [internals]: [$c,!],
-         rows: $c.RowsAtCompileTime,
-         cols: $c.ColsAtCompileTime,
-         ; Check for RowMajorBit
-         #if ($c.Flags & 0x1) (
-             #array(
-                rank: 2,
-                base: 0,
-                expr: ($c.m_storage.m_data.array)[($i % $c.RowsAtCompileTime)*$c.ColsAtCompileTime + (($i- $i % $c.RowsAtCompileTime)/$c.RowsAtCompileTime)], 
-                size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.ColsAtCompileTime
-             )
-         ) #else (
-             #array(
-                rank: 2,
-                base: 0,
-                expr: ($c.m_storage.m_data.array)[$i],
-                size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.ColsAtCompileTime
-             )
-         )
-      )
-   )
-
-   preview
-   (
-     #(
-         "[",
-           $c.RowsAtCompileTime,
-         ",",
-           $c.ColsAtCompileTime,
-         "](",
-           #array(
-            expr :    [($c.m_storage.m_data.array)[$i],g],
-            size :    $c.RowsAtCompileTime*$c.ColsAtCompileTime
-           ),
-         ")"
-      )
-   )
-}
+; ***************************************************************

+; * Eigen Visualizer

+; *

+; * Author: Hauke Heibel <hauke.heibel@gmail.com>

+; *

+; * Support the enhanced debugging of the following Eigen

+; * types (*: any, +:fixed dimension) :

+; *

+; * - Eigen::Matrix<*,4,1,*,*,*> and Eigen::Matrix<*,1,4,*,*,*>

+; * - Eigen::Matrix<*,3,1,*,*,*> and Eigen::Matrix<*,1,3,*,*,*>

+; * - Eigen::Matrix<*,2,1,*,*,*> and Eigen::Matrix<*,1,2,*,*,*>

+; * - Eigen::Matrix<*,-1,-1,*,*,*>

+; * - Eigen::Matrix<*,+,-1,*,*,*>

+; * - Eigen::Matrix<*,-1,+,*,*,*>

+; * - Eigen::Matrix<*,+,+,*,*,*>

+; *

+; * Matrices are displayed properly independently of the memory

+; * alignment (RowMajor vs. ColMajor).

+; *

+; * This file is distributed WITHOUT ANY WARRANTY. Please ensure

+; * that your original autoexp.dat file is copied to a safe 

+; * place before proceeding with its modification.

+; ***************************************************************

+

+[Visualizer]

+

+; Fixed size 4-vectors

+Eigen::Matrix<*,4,1,*,*,*>|Eigen::Matrix<*,1,4,*,*,*>{

+   children

+   (

+      #(

+        [internals]: [$c,!],

+         x : ($c.m_storage.m_data.array)[0],

+         y : ($c.m_storage.m_data.array)[1],

+         z : ($c.m_storage.m_data.array)[2],

+         w : ($c.m_storage.m_data.array)[3]

+      )

+   )

+

+   preview

+   (

+      #(

+        "[",

+        4,

+        "](",

+        #array(expr: $e.m_storage.m_data.array[$i], size: 4),

+        ")"

+      )

+   )

+}

+

+; Fixed size 3-vectors

+Eigen::Matrix<*,3,1,*,*,*>|Eigen::Matrix<*,1,3,*,*,*>{

+   children

+   (

+      #(

+        [internals]: [$c,!],

+         x : ($c.m_storage.m_data.array)[0],

+         y : ($c.m_storage.m_data.array)[1],

+         z : ($c.m_storage.m_data.array)[2]

+      )

+   )

+

+   preview

+   (

+      #(

+        "[",

+        3,

+        "](",

+        #array(expr: $e.m_storage.m_data.array[$i], size: 3),

+        ")"

+      )

+   )

+}

+

+; Fixed size 2-vectors

+Eigen::Matrix<*,2,1,*,*,*>|Eigen::Matrix<*,1,2,*,*,*>{

+   children

+   (

+      #(

+        [internals]: [$c,!],

+         x : ($c.m_storage.m_data.array)[0],

+         y : ($c.m_storage.m_data.array)[1]

+      )

+   )

+

+   preview

+   (

+      #(

+        "[",

+        2,

+        "](",

+        #array(expr: $e.m_storage.m_data.array[$i], size: 2),

+        ")"

+      )

+   )

+}

+

+; Fixed size 1-vectors

+Eigen::Matrix<*,1,1,*,*,*>|Eigen::Matrix<*,1,1,*,*,*>{

+   children

+   (

+      #(

+        [internals]: [$c,!],

+         x : ($c.m_storage.m_data.array)[0]

+      )

+   )

+

+   preview

+   (

+      #(

+        "[",

+        1,

+        "](",

+        #array(expr: $e.m_storage.m_data.array[$i], size: 1),

+        ")"

+      )

+   )

+}

+

+; Dynamic matrices (ColMajor and RowMajor support)

+Eigen::Matrix<*,-1,-1,*,*,*>{

+  children

+   (

+      #(

+         [internals]: [$c,!],

+         rows: $c.m_storage.m_rows,

+         cols: $c.m_storage.m_cols,

+         ; Check for RowMajorBit

+         #if ($c.Flags & 0x1) (

+             #array(

+                rank: 2,

+                base: 0,

+                expr: ($c.m_storage.m_data)[($i % $c.m_storage.m_rows)*$c.m_storage.m_cols + (($i- $i % $c.m_storage.m_rows)/$c.m_storage.m_rows)], 

+                size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.m_storage.m_cols

+             )

+         ) #else (

+             #array(

+                rank: 2,

+                base: 0,

+                expr: ($c.m_storage.m_data)[$i],

+                size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.m_storage.m_cols

+             )

+         )

+      )

+   )

+

+   preview

+   (

+     #(

+         "[",

+           $c.m_storage.m_rows,

+         ",",

+           $c.m_storage.m_cols,

+         "](",

+           #array(

+            expr :    [($c.m_storage.m_data)[$i],g],

+            size :    $c.m_storage.m_rows*$c.m_storage.m_cols

+           ),

+         ")"

+      )

+   )

+}

+

+; Fixed rows, dynamic columns matrix (ColMajor and RowMajor support)

+Eigen::Matrix<*,*,-1,*,*,*>{

+  children

+   (

+      #(

+         [internals]: [$c,!],

+         rows: $c.RowsAtCompileTime,

+         cols: $c.m_storage.m_cols,

+         ; Check for RowMajorBit

+         #if ($c.Flags & 0x1) (

+             #array(

+                rank: 2,

+                base: 0,

+                expr: ($c.m_storage.m_data)[($i % $c.RowsAtCompileTime)*$c.m_storage.m_cols + (($i- $i % $c.RowsAtCompileTime)/$c.RowsAtCompileTime)],

+                size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.m_storage.m_cols

+             )

+         ) #else (

+             #array(

+                rank: 2,

+                base: 0,

+                expr: ($c.m_storage.m_data)[$i],

+                size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.m_storage.m_cols

+             )

+         )

+      )

+   )

+

+   preview

+   (

+     #(

+         "[",

+           $c.RowsAtCompileTime,

+         ",",

+           $c.m_storage.m_cols,

+         "](",

+           #array(

+            expr :    [($c.m_storage.m_data)[$i],g],

+            size :    $c.RowsAtCompileTime*$c.m_storage.m_cols

+           ),

+         ")"

+      )

+   )

+}

+

+; Dynamic rows, fixed columns matrix (ColMajor and RowMajor support)

+Eigen::Matrix<*,-1,*,*,*,*>{

+  children

+   (

+      #(

+         [internals]: [$c,!],

+         rows: $c.m_storage.m_rows,

+         cols: $c.ColsAtCompileTime,

+         ; Check for RowMajorBit

+         #if ($c.Flags & 0x1) (

+             #array(

+                rank: 2,

+                base: 0,

+                expr: ($c.m_storage.m_data)[($i % $c.m_storage.m_rows)*$c.ColsAtCompileTime + (($i- $i % $c.m_storage.m_rows)/$c.m_storage.m_rows)], 

+                size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.ColsAtCompileTime

+             )

+         ) #else (

+             #array(

+                rank: 2,

+                base: 0,

+                expr: ($c.m_storage.m_data)[$i],

+                size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.ColsAtCompileTime

+             )

+         )

+      )

+   )

+

+   preview

+   (

+     #(

+         "[",

+           $c.m_storage.m_rows,

+         ",",

+           $c.ColsAtCompileTime,

+         "](",

+           #array(

+            expr :    [($c.m_storage.m_data)[$i],g],

+            size :    $c.m_storage.m_rows*$c.ColsAtCompileTime

+           ),

+         ")"

+      )

+   )

+}

+

+; Fixed size matrix (ColMajor and RowMajor support)

+Eigen::Matrix<*,*,*,*,*,*>{

+  children

+   (

+      #(

+         [internals]: [$c,!],

+         rows: $c.RowsAtCompileTime,

+         cols: $c.ColsAtCompileTime,

+         ; Check for RowMajorBit

+         #if ($c.Flags & 0x1) (

+             #array(

+                rank: 2,

+                base: 0,

+                expr: ($c.m_storage.m_data.array)[($i % $c.RowsAtCompileTime)*$c.ColsAtCompileTime + (($i- $i % $c.RowsAtCompileTime)/$c.RowsAtCompileTime)], 

+                size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.ColsAtCompileTime

+             )

+         ) #else (

+             #array(

+                rank: 2,

+                base: 0,

+                expr: ($c.m_storage.m_data.array)[$i],

+                size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.ColsAtCompileTime

+             )

+         )

+      )

+   )

+

+   preview

+   (

+     #(

+         "[",

+           $c.RowsAtCompileTime,

+         ",",

+           $c.ColsAtCompileTime,

+         "](",

+           #array(

+            expr :    [($c.m_storage.m_data.array)[$i],g],

+            size :    $c.RowsAtCompileTime*$c.ColsAtCompileTime

+           ),

+         ")"

+      )

+   )

+}

diff --git a/doc/TutorialSlicingIndexing.dox b/doc/TutorialSlicingIndexing.dox
index 645a7cd..7f89554 100644
--- a/doc/TutorialSlicingIndexing.dox
+++ b/doc/TutorialSlicingIndexing.dox
@@ -72,12 +72,12 @@
 </tr>
 <tr>
   <td>%Block starting at \c i,j having \c m rows, and \c n columns</td>
-  <td>\code A(seqN(i,m), seqN(i,n)) \endcode</td>
+  <td>\code A(seqN(i,m), seqN(j,n)) \endcode</td>
   <td>\code A.block(i,j,m,n) \endcode</td>
 </tr>
 <tr>
   <td>%Block starting at \c i0,j0 and ending at \c i1,j1</td>
-  <td>\code A(seq(i0,i1), seq(j0,j1) \endcode</td>
+  <td>\code A(seq(i0,i1), seq(j0,j1)) \endcode</td>
   <td>\code A.block(i0,j0,i1-i0+1,j1-j0+1) \endcode</td>
 </tr>
 <tr>
@@ -97,7 +97,7 @@
 </tr>
 <tr>
   <td>The middle row</td>
-  <td>\code A(last/2,all) \endcode</td>
+  <td>\code A(last/2, all) \endcode</td>
   <td>\code A.row((A.rows()-1)/2) \endcode</td>
 </tr>
 <tr>
diff --git a/doc/eigen_navtree_hacks.js b/doc/eigen_navtree_hacks.js
index afb97ed..f36b332 100644
--- a/doc/eigen_navtree_hacks.js
+++ b/doc/eigen_navtree_hacks.js
@@ -62,23 +62,161 @@
   }
 }
 
-// Overloaded to adjust the size of the navtree wrt the toc
-function resizeHeight() 
-{
-  var header  = $("#top");
-  var sidenav = $("#side-nav");
-  var content = $("#doc-content");
-  var navtree = $("#nav-tree");
-  var footer  = $("#nav-path");
-  var toc     = $("#nav-toc");
+/*
+ @licstart  The following is the entire license notice for the JavaScript code in this file.
 
-  var headerHeight = header.outerHeight();
-  var footerHeight = footer.outerHeight();
-  var tocHeight    = toc.height();
-  var windowHeight = $(window).height() - headerHeight - footerHeight;
-  content.css({height:windowHeight + "px"});
-  navtree.css({height:(windowHeight-tocHeight) + "px"});
-  sidenav.css({height:windowHeight + "px"});
+ The MIT License (MIT)
+
+ Copyright (C) 1997-2020 by Dimitri van Heesch
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software
+ and associated documentation files (the "Software"), to deal in the Software without restriction,
+ including without limitation the rights to use, copy, modify, merge, publish, distribute,
+ sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in all copies or
+ substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+ BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ @licend  The above is the entire license notice for the JavaScript code in this file
+ */
+// We need to override entire resizable just so we can change the height to account for the TOC.
+function initResizable()
+{
+  var cookie_namespace = 'doxygen';
+  var sidenav,navtree,content,header,collapsed,collapsedWidth=0,barWidth=6,desktop_vp=768,titleHeight;
+
+  function readCookie(cookie)
+  {
+    var myCookie = cookie_namespace+"_"+cookie+"=";
+    if (document.cookie) {
+      var index = document.cookie.indexOf(myCookie);
+      if (index != -1) {
+        var valStart = index + myCookie.length;
+        var valEnd = document.cookie.indexOf(";", valStart);
+        if (valEnd == -1) {
+          valEnd = document.cookie.length;
+        }
+        var val = document.cookie.substring(valStart, valEnd);
+        return val;
+      }
+    }
+    return 0;
+  }
+
+  function writeCookie(cookie, val, expiration)
+  {
+    if (val==undefined) return;
+    if (expiration == null) {
+      var date = new Date();
+      date.setTime(date.getTime()+(10*365*24*60*60*1000)); // default expiration is one week
+      expiration = date.toGMTString();
+    }
+    document.cookie = cookie_namespace + "_" + cookie + "=" + val + "; expires=" + expiration+"; path=/";
+  }
+
+  function resizeWidth()
+  {
+    var windowWidth = $(window).width() + "px";
+    var sidenavWidth = $(sidenav).outerWidth();
+    content.css({marginLeft:parseInt(sidenavWidth)+"px"});
+    writeCookie('width',sidenavWidth-barWidth, null);
+  }
+
+  function restoreWidth(navWidth)
+  {
+    var windowWidth = $(window).width() + "px";
+    content.css({marginLeft:parseInt(navWidth)+barWidth+"px"});
+    sidenav.css({width:navWidth + "px"});
+  }
+
+  function resizeHeight()
+  {  
+    var headerHeight = header.outerHeight();
+    var footerHeight = footer.outerHeight();
+    var windowHeight = $(window).height() - headerHeight - footerHeight;
+    //==========================================================================
+    // MODIFICATION:
+    // This small section is the only portion modified within initResizable().
+    // The rest is copy-pasted from the doxygen-generated resize.js.
+    //
+    // Adjust nav height to make room for TOC.
+    var toc = $("#nav-toc");
+    var tocHeight = toc.height();
+    var navHeight = windowHeight;
+    // tocHeight is not always defined (e.g. if empty)
+    if (tocHeight) {
+      navHeight = windowHeight - tocHeight;
+    }
+   //==========================================================================
+    
+    content.css({height:windowHeight + "px"});
+    navtree.css({height:navHeight + "px"});
+    sidenav.css({height:windowHeight + "px"});
+    
+    var width=$(window).width();
+    if (width!=collapsedWidth) {
+      if (width<desktop_vp && collapsedWidth>=desktop_vp) {
+        if (!collapsed) {
+          collapseExpand();
+        }
+      } else if (width>desktop_vp && collapsedWidth<desktop_vp) {
+        if (collapsed) {
+          collapseExpand();
+        }
+      }
+      collapsedWidth=width;
+    }
+    if (location.hash.slice(1)) {
+      (document.getElementById(location.hash.slice(1))||document.body).scrollIntoView();
+    }
+  }
+
+  function collapseExpand()
+  {
+    if (sidenav.width()>0) {
+      restoreWidth(0);
+      collapsed=true;
+    }
+    else {
+      var width = readCookie('width');
+      if (width>200 && width<$(window).width()) { restoreWidth(width); } else { restoreWidth(200); }
+      collapsed=false;
+    }
+  }
+  header  = $("#top");
+  sidenav = $("#side-nav");
+  content = $("#doc-content");
+  navtree = $("#nav-tree");
+  footer  = $("#nav-path");
+
+  $(".side-nav-resizable").resizable({resize: function(e, ui) { resizeWidth(); } });
+  $(sidenav).resizable({ minWidth: 0 });
+  $(window).resize(function() { resizeHeight(); });
+  var device = navigator.userAgent.toLowerCase();
+  var touch_device = device.match(/(iphone|ipod|ipad|android)/);
+  if (touch_device) { /* wider split bar for touch only devices */
+    $(sidenav).css({ paddingRight:'20px' });
+    $('.ui-resizable-e').css({ width:'20px' });
+    $('#nav-sync').css({ right:'34px' });
+    barWidth=20;
+  }
+  var width = readCookie('width');
+  if (width) { restoreWidth(width); } else { resizeWidth(); }
+  resizeHeight();
+  var url = location.href;
+  var i=url.indexOf("#");
+  if (i>=0) window.location.hash=url.substr(i);
+  var _preventDefault = function(evt) { evt.preventDefault(); };
+  $("#splitbar").bind("dragstart", _preventDefault).bind("selectstart", _preventDefault);
+  $(".ui-resizable-handle").dblclick(collapseExpand);
+  $(window).on('load',resizeHeight);
 }
 
 // Overloaded to save the root node into global_navtree_object
@@ -241,7 +379,4 @@
       setTimeout(arguments.callee, 10);
     }
   })();
-
-  $(window).on("load", resizeHeight);
 });
-
diff --git a/failtest/const_qualified_block_method_retval_0.cpp b/failtest/const_qualified_block_method_retval_0.cpp
index 08b5d3c..6d19bb4 100644
--- a/failtest/const_qualified_block_method_retval_0.cpp
+++ b/failtest/const_qualified_block_method_retval_0.cpp
@@ -8,6 +8,9 @@
 
 using namespace Eigen;
 
-void foo(CV_QUALIFIER Matrix3d &m) { Block<Matrix3d, 3, 3> b(m.block<3, 3>(0, 0)); }
+void foo(CV_QUALIFIER Matrix3d &m) {
+  Block<Matrix3d, 3, 3> b(m.block<3, 3>(0, 0));
+  EIGEN_UNUSED_VARIABLE(b);
+}
 
 int main() {}
diff --git a/failtest/const_qualified_block_method_retval_1.cpp b/failtest/const_qualified_block_method_retval_1.cpp
index 06e12c7..d58a018 100644
--- a/failtest/const_qualified_block_method_retval_1.cpp
+++ b/failtest/const_qualified_block_method_retval_1.cpp
@@ -8,6 +8,9 @@
 
 using namespace Eigen;
 
-void foo(CV_QUALIFIER Matrix3d &m) { Block<Matrix3d> b(m.block(0, 0, 3, 3)); }
+void foo(CV_QUALIFIER Matrix3d &m) {
+  Block<Matrix3d> b(m.block(0, 0, 3, 3));
+  EIGEN_UNUSED_VARIABLE(b);
+}
 
 int main() {}
diff --git a/failtest/const_qualified_diagonal_method_retval.cpp b/failtest/const_qualified_diagonal_method_retval.cpp
index f3acba6..a10796a 100644
--- a/failtest/const_qualified_diagonal_method_retval.cpp
+++ b/failtest/const_qualified_diagonal_method_retval.cpp
@@ -8,6 +8,9 @@
 
 using namespace Eigen;
 
-void foo(CV_QUALIFIER Matrix3d &m) { Diagonal<Matrix3d> b(m.diagonal()); }
+void foo(CV_QUALIFIER Matrix3d &m) {
+  Diagonal<Matrix3d> b(m.diagonal());
+  EIGEN_UNUSED_VARIABLE(b);
+}
 
 int main() {}
diff --git a/failtest/const_qualified_transpose_method_retval.cpp b/failtest/const_qualified_transpose_method_retval.cpp
index 394f64a..33200a9 100644
--- a/failtest/const_qualified_transpose_method_retval.cpp
+++ b/failtest/const_qualified_transpose_method_retval.cpp
@@ -8,6 +8,9 @@
 
 using namespace Eigen;
 
-void foo(CV_QUALIFIER Matrix3d &m) { Transpose<Matrix3d> b(m.transpose()); }
+void foo(CV_QUALIFIER Matrix3d &m) {
+  Transpose<Matrix3d> b(m.transpose());
+  EIGEN_UNUSED_VARIABLE(b);
+}
 
 int main() {}
diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt
index 8d6d754..1babd13 100644
--- a/lapack/CMakeLists.txt
+++ b/lapack/CMakeLists.txt
@@ -22,7 +22,7 @@
 include_directories(../blas)
 
 set(EigenLapack_SRCS
-single.cpp double.cpp complex_single.cpp complex_double.cpp ../blas/xerbla.cpp
+  dsecnd_INT_CPU_TIME.cpp second_INT_CPU_TIME.cpp single.cpp double.cpp complex_single.cpp complex_double.cpp ../blas/xerbla.cpp
 )
 
 if(EIGEN_Fortran_COMPILER_WORKS)
@@ -38,7 +38,6 @@
   dlapy2.f  dlapy3.f  slapy2.f  slapy3.f
   clacgv.f  zlacgv.f
   slamch.f  dlamch.f
-  second_NONE.f dsecnd_NONE.f
 )
 
 option(EIGEN_ENABLE_LAPACK_TESTS OFF "Enable the Lapack unit tests")
diff --git a/lapack/dsecnd_INT_CPU_TIME.cpp b/lapack/dsecnd_INT_CPU_TIME.cpp
new file mode 100644
index 0000000..684fa1d
--- /dev/null
+++ b/lapack/dsecnd_INT_CPU_TIME.cpp
@@ -0,0 +1,39 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2024 The Eigen Authors
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifdef _WIN32
+#include <cstdint>
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#else
+#include <ctime>
+#endif
+
+extern "C" {
+double dsecnd_();
+}
+
+// Elapsed CPU Time in seconds.
+double dsecnd_() {
+#ifdef _WIN32
+  // For MSVC, use `GetProcessTimes` for proper CPU time - MSVC uses
+  // a non-standard `std::clock` implementation (see
+  // https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/clock?view=msvc-170).
+  // GetProcessTimes() uses 100-nanosecond time units.
+  FILETIME creation_time, exit_time, kernel_time, user_time;
+  GetProcessTimes(GetCurrentProcess(), &creation_time, &exit_time, &kernel_time, &user_time);
+  ULARGE_INTEGER user;
+  user.HighPart = user_time.dwHighDateTime;
+  user.LowPart = user_time.dwLowDateTime;
+  uint64_t time_100ns = user.QuadPart;
+  return static_cast<double>(time_100ns) / 10000000.0;
+#else
+  return static_cast<double>(std::clock()) / static_cast<double>(CLOCKS_PER_SEC);
+#endif
+}
diff --git a/lapack/second_INT_CPU_TIME.cpp b/lapack/second_INT_CPU_TIME.cpp
new file mode 100644
index 0000000..d6eb402
--- /dev/null
+++ b/lapack/second_INT_CPU_TIME.cpp
@@ -0,0 +1,39 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2024 The Eigen Authors
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifdef _WIN32
+#include <cstdint>
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#else
+#include <ctime>
+#endif
+
+extern "C" {
+float second_();
+}
+
+// Elapsed CPU Time in seconds.
+float second_() {
+#ifdef _WIN32
+  // For MSVC, use `GetProcessTimes` for proper CPU time - MSVC uses
+  // a non-standard `std::clock` implementation (see
+  // https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/clock?view=msvc-170).
+  // GetProcessTimes() uses 100-nanosecond time units.
+  FILETIME creation_time, exit_time, kernel_time, user_time;
+  GetProcessTimes(GetCurrentProcess(), &creation_time, &exit_time, &kernel_time, &user_time);
+  ULARGE_INTEGER user;
+  user.HighPart = user_time.dwHighDateTime;
+  user.LowPart = user_time.dwLowDateTime;
+  uint64_t time_100ns = user.QuadPart;
+  return static_cast<float>(time_100ns) / 10000000.0f;
+#else
+  return static_cast<float>(std::clock()) / static_cast<float>(CLOCKS_PER_SEC);
+#endif
+}
diff --git a/scripts/relicense.py b/scripts/relicense.py
deleted file mode 100644
index 8a5265f..0000000
--- a/scripts/relicense.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# This file is part of Eigen, a lightweight C++ template library
-# for linear algebra.
-#
-# Copyright (C) 2012 Keir Mierle <mierle@gmail.com>
-#
-# This Source Code Form is subject to the terms of the Mozilla
-# Public License v. 2.0. If a copy of the MPL was not distributed
-# with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-# Author: mierle@gmail.com (Keir Mierle)
-#
-# Make the long-awaited conversion to MPL.
-
-lgpl3_header = '''
-// Eigen is free software; you can redistribute it and/or
-// modify it under the terms of the GNU Lesser General Public
-// License as published by the Free Software Foundation; either
-// version 3 of the License, or (at your option) any later version.
-//
-// Alternatively, you can redistribute it and/or
-// modify it under the terms of the GNU General Public License as
-// published by the Free Software Foundation; either version 2 of
-// the License, or (at your option) any later version.
-//
-// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
-// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU Lesser General Public
-// License and a copy of the GNU General Public License along with
-// Eigen. If not, see <http://www.gnu.org/licenses/>.
-'''
-
-mpl2_header = """
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-"""
-
-import os
-import sys
-
-exclusions = set(['relicense.py'])
-
-def update(text):
-  if text.find(lgpl3_header) == -1:
-    return text, False
-  return text.replace(lgpl3_header, mpl2_header), True
-
-rootdir = sys.argv[1]
-for root, sub_folders, files in os.walk(rootdir):
-    for basename in files:
-        if basename in exclusions:
-          print 'SKIPPED', filename
-          continue
-        filename = os.path.join(root, basename)
-        fo = file(filename)
-        text = fo.read()
-        fo.close()
-
-        text, updated = update(text)
-        if updated:
-          fo = file(filename, "w")
-          fo.write(text)
-          fo.close()
-          print 'UPDATED', filename
-        else:
-          print '       ', filename
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index fbbc98a..4c7c3a4 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -48,7 +48,7 @@
   set(SPARSE_LIBS ${SPARSE_LIBS} ${CHOLMOD_LIBRARIES} ${EIGEN_BLAS_LIBRARIES} ${EIGEN_LAPACK_LIBRARIES})
   set(CHOLMOD_ALL_LIBS  ${CHOLMOD_LIBRARIES} ${EIGEN_BLAS_LIBRARIES} ${EIGEN_LAPACK_LIBRARIES})
   ei_add_property(EIGEN_TESTED_BACKENDS "CHOLMOD, ")
-  
+
   ei_add_test(cholmod_support "" "${CHOLMOD_ALL_LIBS}")
 else()
   ei_add_property(EIGEN_MISSING_BACKENDS "CHOLMOD, ")
@@ -61,7 +61,7 @@
   set(SPARSE_LIBS ${SPARSE_LIBS} ${UMFPACK_LIBRARIES} ${EIGEN_BLAS_LIBRARIES})
   set(UMFPACK_ALL_LIBS ${UMFPACK_LIBRARIES} ${EIGEN_BLAS_LIBRARIES})
   ei_add_property(EIGEN_TESTED_BACKENDS "UMFPACK, ")
-  
+
   ei_add_test(umfpack_support "" "${UMFPACK_ALL_LIBS}")
 else()
   ei_add_property(EIGEN_MISSING_BACKENDS "UMFPACK, ")
@@ -74,7 +74,7 @@
   set(SPARSE_LIBS ${SPARSE_LIBS} ${KLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES})
   set(KLU_ALL_LIBS ${KLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES})
   ei_add_property(EIGEN_TESTED_BACKENDS "KLU, ")
-  
+
   ei_add_test(klu_support "" "${KLU_ALL_LIBS}")
 else()
   ei_add_property(EIGEN_MISSING_BACKENDS "KLU, ")
@@ -87,7 +87,7 @@
   set(SPARSE_LIBS ${SPARSE_LIBS} ${SUPERLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES})
   set(SUPERLU_ALL_LIBS ${SUPERLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES})
   ei_add_property(EIGEN_TESTED_BACKENDS  "SuperLU, ")
-  
+
   ei_add_test(superlu_support "" "${SUPERLU_ALL_LIBS}")
 else()
   ei_add_property(EIGEN_MISSING_BACKENDS  "SuperLU, ")
@@ -171,6 +171,7 @@
 set_property(GLOBAL PROPERTY EIGEN_CURRENT_SUBPROJECT "Official")
 add_custom_target(BuildOfficial)
 
+ei_add_test(clz)
 ei_add_test(rand)
 ei_add_test(meta)
 ei_add_test(maxsizevector)
@@ -406,7 +407,7 @@
   string(REPLACE "-pedantic" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
   string(REPLACE "-Wundef" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
   string(REPLACE "-Wnon-virtual-dtor" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-  string(REPLACE "-fno-check-new" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")  
+  string(REPLACE "-fno-check-new" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
 
   if(EIGEN_TEST_CUDA_CLANG)
     string(APPEND CMAKE_CXX_FLAGS " --cuda-path=${CUDA_TOOLKIT_ROOT_DIR}")
@@ -433,12 +434,12 @@
     set(CUDA_NVCC_FLAGS  "--expt-relaxed-constexpr -Xcudafe \"--display_error_number\" ${NVCC_ARCH_FLAGS} ${CUDA_NVCC_FLAGS} ${EIGEN_CUDA_CXX_FLAGS}")
     cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include")
   endif()
-  
+
   set(EIGEN_ADD_TEST_FILENAME_EXTENSION  "cu")
-  
+
   ei_add_test(gpu_example)
   ei_add_test(gpu_basic)
-  
+
   unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
 
 endif()
@@ -477,7 +478,7 @@
       message(FATAL_ERROR "HIP_PLATFORM = nvcc is not supported within Eigen")
     else ()
       message(FATAL_ERROR "Unknown HIP_PLATFORM = ${HIP_PLATFORM}")
-    endif() 
+    endif()
   endif()
 endif()
 
diff --git a/test/array_cwise.cpp b/test/array_cwise.cpp
index 0cfea8b..543ef2e 100644
--- a/test/array_cwise.cpp
+++ b/test/array_cwise.cpp
@@ -97,10 +97,12 @@
       Scalar e = static_cast<Scalar>(ref(lhs(i, j), rhs(i, j)));
       Scalar a = actual(i, j);
 #if EIGEN_ARCH_ARM
-      // Work around NEON flush-to-zero mode
-      // if ref returns denormalized value and Eigen returns 0, then skip the test
-      int ref_fpclass = std::fpclassify(e);
-      if (a == Scalar(0) && ref_fpclass == FP_SUBNORMAL) continue;
+      // Work around NEON flush-to-zero mode.
+      // If ref returns a subnormal value and Eigen returns 0, then skip the test.
+      if (a == Scalar(0) && (e > -(std::numeric_limits<Scalar>::min)() && e < (std::numeric_limits<Scalar>::min)()) &&
+          (e <= -std::numeric_limits<Scalar>::denorm_min() || e >= std::numeric_limits<Scalar>::denorm_min())) {
+        continue;
+      }
 #endif
       bool success = (a == e) || ((numext::isfinite)(e) && internal::isApprox(a, e, tol)) ||
                      ((numext::isnan)(a) && (numext::isnan)(e));
diff --git a/test/clz.cpp b/test/clz.cpp
new file mode 100644
index 0000000..b56d328
--- /dev/null
+++ b/test/clz.cpp
@@ -0,0 +1,74 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2023 The Eigen Authors
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+template <typename T>
+int ref_clz(T val) {
+  constexpr int kNumBits = sizeof(T) * CHAR_BIT;
+  T kMsbMask = T(1) << (kNumBits - 1);
+  int z = 0;
+  for (; z < kNumBits && ((val & kMsbMask) == 0); ++z) {
+    val <<= 1;
+  }
+  return z;
+}
+
+template <typename T>
+int ref_ctz(T val) {
+  constexpr int kNumBits = sizeof(T) * CHAR_BIT;
+  T kLsbMask = T(1);
+  int z = 0;
+  for (; z < kNumBits && ((val & kLsbMask) == 0); ++z) {
+    val >>= 1;
+  }
+  return z;
+}
+
+template <typename T>
+void test_clz_ctz() {
+  T step = sizeof(T) <= 2 ? 1 : (Eigen::NumTraits<T>::highest() / (T(1) << 16));
+  T iters = Eigen::NumTraits<T>::highest() / step;
+  for (T i = 0; i < iters; ++i) {
+    T val = i * step;
+    int expected_clz = ref_clz(val);
+    int actual_clz = Eigen::internal::clz(val);
+    VERIFY(expected_clz == actual_clz);
+
+    int expected_ctz = ref_ctz(val);
+    int actual_ctz = Eigen::internal::ctz(val);
+    VERIFY(expected_ctz == actual_ctz);
+  }
+}
+
+template <typename T>
+void test_clz_ctz_random() {
+  for (int i = 0; i < 1024 * 1024; ++i) {
+    T val = Eigen::internal::random<T>();
+    int expected_clz = ref_clz(val);
+    int actual_clz = Eigen::internal::clz(val);
+    VERIFY(expected_clz == actual_clz);
+
+    int expected_ctz = ref_ctz(val);
+    int actual_ctz = Eigen::internal::ctz(val);
+    VERIFY(expected_ctz == actual_ctz);
+  }
+}
+
+EIGEN_DECLARE_TEST(clz) {
+  CALL_SUBTEST_1(test_clz_ctz<uint8_t>());
+  CALL_SUBTEST_2(test_clz_ctz<uint16_t>());
+  CALL_SUBTEST_3(test_clz_ctz<uint32_t>());
+  CALL_SUBTEST_4(test_clz_ctz<uint64_t>());
+
+  for (int i = 0; i < g_repeat; i++) {
+    test_clz_ctz_random<uint32_t>();
+    test_clz_ctz_random<uint64_t>();
+  }
+}
diff --git a/test/product_threaded.cpp b/test/product_threaded.cpp
index 410f767..1eb38fb 100644
--- a/test/product_threaded.cpp
+++ b/test/product_threaded.cpp
@@ -13,9 +13,9 @@
 void test_parallelize_gemm() {
   constexpr int n = 1024;
   constexpr int num_threads = 4;
-  MatrixXf a(n, n);
-  MatrixXf b(n, n);
-  MatrixXf c(n, n);
+  MatrixXf a = MatrixXf::Random(n, n);
+  MatrixXf b = MatrixXf::Random(n, n);
+  MatrixXf c = MatrixXf::Random(n, n);
   c.noalias() = a * b;
 
   ThreadPool pool(num_threads);
diff --git a/test/stable_norm.cpp b/test/stable_norm.cpp
index 130284a..e9ed3d5 100644
--- a/test/stable_norm.cpp
+++ b/test/stable_norm.cpp
@@ -209,6 +209,11 @@
   }
 }
 
+void test_empty() {
+  Eigen::VectorXf empty(0);
+  VERIFY_IS_EQUAL(empty.stableNorm(), 0.0f);
+}
+
 template <typename Scalar>
 void test_hypot() {
   typedef typename NumTraits<Scalar>::Real RealScalar;
@@ -235,6 +240,8 @@
 }
 
 EIGEN_DECLARE_TEST(stable_norm) {
+  CALL_SUBTEST_1(test_empty());
+
   for (int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_3(test_hypot<double>());
     CALL_SUBTEST_4(test_hypot<float>());
diff --git a/test/threads_non_blocking_thread_pool.cpp b/test/threads_non_blocking_thread_pool.cpp
index 2f0cf58..e805cf2 100644
--- a/test/threads_non_blocking_thread_pool.cpp
+++ b/test/threads_non_blocking_thread_pool.cpp
@@ -112,53 +112,56 @@
 
 static void test_pool_partitions() {
   const int kThreads = 2;
-  ThreadPool tp(kThreads);
-
-  // Assign each thread to its own partition, so that stealing other work only
-  // occurs globally when a thread is idle.
-  std::vector<std::pair<unsigned, unsigned>> steal_partitions(kThreads);
-  for (int i = 0; i < kThreads; ++i) {
-    steal_partitions[i] = std::make_pair(i, i + 1);
-  }
-  tp.SetStealPartitions(steal_partitions);
 
   std::atomic<int> running(0);
   std::atomic<int> done(0);
   std::atomic<int> phase(0);
 
-  // Schedule kThreads tasks and ensure that they all are running.
-  for (int i = 0; i < kThreads; ++i) {
-    tp.Schedule([&]() {
-      const int thread_id = tp.CurrentThreadId();
-      VERIFY_GE(thread_id, 0);
-      VERIFY_LE(thread_id, kThreads - 1);
-      ++running;
-      while (phase < 1) {
-      }
-      ++done;
-    });
+  {
+    ThreadPool tp(kThreads);
+
+    // Assign each thread to its own partition, so that stealing other work only
+    // occurs globally when a thread is idle.
+    std::vector<std::pair<unsigned, unsigned>> steal_partitions(kThreads);
+    for (int i = 0; i < kThreads; ++i) {
+      steal_partitions[i] = std::make_pair(i, i + 1);
+    }
+    tp.SetStealPartitions(steal_partitions);
+
+    // Schedule kThreads tasks and ensure that they all are running.
+    for (int i = 0; i < kThreads; ++i) {
+      tp.Schedule([&]() {
+        const int thread_id = tp.CurrentThreadId();
+        VERIFY_GE(thread_id, 0);
+        VERIFY_LE(thread_id, kThreads - 1);
+        ++running;
+        while (phase < 1) {
+        }
+        ++done;
+      });
+    }
+    while (running != kThreads) {
+    }
+    // Schedule each closure to only run on thread 'i' and verify that it does.
+    for (int i = 0; i < kThreads; ++i) {
+      tp.ScheduleWithHint(
+          [&, i]() {
+            ++running;
+            const int thread_id = tp.CurrentThreadId();
+            VERIFY_IS_EQUAL(thread_id, i);
+            while (phase < 2) {
+            }
+            ++done;
+          },
+          i, i + 1);
+    }
+    running = 0;
+    phase = 1;
+    while (running != kThreads) {
+    }
+    running = 0;
+    phase = 2;
   }
-  while (running != kThreads) {
-  }
-  // Schedule each closure to only run on thread 'i' and verify that it does.
-  for (int i = 0; i < kThreads; ++i) {
-    tp.ScheduleWithHint(
-        [&, i]() {
-          ++running;
-          const int thread_id = tp.CurrentThreadId();
-          VERIFY_IS_EQUAL(thread_id, i);
-          while (phase < 2) {
-          }
-          ++done;
-        },
-        i, i + 1);
-  }
-  running = 0;
-  phase = 1;
-  while (running != kThreads) {
-  }
-  running = 0;
-  phase = 2;
 }
 
 EIGEN_DECLARE_TEST(cxx11_non_blocking_thread_pool) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index c4dedc1..9375398 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -934,6 +934,7 @@
     template <Index DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorChippingOp<DimId, const Derived>
     chip(const Index offset) const {
+      EIGEN_STATIC_ASSERT(DimId < Derived::NumDimensions && DimId >= 0, Chip_Dim_out_of_range)
       return TensorChippingOp<DimId, const Derived>(derived(), offset, DimId);
     }
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -1132,11 +1133,13 @@
     template <DenseIndex DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorChippingOp<DimId, const Derived>
     chip(const Index offset) const {
+      EIGEN_STATIC_ASSERT(DimId < Derived::NumDimensions && DimId >= 0, Chip_Dim_out_of_range)
       return TensorChippingOp<DimId, const Derived>(derived(), offset, DimId);
     }
     template <Index DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     TensorChippingOp<DimId, Derived>
     chip(const Index offset) {
+      EIGEN_STATIC_ASSERT(DimId < Derived::NumDimensions && DimId >= 0, Chip_Dim_out_of_range)
       return TensorChippingOp<DimId, Derived>(derived(), offset, DimId);
     }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index 8a784af..32980c7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -78,7 +78,9 @@
   typedef typename Eigen::internal::traits<TensorChippingOp>::Index Index;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorChippingOp(const XprType& expr, const Index offset, const Index dim)
-      : m_xpr(expr), m_offset(offset), m_dim(dim) {}
+      : m_xpr(expr), m_offset(offset), m_dim(dim) {
+    eigen_assert(dim < XprType::NumDimensions && dim >= 0 && "Chip_Dim_out_of_range");
+  }
 
   EIGEN_DEVICE_FUNC const Index offset() const { return m_offset; }
   EIGEN_DEVICE_FUNC const Index dim() const { return m_dim.actualDim(); }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
index c3a7ef4..d0fbfb3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -13,6 +13,8 @@
 // IWYU pragma: private
 #include "./InternalHeaderCheck.h"
 
+#include <memory>
+
 namespace Eigen {
 
 /** \class TensorForcedEval
@@ -91,6 +93,26 @@
 };
 }  // end namespace internal
 
+template <typename Device>
+class DeviceTempPointerHolder {
+ public:
+  DeviceTempPointerHolder(const Device& device, size_t size)
+      : device_(device), size_(size), ptr_(device.allocate_temp(size)) {}
+
+  ~DeviceTempPointerHolder() {
+    device_.deallocate_temp(ptr_);
+    size_ = 0;
+    ptr_ = nullptr;
+  }
+
+  void* ptr() { return ptr_; }
+
+ private:
+  Device device_;
+  size_t size_;
+  void* ptr_;
+};
+
 template <typename ArgType_, typename Device>
 struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device> {
   typedef const internal::remove_all_t<ArgType_> ArgType;
@@ -124,13 +146,20 @@
   //===--------------------------------------------------------------------===//
 
   TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL) {}
+      : m_impl(op.expression(), device),
+        m_op(op.expression()),
+        m_device(device),
+        m_buffer_holder(nullptr),
+        m_buffer(nullptr) {}
+
+  ~TensorEvaluator() { cleanup(); }
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
 
   EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
     const Index numValues = internal::array_prod(m_impl.dimensions());
-    m_buffer = m_device.get((CoeffReturnType*)m_device.allocate_temp(numValues * sizeof(CoeffReturnType)));
+    m_buffer_holder = std::make_shared<DeviceTempPointerHolder<Device>>(m_device, numValues * sizeof(CoeffReturnType));
+    m_buffer = static_cast<EvaluatorPointerType>(m_buffer_holder->ptr());
 
     internal::non_integral_type_placement_new<Device, CoeffReturnType>()(numValues, m_buffer);
 
@@ -148,7 +177,9 @@
   template <typename EvalSubExprsCallback>
   EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) {
     const Index numValues = internal::array_prod(m_impl.dimensions());
-    m_buffer = m_device.get((CoeffReturnType*)m_device.allocate_temp(numValues * sizeof(CoeffReturnType)));
+    m_buffer_holder = std::make_shared<DeviceTempPointerHolder<Device>>(m_device, numValues * sizeof(CoeffReturnType));
+    m_buffer = static_cast<EvaluatorPointerType>(m_buffer_holder->ptr());
+
     typedef TensorEvalToOp<const std::remove_const_t<ArgType>> EvalTo;
     EvalTo evalToTmp(m_device.get(m_buffer), m_op);
 
@@ -162,8 +193,8 @@
 #endif
 
   EIGEN_STRONG_INLINE void cleanup() {
-    m_device.deallocate_temp(m_buffer);
-    m_buffer = NULL;
+    m_buffer_holder = nullptr;
+    m_buffer = nullptr;
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_buffer[index]; }
@@ -179,7 +210,7 @@
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
                                                           bool /*root_of_expr_ast*/ = false) const {
-    eigen_assert(m_buffer != NULL);
+    eigen_assert(m_buffer != nullptr);
     return TensorBlock::materialize(m_buffer, m_impl.dimensions(), desc, scratch);
   }
 
@@ -187,13 +218,14 @@
     return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_buffer; }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE EvaluatorPointerType data() const { return m_buffer; }
 
  private:
   TensorEvaluator<ArgType, Device> m_impl;
   const ArgType m_op;
   const Device EIGEN_DEVICE_REF m_device;
-  EvaluatorPointerType m_buffer;
+  std::shared_ptr<DeviceTempPointerHolder<Device>> m_buffer_holder;
+  EvaluatorPointerType m_buffer;  // Cached copy of the value stored in m_buffer_holder.
 };
 
 }  // end namespace Eigen
diff --git a/unsupported/Eigen/src/SparseExtra/MarketIO.h b/unsupported/Eigen/src/SparseExtra/MarketIO.h
index 5e65b26..f92622d 100644
--- a/unsupported/Eigen/src/SparseExtra/MarketIO.h
+++ b/unsupported/Eigen/src/SparseExtra/MarketIO.h
@@ -309,6 +309,7 @@
   out << header << std::endl;
   out << mat.rows() << " " << mat.cols() << " " << mat.nonZeros() << "\n";
   int count = 0;
+  EIGEN_UNUSED_VARIABLE(count);
   for (int j = 0; j < mat.outerSize(); ++j)
     for (typename SparseMatrixType::InnerIterator it(mat, j); it; ++it) {
       ++count;
diff --git a/unsupported/test/cxx11_tensor_concatenation.cpp b/unsupported/test/cxx11_tensor_concatenation.cpp
index ce78892..68455b3 100644
--- a/unsupported/test/cxx11_tensor_concatenation.cpp
+++ b/unsupported/test/cxx11_tensor_concatenation.cpp
@@ -36,6 +36,8 @@
 static void test_static_dimension_failure() {
   Tensor<int, 2, DataLayout> left(2, 3);
   Tensor<int, 3, DataLayout> right(2, 3, 1);
+  left.setRandom();
+  right.setRandom();
 
 #ifdef CXX11_TENSOR_CONCATENATION_STATIC_DIMENSION_FAILURE
   // Technically compatible, but we static assert that the inputs have same
diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp
index ba553f9..228fa9e 100644
--- a/unsupported/test/cxx11_tensor_executor.cpp
+++ b/unsupported/test/cxx11_tensor_executor.cpp
@@ -24,7 +24,7 @@
 // Default assignment that does no use block evaluation or vectorization.
 // We assume that default coefficient evaluation is well tested and correct.
 template <typename Dst, typename Expr>
-static void DefaultAssign(Dst& dst, Expr expr) {
+void DefaultAssign(Dst& dst, Expr expr) {
   using Assign = Eigen::TensorAssignOp<Dst, const Expr>;
   using Executor = Eigen::internal::TensorExecutor<const Assign, DefaultDevice,
                                                    /*Vectorizable=*/false,
@@ -35,7 +35,7 @@
 
 // Assignment with specified device and tiling strategy.
 template <bool Vectorizable, TiledEvaluation Tiling, typename Device, typename Dst, typename Expr>
-static void DeviceAssign(Device& d, Dst& dst, Expr expr) {
+void DeviceAssign(Device& d, Dst& dst, Expr expr) {
   using Assign = Eigen::TensorAssignOp<Dst, const Expr>;
   using Executor = Eigen::internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
 
@@ -52,7 +52,7 @@
 }
 
 template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout>
-static void test_execute_unary_expr(Device d) {
+void test_execute_unary_expr(Device d) {
   static constexpr int Options = 0 | Layout;
 
   // Pick a large enough tensor size to bypass small tensor block evaluation
@@ -77,7 +77,7 @@
 }
 
 template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout>
-static void test_execute_binary_expr(Device d) {
+void test_execute_binary_expr(Device d) {
   static constexpr int Options = 0 | Layout;
 
   // Pick a large enough tensor size to bypass small tensor block evaluation
@@ -105,7 +105,7 @@
 }
 
 template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout>
-static void test_execute_broadcasting(Device d) {
+void test_execute_broadcasting(Device d) {
   static constexpr int Options = 0 | Layout;
 
   auto dims = RandomDims<NumDims>(1, 10);
@@ -134,92 +134,103 @@
 }
 
 template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout>
-static void test_execute_chipping_rvalue(Device d) {
-  auto dims = RandomDims<NumDims>(1, 10);
-  Tensor<T, NumDims, Layout, Index> src(dims);
-  src.setRandom();
+struct test_execute_chipping_rvalue_runner {
+  template <int ChipDim>
+  static std::enable_if_t<0 <= ChipDim, void> run_dim(Device& d, const array<Index, NumDims>& dims,
+                                                      const Tensor<T, NumDims, Layout, Index>& src) {
+    const auto offset = internal::random<Index>(0, dims[(ChipDim)] - 1);
+    const auto expr = src.template chip<ChipDim>(offset);
 
-#define TEST_CHIPPING(CHIP_DIM)                                                            \
-  if (NumDims > (CHIP_DIM)) {                                                              \
-    const auto offset = internal::random<Index>(0, dims[(CHIP_DIM)] - 1);                  \
-    const auto expr = src.template chip<(CHIP_DIM)>(offset);                               \
-                                                                                           \
-    Tensor<T, NumDims - 1, Layout, Index> golden;                                          \
-    golden = expr;                                                                         \
-                                                                                           \
-    Tensor<T, NumDims - 1, Layout, Index> dst(golden.dimensions());                        \
-                                                                                           \
-    using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;                    \
-    using Executor = internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>; \
-                                                                                           \
-    Executor::run(Assign(dst, expr), d);                                                   \
-                                                                                           \
-    for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {                             \
-      VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));                                      \
-    }                                                                                      \
+    Tensor<T, NumDims - 1, Layout, Index> golden;
+    golden = expr;
+
+    Tensor<T, NumDims - 1, Layout, Index> dst(golden.dimensions());
+
+    using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+    using Executor = internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+    Executor::run(Assign(dst, expr), d);
+
+    for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+      VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+    }
+
+    // Recursively reduce chip dimension.
+    run_dim<ChipDim - 1>(d, dims, src);
   }
 
-  TEST_CHIPPING(0)
-  TEST_CHIPPING(1)
-  TEST_CHIPPING(2)
-  TEST_CHIPPING(3)
-  TEST_CHIPPING(4)
-  TEST_CHIPPING(5)
+  template <int ChipDim>
+      static std::enable_if_t <
+      ChipDim<0, void> run_dim(Device&, const array<Index, NumDims>&, const Tensor<T, NumDims, Layout, Index>&) {}
 
-#undef TEST_CHIPPING
+  static void run(Device d) {
+    auto dims = RandomDims<NumDims>(1, 10);
+    Tensor<T, NumDims, Layout, Index> src(dims);
+    src.setRandom();
+    run_dim<NumDims - 1>(d, dims, src);
+  }
+};
+
+template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout>
+void test_execute_chipping_rvalue(Device d) {
+  test_execute_chipping_rvalue_runner<T, NumDims, Device, Vectorizable, Tiling, Layout>::run(d);
 }
 
 template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout>
-static void test_execute_chipping_lvalue(Device d) {
-  auto dims = RandomDims<NumDims>(1, 10);
+struct test_execute_chipping_lvalue_runner {
+  template <int ChipDim>
+  static std::enable_if_t<0 <= ChipDim> run_dim(Device& d, const array<Index, NumDims>& dims) {
+    /* Generate random data that we'll assign to the chipped tensor dim. */
+    array<Index, NumDims - 1> src_dims;
+    for (int i = 0; i < NumDims - 1; ++i) {
+      int dim = i < (ChipDim) ? i : i + 1;
+      src_dims[i] = dims[dim];
+    }
 
-#define TEST_CHIPPING(CHIP_DIM)                                                            \
-  if (NumDims > (CHIP_DIM)) {                                                              \
-    /* Generate random data that we'll assign to the chipped tensor dim. */                \
-    array<Index, NumDims - 1> src_dims;                                                    \
-    for (int i = 0; i < NumDims - 1; ++i) {                                                \
-      int dim = i < (CHIP_DIM) ? i : i + 1;                                                \
-      src_dims[i] = dims[dim];                                                             \
-    }                                                                                      \
-                                                                                           \
-    Tensor<T, NumDims - 1, Layout, Index> src(src_dims);                                   \
-    src.setRandom();                                                                       \
-                                                                                           \
-    const auto offset = internal::random<Index>(0, dims[(CHIP_DIM)] - 1);                  \
-                                                                                           \
-    Tensor<T, NumDims, Layout, Index> random(dims);                                        \
-    random.setZero();                                                                      \
-                                                                                           \
-    Tensor<T, NumDims, Layout, Index> golden(dims);                                        \
-    golden = random;                                                                       \
-    golden.template chip<(CHIP_DIM)>(offset) = src;                                        \
-                                                                                           \
-    Tensor<T, NumDims, Layout, Index> dst(dims);                                           \
-    dst = random;                                                                          \
-    auto expr = dst.template chip<(CHIP_DIM)>(offset);                                     \
-                                                                                           \
-    using Assign = TensorAssignOp<decltype(expr), const decltype(src)>;                    \
-    using Executor = internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>; \
-                                                                                           \
-    Executor::run(Assign(expr, src), d);                                                   \
-                                                                                           \
-    for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {                             \
-      VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));                                      \
-    }                                                                                      \
+    Tensor<T, NumDims - 1, Layout, Index> src(src_dims);
+    src.setRandom();
+
+    const auto offset = internal::random<Index>(0, dims[(ChipDim)] - 1);
+
+    Tensor<T, NumDims, Layout, Index> random(dims);
+    random.setZero();
+
+    Tensor<T, NumDims, Layout, Index> golden(dims);
+    golden = random;
+    golden.template chip<(ChipDim)>(offset) = src;
+
+    Tensor<T, NumDims, Layout, Index> dst(dims);
+    dst = random;
+    auto expr = dst.template chip<(ChipDim)>(offset);
+
+    using Assign = TensorAssignOp<decltype(expr), const decltype(src)>;
+    using Executor = internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+    Executor::run(Assign(expr, src), d);
+
+    for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+      VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+    }
+
+    run_dim<ChipDim - 1>(d, dims);
   }
 
-  TEST_CHIPPING(0)
-  TEST_CHIPPING(1)
-  TEST_CHIPPING(2)
-  TEST_CHIPPING(3)
-  TEST_CHIPPING(4)
-  TEST_CHIPPING(5)
+  template <int ChipDim>
+      static std::enable_if_t < ChipDim<0, void> run_dim(Device&, const array<Index, NumDims>&) {}
 
-#undef TEST_CHIPPING
+  static void run(Device d) {
+    auto dims = RandomDims<NumDims>(1, 10);
+    run_dim<NumDims - 1>(d, dims);
+  }
+};
+
+template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout>
+void test_execute_chipping_lvalue(Device d) {
+  test_execute_chipping_lvalue_runner<T, NumDims, Device, Vectorizable, Tiling, Layout>::run(d);
 }
 
 template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout>
-static void test_execute_shuffle_rvalue(Device d) {
+void test_execute_shuffle_rvalue(Device d) {
   static constexpr int Options = 0 | Layout;
 
   auto dims = RandomDims<NumDims>(1, 10);
@@ -255,7 +266,7 @@
 }
 
 template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout>
-static void test_execute_shuffle_lvalue(Device d) {
+void test_execute_shuffle_lvalue(Device d) {
   static constexpr int Options = 0 | Layout;
 
   auto dims = RandomDims<NumDims>(5, 10);
@@ -289,7 +300,7 @@
 }
 
 template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout>
-static void test_execute_reshape(Device d) {
+void test_execute_reshape(Device d) {
   static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
 
   static constexpr int ReshapedDims = NumDims - 1;
@@ -326,7 +337,7 @@
 }
 
 template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout>
-static void test_execute_slice_rvalue(Device d) {
+void test_execute_slice_rvalue(Device d) {
   static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
   static constexpr int Options = 0 | Layout;
 
@@ -362,7 +373,7 @@
 }
 
 template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout>
-static void test_execute_slice_lvalue(Device d) {
+void test_execute_slice_lvalue(Device d) {
   static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
   static constexpr int Options = 0 | Layout;
 
@@ -402,7 +413,7 @@
 }
 
 template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout>
-static void test_execute_broadcasting_of_forced_eval(Device d) {
+void test_execute_broadcasting_of_forced_eval(Device d) {
   static constexpr int Options = 0 | Layout;
 
   auto dims = RandomDims<NumDims>(1, 10);
@@ -442,7 +453,7 @@
 };
 
 template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout>
-static void test_execute_generator_op(Device d) {
+void test_execute_generator_op(Device d) {
   static constexpr int Options = 0 | Layout;
 
   auto dims = RandomDims<NumDims>(20, 30);
@@ -470,7 +481,7 @@
 }
 
 template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout>
-static void test_execute_reverse_rvalue(Device d) {
+void test_execute_reverse_rvalue(Device d) {
   static constexpr int Options = 0 | Layout;
 
   auto dims = RandomDims<NumDims>(1, numext::pow(1000000.0, 1.0 / NumDims));
@@ -502,7 +513,7 @@
 }
 
 template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout>
-static void test_async_execute_unary_expr(Device d) {
+void test_async_execute_unary_expr(Device d) {
   static constexpr int Options = 0 | Layout;
 
   // Pick a large enough tensor size to bypass small tensor block evaluation
@@ -532,7 +543,7 @@
 }
 
 template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout>
-static void test_async_execute_binary_expr(Device d) {
+void test_async_execute_binary_expr(Device d) {
   static constexpr int Options = 0 | Layout;
 
   // Pick a large enough tensor size to bypass small tensor block evaluation
diff --git a/unsupported/test/sparse_extra.cpp b/unsupported/test/sparse_extra.cpp
index 22d6665..4bf5452 100644
--- a/unsupported/test/sparse_extra.cpp
+++ b/unsupported/test/sparse_extra.cpp
@@ -7,6 +7,9 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+#include <cstdlib>
+#include <string>
+
 #include "sparse.h"
 
 #ifdef min
@@ -19,6 +22,16 @@
 
 #include <Eigen/SparseExtra>
 
+// Read from an environment variable TEST_TMPDIR, if available,
+// and append the provided filename. Defaults to local directory.
+std::string GetTestTempFilename(const char* filename) {
+  const char* test_tmpdir = std::getenv("TEST_TMPDIR");
+  if (test_tmpdir == nullptr) {
+    return std::string(filename);
+  }
+  return std::string(test_tmpdir) + std::string("/") + std::string(filename);
+}
+
 template <typename SetterType, typename DenseType, typename Scalar, int Options>
 bool test_random_setter(SparseMatrix<Scalar, Options>& sm, const DenseType& ref,
                         const std::vector<Vector2i>& nonzeroCoords) {
@@ -116,8 +129,9 @@
   Index cols = internal::random<Index>(1, 100);
   SparseMatrixType m1, m2;
   m1 = DenseMatrix::Random(rows, cols).sparseView();
-  saveMarket(m1, "sparse_extra.mtx");
-  loadMarket(m2, "sparse_extra.mtx");
+  std::string filename = GetTestTempFilename("sparse_extra.mtx");
+  saveMarket(m1, filename);
+  loadMarket(m2, filename);
   VERIFY_IS_EQUAL(DenseMatrix(m1), DenseMatrix(m2));
 }
 
@@ -126,8 +140,9 @@
   Index size = internal::random<Index>(1, 100);
   VectorType v1, v2;
   v1 = VectorType::Random(size);
-  saveMarketVector(v1, "vector_extra.mtx");
-  loadMarketVector(v2, "vector_extra.mtx");
+  std::string filename = GetTestTempFilename("vector_extra.mtx");
+  saveMarketVector(v1, filename);
+  loadMarketVector(v2, filename);
   VERIFY_IS_EQUAL(v1, v2);
 }
 
@@ -149,8 +164,9 @@
 
   DenseMatrixType m1, m2;
   m1 = DenseMatrixType::Random(rows, cols);
-  saveMarketDense(m1, "dense_extra.mtx");
-  loadMarketDense(m2, "dense_extra.mtx");
+  std::string filename = GetTestTempFilename("dense_extra.mtx");
+  saveMarketDense(m1, filename);
+  loadMarketDense(m2, filename);
   VERIFY_IS_EQUAL(m1, m2);
 }