Update Eigen to commit:ea13a98decd497a8c5588fb5de71b57bcf10d864 CHANGELOG ========= ea13a98de - Fix imag_ref for real scalar types and clean up svd_fill.h 929785924 - Fix more cache size queries. b2f95d373 - Fix more cache size queries. 9ae0e0f19 - Remove include from within Eigen namespace. c1faa7473 - Add boundary test coverage: stableNorm, LinSpaced, complex GEMV, triangular solve 6b9275d1a - Add test coverage for transpose, reverse, bool redux, select, diagonal-of-product at boundaries 356a9ba1d - Add test coverage for matrix lpNorm, RowMajor partial reductions, selfadjoint boundaries 15cae8348 - Add test coverage for strided maps, triangular blocking, and mixed storage orders 93aa959b8 - Add vectorization boundary tests for redux and visitor c93116b43 - Improve test coverage for inner product, fill, reductions, and IO 5e478d328 - Improve product test coverage at critical code-path boundaries 3a2ba7c43 - Optimize `predux_any<Packet4f>` 8190c82cb - Add missing SIMD math function benchmarks 8368a12f0 - Add runtime cache size detection for ARM and improve GEMM blocking 42c1dbd2c - Add aarch64 smoke test pipeline for MRs 875fb48f0 - fix various irksome compiler warnings 2a2456c87 - restore Eigen/src/Core/arch/Altivec/MatrixProduct.h to b1e74b1cc c4eb3c4f4 - fix custom visitors 4387e3248 - Fix row-skipping bug in general_matrix_vector_product::run_small_cols 81550faea - Use Web Archive for dead link for the PDF referenced in Geometry/EulerAngles.h 42b6c43cf - Revert "Remove random retry loops in tests (batch 2: indices and integer types)" 54458cb39 - Remove random retry loops in tests (batch 3: geometry, sparse, umeyama) a3cb1c659 - cxx11_tensor_random: use retry loop for low-precision RNG collisions f80d7b825 - Fix three more flaky tests: igamma, tensor_random, matrix_power 8eaa7552f - Fix three flaky tests: packetmath, array_cwise, polynomialsolver dd81698ae - Fix vectorization_logic test for wide SIMD widths ab5878426 - Remove random retry loops in tests (batch 5: geometry, mixing types, triangular) 411422f2d - Remove random retry loop in SVD min-norm test 7c3a34476 - Remove random retry loops in tests (batch 2: indices and integer types) be7538ed6 - Remove random retry loops in tests (batch 1: simple scalar cases) 5790d716c - Simplify and optimize pow/cbrt special case handling 3041ab44a - Fix GEBP asm register constraints for custom scalar types 20fce70e5 - Fix another complex div edge case. 5bacb5be9 - Fix null pointer dereference in Sparse-Dense products for Sparse vectors. d8c8ee6fb - Fix crash on construction of SparseMatrix with zero-length diagonal 265496e86 - Fix heap overflow in BM_BatchContraction benchmark eea4d31f5 - Simplify and modernize XprHelper.h dd826edb4 - Replace typedef with using in tensor contraction files abc3d6014 - Fix CUDA+Clang build warnings. 0269c017a - Revise Tensor module README.md: fix bugs, add missing docs ca94be70d - fix uninitialized variable in constexpr function b0ebf966a - Fix default rank-detection threshold in QR and LU decompositions d36a7db7b - Fix Eigen::array constructors. 661cdb227 - Fix relative paths after move. 57b1de233 - Fix row-major GEMV dropping rows when n8 heuristic disables main loop 662d5c21f - Optimize SYMV, SYR, SYR2, and TRMV product kernels c66fc5286 - Add ULP accuracy measurement tool and documentation for vectorized math functions c20b6f5c4 - Restore EIGEN_EMPTY_STRUCT_CTOR as a no-op macro for backward compatibility 77d917359 - Add ca-certificates to clang-tidy CI job 444ae9761 - Clamp igamma/igammac output to [0,1] for numerical stability eddb470a0 - Fix flaky array_cwise and sparse_basic tests 25dba492e - Use stack-constructed variable for SVD block sweep. f64d1e0ac - Improve ConditionEstimator docs and tighten test bounds 8525491eb - Add dedicated unit tests and benchmark for ConditionEstimator e730b1fe3 - Fix mixed products GEMM. 3adfa9bd3 - Add const to non-mutating member functions across remaining modules 13b61529f - Add const to non-mutating member functions in products/ and Serializer aaca9e585 - Add missing const qualifiers in Eigen/src/Core/ 1b1b7e347 - Fix EIGEN_NO_AUTOMATIC_RESIZING not resizing empty destinations 064d686c5 - Remove CXX11/ directory nesting for Tensor modules 11eb66e1b - Remove pre-C++14 workarounds from unsupported/ tensor code a95440de1 - Remove obsolete bench/ and btl/ directories 6e2aff6b5 - Fix ambiguous static_cast in JacobiSVD blocking threshold d8ed4f688 - Fix GEBP half/quarter-packet loops for nr>=8 RHS packing on ARM64 6b6d0d8c8 - Revert "Fix ambiguous static_cast in JacobiSVD blocking threshold computation" ba2fc4e77 - Revert "Fix GEBP half/quarter-packet loops for nr>=8 RHS packing on ARM64" 888d708dc - Fix GEBP half/quarter-packet loops for nr>=8 RHS packing on ARM64 e567151ce - Fix ambiguous static_cast in JacobiSVD blocking threshold computation a31de4778 - Blocked Jacobi SVD sweep with L2-cache-adaptive threshold 647e0009b - Refactor BDCSVD D&C code to reduce compilation time and memory footprint 4fab38d79 - Make clang generic vector backend support 16, 32, and 64-byte vectors ea25ea52b - Revert accidental changes from !2212 squash merge 38f0f4275 - Update rmlarsen email address from @google.com to @gmail.com d0d70a952 - Consolidate complex math function boilerplate with shared macros c4c704e5d - Install libclang-rt-19-dev for asan-ubsan CI job 61895c597 - Selectively add constexpr to Core expression template scaffolding 34092d278 - Fix flaky tests: add iteration guards, yield in busy-waits, cap thread count 28d090a49 - Refactor GenericPacketMathFunctions.h into smaller focused headers 16da0279f - Add benchmarks for unsupported modules and extend supported benchmarks fa567f6bc - Add CUDA CI jobs with NVHPC (nvc++) as host and device compiler 2cd9bb738 - Fix sparse product with entities that do not have direct access. 00cc497d3 - Add clang-tidy, codespell, and sanitizer checks to CI pipeline 241af1c0b - Add NVHPC (nvc++) compiler support and CI build/test jobs f3f2c676b - Fix direct access for sparse blocks. d537b51ed - Fix ComplexEigenSolver NaN with flush-to-zero arithmetic 667cabe3a - Clean up comments in unsupported module 78b76986b - Comment cleanup v3: trailing ??, informal language, FIXME/TODO colons 112c2324b - Consolidate BF16/F16 wrapper macros and simplify arch math functions d5e67adbe - Clean up informal language, vague TODOs, and dead code in comments 7d727d26b - Refactor GenericPacketMathFunctions.h into smaller focused headers 9810969c0 - Suppress false-positive GCC and clang warnings in test builds ad7f1fe70 - Improve clang vector extension backend 1f49bf96c - Add new benchmarks for Core, LU, and QR operations 8c35441f1 - Fix typos: misspellings, French variable names, and hyphenation 44c613216 - Fix ~40 typos found by codespell across the codebase f52ad04bb - Fix ASAN-detected bugs in Diagonal::data() and array_cwise test d4077a6e9 - Reorganize benchmarks into subdirectories and clean up Eigen sources 832b94097 - Update COPYING.README to clarify third-party license status e6accc73f - Fix comment typos, doubled words, grammar errors, and copy-paste mistakes 0e424f405 - Remove dead code, commented-out blocks, and outdated comments 18791a81b - Fix MSVC build: disable [[msvc::forceinline]] on generic lambdas 95e8bc326 - Add EIGEN_LAMBDA_ALWAYS_INLINE macro for MSVC lambda inlining a87ecfb17 - Use m_ prefix consistently for private/protected member variables 270ea539f - Remove redundant EIGEN_STRONG_INLINE from trivial constexpr and = default functions e0a8d6c9d - Fix compile warnings 1dcea43c4 - Fix RowMajor performance for triangular/dense assignment 374fe225b - Reduce GEMV and TRSM benchmark sizes for faster routine runs 2c898e8b9 - Remove unused LhsPacketType typedef in gebp_peeled_loop 4fdc82d69 - Fix mixed-type compilation error in row-major GEMV small-cols path 4141d1fd2 - Fix -Wtautological-overlap-compare warning in row-major GEMV dispatch 53e3408cb - Optimize GEMV kernels: row-major small-cols and template deduplication 9c63d26de - Remove reference to nonexistent spmv.cpp in benchmarks 5f09b3b63 - Fix missing template argument list in trsmKernelR for Clang 20/21 c9eab4087 - Fix unused variable warning for phys_l1 on non-AVX512 builds 3c86a013b - Vectorize generic trsmKernelR for non-AVX512 targets 43a01f06a - update AVX and AVX512 to support gcc < 10.1 and clang < 10 552ca8f15 - Simplify GEBP micro-kernel and improve blocking heuristics e953f1e50 - Modernize C++14 usage and minor optimizations in Core f69745b67 - Fix real x complex GEMM for backends where half == full packet size 073190be0 - Fix outdated documentation across multiple .dox files bdec88009 - Remove const from return-by-value types (issue #1087) 3108f6360 - Migrate Eigen benchmarks to the Google benchmark framework 740cac97b - Fix AVX double-precision trig and complex exp without AVX2 50d6d92a7 - Optimize sparse-dense product by bypassing InnerIterator for compressed storage b6b2f31ba - Fix compiler warnings from GCC 13 and Clang 18 113207a9d - Optimize JacobiSVD 2x2 kernel and hoist sweep threshold e6e5b5c4c - Fix pexp_complex for `complex<double>` (issue #3022) 2b561f928 - Revert "Specialized enable_borrowed_ranges for VectorwiseOp class range iteration" d0654a201 - Specialized enable_borrowed_ranges for VectorwiseOp class range iteration 1a2b80727 - Fix pdiv for complex packets involving infinites. 9b709e826 - Diagonalview example typo 23fcc1c6c - MatrixBase::diagonalView issue 604 004d81a85 - Fix cblat3/zblat3 test program with gfortran. 0ac2a2df9 - Prevent predux_half for DoublePacket from accidentally catching complex Packets of size >= 16 4d05fcf8d - Fix packetmath tests on M* macs. 752911927 - betainc edge case checks at start of calculation afb438053 - Fix RunQueue race condition on weak memory architectures (ARM64) c64829636 - Update HVX floating-point reduction to support V79 architecture. ddfc68d39 - Fix clang vector backend type compatibility issues. 93ff38884 - Fix relative tolerance scaling by multiplying with RHS norm in BiCGSTAB 26c242ab5 - make EIGEN_BLAS macro names consistent and undef at end of file 3d6f5fe8f - Tests: skip denorms in ARM ieee tests fdfdd4c96 - test suite: emit the function name when an ieee test fails e246f9cb6 - Use memset if !NumTraits<Scalar>::RequireInitialization f46a2c561 - Fix bad static access for TensorDeviceGpu. c09151a2c - Wrote resizing documentation page f7772e394 - Gcc warnings 918a5f1a6 - Fix warnings related to variable_if_dynamic. 9a37aca9f - Fix assignment size assertion for EIGEN_NO_AUTOMATIC_RESIZING. 251bff288 - CUDA 13 compatibility update for unit test gpu_basic 0315fb319 - Change inline hint for `general_matrix_vector_product<>::run()` to gain performance 7aea350ba - Fix more packetmath issues for RVV 5d9beb81a - Initial version of reactivating RVV features like GeneralBlockPanelKernel d90a0534b - fix polynomialsolver test failures 711118b74 - docs does not exists c30af8f3d - fix UB in random implementation and tests c5aa40675 - Fix `check_that_free_is_allowed` so that it properly checks `is_free_allowed` and not `is_malloc_allowed` 5793499a5 - Fix AVX512FP16 build. 2ac496ff8 - Revert !1953 and !1954 9164d3f16 - Fix undefined behavior in packetmath. 748e0a651 - Add missing semicolon fe973ab0c - Force early evaluation of boost expressions. 976f15ebc - fix doc generation with doxygen 1.14 & 1.15 4f14da11d - Add basic support for packetmath for BF16 RVV 21e4582d1 - Merge remote-tracking branch 'refs/remotes/origin/master' a7209fad7 - GemmKernel: Define static constexpr member variables out-of-class for C++14 compatibility cdc62b84c - Merge remote-tracking branch 'origin2/master' 26fe567dd - Fix FP16 for RVV so that it will compile for gcc afbf8173d - Merge remote-tracking branch 'origin2/master' 9b00db8cb - Simplify thread-safe initialization of GpuDeviceProperties. 8cdc0fa67 - Fix naming of predux_half for RVV when LMUL > 1 f610edadc - Merge remote-tracking branch 'origin2/master' 5aefbab77 - Merge remote-tracking branch 'origin2/master' 3ff3d0378 - Merge remote-tracking branch 'origin2/master' 196eed3d6 - Merge branch 'master' of https://gitlab.com/libeigen/eigen PiperOrigin-RevId: 904604242 Change-Id: I3b70435e8fee585af45ad5de8384531d716a57e8

diff --git a/Eigen/Core b/Eigen/Core
index 9f81658..47d599c 100644
--- a/Eigen/Core
+++ b/Eigen/Core

@@ -129,6 +129,14 @@
 #include <intrin.h>
 #endif
 
+// Required for querying cache sizes on Linux and macOS.
+#if EIGEN_OS_LINUX
+#include <unistd.h>
+#elif EIGEN_OS_MAC
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#endif
+
 #if defined(EIGEN_USE_SYCL)
 #undef min
 #undef max
@@ -288,6 +296,9 @@
 #if defined EIGEN_VECTORIZE_RVV10FP16
 #include "src/Core/arch/RVV10/PacketMathFP16.h"
 #endif
+#if defined EIGEN_VECTORIZE_RVV10BF16
+#include "src/Core/arch/RVV10/PacketMathBF16.h"
+#endif
 #elif defined EIGEN_VECTORIZE_ZVECTOR
 #include "src/Core/arch/ZVector/PacketMath.h"
 #include "src/Core/arch/ZVector/MathFunctions.h"
@@ -434,6 +445,8 @@
 #include "src/Core/arch/NEON/GeneralBlockPanelKernel.h"
 #elif defined EIGEN_VECTORIZE_LSX
 #include "src/Core/arch/LSX/GeneralBlockPanelKernel.h"
+#elif defined EIGEN_VECTORIZE_RVV10
+#include "src/Core/arch/RVV10/GeneralBlockPanelKernel.h"
 #endif
 
 #if defined(EIGEN_VECTORIZE_AVX512)

diff --git a/Eigen/src/AccelerateSupport/AccelerateSupport.h b/Eigen/src/AccelerateSupport/AccelerateSupport.h
index 13a26df..c944aea 100644
--- a/Eigen/src/AccelerateSupport/AccelerateSupport.h
+++ b/Eigen/src/AccelerateSupport/AccelerateSupport.h

@@ -110,7 +110,7 @@
 namespace internal {
 template <typename T>
 struct AccelFactorizationDeleter {
-  void operator()(T* sym) {
+  void operator()(T* sym) const {
     if (sym) {
       SparseCleanup(*sym);
       delete sym;

diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h
index b1d801d..45912a2 100644
--- a/Eigen/src/Cholesky/LDLT.h
+++ b/Eigen/src/Cholesky/LDLT.h

@@ -84,7 +84,13 @@
    * The default constructor is useful in cases in which the user intends to
    * perform decompositions via LDLT::compute(const MatrixType&).
    */
-  LDLT() : m_matrix(), m_transpositions(), m_sign(internal::ZeroSign), m_isInitialized(false) {}
+  LDLT()
+      : m_matrix(),
+        m_l1_norm(0),
+        m_transpositions(),
+        m_sign(internal::ZeroSign),
+        m_isInitialized(false),
+        m_info(InvalidInput) {}
 
   /** \brief Default Constructor with memory preallocation
    *
@@ -94,10 +100,12 @@
    */
   explicit LDLT(Index size)
       : m_matrix(size, size),
+        m_l1_norm(0),
         m_transpositions(size),
         m_temporary(size),
         m_sign(internal::ZeroSign),
-        m_isInitialized(false) {}
+        m_isInitialized(false),
+        m_info(InvalidInput) {}
 
   /** \brief Constructor with decomposition
    *
@@ -108,10 +116,12 @@
   template <typename InputType>
   explicit LDLT(const EigenBase<InputType>& matrix)
       : m_matrix(matrix.rows(), matrix.cols()),
+        m_l1_norm(0),
         m_transpositions(matrix.rows()),
         m_temporary(matrix.rows()),
         m_sign(internal::ZeroSign),
-        m_isInitialized(false) {
+        m_isInitialized(false),
+        m_info(InvalidInput) {
     compute(matrix.derived());
   }
 
@@ -125,10 +135,12 @@
   template <typename InputType>
   explicit LDLT(EigenBase<InputType>& matrix)
       : m_matrix(matrix.derived()),
+        m_l1_norm(0),
         m_transpositions(matrix.rows()),
         m_temporary(matrix.rows()),
         m_sign(internal::ZeroSign),
-        m_isInitialized(false) {
+        m_isInitialized(false),
+        m_info(InvalidInput) {
     compute(matrix.derived());
   }
 
@@ -191,7 +203,7 @@
    * \sa MatrixBase::ldlt(), SelfAdjointView::ldlt()
    */
   template <typename Rhs>
-  inline const Solve<LDLT, Rhs> solve(const MatrixBase<Rhs>& b) const;
+  inline Solve<LDLT, Rhs> solve(const MatrixBase<Rhs>& b) const;
 #endif
 
   template <typename Derived>
@@ -213,7 +225,7 @@
 
   /** \returns the internal LDLT decomposition matrix
    *
-   * TODO: document the storage layout
+   * TODO: document the storage layout.
    */
   inline const MatrixType& matrixLDLT() const {
     eigen_assert(m_isInitialized && "LDLT is not initialized.");
@@ -479,7 +491,7 @@
 
   // Compute matrix L1 norm = max abs column sum.
   m_l1_norm = RealScalar(0);
-  // TODO move this code to SelfAdjointView
+  // TODO: move this code to SelfAdjointView
   for (Index col = 0; col < size; ++col) {
     RealScalar abs_col_sum;
     if (UpLo_ == Lower)
@@ -630,8 +642,8 @@
  * \sa MatrixBase::ldlt()
  */
 template <typename MatrixType, unsigned int UpLo>
-inline const LDLT<typename SelfAdjointView<MatrixType, UpLo>::PlainObject, UpLo>
-SelfAdjointView<MatrixType, UpLo>::ldlt() const {
+inline LDLT<typename SelfAdjointView<MatrixType, UpLo>::PlainObject, UpLo> SelfAdjointView<MatrixType, UpLo>::ldlt()
+    const {
   return LDLT<PlainObject, UpLo>(m_matrix);
 }
 
@@ -640,7 +652,7 @@
  * \sa SelfAdjointView::ldlt()
  */
 template <typename Derived>
-inline const LDLT<typename MatrixBase<Derived>::PlainObject> MatrixBase<Derived>::ldlt() const {
+inline LDLT<typename MatrixBase<Derived>::PlainObject> MatrixBase<Derived>::ldlt() const {
   return LDLT<PlainObject>(derived());
 }
 

diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h
index 7fa4fa2..7066cd0 100644
--- a/Eigen/src/Cholesky/LLT.h
+++ b/Eigen/src/Cholesky/LLT.h

@@ -86,7 +86,7 @@
    * The default constructor is useful in cases in which the user intends to
    * perform decompositions via LLT::compute(const MatrixType&).
    */
-  LLT() : m_matrix(), m_isInitialized(false) {}
+  LLT() : m_matrix(), m_l1_norm(0), m_isInitialized(false), m_info(InvalidInput) {}
 
   /** \brief Default Constructor with memory preallocation
    *
@@ -94,10 +94,11 @@
    * according to the specified problem \a size.
    * \sa LLT()
    */
-  explicit LLT(Index size) : m_matrix(size, size), m_isInitialized(false) {}
+  explicit LLT(Index size) : m_matrix(size, size), m_l1_norm(0), m_isInitialized(false), m_info(InvalidInput) {}
 
   template <typename InputType>
-  explicit LLT(const EigenBase<InputType>& matrix) : m_matrix(matrix.rows(), matrix.cols()), m_isInitialized(false) {
+  explicit LLT(const EigenBase<InputType>& matrix)
+      : m_matrix(matrix.rows(), matrix.cols()), m_l1_norm(0), m_isInitialized(false), m_info(InvalidInput) {
     compute(matrix.derived());
   }
 
@@ -109,7 +110,8 @@
    * \sa LLT(const EigenBase&)
    */
   template <typename InputType>
-  explicit LLT(EigenBase<InputType>& matrix) : m_matrix(matrix.derived()), m_isInitialized(false) {
+  explicit LLT(EigenBase<InputType>& matrix)
+      : m_matrix(matrix.derived()), m_l1_norm(0), m_isInitialized(false), m_info(InvalidInput) {
     compute(matrix.derived());
   }
 
@@ -137,7 +139,7 @@
    * \sa solveInPlace(), MatrixBase::llt(), SelfAdjointView::llt()
    */
   template <typename Rhs>
-  inline const Solve<LLT, Rhs> solve(const MatrixBase<Rhs>& b) const;
+  inline Solve<LLT, Rhs> solve(const MatrixBase<Rhs>& b) const;
 #endif
 
   template <typename Derived>
@@ -404,7 +406,7 @@
 
   // Compute matrix L1 norm = max abs column sum.
   m_l1_norm = RealScalar(0);
-  // TODO move this code to SelfAdjointView
+  // TODO: move this code to SelfAdjointView
   for (Index col = 0; col < size; ++col) {
     RealScalar abs_col_sum;
     if (UpLo_ == Lower)
@@ -495,7 +497,7 @@
  * \sa SelfAdjointView::llt()
  */
 template <typename Derived>
-inline const LLT<typename MatrixBase<Derived>::PlainObject> MatrixBase<Derived>::llt() const {
+inline LLT<typename MatrixBase<Derived>::PlainObject> MatrixBase<Derived>::llt() const {
   return LLT<PlainObject>(derived());
 }
 
@@ -504,7 +506,7 @@
  * \sa SelfAdjointView::llt()
  */
 template <typename MatrixType, unsigned int UpLo>
-inline const LLT<typename SelfAdjointView<MatrixType, UpLo>::PlainObject, UpLo> SelfAdjointView<MatrixType, UpLo>::llt()
+inline LLT<typename SelfAdjointView<MatrixType, UpLo>::PlainObject, UpLo> SelfAdjointView<MatrixType, UpLo>::llt()
     const {
   return LLT<PlainObject, UpLo>(m_matrix);
 }

diff --git a/Eigen/src/CholmodSupport/CholmodSupport.h b/Eigen/src/CholmodSupport/CholmodSupport.h
index 7e3c881..dc3d6a3 100644
--- a/Eigen/src/CholmodSupport/CholmodSupport.h
+++ b/Eigen/src/CholmodSupport/CholmodSupport.h

@@ -360,7 +360,7 @@
       this->m_info = NumericalIssue;
       return;
     }
-    // TODO optimize this copy by swapping when possible (be careful with alignment, etc.)
+    // TODO: optimize this copy by swapping when possible (be careful with alignment, etc.)
     // NOTE Actually, the copy can be avoided by calling cholmod_solve2 instead of cholmod_solve
     dest = Matrix<Scalar, Dest::RowsAtCompileTime, Dest::ColsAtCompileTime>::Map(reinterpret_cast<Scalar*>(x_cd->x),
                                                                                  b.rows(), b.cols());
@@ -386,7 +386,7 @@
       this->m_info = NumericalIssue;
       return;
     }
-    // TODO optimize this copy by swapping when possible (be careful with alignment, etc.)
+    // TODO: optimize this copy by swapping when possible (be careful with alignment, etc.)
     // NOTE cholmod_spsolve in fact just calls the dense solver for blocks of 4 columns at a time (similar to Eigen's
     // sparse solver)
     dest.derived() = viewAsEigen<typename DestDerived::Scalar, typename DestDerived::StorageIndex>(*x_cs);

diff --git a/Eigen/src/Core/ArithmeticSequence.h b/Eigen/src/Core/ArithmeticSequence.h
index ae6373d..4161771 100644
--- a/Eigen/src/Core/ArithmeticSequence.h
+++ b/Eigen/src/Core/ArithmeticSequence.h

@@ -182,7 +182,7 @@
  * \returns a symbolic ArithmeticSequence representing the last \a size elements with increment \a incr.
  *
  * It is a shortcut for: \code seqN(last-(size-fix<1>)*incr, size, incr) \endcode
- *
+ * \anchor Eigen_placeholders_lastN
  * \sa lastN(SizeType), seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType) */
 template <typename SizeType, typename IncrType>
 auto lastN(SizeType size, IncrType incr)

diff --git a/Eigen/src/Core/Array.h b/Eigen/src/Core/Array.h
index 57f3186..b6b4637 100644
--- a/Eigen/src/Core/Array.h
+++ b/Eigen/src/Core/Array.h

@@ -123,12 +123,12 @@
    * \sa resize(Index,Index)
    */
 #ifdef EIGEN_INITIALIZE_COEFFS
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Array() : Base() { EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED }
+  EIGEN_DEVICE_FUNC constexpr Array() : Base() { EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED }
 #else
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Array() = default;
+  EIGEN_DEVICE_FUNC constexpr Array() = default;
 #endif
   /** \brief Move constructor */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Array(Array&&) = default;
+  EIGEN_DEVICE_FUNC constexpr Array(Array&&) = default;
   EIGEN_DEVICE_FUNC Array& operator=(Array&& other) noexcept(std::is_nothrow_move_assignable<Scalar>::value) {
     Base::operator=(std::move(other));
     return *this;
@@ -141,7 +141,7 @@
    * This constructor is for 1D array or vectors with more than 4 coefficients.
    *
    * \warning To construct a column (resp. row) vector of fixed length, the number of values passed to this
-   * constructor must match the the fixed number of rows (resp. columns) of \c *this.
+   * constructor must match the fixed number of rows (resp. columns) of \c *this.
    *
    *
    * Example: \include Array_variadic_ctor_cxx11.cpp
@@ -178,9 +178,7 @@
    *
    * \sa  Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
    */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Array(
-      const std::initializer_list<std::initializer_list<Scalar>>& list)
-      : Base(list) {}
+  EIGEN_DEVICE_FUNC constexpr Array(const std::initializer_list<std::initializer_list<Scalar>>& list) : Base(list) {}
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
   template <typename T>
@@ -239,7 +237,7 @@
   }
 
   /** Copy constructor */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Array(const Array&) = default;
+  EIGEN_DEVICE_FUNC constexpr Array(const Array&) = default;
 
  private:
   struct PrivateType {};
@@ -247,7 +245,7 @@
  public:
   /** \sa MatrixBase::operator=(const EigenBase<OtherDerived>&) */
   template <typename OtherDerived>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Array(
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Array(
       const EigenBase<OtherDerived>& other,
       std::enable_if_t<internal::is_convertible<typename OtherDerived::Scalar, Scalar>::value, PrivateType> =
           PrivateType())

diff --git a/Eigen/src/Core/ArrayBase.h b/Eigen/src/Core/ArrayBase.h
index 8465f54..dacc239 100644
--- a/Eigen/src/Core/ArrayBase.h
+++ b/Eigen/src/Core/ArrayBase.h

@@ -168,19 +168,16 @@
   }
 
  public:
-  EIGEN_DEVICE_FUNC ArrayBase<Derived>& array() { return *this; }
-  EIGEN_DEVICE_FUNC const ArrayBase<Derived>& array() const { return *this; }
+  EIGEN_DEVICE_FUNC constexpr ArrayBase<Derived>& array() { return *this; }
+  EIGEN_DEVICE_FUNC constexpr const ArrayBase<Derived>& array() const { return *this; }
 
   /** \returns an \link Eigen::MatrixBase Matrix \endlink expression of this array
    * \sa MatrixBase::array() */
-  EIGEN_DEVICE_FUNC MatrixWrapper<Derived> matrix() { return MatrixWrapper<Derived>(derived()); }
-  EIGEN_DEVICE_FUNC const MatrixWrapper<const Derived> matrix() const {
+  EIGEN_DEVICE_FUNC constexpr MatrixWrapper<Derived> matrix() { return MatrixWrapper<Derived>(derived()); }
+  EIGEN_DEVICE_FUNC constexpr const MatrixWrapper<const Derived> matrix() const {
     return MatrixWrapper<const Derived>(derived());
   }
 
-  //     template<typename Dest>
-  //     inline void evalTo(Dest& dst) const { dst = matrix(); }
-
  protected:
   EIGEN_DEFAULT_COPY_CONSTRUCTOR(ArrayBase)
   EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(ArrayBase)

diff --git a/Eigen/src/Core/ArrayWrapper.h b/Eigen/src/Core/ArrayWrapper.h
index c9a194e..fb05ab5 100644
--- a/Eigen/src/Core/ArrayWrapper.h
+++ b/Eigen/src/Core/ArrayWrapper.h

@@ -21,7 +21,7 @@
  * \brief Expression of a mathematical vector or matrix as an array object
  *
  * This class is the return type of MatrixBase::array(), and most of the time
- * this is the only way it is use.
+ * this is the only way it is used.
  *
  * \sa MatrixBase::array(), class MatrixWrapper
  */
@@ -54,7 +54,8 @@
 
   using Base::coeffRef;
 
-  EIGEN_DEVICE_FUNC explicit EIGEN_STRONG_INLINE ArrayWrapper(ExpressionType& matrix) : m_expression(matrix) {}
+  EIGEN_DEVICE_FUNC constexpr explicit EIGEN_STRONG_INLINE ArrayWrapper(ExpressionType& matrix)
+      : m_expression(matrix) {}
 
   EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_expression.rows(); }
   EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_expression.cols(); }
@@ -75,7 +76,7 @@
     dst = m_expression;
   }
 
-  EIGEN_DEVICE_FUNC const internal::remove_all_t<NestedExpressionType>& nestedExpression() const {
+  EIGEN_DEVICE_FUNC constexpr const internal::remove_all_t<NestedExpressionType>& nestedExpression() const {
     return m_expression;
   }
 
@@ -96,7 +97,7 @@
  * \brief Expression of an array as a mathematical vector or matrix
  *
  * This class is the return type of ArrayBase::matrix(), and most of the time
- * this is the only way it is use.
+ * this is the only way it is used.
  *
  * \sa MatrixBase::matrix(), class ArrayWrapper
  */
@@ -129,7 +130,7 @@
 
   using Base::coeffRef;
 
-  EIGEN_DEVICE_FUNC explicit inline MatrixWrapper(ExpressionType& matrix) : m_expression(matrix) {}
+  EIGEN_DEVICE_FUNC constexpr explicit inline MatrixWrapper(ExpressionType& matrix) : m_expression(matrix) {}
 
   EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_expression.rows(); }
   EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_expression.cols(); }
@@ -145,7 +146,7 @@
 
   EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index index) const { return m_expression.coeffRef(index); }
 
-  EIGEN_DEVICE_FUNC const internal::remove_all_t<NestedExpressionType>& nestedExpression() const {
+  EIGEN_DEVICE_FUNC constexpr const internal::remove_all_t<NestedExpressionType>& nestedExpression() const {
     return m_expression;
   }
 

diff --git a/Eigen/src/Core/Assign.h b/Eigen/src/Core/Assign.h
index 4b30f7b..3d30d86 100644
--- a/Eigen/src/Core/Assign.h
+++ b/Eigen/src/Core/Assign.h

@@ -19,7 +19,8 @@
 
 template <typename Derived>
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::lazyAssign(const DenseBase<OtherDerived>& other) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::lazyAssign(
+    const DenseBase<OtherDerived>& other) {
   enum { SameType = internal::is_same<typename Derived::Scalar, typename OtherDerived::Scalar>::value };
 
   EIGEN_STATIC_ASSERT_LVALUE(Derived)
@@ -36,40 +37,43 @@
 
 template <typename Derived>
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator=(const DenseBase<OtherDerived>& other) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator=(
+    const DenseBase<OtherDerived>& other) {
   internal::call_assignment(derived(), other.derived());
   return derived();
 }
 
 template <typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator=(const DenseBase& other) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator=(const DenseBase& other) {
   internal::call_assignment(derived(), other.derived());
   return derived();
 }
 
 template <typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const MatrixBase& other) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const MatrixBase& other) {
   internal::call_assignment(derived(), other.derived());
   return derived();
 }
 
 template <typename Derived>
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const DenseBase<OtherDerived>& other) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(
+    const DenseBase<OtherDerived>& other) {
   internal::call_assignment(derived(), other.derived());
   return derived();
 }
 
 template <typename Derived>
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const EigenBase<OtherDerived>& other) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(
+    const EigenBase<OtherDerived>& other) {
   internal::call_assignment(derived(), other.derived());
   return derived();
 }
 
 template <typename Derived>
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(
     const ReturnByValue<OtherDerived>& other) {
   other.derived().evalTo(derived());
   return derived();

diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h
index a467575..abaa429 100644
--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h

@@ -63,7 +63,7 @@
   static constexpr int RestrictedLinearSize = min_size_prefer_fixed(MaxSizeAtCompileTime, MaxPacketSize);
   static constexpr int OuterStride = outer_stride_at_compile_time<Dst>::ret;
 
-  // TODO distinguish between linear traversal and inner-traversals
+  // TODO: distinguish between linear traversal and inner-traversal packet types.
   using LinearPacketType = typename find_best_packet<DstScalar, RestrictedLinearSize>::type;
   using InnerPacketType = typename find_best_packet<DstScalar, RestrictedInnerSize>::type;
 
@@ -681,8 +681,8 @@
   EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_dstExpr.cols(); }
   EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return m_dstExpr.outerStride(); }
 
-  EIGEN_DEVICE_FUNC DstEvaluatorType& dstEvaluator() noexcept { return m_dst; }
-  EIGEN_DEVICE_FUNC const SrcEvaluatorType& srcEvaluator() const noexcept { return m_src; }
+  EIGEN_DEVICE_FUNC constexpr DstEvaluatorType& dstEvaluator() noexcept { return m_dst; }
+  EIGEN_DEVICE_FUNC constexpr const SrcEvaluatorType& srcEvaluator() const noexcept { return m_src; }
 
   /// Assign src(row,col) to dst(row,col) through the assignment functor.
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void assignCoeff(Index row, Index col) {
@@ -690,7 +690,7 @@
   }
 
   /// \sa assignCoeff(Index,Index)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Index index) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void assignCoeff(Index index) {
     m_functor.assignCoeff(m_dst.coeffRef(index), m_src.coeff(index));
   }
 
@@ -762,7 +762,7 @@
   DstEvaluatorType& m_dst;
   const SrcEvaluatorType& m_src;
   const Functor& m_functor;
-  // TODO find a way to avoid the needs of the original expression
+  // TODO: find a way to avoid the needs of the original expression
   DstXprType& m_dstExpr;
 };
 
@@ -804,8 +804,20 @@
                                                                        const internal::assign_op<T1, T2>& /*func*/) {
   Index dstRows = src.rows();
   Index dstCols = src.cols();
-  if (((dst.rows() != dstRows) || (dst.cols() != dstCols))) dst.resize(dstRows, dstCols);
-  eigen_assert(dst.rows() == dstRows && dst.cols() == dstCols);
+  if (((dst.rows() != dstRows) || (dst.cols() != dstCols))) {
+#ifdef EIGEN_NO_AUTOMATIC_RESIZING
+    eigen_assert(
+        (dst.size() == 0 || (DstXprType::IsVectorAtCompileTime ? (dst.size() == src.size())
+                                                               : (dst.rows() == dstRows && dst.cols() == dstCols))) &&
+        "Size mismatch. Automatic resizing is disabled because EIGEN_NO_AUTOMATIC_RESIZING is defined");
+    if (dst.size() == 0) {
+      dst.resize(dstRows, dstCols);
+    }
+#else
+    dst.resize(dstRows, dstCols);
+    eigen_assert(dst.rows() == dstRows && dst.cols() == dstCols);
+#endif
+  }
 }
 
 template <typename DstXprType, typename SrcXprType, typename Functor>
@@ -912,7 +924,7 @@
   typedef std::conditional_t<NeedToTranspose, Transpose<Dst>, Dst&> ActualDstType;
   ActualDstType actualDst(dst);
 
-  // TODO check whether this is the right place to perform these checks:
+  // TODO: check whether this is the right place to perform these checks:
   EIGEN_STATIC_ASSERT_LVALUE(Dst)
   EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(ActualDstTypeCleaned, Src)
   EIGEN_CHECK_BINARY_COMPATIBILIY(Func, typename ActualDstTypeCleaned::Scalar, typename Src::Scalar);
@@ -947,7 +959,7 @@
 template <typename Dst, typename Src, typename Func>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src,
                                                                                            const Func& func) {
-  // TODO check whether this is the right place to perform these checks:
+  // TODO: check whether this is the right place to perform these checks:
   EIGEN_STATIC_ASSERT_LVALUE(Dst)
   EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Dst, Src)
   EIGEN_CHECK_BINARY_COMPATIBILIY(Func, typename Dst::Scalar, typename Src::Scalar);
@@ -1007,7 +1019,7 @@
 };
 
 // Generic assignment through evalTo.
-// TODO: not sure we have to keep that one, but it helps porting current code to new evaluator mechanism.
+// TODO: evaluate whether this generic evalTo-based assignment path is still needed.
 // Note that the last template argument "Weak" is needed to make it possible to perform
 // both partial specialization+SFINAE without ambiguous specialization
 template <typename DstXprType, typename SrcXprType, typename Functor, typename Weak>

diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h
index 39abff7..c2d8bea 100644
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h

@@ -121,14 +121,14 @@
 
   /** Column or Row constructor
    */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Block(XprType& xpr, Index i) : Impl(xpr, i) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Block(XprType& xpr, Index i) : Impl(xpr, i) {
     eigen_assert((i >= 0) && (((BlockRows == 1) && (BlockCols == XprType::ColsAtCompileTime) && i < xpr.rows()) ||
                               ((BlockRows == XprType::RowsAtCompileTime) && (BlockCols == 1) && i < xpr.cols())));
   }
 
   /** Fixed-size constructor
    */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Block(XprType& xpr, Index startRow, Index startCol)
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Block(XprType& xpr, Index startRow, Index startCol)
       : Impl(xpr, startRow, startCol) {
     EIGEN_STATIC_ASSERT(RowsAtCompileTime != Dynamic && ColsAtCompileTime != Dynamic,
                         THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE)
@@ -138,8 +138,8 @@
 
   /** Dynamic-size constructor
    */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Block(XprType& xpr, Index startRow, Index startCol, Index blockRows,
-                                              Index blockCols)
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Block(XprType& xpr, Index startRow, Index startCol, Index blockRows,
+                                                        Index blockCols)
       : Impl(xpr, startRow, startCol, blockRows, blockCols) {
     eigen_assert((RowsAtCompileTime == Dynamic || RowsAtCompileTime == blockRows) &&
                  (ColsAtCompileTime == Dynamic || ColsAtCompileTime == blockCols));
@@ -175,11 +175,11 @@
  public:
   typedef Impl Base;
   EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index i) : Impl(xpr, i) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index startRow, Index startCol)
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index i) : Impl(xpr, i) {}
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index startRow, Index startCol)
       : Impl(xpr, startRow, startCol) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index startRow, Index startCol, Index blockRows,
-                                                  Index blockCols)
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index startRow, Index startCol,
+                                                            Index blockRows, Index blockCols)
       : Impl(xpr, startRow, startCol, blockRows, blockCols) {}
 };
 
@@ -196,11 +196,9 @@
   EIGEN_DENSE_PUBLIC_INTERFACE(BlockType)
   EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl_dense)
 
-  // class InnerIterator; // FIXME apparently never used
-
   /** Column or Row constructor
    */
-  EIGEN_DEVICE_FUNC inline BlockImpl_dense(XprType& xpr, Index i)
+  EIGEN_DEVICE_FUNC constexpr BlockImpl_dense(XprType& xpr, Index i)
       : m_xpr(xpr),
         // It is a row if and only if BlockRows==1 and BlockCols==XprType::ColsAtCompileTime,
         // and it is a column if and only if BlockRows==XprType::RowsAtCompileTime and BlockCols==1,
@@ -213,17 +211,17 @@
 
   /** Fixed-size constructor
    */
-  EIGEN_DEVICE_FUNC inline BlockImpl_dense(XprType& xpr, Index startRow, Index startCol)
+  EIGEN_DEVICE_FUNC constexpr BlockImpl_dense(XprType& xpr, Index startRow, Index startCol)
       : m_xpr(xpr), m_startRow(startRow), m_startCol(startCol), m_blockRows(BlockRows), m_blockCols(BlockCols) {}
 
   /** Dynamic-size constructor
    */
-  EIGEN_DEVICE_FUNC inline BlockImpl_dense(XprType& xpr, Index startRow, Index startCol, Index blockRows,
-                                           Index blockCols)
+  EIGEN_DEVICE_FUNC constexpr BlockImpl_dense(XprType& xpr, Index startRow, Index startCol, Index blockRows,
+                                              Index blockCols)
       : m_xpr(xpr), m_startRow(startRow), m_startCol(startCol), m_blockRows(blockRows), m_blockCols(blockCols) {}
 
-  EIGEN_DEVICE_FUNC inline Index rows() const { return m_blockRows.value(); }
-  EIGEN_DEVICE_FUNC inline Index cols() const { return m_blockCols.value(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const { return m_blockRows.value(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const { return m_blockCols.value(); }
 
   EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index rowId, Index colId) {
     EIGEN_STATIC_ASSERT_LVALUE(XprType)
@@ -289,9 +287,9 @@
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE XprType& nestedExpression() { return m_xpr; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr StorageIndex startRow() const noexcept { return m_startRow.value(); }
+  EIGEN_DEVICE_FUNC constexpr StorageIndex startRow() const noexcept { return m_startRow.value(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr StorageIndex startCol() const noexcept { return m_startCol.value(); }
+  EIGEN_DEVICE_FUNC constexpr StorageIndex startCol() const noexcept { return m_startCol.value(); }
 
  protected:
   XprTypeNested m_xpr;
@@ -380,18 +378,18 @@
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE XprType& nestedExpression() { return m_xpr; }
 
   /** \sa MapBase::innerStride() */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index innerStride() const noexcept {
+  EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept {
     return internal::traits<BlockType>::HasSameStorageOrderAsXprType ? m_xpr.innerStride() : m_xpr.outerStride();
   }
 
   /** \sa MapBase::outerStride() */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index outerStride() const noexcept {
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept {
     return internal::traits<BlockType>::HasSameStorageOrderAsXprType ? m_xpr.outerStride() : m_xpr.innerStride();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr StorageIndex startRow() const noexcept { return m_startRow.value(); }
+  EIGEN_DEVICE_FUNC constexpr StorageIndex startRow() const noexcept { return m_startRow.value(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr StorageIndex startCol() const noexcept { return m_startCol.value(); }
+  EIGEN_DEVICE_FUNC constexpr StorageIndex startCol() const noexcept { return m_startCol.value(); }
 
 #ifndef __SUNPRO_CC
   // FIXME sunstudio is not friendly with the above friend...

diff --git a/Eigen/src/Core/CommaInitializer.h b/Eigen/src/Core/CommaInitializer.h
index c414117..4541f47 100644
--- a/Eigen/src/Core/CommaInitializer.h
+++ b/Eigen/src/Core/CommaInitializer.h

@@ -31,7 +31,7 @@
 struct CommaInitializer {
   typedef typename XprType::Scalar Scalar;
 
-  EIGEN_DEVICE_FUNC inline CommaInitializer(XprType& xpr, const Scalar& s)
+  EIGEN_DEVICE_FUNC constexpr CommaInitializer(XprType& xpr, const Scalar& s)
       : m_xpr(xpr), m_row(0), m_col(1), m_currentBlockRows(1) {
     eigen_assert(m_xpr.rows() > 0 && m_xpr.cols() > 0 && "Cannot comma-initialize a 0x0 matrix (operator<<)");
     m_xpr.coeffRef(0, 0) = s;
@@ -48,7 +48,6 @@
 
   /* Copy/Move constructor which transfers ownership. This is crucial in
    * absence of return value optimization to avoid assertions during destruction. */
-  // FIXME in C++11 mode this could be replaced by a proper RValue constructor
   EIGEN_DEVICE_FUNC inline CommaInitializer(const CommaInitializer& o)
       : m_xpr(o.m_xpr), m_row(o.m_row), m_col(o.m_col), m_currentBlockRows(o.m_currentBlockRows) {
     // Mark original object as finished. In absence of R-value references we need to const_cast:

diff --git a/Eigen/src/Core/ConditionEstimator.h b/Eigen/src/Core/ConditionEstimator.h
index dd1770b..df27be3 100644
--- a/Eigen/src/Core/ConditionEstimator.h
+++ b/Eigen/src/Core/ConditionEstimator.h

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2016 Rasmus Munk Larsen (rmlarsen@google.com)
+// Copyright (C) 2016 Rasmus Munk Larsen (rmlarsen@gmail.com)
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -40,18 +40,17 @@
  * \a matrix that implements .solve() and .adjoint().solve() methods.
  *
  * This function implements Algorithms 4.1 and 5.1 from
- *   http://www.maths.manchester.ac.uk/~higham/narep/narep135.pdf
- * which also forms the basis for the condition number estimators in
- * LAPACK. Since at most 10 calls to the solve method of dec are
- * performed, the total cost is O(dims^2), as opposed to O(dims^3)
- * needed to compute the inverse matrix explicitly.
+ *   Higham, "Experience with a Matrix Norm Estimator",
+ *   SIAM J. Sci. Stat. Comput., 11(4):804-809, 1990.
+ * with Higham's alternating-sign safety-net estimate from
+ *   Higham and Tisseur, "A Block Algorithm for Matrix 1-Norm Estimation,
+ *   with an Application to 1-Norm Pseudospectra", SIAM J. Matrix Anal. Appl.,
+ *   21(4):1185-1201, 2000.
  *
- * The most common usage is in estimating the condition number
- * ||matrix||_1 * ||inv(matrix)||_1. The first term ||matrix||_1 can be
- * computed directly in O(n^2) operations.
+ * The Hager/Higham gradient ascent uses at most 5 iterations of 2 solves
+ * each, giving a total cost of O(n^2).
  *
- * Supports the following decompositions: FullPivLU, PartialPivLU, LDLT, and
- * LLT.
+ * Supports the following decompositions: FullPivLU, PartialPivLU, LDLT, LLT.
  *
  * \sa FullPivLU, PartialPivLU, LDLT, LLT.
  */
@@ -66,7 +65,7 @@
 
   eigen_assert(dec.rows() == dec.cols());
   const Index n = dec.rows();
-  if (n == 0) return 0;
+  if (n == 0) return RealScalar(0);
 
     // Disable Index to float conversion warning
 #ifdef __INTEL_COMPILER
@@ -80,14 +79,12 @@
 
   // lower_bound is a lower bound on
   //   ||inv(matrix)||_1  = sup_v ||inv(matrix) v||_1 / ||v||_1
-  // and is the objective maximized by the ("super-") gradient ascent
-  // algorithm below.
+  // and is the objective maximized by the supergradient ascent algorithm below.
   RealScalar lower_bound = v.template lpNorm<1>();
   if (n == 1) return lower_bound;
 
-  // Gradient ascent algorithm follows: We know that the optimum is achieved at
-  // one of the simplices v = e_i, so in each iteration we follow a
-  // super-gradient to move towards the optimal one.
+  // Gradient ascent: the optimum is achieved at a unit vector e_j. Each
+  // iteration follows the supergradient to find which unit vector to probe next.
   RealScalar old_lower_bound = lower_bound;
   Vector sign_vector(n);
   Vector old_sign_vector;
@@ -96,21 +93,21 @@
   for (int k = 0; k < 4; ++k) {
     sign_vector = internal::rcond_compute_sign<Vector, RealVector, is_complex>::run(v);
     if (k > 0 && !is_complex && sign_vector == old_sign_vector) {
-      // Break if the solution stagnated.
+      // Break if the sign vector stagnated.
       break;
     }
-    // v_max_abs_index = argmax |real( inv(matrix)^T * sign_vector )|
+    // Supergradient: z = A^{-T} * sign(v), pick argmax |z_i|.
     v = dec.adjoint().solve(sign_vector);
     v.real().cwiseAbs().maxCoeff(&v_max_abs_index);
     if (v_max_abs_index == old_v_max_abs_index) {
-      // Break if the solution stagnated.
+      // Optimality: supergradient points to the same unit vector.
       break;
     }
-    // Move to the new simplex e_j, where j = v_max_abs_index.
-    v = dec.solve(Vector::Unit(n, v_max_abs_index));  // v = inv(matrix) * e_j.
+    // Probe the best unit vector: v = A^{-1} * e_j.
+    v = dec.solve(Vector::Unit(n, v_max_abs_index));
     lower_bound = v.template lpNorm<1>();
     if (lower_bound <= old_lower_bound) {
-      // Break if the gradient step did not increase the lower_bound.
+      // No improvement from the gradient step.
       break;
     }
     if (!is_complex) {
@@ -119,25 +116,19 @@
     old_v_max_abs_index = v_max_abs_index;
     old_lower_bound = lower_bound;
   }
-  // The following calculates an independent estimate of ||matrix||_1 by
-  // multiplying matrix by a vector with entries of slowly increasing
-  // magnitude and alternating sign:
-  //   v_i = (-1)^{i} (1 + (i / (dim-1))), i = 0,...,dim-1.
-  // This improvement to Hager's algorithm above is due to Higham. It was
-  // added to make the algorithm more robust in certain corner cases where
-  // large elements in the matrix might otherwise escape detection due to
-  // exact cancellation (especially when op and op_adjoint correspond to a
-  // sequence of backsubstitutions and permutations), which could cause
-  // Hager's algorithm to vastly underestimate ||matrix||_1.
+  // Higham's alternating-sign estimate: an independent safety-net that catches
+  // cases where the gradient ascent converges to a local maximum due to exact
+  // cancellation patterns (especially with permutations and backsubstitutions).
+  //   v_i = (-1)^i * (1 + i/(n-1)), then estimate = 2*||A^{-1}*v||_1 / (3*n).
   Scalar alternating_sign(RealScalar(1));
   for (Index i = 0; i < n; ++i) {
-    // The static_cast is needed when Scalar is a complex and RealScalar implements expression templates
+    // The static_cast is needed when Scalar is complex and RealScalar uses expression templates.
     v[i] = alternating_sign * static_cast<RealScalar>(RealScalar(1) + (RealScalar(i) / (RealScalar(n - 1))));
     alternating_sign = -alternating_sign;
   }
   v = dec.solve(v);
-  const RealScalar alternate_lower_bound = (2 * v.template lpNorm<1>()) / (3 * RealScalar(n));
-  return numext::maxi(lower_bound, alternate_lower_bound);
+  const RealScalar alt_est = (RealScalar(2) * v.template lpNorm<1>()) / (RealScalar(3) * RealScalar(n));
+  return numext::maxi(lower_bound, alt_est);
 }
 
 /** \brief Reciprocal condition number estimator.

diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h
index 60857e2..9df0c96 100644
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h

@@ -103,20 +103,20 @@
 template <typename T>
 struct evaluator : public unary_evaluator<T> {
   typedef unary_evaluator<T> Base;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const T& xpr) : Base(xpr) {}
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE explicit evaluator(const T& xpr) : Base(xpr) {}
 };
 
 // TODO: Think about const-correctness
 template <typename T>
 struct evaluator<const T> : evaluator<T> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const T& xpr) : evaluator<T>(xpr) {}
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE explicit evaluator(const T& xpr) : evaluator<T>(xpr) {}
 };
 
 // ---------- base class for all evaluators ----------
 
 template <typename ExpressionType>
 struct evaluator_base {
-  // TODO that's not very nice to have to propagate all these traits. They are currently only needed to handle
+  // TODO: find a way to avoid propagating all these traits. They are currently only needed to handle
   // outer,inner indices.
   typedef traits<ExpressionType> ExpressionTraits;
 
@@ -124,7 +124,7 @@
   // noncopyable:
   // Don't make this class inherit noncopyable as this kills EBO (Empty Base Optimization)
   // and make complex evaluator much larger than then should do.
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr evaluator_base() = default;
+  EIGEN_DEVICE_FUNC constexpr evaluator_base() = default;
 
  private:
   EIGEN_DEVICE_FUNC evaluator_base(const evaluator_base&);
@@ -142,23 +142,22 @@
 template <typename Scalar, int OuterStride>
 class plainobjectbase_evaluator_data {
  public:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride)
-      : data(ptr) {
+  EIGEN_DEVICE_FUNC constexpr plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr) {
 #ifndef EIGEN_INTERNAL_DEBUGGING
     EIGEN_UNUSED_VARIABLE(outerStride);
 #endif
     eigen_internal_assert(outerStride == OuterStride);
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index outerStride() const noexcept { return OuterStride; }
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return OuterStride; }
   const Scalar* data;
 };
 
 template <typename Scalar>
 class plainobjectbase_evaluator_data<Scalar, Dynamic> {
  public:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride)
+  EIGEN_DEVICE_FUNC constexpr plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride)
       : data(ptr), m_outerStride(outerStride) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index outerStride() const { return m_outerStride; }
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const { return m_outerStride; }
   const Scalar* data;
 
  protected:
@@ -188,11 +187,11 @@
                                                      : RowsAtCompileTime
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr evaluator() : m_d(0, OuterStrideAtCompileTime) {
+  EIGEN_DEVICE_FUNC constexpr evaluator() : m_d(0, OuterStrideAtCompileTime) {
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr explicit evaluator(const PlainObjectType& m)
+  EIGEN_DEVICE_FUNC constexpr explicit evaluator(const PlainObjectType& m)
       : m_d(m.data(), IsVectorAtCompileTime ? 0 : m.outerStride()) {
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
@@ -257,7 +256,7 @@
   plainobjectbase_evaluator_data<Scalar, OuterStrideAtCompileTime> m_d;
 
  private:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index constexpr getIndex(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC constexpr Index getIndex(Index row, Index col) const {
     return IsRowMajor ? row * m_d.outerStride() + col : row + col * m_d.outerStride();
   }
 };
@@ -267,10 +266,9 @@
     : evaluator<PlainObjectBase<Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols>>> {
   typedef Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols> XprType;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr evaluator() = default;
+  EIGEN_DEVICE_FUNC constexpr evaluator() = default;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr explicit evaluator(const XprType& m)
-      : evaluator<PlainObjectBase<XprType>>(m) {}
+  EIGEN_DEVICE_FUNC constexpr explicit evaluator(const XprType& m) : evaluator<PlainObjectBase<XprType>>(m) {}
 };
 
 template <typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
@@ -278,10 +276,9 @@
     : evaluator<PlainObjectBase<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols>>> {
   typedef Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> XprType;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr evaluator() = default;
+  EIGEN_DEVICE_FUNC constexpr evaluator() = default;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr explicit evaluator(const XprType& m)
-      : evaluator<PlainObjectBase<XprType>>(m) {}
+  EIGEN_DEVICE_FUNC constexpr explicit evaluator(const XprType& m) : evaluator<PlainObjectBase<XprType>>(m) {}
 };
 
 // -------------------- Transpose --------------------
@@ -296,20 +293,25 @@
     Alignment = evaluator<ArgType>::Alignment
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit unary_evaluator(const XprType& t) : m_argImpl(t.nestedExpression()) {}
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE explicit unary_evaluator(const XprType& t)
+      : m_argImpl(t.nestedExpression()) {}
 
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
     return m_argImpl.coeff(col, row);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_argImpl.coeff(index); }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    return m_argImpl.coeff(index);
+  }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) { return m_argImpl.coeffRef(col, row); }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) {
+    return m_argImpl.coeffRef(col, row);
+  }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename XprType::Scalar& coeffRef(Index index) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename XprType::Scalar& coeffRef(Index index) {
     return m_argImpl.coeffRef(index);
   }
 
@@ -368,11 +370,12 @@
           bool has_binary = has_binary_operator<NullaryOp>::value>
 struct nullary_wrapper {
   template <typename IndexType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i, IndexType j) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i,
+                                                                    IndexType j) const {
     return op(i, j);
   }
   template <typename IndexType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i) const {
     return op(i);
   }
 
@@ -389,7 +392,8 @@
 template <typename Scalar, typename NullaryOp>
 struct nullary_wrapper<Scalar, NullaryOp, true, false, false> {
   template <typename IndexType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType = 0, IndexType = 0) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType = 0,
+                                                                    IndexType = 0) const {
     return op();
   }
   template <typename T, typename IndexType>
@@ -401,7 +405,8 @@
 template <typename Scalar, typename NullaryOp>
 struct nullary_wrapper<Scalar, NullaryOp, false, false, true> {
   template <typename IndexType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i, IndexType j = 0) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i,
+                                                                    IndexType j = 0) const {
     return op(i, j);
   }
   template <typename T, typename IndexType>
@@ -416,7 +421,8 @@
 template <typename Scalar, typename NullaryOp>
 struct nullary_wrapper<Scalar, NullaryOp, false, true, false> {
   template <typename IndexType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i, IndexType j) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i,
+                                                                    IndexType j) const {
     eigen_assert(i == 0 || j == 0);
     return op(i + j);
   }
@@ -427,7 +433,7 @@
   }
 
   template <typename IndexType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i) const {
     return op(i);
   }
   template <typename T, typename IndexType>
@@ -439,64 +445,6 @@
 template <typename Scalar, typename NullaryOp>
 struct nullary_wrapper<Scalar, NullaryOp, false, false, false> {};
 
-#if 0 && EIGEN_COMP_MSVC > 0
-// Disable this ugly workaround. This is now handled in traits<Ref>::match,
-// but this piece of code might still become handly if some other weird compilation
-// errors pop up again.
-
-// MSVC exhibits a weird compilation error when
-// compiling:
-//    Eigen::MatrixXf A = MatrixXf::Random(3,3);
-//    Ref<const MatrixXf> R = 2.f*A;
-// and that has_*ary_operator<scalar_constant_op<float>> have not been instantiated yet.
-// The "problem" is that evaluator<2.f*A> is instantiated by traits<Ref>::match<2.f*A>
-// and at that time has_*ary_operator<T> returns true regardless of T.
-// Then nullary_wrapper is badly instantiated as nullary_wrapper<.,.,true,true,true>.
-// The trick is thus to defer the proper instantiation of nullary_wrapper when coeff(),
-// and packet() are really instantiated as implemented below:
-
-// This is a simple wrapper around Index to enforce the re-instantiation of
-// has_*ary_operator when needed.
-template<typename T> struct nullary_wrapper_workaround_msvc {
-  nullary_wrapper_workaround_msvc(const T&);
-  operator T()const;
-};
-
-template<typename Scalar,typename NullaryOp>
-struct nullary_wrapper<Scalar,NullaryOp,true,true,true>
-{
-  template <typename IndexType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i, IndexType j) const {
-    return nullary_wrapper<Scalar,NullaryOp,
-    has_nullary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
-    has_unary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
-    has_binary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value>().operator()(op,i,j);
-  }
-  template <typename IndexType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i) const {
-    return nullary_wrapper<Scalar,NullaryOp,
-    has_nullary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
-    has_unary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
-    has_binary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value>().operator()(op,i);
-  }
-
-  template <typename T, typename IndexType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i, IndexType j) const {
-    return nullary_wrapper<Scalar,NullaryOp,
-    has_nullary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
-    has_unary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
-    has_binary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value>().template packetOp<T>(op,i,j);
-  }
-  template <typename T, typename IndexType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i) const {
-    return nullary_wrapper<Scalar,NullaryOp,
-    has_nullary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
-    has_unary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
-    has_binary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value>().template packetOp<T>(op,i);
-  }
-};
-#endif  // MSVC workaround
-
 template <typename NullaryOp, typename PlainObjectType>
 struct evaluator<CwiseNullaryOp<NullaryOp, PlainObjectType>>
     : evaluator_base<CwiseNullaryOp<NullaryOp, PlainObjectType>> {
@@ -513,19 +461,19 @@
     Alignment = AlignedMax
   };
 
-  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& n) : m_functor(n.functor()), m_wrapper() {
+  EIGEN_DEVICE_FUNC constexpr explicit evaluator(const XprType& n) : m_functor(n.functor()), m_wrapper() {
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
 
   typedef typename XprType::CoeffReturnType CoeffReturnType;
 
   template <typename IndexType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(IndexType row, IndexType col) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(IndexType row, IndexType col) const {
     return m_wrapper(m_functor, row, col);
   }
 
   template <typename IndexType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(IndexType index) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(IndexType index) const {
     return m_wrapper(m_functor, index);
   }
 
@@ -570,18 +518,18 @@
     Alignment = evaluator<ArgType>::Alignment
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit unary_evaluator(const XprType& op) : m_d(op) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE explicit unary_evaluator(const XprType& op) : m_d(op) {
     EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<UnaryOp>::Cost);
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
 
   typedef typename XprType::CoeffReturnType CoeffReturnType;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
     return m_d.func()(m_d.argImpl.coeff(row, col));
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
     return m_d.func()(m_d.argImpl.coeff(index));
   }
 
@@ -608,9 +556,9 @@
  protected:
   // this helper permits to completely eliminate the functor if it is empty
   struct Data {
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Data(const XprType& xpr)
+    EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Data(const XprType& xpr)
         : op(xpr.functor()), argImpl(xpr.nestedExpression()) {}
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const UnaryOp& func() const { return op; }
+    EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const UnaryOp& func() const { return op; }
     UnaryOp op;
     evaluator<ArgType> argImpl;
   };
@@ -639,7 +587,7 @@
     Alignment = evaluator<ArgType>::Alignment
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit unary_evaluator(const XprType& xpr)
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE explicit unary_evaluator(const XprType& xpr)
       : m_argImpl(xpr.nestedExpression()), m_rows(xpr.rows()), m_cols(xpr.cols()) {
     EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<CastOp>::Cost);
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
@@ -671,15 +619,15 @@
     Index actualCol = IsRowMajor ? col + offset : col;
     return m_argImpl.coeff(actualRow, actualCol);
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SrcType srcCoeff(Index index, Index offset) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE SrcType srcCoeff(Index index, Index offset) const {
     Index actualIndex = index + offset;
     return m_argImpl.coeff(actualIndex);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstType coeff(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE DstType coeff(Index row, Index col) const {
     return cast<SrcType, DstType>(srcCoeff(row, col, 0));
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstType coeff(Index index) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE DstType coeff(Index index) const {
     return cast<SrcType, DstType>(srcCoeff(index, 0));
   }
 
@@ -961,7 +909,7 @@
   typedef CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> XprType;
   typedef ternary_evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3>> Base;
 
-  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {}
+  EIGEN_DEVICE_FUNC constexpr explicit evaluator(const XprType& xpr) : Base(xpr) {}
 };
 
 template <typename TernaryOp, typename Arg1, typename Arg2, typename Arg3>
@@ -990,18 +938,18 @@
                                evaluator<Arg3>::Alignment)
   };
 
-  EIGEN_DEVICE_FUNC explicit ternary_evaluator(const XprType& xpr) : m_d(xpr) {
+  EIGEN_DEVICE_FUNC constexpr explicit ternary_evaluator(const XprType& xpr) : m_d(xpr) {
     EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<TernaryOp>::Cost);
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
 
   typedef typename XprType::CoeffReturnType CoeffReturnType;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
     return m_d.func()(m_d.arg1Impl.coeff(row, col), m_d.arg2Impl.coeff(row, col), m_d.arg3Impl.coeff(row, col));
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
     return m_d.func()(m_d.arg1Impl.coeff(index), m_d.arg2Impl.coeff(index), m_d.arg3Impl.coeff(index));
   }
 
@@ -1036,9 +984,9 @@
  protected:
   // this helper permits to completely eliminate the functor if it is empty
   struct Data {
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Data(const XprType& xpr)
+    EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Data(const XprType& xpr)
         : op(xpr.functor()), arg1Impl(xpr.arg1()), arg2Impl(xpr.arg2()), arg3Impl(xpr.arg3()) {}
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TernaryOp& func() const { return op; }
+    EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const TernaryOp& func() const { return op; }
     TernaryOp op;
     evaluator<Arg1> arg1Impl;
     evaluator<Arg2> arg2Impl;
@@ -1076,7 +1024,7 @@
   using Arg3 = typename Helper::Arg3;
   using XprType = typename Helper::XprType;
 
-  EIGEN_DEVICE_FUNC explicit evaluator(const DummyXprType& xpr)
+  EIGEN_DEVICE_FUNC constexpr explicit evaluator(const DummyXprType& xpr)
       : Base(XprType(xpr.arg1(), xpr.arg2(), Arg3(xpr.arg3().lhs(), xpr.arg3().rhs()))) {}
 };
 
@@ -1088,7 +1036,7 @@
   typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
   typedef binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>> Base;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr) : Base(xpr) {}
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr) : Base(xpr) {}
 };
 
 template <typename BinaryOp, typename Lhs, typename Rhs>
@@ -1113,18 +1061,18 @@
     Alignment = plain_enum_min(evaluator<Lhs>::Alignment, evaluator<Rhs>::Alignment)
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit binary_evaluator(const XprType& xpr) : m_d(xpr) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE explicit binary_evaluator(const XprType& xpr) : m_d(xpr) {
     EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<BinaryOp>::Cost);
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
 
   typedef typename XprType::CoeffReturnType CoeffReturnType;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
     return m_d.func()(m_d.lhsImpl.coeff(row, col), m_d.rhsImpl.coeff(row, col));
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
     return m_d.func()(m_d.lhsImpl.coeff(index), m_d.rhsImpl.coeff(index));
   }
 
@@ -1155,9 +1103,9 @@
  protected:
   // this helper permits to completely eliminate the functor if it is empty
   struct Data {
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Data(const XprType& xpr)
+    EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Data(const XprType& xpr)
         : op(xpr.functor()), lhsImpl(xpr.lhs()), rhsImpl(xpr.rhs()) {}
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const BinaryOp& func() const { return op; }
+    EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const BinaryOp& func() const { return op; }
     BinaryOp op;
     evaluator<Lhs> lhsImpl;
     evaluator<Rhs> rhsImpl;
@@ -1178,10 +1126,10 @@
 
     Flags = (evaluator<ArgType>::Flags & (HereditaryBits | LinearAccessBit | DirectAccessBit)),
 
-    Alignment = 0  // FIXME it is not very clear why alignment is necessarily lost...
+    Alignment = 0  // FIXME: clarify why alignment is lost for CwiseUnaryView.
   };
 
-  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& op) : m_d(op) {
+  EIGEN_DEVICE_FUNC constexpr explicit unary_evaluator(const XprType& op) : m_d(op) {
     EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<UnaryOp>::Cost);
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
@@ -1189,28 +1137,28 @@
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
     return m_d.func()(m_d.argImpl.coeff(row, col));
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
     return m_d.func()(m_d.argImpl.coeff(index));
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) {
     return m_d.func()(m_d.argImpl.coeffRef(row, col));
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
     return m_d.func()(m_d.argImpl.coeffRef(index));
   }
 
  protected:
   // this helper permits to completely eliminate the functor if it is empty
   struct Data {
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Data(const XprType& xpr)
+    EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Data(const XprType& xpr)
         : op(xpr.functor()), argImpl(xpr.nestedExpression()) {}
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const UnaryOp& func() const { return op; }
+    EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const UnaryOp& func() const { return op; }
     UnaryOp op;
     evaluator<ArgType> argImpl;
   };
@@ -1220,7 +1168,7 @@
 
 // -------------------- Map --------------------
 
-// FIXME perhaps the PlainObjectType could be provided by Derived::PlainObject ?
+// FIXME: consider using Derived::PlainObject for PlainObjectType.
 // but that might complicate template specialization
 template <typename Derived, typename PlainObjectType>
 struct mapbase_evaluator;
@@ -1238,7 +1186,7 @@
     CoeffReadCost = NumTraits<Scalar>::ReadCost
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit mapbase_evaluator(const XprType& map)
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE explicit mapbase_evaluator(const XprType& map)
       : m_data(const_cast<PointerType>(map.data())),
         m_innerStride(map.innerStride()),
         m_outerStride(map.outerStride()) {
@@ -1248,19 +1196,21 @@
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
     return m_data[col * colStride() + row * rowStride()];
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
     return m_data[index * m_innerStride.value()];
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) {
     return m_data[col * colStride() + row * rowStride()];
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { return m_data[index * m_innerStride.value()]; }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+    return m_data[index * m_innerStride.value()];
+  }
 
   template <int LoadMode, typename PacketType>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
@@ -1309,10 +1259,10 @@
   }
 
  protected:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rowStride() const noexcept {
+  EIGEN_DEVICE_FUNC constexpr Index rowStride() const noexcept {
     return XprType::IsRowMajor ? m_outerStride.value() : m_innerStride.value();
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index colStride() const noexcept {
+  EIGEN_DEVICE_FUNC constexpr Index colStride() const noexcept {
     return XprType::IsRowMajor ? m_innerStride.value() : m_outerStride.value();
   }
 
@@ -1349,7 +1299,8 @@
     Alignment = int(MapOptions) & int(AlignedMask)
   };
 
-  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& map) : mapbase_evaluator<XprType, PlainObjectType>(map) {}
+  EIGEN_DEVICE_FUNC constexpr explicit evaluator(const XprType& map)
+      : mapbase_evaluator<XprType, PlainObjectType>(map) {}
 };
 
 // -------------------- Ref --------------------
@@ -1364,7 +1315,7 @@
     Alignment = evaluator<Map<PlainObjectType, RefOptions, StrideType>>::Alignment
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& ref)
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE explicit evaluator(const XprType& ref)
       : mapbase_evaluator<XprType, PlainObjectType>(ref) {}
 };
 
@@ -1418,7 +1369,8 @@
     Alignment = plain_enum_min(evaluator<ArgType>::Alignment, Alignment0)
   };
   typedef block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel> block_evaluator_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& block) : block_evaluator_type(block) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE explicit evaluator(const XprType& block)
+      : block_evaluator_type(block) {
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
 };
@@ -1429,7 +1381,7 @@
     : unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>> {
   typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit block_evaluator(const XprType& block)
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE explicit block_evaluator(const XprType& block)
       : unary_evaluator<XprType>(block) {}
 };
 
@@ -1438,7 +1390,7 @@
     : evaluator_base<Block<ArgType, BlockRows, BlockCols, InnerPanel>> {
   typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit unary_evaluator(const XprType& block)
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE explicit unary_evaluator(const XprType& block)
       : m_argImpl(block.nestedExpression()),
         m_startRow(block.startRow()),
         m_startCol(block.startCol()),
@@ -1457,19 +1409,19 @@
                           bool(evaluator<ArgType>::Flags & LinearAccessBit)
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
     return m_argImpl.coeff(m_startRow.value() + row, m_startCol.value() + col);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
     return linear_coeff_impl(index, bool_constant<ForwardLinearAccess>());
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) {
     return m_argImpl.coeffRef(m_startRow.value() + row, m_startCol.value() + col);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
     return linear_coeffRef_impl(index, bool_constant<ForwardLinearAccess>());
   }
 
@@ -1534,20 +1486,20 @@
   }
 
  protected:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType
   linear_coeff_impl(Index index, internal::true_type /* ForwardLinearAccess */) const {
     return m_argImpl.coeff(m_linear_offset.value() + index);
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType
   linear_coeff_impl(Index index, internal::false_type /* not ForwardLinearAccess */) const {
     return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& linear_coeffRef_impl(Index index,
-                                                                     internal::true_type /* ForwardLinearAccess */) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar& linear_coeffRef_impl(
+      Index index, internal::true_type /* ForwardLinearAccess */) {
     return m_argImpl.coeffRef(m_linear_offset.value() + index);
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& linear_coeffRef_impl(
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar& linear_coeffRef_impl(
       Index index, internal::false_type /* not ForwardLinearAccess */) {
     return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
   }
@@ -1568,7 +1520,7 @@
   typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
   typedef typename XprType::Scalar Scalar;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit block_evaluator(const XprType& block)
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE explicit block_evaluator(const XprType& block)
       : mapbase_evaluator<XprType, typename XprType::PlainObject>(block) {
     eigen_internal_assert((internal::is_constant_evaluated() ||
                            (std::uintptr_t(block.data()) % plain_enum_max(1, evaluator<XprType>::Alignment)) == 0) &&
@@ -1596,13 +1548,13 @@
     Alignment = evaluator<ArgTypeNestedCleaned>::Alignment
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit unary_evaluator(const XprType& replicate)
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE explicit unary_evaluator(const XprType& replicate)
       : m_arg(replicate.nestedExpression()),
         m_argImpl(m_arg),
         m_rows(replicate.nestedExpression().rows()),
         m_cols(replicate.nestedExpression().cols()) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
     // try to avoid using modulo; this is a pure optimization strategy
     const Index actual_row = traits<XprType>::RowsAtCompileTime == 1 ? 0 : RowFactor == 1 ? row : row % m_rows.value();
     const Index actual_col = traits<XprType>::ColsAtCompileTime == 1 ? 0 : ColFactor == 1 ? col : col % m_cols.value();
@@ -1610,7 +1562,7 @@
     return m_argImpl.coeff(actual_row, actual_col);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
     // try to avoid using modulo; this is a pure optimization strategy
     const Index actual_index = traits<XprType>::RowsAtCompileTime == 1
                                    ? (ColFactor == 1 ? index : index % m_cols.value())
@@ -1679,15 +1631,19 @@
   typedef typename ArgType::Scalar Scalar;
   typedef typename ArgType::CoeffReturnType CoeffReturnType;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
     return m_argImpl.coeff(row, col);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_argImpl.coeff(index); }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    return m_argImpl.coeff(index);
+  }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) { return m_argImpl.coeffRef(row, col); }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) {
+    return m_argImpl.coeffRef(row, col);
+  }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { return m_argImpl.coeffRef(index); }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { return m_argImpl.coeffRef(index); }
 
   template <int LoadMode, typename PacketType>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
@@ -1739,7 +1695,7 @@
 struct unary_evaluator<MatrixWrapper<TArgType>> : evaluator_wrapper_base<MatrixWrapper<TArgType>> {
   typedef MatrixWrapper<TArgType> XprType;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit unary_evaluator(const XprType& wrapper)
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE explicit unary_evaluator(const XprType& wrapper)
       : evaluator_wrapper_base<MatrixWrapper<TArgType>>(wrapper.nestedExpression()) {}
 };
 
@@ -1747,7 +1703,7 @@
 struct unary_evaluator<ArrayWrapper<TArgType>> : evaluator_wrapper_base<ArrayWrapper<TArgType>> {
   typedef ArrayWrapper<TArgType> XprType;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit unary_evaluator(const XprType& wrapper)
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE explicit unary_evaluator(const XprType& wrapper)
       : evaluator_wrapper_base<ArrayWrapper<TArgType>>(wrapper.nestedExpression()) {}
 };
 
@@ -1774,7 +1730,7 @@
     CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
 
     // let's enable LinearAccess only with vectorization because of the product overhead
-    // FIXME enable DirectAccess with negative strides?
+    // FIXME: consider enabling DirectAccess with negative strides.
     Flags0 = evaluator<ArgType>::Flags,
     LinearAccess =
         ((Direction == BothDirections) && (int(Flags0) & PacketAccessBit)) ||
@@ -1784,27 +1740,27 @@
 
     Flags = int(Flags0) & (HereditaryBits | PacketAccessBit | LinearAccess),
 
-    Alignment = 0  // FIXME in some rare cases, Alignment could be preserved, like a Vector4f.
+    Alignment = 0  // FIXME: in some rare cases, Alignment could be preserved.
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit unary_evaluator(const XprType& reverse)
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE explicit unary_evaluator(const XprType& reverse)
       : m_argImpl(reverse.nestedExpression()),
         m_rows(ReverseRow ? reverse.nestedExpression().rows() : 1),
         m_cols(ReverseCol ? reverse.nestedExpression().cols() : 1) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
     return m_argImpl.coeff(ReverseRow ? m_rows.value() - row - 1 : row, ReverseCol ? m_cols.value() - col - 1 : col);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
     return m_argImpl.coeff(m_rows.value() * m_cols.value() - index - 1);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) {
     return m_argImpl.coeffRef(ReverseRow ? m_rows.value() - row - 1 : row, ReverseCol ? m_cols.value() - col - 1 : col);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
     return m_argImpl.coeffRef(m_rows.value() * m_cols.value() - index - 1);
   }
 
@@ -1927,25 +1883,25 @@
     Alignment = 0
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& diagonal)
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE explicit evaluator(const XprType& diagonal)
       : m_argImpl(diagonal.nestedExpression()), m_index(diagonal.index()) {}
 
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index) const {
     return m_argImpl.coeff(row + rowOffset(), row + colOffset());
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
     return m_argImpl.coeff(index + rowOffset(), index + colOffset());
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index) {
     return m_argImpl.coeffRef(row + rowOffset(), row + colOffset());
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
     return m_argImpl.coeffRef(index + rowOffset(), index + colOffset());
   }
 
@@ -1954,12 +1910,8 @@
   const variable_if_dynamicindex<Index, XprType::DiagIndex> m_index;
 
  private:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rowOffset() const {
-    return m_index.value() > 0 ? 0 : -m_index.value();
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index colOffset() const {
-    return m_index.value() > 0 ? m_index.value() : 0;
-  }
+  EIGEN_DEVICE_FUNC constexpr Index rowOffset() const { return m_index.value() > 0 ? 0 : -m_index.value(); }
+  EIGEN_DEVICE_FUNC constexpr Index colOffset() const { return m_index.value() > 0 ? m_index.value() : 0; }
 };
 
 //----------------------------------------------------------------------
@@ -2000,12 +1952,14 @@
   typedef typename ArgType::PlainObject PlainObject;
   typedef evaluator<PlainObject> Base;
 
-  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : m_result(xpr.arg()) {
+  EIGEN_DEVICE_FUNC constexpr explicit evaluator(const XprType& xpr) : m_result(xpr.arg()) {
     internal::construct_at<Base>(this, m_result);
   }
 
   // This constructor is used when nesting an EvalTo evaluator in another evaluator
-  EIGEN_DEVICE_FUNC evaluator(const ArgType& arg) : m_result(arg) { internal::construct_at<Base>(this, m_result); }
+  EIGEN_DEVICE_FUNC constexpr evaluator(const ArgType& arg) : m_result(arg) {
+    internal::construct_at<Base>(this, m_result);
+  }
 
  protected:
   PlainObject m_result;

diff --git a/Eigen/src/Core/CoreIterators.h b/Eigen/src/Core/CoreIterators.h
index f62cf23..3143726 100644
--- a/Eigen/src/Core/CoreIterators.h
+++ b/Eigen/src/Core/CoreIterators.h

@@ -57,7 +57,7 @@
     m_iter.operator+=(i);
     return *this;
   }
-  EIGEN_STRONG_INLINE InnerIterator operator+(Index i) {
+  EIGEN_STRONG_INLINE InnerIterator operator+(Index i) const {
     InnerIterator result(*this);
     result += i;
     return result;

diff --git a/Eigen/src/Core/CwiseBinaryOp.h b/Eigen/src/Core/CwiseBinaryOp.h
index e7664ef..27fd434 100644
--- a/Eigen/src/Core/CwiseBinaryOp.h
+++ b/Eigen/src/Core/CwiseBinaryOp.h

@@ -98,33 +98,33 @@
   typedef std::remove_reference_t<RhsNested> RhsNested_;
 
 #if EIGEN_COMP_MSVC
-  // Required for Visual Studio or the Copy constructor will probably not get inlined!
+  // Required for Visual Studio, which may fail to inline the copy constructor otherwise.
   EIGEN_STRONG_INLINE CwiseBinaryOp(const CwiseBinaryOp<BinaryOp, LhsType, RhsType>&) = default;
 #endif
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CwiseBinaryOp(const Lhs& aLhs, const Rhs& aRhs,
-                                                      const BinaryOp& func = BinaryOp())
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CwiseBinaryOp(const Lhs& aLhs, const Rhs& aRhs,
+                                                                const BinaryOp& func = BinaryOp())
       : m_lhs(aLhs), m_rhs(aRhs), m_functor(func) {
     eigen_assert(aLhs.rows() == aRhs.rows() && aLhs.cols() == aRhs.cols());
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const noexcept {
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept {
     // return the fixed size type if available to enable compile time optimizations
     return internal::traits<internal::remove_all_t<LhsNested>>::RowsAtCompileTime == Dynamic ? m_rhs.rows()
                                                                                              : m_lhs.rows();
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const noexcept {
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept {
     // return the fixed size type if available to enable compile time optimizations
     return internal::traits<internal::remove_all_t<LhsNested>>::ColsAtCompileTime == Dynamic ? m_rhs.cols()
                                                                                              : m_lhs.cols();
   }
 
   /** \returns the left hand side nested expression */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const LhsNested_& lhs() const { return m_lhs; }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const LhsNested_& lhs() const { return m_lhs; }
   /** \returns the right hand side nested expression */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const RhsNested_& rhs() const { return m_rhs; }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const RhsNested_& rhs() const { return m_rhs; }
   /** \returns the functor representing the binary operation */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const BinaryOp& functor() const { return m_functor; }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const BinaryOp& functor() const { return m_functor; }
 
  protected:
   LhsNested m_lhs;

diff --git a/Eigen/src/Core/CwiseNullaryOp.h b/Eigen/src/Core/CwiseNullaryOp.h
index 084f503..7e117b6 100644
--- a/Eigen/src/Core/CwiseNullaryOp.h
+++ b/Eigen/src/Core/CwiseNullaryOp.h

@@ -66,21 +66,21 @@
   typedef typename internal::dense_xpr_base<CwiseNullaryOp>::type Base;
   EIGEN_DENSE_PUBLIC_INTERFACE(CwiseNullaryOp)
 
-  EIGEN_DEVICE_FUNC CwiseNullaryOp(Index rows, Index cols, const NullaryOp& func = NullaryOp())
+  EIGEN_DEVICE_FUNC constexpr CwiseNullaryOp(Index rows, Index cols, const NullaryOp& func = NullaryOp())
       : m_rows(rows), m_cols(cols), m_functor(func) {
     eigen_assert(rows >= 0 && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == rows) && cols >= 0 &&
                  (ColsAtCompileTime == Dynamic || ColsAtCompileTime == cols));
   }
-  EIGEN_DEVICE_FUNC CwiseNullaryOp(Index size, const NullaryOp& func = NullaryOp())
+  EIGEN_DEVICE_FUNC constexpr CwiseNullaryOp(Index size, const NullaryOp& func = NullaryOp())
       : CwiseNullaryOp(RowsAtCompileTime == 1 ? 1 : size, RowsAtCompileTime == 1 ? size : 1, func) {
     EIGEN_STATIC_ASSERT(CwiseNullaryOp::IsVectorAtCompileTime, YOU_TRIED_CALLING_A_VECTOR_METHOD_ON_A_MATRIX);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return m_rows.value(); }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return m_cols.value(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const { return m_rows.value(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const { return m_cols.value(); }
 
   /** \returns the functor representing the nullary operation */
-  EIGEN_DEVICE_FUNC const NullaryOp& functor() const { return m_functor; }
+  EIGEN_DEVICE_FUNC constexpr const NullaryOp& functor() const { return m_functor; }
 
  protected:
   const internal::variable_if_dynamic<Index, RowsAtCompileTime> m_rows;

diff --git a/Eigen/src/Core/CwiseTernaryOp.h b/Eigen/src/Core/CwiseTernaryOp.h
index 9bb0d40..8737791 100644
--- a/Eigen/src/Core/CwiseTernaryOp.h
+++ b/Eigen/src/Core/CwiseTernaryOp.h

@@ -118,7 +118,7 @@
     eigen_assert(a1.rows() == a2.rows() && a1.cols() == a2.cols() && a1.rows() == a3.rows() && a1.cols() == a3.cols());
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rows() const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Index rows() const {
     // return the fixed size type if available to enable compile time
     // optimizations
     if (internal::traits<internal::remove_all_t<Arg1Nested>>::RowsAtCompileTime == Dynamic &&
@@ -130,7 +130,7 @@
     else
       return m_arg1.rows();
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index cols() const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Index cols() const {
     // return the fixed size type if available to enable compile time
     // optimizations
     if (internal::traits<internal::remove_all_t<Arg1Nested>>::ColsAtCompileTime == Dynamic &&
@@ -144,13 +144,13 @@
   }
 
   /** \returns the first argument nested expression */
-  EIGEN_DEVICE_FUNC const Arg1Nested_& arg1() const { return m_arg1; }
+  EIGEN_DEVICE_FUNC constexpr const Arg1Nested_& arg1() const { return m_arg1; }
   /** \returns the first argument nested expression */
-  EIGEN_DEVICE_FUNC const Arg2Nested_& arg2() const { return m_arg2; }
+  EIGEN_DEVICE_FUNC constexpr const Arg2Nested_& arg2() const { return m_arg2; }
   /** \returns the third argument nested expression */
-  EIGEN_DEVICE_FUNC const Arg3Nested_& arg3() const { return m_arg3; }
+  EIGEN_DEVICE_FUNC constexpr const Arg3Nested_& arg3() const { return m_arg3; }
   /** \returns the functor representing the ternary operation */
-  EIGEN_DEVICE_FUNC const TernaryOp& functor() const { return m_functor; }
+  EIGEN_DEVICE_FUNC constexpr const TernaryOp& functor() const { return m_functor; }
 
  protected:
   Arg1Nested m_arg1;

diff --git a/Eigen/src/Core/CwiseUnaryOp.h b/Eigen/src/Core/CwiseUnaryOp.h
index 94ec1a0..fabf4fc 100644
--- a/Eigen/src/Core/CwiseUnaryOp.h
+++ b/Eigen/src/Core/CwiseUnaryOp.h

@@ -57,22 +57,26 @@
   typedef typename internal::ref_selector<XprType>::type XprTypeNested;
   typedef internal::remove_all_t<XprType> NestedExpression;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit CwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp())
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE explicit CwiseUnaryOp(const XprType& xpr,
+                                                                        const UnaryOp& func = UnaryOp())
       : m_xpr(xpr), m_functor(func) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const noexcept { return m_xpr.rows(); }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const noexcept { return m_xpr.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_xpr.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_xpr.cols(); }
 
   /** \returns the functor representing the unary operation */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const UnaryOp& functor() const { return m_functor; }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const UnaryOp& functor() const { return m_functor; }
 
   /** \returns the nested expression */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all_t<XprTypeNested>& nestedExpression() const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const internal::remove_all_t<XprTypeNested>& nestedExpression()
+      const {
     return m_xpr;
   }
 
   /** \returns the nested expression */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::remove_all_t<XprTypeNested>& nestedExpression() { return m_xpr; }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE internal::remove_all_t<XprTypeNested>& nestedExpression() {
+    return m_xpr;
+  }
 
  protected:
   XprTypeNested m_xpr;

diff --git a/Eigen/src/Core/CwiseUnaryView.h b/Eigen/src/Core/CwiseUnaryView.h
index 7dd7623..34546e8 100644
--- a/Eigen/src/Core/CwiseUnaryView.h
+++ b/Eigen/src/Core/CwiseUnaryView.h

@@ -140,22 +140,24 @@
   typedef typename internal::ref_selector<MatrixType>::non_const_type MatrixTypeNested;
   typedef internal::remove_all_t<MatrixType> NestedExpression;
 
-  explicit EIGEN_DEVICE_FUNC inline CwiseUnaryView(MatrixType& mat, const ViewOp& func = ViewOp())
+  explicit EIGEN_DEVICE_FUNC constexpr inline CwiseUnaryView(MatrixType& mat, const ViewOp& func = ViewOp())
       : m_matrix(mat), m_functor(func) {}
 
   EIGEN_INHERIT_ASSIGNMENT_OPERATORS(CwiseUnaryView)
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const noexcept { return m_matrix.rows(); }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const noexcept { return m_matrix.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_matrix.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_matrix.cols(); }
 
   /** \returns the functor representing unary operation */
-  EIGEN_DEVICE_FUNC const ViewOp& functor() const { return m_functor; }
+  EIGEN_DEVICE_FUNC constexpr const ViewOp& functor() const { return m_functor; }
 
   /** \returns the nested expression */
-  EIGEN_DEVICE_FUNC const internal::remove_all_t<MatrixTypeNested>& nestedExpression() const { return m_matrix; }
+  EIGEN_DEVICE_FUNC constexpr const internal::remove_all_t<MatrixTypeNested>& nestedExpression() const {
+    return m_matrix;
+  }
 
   /** \returns the nested expression */
-  EIGEN_DEVICE_FUNC std::remove_reference_t<MatrixTypeNested>& nestedExpression() { return m_matrix; }
+  EIGEN_DEVICE_FUNC constexpr std::remove_reference_t<MatrixTypeNested>& nestedExpression() { return m_matrix; }
 
  protected:
   MatrixTypeNested m_matrix;

diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h
index dcadcbf..4331a72 100644
--- a/Eigen/src/Core/DenseBase.h
+++ b/Eigen/src/Core/DenseBase.h

@@ -260,21 +260,21 @@
 
   /** Copies \a other into *this. \returns a reference to *this. */
   template <typename OtherDerived>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const DenseBase<OtherDerived>& other);
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Derived& operator=(const DenseBase<OtherDerived>& other);
 
   /** Special case of the template operator=, in order to prevent the compiler
    * from generating a default operator= (issue hit with g++ 4.1)
    */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const DenseBase& other);
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Derived& operator=(const DenseBase& other);
 
   template <typename OtherDerived>
-  EIGEN_DEVICE_FUNC Derived& operator=(const EigenBase<OtherDerived>& other);
+  EIGEN_DEVICE_FUNC constexpr Derived& operator=(const EigenBase<OtherDerived>& other);
 
   template <typename OtherDerived>
-  EIGEN_DEVICE_FUNC Derived& operator+=(const EigenBase<OtherDerived>& other);
+  EIGEN_DEVICE_FUNC constexpr Derived& operator+=(const EigenBase<OtherDerived>& other);
 
   template <typename OtherDerived>
-  EIGEN_DEVICE_FUNC Derived& operator-=(const EigenBase<OtherDerived>& other);
+  EIGEN_DEVICE_FUNC constexpr Derived& operator-=(const EigenBase<OtherDerived>& other);
 
   template <typename OtherDerived>
   EIGEN_DEVICE_FUNC Derived& operator=(const ReturnByValue<OtherDerived>& func);
@@ -283,7 +283,7 @@
    * Copies \a other into *this without evaluating other. \returns a reference to *this. */
   template <typename OtherDerived>
   /** \deprecated */
-  EIGEN_DEPRECATED EIGEN_DEVICE_FUNC Derived& lazyAssign(const DenseBase<OtherDerived>& other);
+  EIGEN_DEPRECATED EIGEN_DEVICE_FUNC constexpr Derived& lazyAssign(const DenseBase<OtherDerived>& other);
 
   EIGEN_DEVICE_FUNC CommaInitializer<Derived> operator<<(const Scalar& s);
 
@@ -348,13 +348,13 @@
   EIGEN_DEVICE_FUNC Derived& setRandom();
 
   template <typename OtherDerived>
-  EIGEN_DEVICE_FUNC bool isApprox(const DenseBase<OtherDerived>& other,
-                                  const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
-  EIGEN_DEVICE_FUNC bool isMuchSmallerThan(const RealScalar& other,
-                                           const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+  EIGEN_DEVICE_FUNC constexpr bool isApprox(const DenseBase<OtherDerived>& other,
+                                            const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+  EIGEN_DEVICE_FUNC constexpr bool isMuchSmallerThan(
+      const RealScalar& other, const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
   template <typename OtherDerived>
-  EIGEN_DEVICE_FUNC bool isMuchSmallerThan(const DenseBase<OtherDerived>& other,
-                                           const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+  EIGEN_DEVICE_FUNC constexpr bool isMuchSmallerThan(
+      const DenseBase<OtherDerived>& other, const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
 
   EIGEN_DEVICE_FUNC bool isApproxToConstant(const Scalar& value,
                                             const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
@@ -366,13 +366,13 @@
   EIGEN_DEVICE_FUNC inline bool hasNaN() const;
   EIGEN_DEVICE_FUNC inline bool allFinite() const;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator*=(const Scalar& other);
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Derived& operator*=(const Scalar& other);
   template <bool Enable = internal::complex_array_access<Scalar>::value, typename = std::enable_if_t<Enable>>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator*=(const RealScalar& other);
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Derived& operator*=(const RealScalar& other);
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator/=(const Scalar& other);
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Derived& operator/=(const Scalar& other);
   template <bool Enable = internal::complex_array_access<Scalar>::value, typename = std::enable_if_t<Enable>>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator/=(const RealScalar& other);
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Derived& operator/=(const RealScalar& other);
 
   typedef internal::add_const_on_value_type_t<typename internal::eval<Derived>::type> EvalReturnType;
   /** \returns the matrix or vector obtained by evaluating this expression.
@@ -409,7 +409,7 @@
     call_assignment(derived(), other.derived(), internal::swap_assign_op<Scalar>());
   }
 
-  EIGEN_DEVICE_FUNC inline const NestByValue<Derived> nestByValue() const;
+  EIGEN_DEVICE_FUNC constexpr inline const NestByValue<Derived> nestByValue() const;
   EIGEN_DEVICE_FUNC inline const ForceAlignedAccess<Derived> forceAlignedAccess() const;
   EIGEN_DEVICE_FUNC inline ForceAlignedAccess<Derived> forceAlignedAccess();
   template <bool Enable>
@@ -431,8 +431,7 @@
 
   // By default, the fastest version with undefined NaN propagation semantics is
   // used.
-  // TODO(rmlarsen): Replace with default template argument when we move to
-  // c++11 or beyond.
+  // TODO(rmlarsen): Replace with default template argument (C++14 is now the minimum standard).
   EIGEN_DEVICE_FUNC inline typename internal::traits<Derived>::Scalar minCoeff() const {
     return minCoeff<PropagateFast>();
   }
@@ -449,7 +448,7 @@
   template <int NaNPropagation, typename IndexType>
   EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar maxCoeff(IndexType* index) const;
 
-  // TODO(rmlarsen): Replace these methods with a default template argument.
+  // TODO(rmlarsen): Replace these methods with a default template argument (C++14 is now the minimum standard).
   template <typename IndexType>
   EIGEN_DEVICE_FUNC inline typename internal::traits<Derived>::Scalar minCoeff(IndexType* row, IndexType* col) const {
     return minCoeff<PropagateFast>(row, col);
@@ -524,25 +523,25 @@
   static const RandomReturnType Random();
 
   template <typename ThenDerived, typename ElseDerived>
-  inline EIGEN_DEVICE_FUNC
-      CwiseTernaryOp<internal::scalar_boolean_select_op<typename DenseBase<ThenDerived>::Scalar,
-                                                        typename DenseBase<ElseDerived>::Scalar, Scalar>,
-                     ThenDerived, ElseDerived, Derived>
-      select(const DenseBase<ThenDerived>& thenMatrix, const DenseBase<ElseDerived>& elseMatrix) const;
+  inline EIGEN_DEVICE_FUNC constexpr CwiseTernaryOp<
+      internal::scalar_boolean_select_op<typename DenseBase<ThenDerived>::Scalar,
+                                         typename DenseBase<ElseDerived>::Scalar, Scalar>,
+      ThenDerived, ElseDerived, Derived>
+  select(const DenseBase<ThenDerived>& thenMatrix, const DenseBase<ElseDerived>& elseMatrix) const;
 
   template <typename ThenDerived>
-  inline EIGEN_DEVICE_FUNC
-      CwiseTernaryOp<internal::scalar_boolean_select_op<typename DenseBase<ThenDerived>::Scalar,
-                                                        typename DenseBase<ThenDerived>::Scalar, Scalar>,
-                     ThenDerived, typename DenseBase<ThenDerived>::ConstantReturnType, Derived>
-      select(const DenseBase<ThenDerived>& thenMatrix, const typename DenseBase<ThenDerived>::Scalar& elseScalar) const;
+  inline EIGEN_DEVICE_FUNC constexpr CwiseTernaryOp<
+      internal::scalar_boolean_select_op<typename DenseBase<ThenDerived>::Scalar,
+                                         typename DenseBase<ThenDerived>::Scalar, Scalar>,
+      ThenDerived, typename DenseBase<ThenDerived>::ConstantReturnType, Derived>
+  select(const DenseBase<ThenDerived>& thenMatrix, const typename DenseBase<ThenDerived>::Scalar& elseScalar) const;
 
   template <typename ElseDerived>
-  inline EIGEN_DEVICE_FUNC
-      CwiseTernaryOp<internal::scalar_boolean_select_op<typename DenseBase<ElseDerived>::Scalar,
-                                                        typename DenseBase<ElseDerived>::Scalar, Scalar>,
-                     typename DenseBase<ElseDerived>::ConstantReturnType, ElseDerived, Derived>
-      select(const typename DenseBase<ElseDerived>::Scalar& thenScalar, const DenseBase<ElseDerived>& elseMatrix) const;
+  inline EIGEN_DEVICE_FUNC constexpr CwiseTernaryOp<
+      internal::scalar_boolean_select_op<typename DenseBase<ElseDerived>::Scalar,
+                                         typename DenseBase<ElseDerived>::Scalar, Scalar>,
+      typename DenseBase<ElseDerived>::ConstantReturnType, ElseDerived, Derived>
+  select(const typename DenseBase<ElseDerived>::Scalar& thenScalar, const DenseBase<ElseDerived>& elseMatrix) const;
 
   template <int p>
   RealScalar lpNorm() const;
@@ -580,12 +579,12 @@
 #else
   typedef std::conditional_t<(Flags & DirectAccessBit) == DirectAccessBit,
                              internal::pointer_based_stl_iterator<Derived>,
-                             internal::generic_randaccess_stl_iterator<Derived> >
+                             internal::generic_randaccess_stl_iterator<Derived>>
       iterator_type;
 
   typedef std::conditional_t<(Flags & DirectAccessBit) == DirectAccessBit,
                              internal::pointer_based_stl_iterator<const Derived>,
-                             internal::generic_randaccess_stl_iterator<const Derived> >
+                             internal::generic_randaccess_stl_iterator<const Derived>>
       const_iterator_type;
 
   // Stl-style iterators are supported only for vectors.

diff --git a/Eigen/src/Core/DenseCoeffsBase.h b/Eigen/src/Core/DenseCoeffsBase.h
index a48fc44..c528416 100644
--- a/Eigen/src/Core/DenseCoeffsBase.h
+++ b/Eigen/src/Core/DenseCoeffsBase.h

@@ -67,14 +67,14 @@
   using Base::rows;
   using Base::size;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowIndexByOuterInner(Index outer, Index inner) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Index rowIndexByOuterInner(Index outer, Index inner) const {
     return int(Derived::RowsAtCompileTime) == 1   ? 0
            : int(Derived::ColsAtCompileTime) == 1 ? inner
            : int(Derived::Flags) & RowMajorBit    ? outer
                                                   : inner;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colIndexByOuterInner(Index outer, Index inner) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Index colIndexByOuterInner(Index outer, Index inner) const {
     return int(Derived::ColsAtCompileTime) == 1   ? 0
            : int(Derived::RowsAtCompileTime) == 1 ? inner
            : int(Derived::Flags) & RowMajorBit    ? inner
@@ -95,12 +95,12 @@
    *
    * \sa operator()(Index,Index) const, coeffRef(Index,Index), coeff(Index) const
    */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType coeff(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC constexpr CoeffReturnType coeff(Index row, Index col) const {
     eigen_internal_assert(row >= 0 && row < rows() && col >= 0 && col < cols());
     return internal::evaluator<Derived>(derived()).coeff(row, col);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType coeffByOuterInner(Index outer, Index inner) const {
+  EIGEN_DEVICE_FUNC constexpr CoeffReturnType coeffByOuterInner(Index outer, Index inner) const {
     return coeff(rowIndexByOuterInner(outer, inner), colIndexByOuterInner(outer, inner));
   }
 
@@ -108,7 +108,7 @@
    *
    * \sa operator()(Index,Index), operator[](Index)
    */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType operator()(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC constexpr CoeffReturnType operator()(Index row, Index col) const {
     eigen_assert(row >= 0 && row < rows() && col >= 0 && col < cols());
     return coeff(row, col);
   }
@@ -118,9 +118,7 @@
    *
    * \sa operator[](Index,Index), operator[](Index)
    */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType operator[](Index row, Index col) const {
-    return operator()(row, col);
-  }
+  EIGEN_DEVICE_FUNC constexpr CoeffReturnType operator[](Index row, Index col) const { return operator()(row, col); }
 #endif
 
   /** Short version: don't use this function, use
@@ -138,7 +136,7 @@
    * \sa operator[](Index) const, coeffRef(Index), coeff(Index,Index) const
    */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType coeff(Index index) const {
+  EIGEN_DEVICE_FUNC constexpr CoeffReturnType coeff(Index index) const {
     EIGEN_STATIC_ASSERT(internal::evaluator<Derived>::Flags & LinearAccessBit,
                         THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS)
     eigen_internal_assert(index >= 0 && index < size());
@@ -153,7 +151,7 @@
    * z() const, w() const
    */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType operator[](Index index) const {
+  EIGEN_DEVICE_FUNC constexpr CoeffReturnType operator[](Index index) const {
     EIGEN_STATIC_ASSERT(Derived::IsVectorAtCompileTime,
                         THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD)
     eigen_assert(index >= 0 && index < size());
@@ -170,32 +168,32 @@
    * z() const, w() const
    */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType operator()(Index index) const {
+  EIGEN_DEVICE_FUNC constexpr CoeffReturnType operator()(Index index) const {
     eigen_assert(index >= 0 && index < size());
     return coeff(index);
   }
 
   /** equivalent to operator[](0).  */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType x() const { return (*this)[0]; }
+  EIGEN_DEVICE_FUNC constexpr CoeffReturnType x() const { return (*this)[0]; }
 
   /** equivalent to operator[](1).  */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType y() const {
+  EIGEN_DEVICE_FUNC constexpr CoeffReturnType y() const {
     EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime == -1 || Derived::SizeAtCompileTime >= 2, OUT_OF_RANGE_ACCESS);
     return (*this)[1];
   }
 
   /** equivalent to operator[](2).  */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType z() const {
+  EIGEN_DEVICE_FUNC constexpr CoeffReturnType z() const {
     EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime == -1 || Derived::SizeAtCompileTime >= 3, OUT_OF_RANGE_ACCESS);
     return (*this)[2];
   }
 
   /** equivalent to operator[](3).  */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType w() const {
+  EIGEN_DEVICE_FUNC constexpr CoeffReturnType w() const {
     EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime == -1 || Derived::SizeAtCompileTime >= 4, OUT_OF_RANGE_ACCESS);
     return (*this)[3];
   }
@@ -313,12 +311,12 @@
    *
    * \sa operator()(Index,Index), coeff(Index, Index) const, coeffRef(Index)
    */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& coeffRef(Index row, Index col) {
+  EIGEN_DEVICE_FUNC constexpr Scalar& coeffRef(Index row, Index col) {
     eigen_internal_assert(row >= 0 && row < rows() && col >= 0 && col < cols());
     return internal::evaluator<Derived>(derived()).coeffRef(row, col);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRefByOuterInner(Index outer, Index inner) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar& coeffRefByOuterInner(Index outer, Index inner) {
     return coeffRef(rowIndexByOuterInner(outer, inner), colIndexByOuterInner(outer, inner));
   }
 
@@ -326,7 +324,7 @@
    *
    * \sa operator[](Index)
    */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& operator()(Index row, Index col) {
+  EIGEN_DEVICE_FUNC constexpr Scalar& operator()(Index row, Index col) {
     eigen_assert(row >= 0 && row < rows() && col >= 0 && col < cols());
     return coeffRef(row, col);
   }
@@ -336,9 +334,7 @@
    *
    * \sa operator[](Index)
    */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& operator[](Index row, Index col) {
-    return operator()(row, col);
-  }
+  EIGEN_DEVICE_FUNC constexpr Scalar& operator[](Index row, Index col) { return operator()(row, col); }
 #endif
 
   /** Short version: don't use this function, use
@@ -356,7 +352,7 @@
    * \sa operator[](Index), coeff(Index) const, coeffRef(Index,Index)
    */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& coeffRef(Index index) {
+  EIGEN_DEVICE_FUNC constexpr Scalar& coeffRef(Index index) {
     EIGEN_STATIC_ASSERT(internal::evaluator<Derived>::Flags & LinearAccessBit,
                         THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS)
     eigen_internal_assert(index >= 0 && index < size());
@@ -370,7 +366,7 @@
    * \sa operator[](Index) const, operator()(Index,Index), x(), y(), z(), w()
    */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& operator[](Index index) {
+  EIGEN_DEVICE_FUNC constexpr Scalar& operator[](Index index) {
     EIGEN_STATIC_ASSERT(Derived::IsVectorAtCompileTime,
                         THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD)
     eigen_assert(index >= 0 && index < size());
@@ -386,32 +382,32 @@
    * \sa operator[](Index) const, operator()(Index,Index), x(), y(), z(), w()
    */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& operator()(Index index) {
+  EIGEN_DEVICE_FUNC constexpr Scalar& operator()(Index index) {
     eigen_assert(index >= 0 && index < size());
     return coeffRef(index);
   }
 
   /** equivalent to operator[](0).  */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& x() { return (*this)[0]; }
+  EIGEN_DEVICE_FUNC constexpr Scalar& x() { return (*this)[0]; }
 
   /** equivalent to operator[](1).  */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& y() {
+  EIGEN_DEVICE_FUNC constexpr Scalar& y() {
     EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime == -1 || Derived::SizeAtCompileTime >= 2, OUT_OF_RANGE_ACCESS);
     return (*this)[1];
   }
 
   /** equivalent to operator[](2).  */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& z() {
+  EIGEN_DEVICE_FUNC constexpr Scalar& z() {
     EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime == -1 || Derived::SizeAtCompileTime >= 3, OUT_OF_RANGE_ACCESS);
     return (*this)[2];
   }
 
   /** equivalent to operator[](3).  */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& w() {
+  EIGEN_DEVICE_FUNC constexpr Scalar& w() {
     EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime == -1 || Derived::SizeAtCompileTime >= 4, OUT_OF_RANGE_ACCESS);
     return (*this)[3];
   }

diff --git a/Eigen/src/Core/DenseStorage.h b/Eigen/src/Core/DenseStorage.h
index 45c8779..2892127 100644
--- a/Eigen/src/Core/DenseStorage.h
+++ b/Eigen/src/Core/DenseStorage.h

@@ -54,7 +54,7 @@
 struct plain_array {
   EIGEN_ALIGN_TO_BOUNDARY(Alignment) T array[Size];
 #if defined(EIGEN_NO_DEBUG) || defined(EIGEN_TESTING_PLAINOBJECT_CTOR)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr plain_array() = default;
+  EIGEN_DEVICE_FUNC constexpr plain_array() = default;
 #else
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr plain_array() {
     EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(Alignment)
@@ -68,7 +68,7 @@
   // on some 32-bit platforms, stack-allocated arrays are aligned to 4 bytes, not the preferred alignment of T
   EIGEN_ALIGN_TO_BOUNDARY(alignof(T)) T array[Size];
 #if defined(EIGEN_NO_DEBUG) || defined(EIGEN_TESTING_PLAINOBJECT_CTOR)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr plain_array() = default;
+  EIGEN_DEVICE_FUNC constexpr plain_array() = default;
 #else
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr plain_array() { EIGEN_MAKE_STACK_ALLOCATION_ASSERT(Size * sizeof(T)) }
 #endif
@@ -92,8 +92,8 @@
 
  public:
 #ifndef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl() = default;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(const DenseStorage_impl&) = default;
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl() = default;
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl(const DenseStorage_impl&) = default;
 #else
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl() {
     EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = Size)
@@ -103,19 +103,18 @@
     smart_copy(other.m_data.array, other.m_data.array + Size, m_data.array);
   }
 #endif
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(Index /*size*/, Index /*rows*/, Index /*cols*/) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl& operator=(const DenseStorage_impl&) = default;
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl(Index /*size*/, Index /*rows*/, Index /*cols*/) {}
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl& operator=(const DenseStorage_impl&) = default;
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void swap(DenseStorage_impl& other) {
     numext::swap(m_data, other.m_data);
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void conservativeResize(Index /*size*/, Index /*rows*/,
-                                                                          Index /*cols*/) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void resize(Index /*size*/, Index /*rows*/, Index /*cols*/) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return Rows; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return Cols; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index size() const { return Rows * Cols; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* data() { return m_data.array; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* data() const { return m_data.array; }
+  EIGEN_DEVICE_FUNC constexpr void conservativeResize(Index /*size*/, Index /*rows*/, Index /*cols*/) {}
+  EIGEN_DEVICE_FUNC constexpr void resize(Index /*size*/, Index /*rows*/, Index /*cols*/) {}
+  EIGEN_DEVICE_FUNC constexpr Index rows() const { return Rows; }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const { return Cols; }
+  EIGEN_DEVICE_FUNC constexpr Index size() const { return Rows * Cols; }
+  EIGEN_DEVICE_FUNC constexpr T* data() { return m_data.array; }
+  EIGEN_DEVICE_FUNC constexpr const T* data() const { return m_data.array; }
 };
 template <typename T, int Size, int Cols, int Options>
 class DenseStorage_impl<T, Size, Dynamic, Cols, Options> {
@@ -123,7 +122,7 @@
   Index m_rows = 0;
 
  public:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl() = default;
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl() = default;
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(const DenseStorage_impl& other)
       : m_rows(other.m_rows) {
     EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = other.size())
@@ -143,17 +142,13 @@
     swap_plain_array(m_data, other.m_data, size(), other.size());
     numext::swap(m_rows, other.m_rows);
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void conservativeResize(Index /*size*/, Index rows, Index /*cols*/) {
-    m_rows = rows;
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void resize(Index /*size*/, Index rows, Index /*cols*/) {
-    m_rows = rows;
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return m_rows; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return Cols; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index size() const { return m_rows * Cols; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* data() { return m_data.array; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* data() const { return m_data.array; }
+  EIGEN_DEVICE_FUNC constexpr void conservativeResize(Index /*size*/, Index rows, Index /*cols*/) { m_rows = rows; }
+  EIGEN_DEVICE_FUNC constexpr void resize(Index /*size*/, Index rows, Index /*cols*/) { m_rows = rows; }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const { return m_rows; }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const { return Cols; }
+  EIGEN_DEVICE_FUNC constexpr Index size() const { return m_rows * Cols; }
+  EIGEN_DEVICE_FUNC constexpr T* data() { return m_data.array; }
+  EIGEN_DEVICE_FUNC constexpr const T* data() const { return m_data.array; }
 };
 template <typename T, int Size, int Rows, int Options>
 class DenseStorage_impl<T, Size, Rows, Dynamic, Options> {
@@ -161,7 +156,7 @@
   Index m_cols = 0;
 
  public:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl() = default;
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl() = default;
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(const DenseStorage_impl& other)
       : m_cols(other.m_cols) {
     EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = other.size())
@@ -181,17 +176,13 @@
     swap_plain_array(m_data, other.m_data, size(), other.size());
     numext::swap(m_cols, other.m_cols);
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void conservativeResize(Index /*size*/, Index /*rows*/, Index cols) {
-    m_cols = cols;
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void resize(Index /*size*/, Index /*rows*/, Index cols) {
-    m_cols = cols;
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return Rows; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return m_cols; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index size() const { return Rows * m_cols; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* data() { return m_data.array; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* data() const { return m_data.array; }
+  EIGEN_DEVICE_FUNC constexpr void conservativeResize(Index /*size*/, Index /*rows*/, Index cols) { m_cols = cols; }
+  EIGEN_DEVICE_FUNC constexpr void resize(Index /*size*/, Index /*rows*/, Index cols) { m_cols = cols; }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const { return Rows; }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const { return m_cols; }
+  EIGEN_DEVICE_FUNC constexpr Index size() const { return Rows * m_cols; }
+  EIGEN_DEVICE_FUNC constexpr T* data() { return m_data.array; }
+  EIGEN_DEVICE_FUNC constexpr const T* data() const { return m_data.array; }
 };
 template <typename T, int Size, int Options>
 class DenseStorage_impl<T, Size, Dynamic, Dynamic, Options> {
@@ -200,7 +191,7 @@
   Index m_cols = 0;
 
  public:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl() = default;
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl() = default;
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(const DenseStorage_impl& other)
       : m_rows(other.m_rows), m_cols(other.m_cols) {
     EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = other.size())
@@ -222,87 +213,72 @@
     numext::swap(m_rows, other.m_rows);
     numext::swap(m_cols, other.m_cols);
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void conservativeResize(Index /*size*/, Index rows, Index cols) {
+  EIGEN_DEVICE_FUNC constexpr void conservativeResize(Index /*size*/, Index rows, Index cols) {
     m_rows = rows;
     m_cols = cols;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void resize(Index /*size*/, Index rows, Index cols) {
+  EIGEN_DEVICE_FUNC constexpr void resize(Index /*size*/, Index rows, Index cols) {
     m_rows = rows;
     m_cols = cols;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return m_rows; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return m_cols; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index size() const { return m_rows * m_cols; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* data() { return m_data.array; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* data() const { return m_data.array; }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const { return m_rows; }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const { return m_cols; }
+  EIGEN_DEVICE_FUNC constexpr Index size() const { return m_rows * m_cols; }
+  EIGEN_DEVICE_FUNC constexpr T* data() { return m_data.array; }
+  EIGEN_DEVICE_FUNC constexpr const T* data() const { return m_data.array; }
 };
 // null matrix variants
 template <typename T, int Rows, int Cols, int Options>
 class DenseStorage_impl<T, 0, Rows, Cols, Options> {
  public:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl() = default;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(const DenseStorage_impl&) = default;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(Index /*size*/, Index /*rows*/, Index /*cols*/) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl& operator=(const DenseStorage_impl&) = default;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void swap(DenseStorage_impl&) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void conservativeResize(Index /*size*/, Index /*rows*/,
-                                                                          Index /*cols*/) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void resize(Index /*size*/, Index /*rows*/, Index /*cols*/) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return Rows; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return Cols; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index size() const { return Rows * Cols; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* data() { return nullptr; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* data() const { return nullptr; }
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl() = default;
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl(const DenseStorage_impl&) = default;
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl(Index /*size*/, Index /*rows*/, Index /*cols*/) {}
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl& operator=(const DenseStorage_impl&) = default;
+  EIGEN_DEVICE_FUNC constexpr void swap(DenseStorage_impl&) {}
+  EIGEN_DEVICE_FUNC constexpr void conservativeResize(Index /*size*/, Index /*rows*/, Index /*cols*/) {}
+  EIGEN_DEVICE_FUNC constexpr void resize(Index /*size*/, Index /*rows*/, Index /*cols*/) {}
+  EIGEN_DEVICE_FUNC constexpr Index rows() const { return Rows; }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const { return Cols; }
+  EIGEN_DEVICE_FUNC constexpr Index size() const { return Rows * Cols; }
+  EIGEN_DEVICE_FUNC constexpr T* data() { return nullptr; }
+  EIGEN_DEVICE_FUNC constexpr const T* data() const { return nullptr; }
 };
 template <typename T, int Cols, int Options>
 class DenseStorage_impl<T, 0, Dynamic, Cols, Options> {
   Index m_rows = 0;
 
  public:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl() = default;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(const DenseStorage_impl&) = default;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(Index /*size*/, Index rows, Index /*cols*/)
-      : m_rows(rows) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl& operator=(const DenseStorage_impl&) = default;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void swap(DenseStorage_impl& other) noexcept {
-    numext::swap(m_rows, other.m_rows);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void conservativeResize(Index /*size*/, Index rows, Index /*cols*/) {
-    m_rows = rows;
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void resize(Index /*size*/, Index rows, Index /*cols*/) {
-    m_rows = rows;
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return m_rows; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return Cols; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index size() const { return m_rows * Cols; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* data() { return nullptr; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* data() const { return nullptr; }
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl() = default;
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl(const DenseStorage_impl&) = default;
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl(Index /*size*/, Index rows, Index /*cols*/) : m_rows(rows) {}
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl& operator=(const DenseStorage_impl&) = default;
+  EIGEN_DEVICE_FUNC constexpr void swap(DenseStorage_impl& other) noexcept { numext::swap(m_rows, other.m_rows); }
+  EIGEN_DEVICE_FUNC constexpr void conservativeResize(Index /*size*/, Index rows, Index /*cols*/) { m_rows = rows; }
+  EIGEN_DEVICE_FUNC constexpr void resize(Index /*size*/, Index rows, Index /*cols*/) { m_rows = rows; }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const { return m_rows; }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const { return Cols; }
+  EIGEN_DEVICE_FUNC constexpr Index size() const { return m_rows * Cols; }
+  EIGEN_DEVICE_FUNC constexpr T* data() { return nullptr; }
+  EIGEN_DEVICE_FUNC constexpr const T* data() const { return nullptr; }
 };
 template <typename T, int Rows, int Options>
 class DenseStorage_impl<T, 0, Rows, Dynamic, Options> {
   Index m_cols = 0;
 
  public:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl() = default;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(const DenseStorage_impl&) = default;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(Index /*size*/, Index /*rows*/, Index cols)
-      : m_cols(cols) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl& operator=(const DenseStorage_impl&) = default;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void swap(DenseStorage_impl& other) noexcept {
-    numext::swap(m_cols, other.m_cols);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void conservativeResize(Index /*size*/, Index /*rows*/, Index cols) {
-    m_cols = cols;
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void resize(Index /*size*/, Index /*rows*/, Index cols) {
-    m_cols = cols;
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return Rows; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return m_cols; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index size() const { return Rows * m_cols; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* data() { return nullptr; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* data() const { return nullptr; }
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl() = default;
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl(const DenseStorage_impl&) = default;
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl(Index /*size*/, Index /*rows*/, Index cols) : m_cols(cols) {}
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl& operator=(const DenseStorage_impl&) = default;
+  EIGEN_DEVICE_FUNC constexpr void swap(DenseStorage_impl& other) noexcept { numext::swap(m_cols, other.m_cols); }
+  EIGEN_DEVICE_FUNC constexpr void conservativeResize(Index /*size*/, Index /*rows*/, Index cols) { m_cols = cols; }
+  EIGEN_DEVICE_FUNC constexpr void resize(Index /*size*/, Index /*rows*/, Index cols) { m_cols = cols; }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const { return Rows; }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const { return m_cols; }
+  EIGEN_DEVICE_FUNC constexpr Index size() const { return Rows * m_cols; }
+  EIGEN_DEVICE_FUNC constexpr T* data() { return nullptr; }
+  EIGEN_DEVICE_FUNC constexpr const T* data() const { return nullptr; }
 };
 template <typename T, int Options>
 class DenseStorage_impl<T, 0, Dynamic, Dynamic, Options> {
@@ -310,28 +286,27 @@
   Index m_cols = 0;
 
  public:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl() = default;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(const DenseStorage_impl&) = default;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(Index /*size*/, Index rows, Index cols)
-      : m_rows(rows), m_cols(cols) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl& operator=(const DenseStorage_impl&) = default;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void swap(DenseStorage_impl& other) noexcept {
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl() = default;
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl(const DenseStorage_impl&) = default;
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl(Index /*size*/, Index rows, Index cols) : m_rows(rows), m_cols(cols) {}
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl& operator=(const DenseStorage_impl&) = default;
+  EIGEN_DEVICE_FUNC constexpr void swap(DenseStorage_impl& other) noexcept {
     numext::swap(m_rows, other.m_rows);
     numext::swap(m_cols, other.m_cols);
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void conservativeResize(Index /*size*/, Index rows, Index cols) {
+  EIGEN_DEVICE_FUNC constexpr void conservativeResize(Index /*size*/, Index rows, Index cols) {
     m_rows = rows;
     m_cols = cols;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void resize(Index /*size*/, Index rows, Index cols) {
+  EIGEN_DEVICE_FUNC constexpr void resize(Index /*size*/, Index rows, Index cols) {
     m_rows = rows;
     m_cols = cols;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return m_rows; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return m_cols; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index size() const { return m_rows * m_cols; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* data() { return nullptr; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* data() const { return nullptr; }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const { return m_rows; }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const { return m_cols; }
+  EIGEN_DEVICE_FUNC constexpr Index size() const { return m_rows * m_cols; }
+  EIGEN_DEVICE_FUNC constexpr T* data() { return nullptr; }
+  EIGEN_DEVICE_FUNC constexpr const T* data() const { return nullptr; }
 };
 // fixed-size matrix with dynamic memory allocation not currently supported
 template <typename T, int Rows, int Cols, int Options>
@@ -345,7 +320,7 @@
 
  public:
   static constexpr int Size = Dynamic;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl() = default;
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl() = default;
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(const DenseStorage_impl& other)
       : m_data(conditional_aligned_new_auto<T, Align>(other.size())), m_rows(other.m_rows) {
     EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = other.size())
@@ -355,7 +330,7 @@
       : m_data(conditional_aligned_new_auto<T, Align>(size)), m_rows(rows) {
     EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(DenseStorage_impl&& other) noexcept
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl(DenseStorage_impl&& other) noexcept
       : m_data(other.m_data), m_rows(other.m_rows) {
     other.m_data = nullptr;
     other.m_rows = 0;
@@ -366,11 +341,11 @@
     smart_copy(other.m_data, other.m_data + other.size(), m_data);
     return *this;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl& operator=(DenseStorage_impl&& other) noexcept {
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl& operator=(DenseStorage_impl&& other) noexcept {
     this->swap(other);
     return *this;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void swap(DenseStorage_impl& other) noexcept {
+  EIGEN_DEVICE_FUNC constexpr void swap(DenseStorage_impl& other) noexcept {
     numext::swap(m_data, other.m_data);
     numext::swap(m_rows, other.m_rows);
   }
@@ -387,11 +362,11 @@
     }
     m_rows = rows;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return m_rows; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return Cols; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index size() const { return m_rows * Cols; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* data() { return m_data; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* data() const { return m_data; }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const { return m_rows; }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const { return Cols; }
+  EIGEN_DEVICE_FUNC constexpr Index size() const { return m_rows * Cols; }
+  EIGEN_DEVICE_FUNC constexpr T* data() { return m_data; }
+  EIGEN_DEVICE_FUNC constexpr const T* data() const { return m_data; }
 };
 template <typename T, int Rows, int Options>
 class DenseStorage_impl<T, Dynamic, Rows, Dynamic, Options> {
@@ -401,7 +376,7 @@
 
  public:
   static constexpr int Size = Dynamic;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl() = default;
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl() = default;
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(const DenseStorage_impl& other)
       : m_data(conditional_aligned_new_auto<T, Align>(other.size())), m_cols(other.m_cols) {
     EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = other.size())
@@ -411,7 +386,7 @@
       : m_data(conditional_aligned_new_auto<T, Align>(size)), m_cols(cols) {
     EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(DenseStorage_impl&& other) noexcept
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl(DenseStorage_impl&& other) noexcept
       : m_data(other.m_data), m_cols(other.m_cols) {
     other.m_data = nullptr;
     other.m_cols = 0;
@@ -422,11 +397,11 @@
     smart_copy(other.m_data, other.m_data + other.size(), m_data);
     return *this;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl& operator=(DenseStorage_impl&& other) noexcept {
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl& operator=(DenseStorage_impl&& other) noexcept {
     this->swap(other);
     return *this;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void swap(DenseStorage_impl& other) noexcept {
+  EIGEN_DEVICE_FUNC constexpr void swap(DenseStorage_impl& other) noexcept {
     numext::swap(m_data, other.m_data);
     numext::swap(m_cols, other.m_cols);
   }
@@ -443,11 +418,11 @@
     }
     m_cols = cols;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return Rows; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return m_cols; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index size() const { return Rows * m_cols; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* data() { return m_data; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* data() const { return m_data; }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const { return Rows; }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const { return m_cols; }
+  EIGEN_DEVICE_FUNC constexpr Index size() const { return Rows * m_cols; }
+  EIGEN_DEVICE_FUNC constexpr T* data() { return m_data; }
+  EIGEN_DEVICE_FUNC constexpr const T* data() const { return m_data; }
 };
 template <typename T, int Options>
 class DenseStorage_impl<T, Dynamic, Dynamic, Dynamic, Options> {
@@ -458,7 +433,7 @@
 
  public:
   static constexpr int Size = Dynamic;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl() = default;
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl() = default;
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(const DenseStorage_impl& other)
       : m_data(conditional_aligned_new_auto<T, Align>(other.size())), m_rows(other.m_rows), m_cols(other.m_cols) {
     EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = other.size())
@@ -468,7 +443,7 @@
       : m_data(conditional_aligned_new_auto<T, Align>(size)), m_rows(rows), m_cols(cols) {
     EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(DenseStorage_impl&& other) noexcept
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl(DenseStorage_impl&& other) noexcept
       : m_data(other.m_data), m_rows(other.m_rows), m_cols(other.m_cols) {
     other.m_data = nullptr;
     other.m_rows = 0;
@@ -480,11 +455,11 @@
     smart_copy(other.m_data, other.m_data + other.size(), m_data);
     return *this;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl& operator=(DenseStorage_impl&& other) noexcept {
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl& operator=(DenseStorage_impl&& other) noexcept {
     this->swap(other);
     return *this;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void swap(DenseStorage_impl& other) noexcept {
+  EIGEN_DEVICE_FUNC constexpr void swap(DenseStorage_impl& other) noexcept {
     numext::swap(m_data, other.m_data);
     numext::swap(m_rows, other.m_rows);
     numext::swap(m_cols, other.m_cols);
@@ -504,11 +479,11 @@
     m_rows = rows;
     m_cols = cols;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return m_rows; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return m_cols; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index size() const { return m_rows * m_cols; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* data() { return m_data; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* data() const { return m_data; }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const { return m_rows; }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const { return m_cols; }
+  EIGEN_DEVICE_FUNC constexpr Index size() const { return m_rows * m_cols; }
+  EIGEN_DEVICE_FUNC constexpr T* data() { return m_data; }
+  EIGEN_DEVICE_FUNC constexpr const T* data() const { return m_data; }
 };
 template <typename T, int Size, int Rows, int Cols>
 struct use_default_move {
@@ -537,15 +512,14 @@
   using Base = internal::DenseStorage_impl<T, Size, Rows, Cols, Options>;
 
  public:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage() = default;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage(const DenseStorage&) = default;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage(Index size, Index rows, Index cols)
-      : Base(size, rows, cols) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage& operator=(const DenseStorage&) = default;
+  EIGEN_DEVICE_FUNC constexpr DenseStorage() = default;
+  EIGEN_DEVICE_FUNC constexpr DenseStorage(const DenseStorage&) = default;
+  EIGEN_DEVICE_FUNC constexpr DenseStorage(Index size, Index rows, Index cols) : Base(size, rows, cols) {}
+  EIGEN_DEVICE_FUNC constexpr DenseStorage& operator=(const DenseStorage&) = default;
   // if DenseStorage meets the requirements of use_default_move, then use the move construction and move assignment
   // operation defined in DenseStorage_impl, or the compiler-generated version if none is defined
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage(DenseStorage&&) = default;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage& operator=(DenseStorage&&) = default;
+  EIGEN_DEVICE_FUNC constexpr DenseStorage(DenseStorage&&) = default;
+  EIGEN_DEVICE_FUNC constexpr DenseStorage& operator=(DenseStorage&&) = default;
 };
 template <typename T, int Size, int Rows, int Cols, int Options>
 class DenseStorage<T, Size, Rows, Cols, Options, false>
@@ -553,16 +527,15 @@
   using Base = internal::DenseStorage_impl<T, Size, Rows, Cols, Options>;
 
  public:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage() = default;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage(const DenseStorage&) = default;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage(Index size, Index rows, Index cols)
-      : Base(size, rows, cols) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage& operator=(const DenseStorage&) = default;
+  EIGEN_DEVICE_FUNC constexpr DenseStorage() = default;
+  EIGEN_DEVICE_FUNC constexpr DenseStorage(const DenseStorage&) = default;
+  EIGEN_DEVICE_FUNC constexpr DenseStorage(Index size, Index rows, Index cols) : Base(size, rows, cols) {}
+  EIGEN_DEVICE_FUNC constexpr DenseStorage& operator=(const DenseStorage&) = default;
   // if DenseStorage does not meet the requirements of use_default_move, then defer to the copy construction and copy
   // assignment behavior
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage(DenseStorage&& other)
+  EIGEN_DEVICE_FUNC constexpr DenseStorage(DenseStorage&& other)
       : DenseStorage(static_cast<const DenseStorage&>(other)) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage& operator=(DenseStorage&& other) {
+  EIGEN_DEVICE_FUNC constexpr DenseStorage& operator=(DenseStorage&& other) {
     *this = other;
     return *this;
   }

diff --git a/Eigen/src/Core/DeviceWrapper.h b/Eigen/src/Core/DeviceWrapper.h
index 012dce1..1a326ae 100644
--- a/Eigen/src/Core/DeviceWrapper.h
+++ b/Eigen/src/Core/DeviceWrapper.h

@@ -87,7 +87,7 @@
           int Unrolling = Kernel::AssignmentTraits::Unrolling>
 struct dense_assignment_loop_with_device {
   using Base = dense_assignment_loop<Kernel, Traversal, Unrolling>;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel, Device&) { Base::run(kernel); }
+  static EIGEN_DEVICE_FUNC constexpr void run(Kernel& kernel, Device&) { Base::run(kernel); }
 };
 
 // entry point for a generic expression with device
@@ -104,7 +104,7 @@
   using ActualDstType = std::conditional_t<NeedToTranspose, Transpose<Dst>, Dst&>;
   ActualDstType actualDst(dst.derived());
 
-  // TODO check whether this is the right place to perform these checks:
+  // TODO: check whether this is the right place to perform these checks:
   EIGEN_STATIC_ASSERT_LVALUE(Dst)
   EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(ActualDstTypeCleaned, Src)
   EIGEN_CHECK_BINARY_COMPATIBILIY(Func, typename ActualDstTypeCleaned::Scalar, typename Src::Scalar);

diff --git a/Eigen/src/Core/Diagonal.h b/Eigen/src/Core/Diagonal.h
index ff8611c..61a4784 100644
--- a/Eigen/src/Core/Diagonal.h
+++ b/Eigen/src/Core/Diagonal.h

@@ -71,14 +71,14 @@
   typedef typename internal::dense_xpr_base<Diagonal>::type Base;
   EIGEN_DENSE_PUBLIC_INTERFACE(Diagonal)
 
-  EIGEN_DEVICE_FUNC explicit inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex)
+  EIGEN_DEVICE_FUNC constexpr explicit inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex)
       : m_matrix(matrix), m_index(a_index) {
     eigen_assert(a_index <= m_matrix.cols() && -a_index <= m_matrix.rows());
   }
 
   EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Diagonal)
 
-  EIGEN_DEVICE_FUNC inline Index rows() const {
+  EIGEN_DEVICE_FUNC constexpr inline Index rows() const {
     return m_index.value() < 0 ? numext::mini<Index>(m_matrix.cols(), m_matrix.rows() + m_index.value())
                                : numext::mini<Index>(m_matrix.rows(), m_matrix.cols() - m_index.value());
   }
@@ -91,8 +91,12 @@
 
   typedef std::conditional_t<internal::is_lvalue<MatrixType>::value, Scalar, const Scalar> ScalarWithConstIfNotLvalue;
 
-  EIGEN_DEVICE_FUNC inline ScalarWithConstIfNotLvalue* data() { return &(m_matrix.coeffRef(rowOffset(), colOffset())); }
-  EIGEN_DEVICE_FUNC inline const Scalar* data() const { return &(m_matrix.coeffRef(rowOffset(), colOffset())); }
+  EIGEN_DEVICE_FUNC inline ScalarWithConstIfNotLvalue* data() {
+    return rows() > 0 ? &(m_matrix.coeffRef(rowOffset(), colOffset())) : nullptr;
+  }
+  EIGEN_DEVICE_FUNC inline const Scalar* data() const {
+    return rows() > 0 ? &(m_matrix.coeffRef(rowOffset(), colOffset())) : nullptr;
+  }
 
   EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index row, Index) {
     EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
@@ -120,11 +124,12 @@
     return m_matrix.coeff(idx + rowOffset(), idx + colOffset());
   }
 
-  EIGEN_DEVICE_FUNC inline const internal::remove_all_t<typename MatrixType::Nested>& nestedExpression() const {
+  EIGEN_DEVICE_FUNC constexpr inline const internal::remove_all_t<typename MatrixType::Nested>& nestedExpression()
+      const {
     return m_matrix;
   }
 
-  EIGEN_DEVICE_FUNC inline Index index() const { return m_index.value(); }
+  EIGEN_DEVICE_FUNC constexpr inline Index index() const { return m_index.value(); }
 
  protected:
   typename internal::ref_selector<MatrixType>::non_const_type m_matrix;
@@ -132,15 +137,11 @@
 
  private:
   // some compilers may fail to optimize std::max etc in case of compile-time constants...
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index absDiagIndex() const noexcept {
+  EIGEN_DEVICE_FUNC constexpr Index absDiagIndex() const noexcept {
     return m_index.value() > 0 ? m_index.value() : -m_index.value();
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rowOffset() const noexcept {
-    return m_index.value() > 0 ? 0 : -m_index.value();
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index colOffset() const noexcept {
-    return m_index.value() > 0 ? m_index.value() : 0;
-  }
+  EIGEN_DEVICE_FUNC constexpr Index rowOffset() const noexcept { return m_index.value() > 0 ? 0 : -m_index.value(); }
+  EIGEN_DEVICE_FUNC constexpr Index colOffset() const noexcept { return m_index.value() > 0 ? m_index.value() : 0; }
   // trigger a compile-time error if someone try to call packet
   template <int LoadMode>
   typename MatrixType::PacketReturnType packet(Index) const;
@@ -157,13 +158,13 @@
  *
  * \sa class Diagonal */
 template <typename Derived>
-EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::DiagonalReturnType MatrixBase<Derived>::diagonal() {
+EIGEN_DEVICE_FUNC constexpr typename MatrixBase<Derived>::DiagonalReturnType MatrixBase<Derived>::diagonal() {
   return DiagonalReturnType(derived());
 }
 
 /** This is the const version of diagonal(). */
 template <typename Derived>
-EIGEN_DEVICE_FUNC inline const typename MatrixBase<Derived>::ConstDiagonalReturnType MatrixBase<Derived>::diagonal()
+EIGEN_DEVICE_FUNC constexpr const typename MatrixBase<Derived>::ConstDiagonalReturnType MatrixBase<Derived>::diagonal()
     const {
   return ConstDiagonalReturnType(derived());
 }
@@ -180,13 +181,14 @@
  *
  * \sa MatrixBase::diagonal(), class Diagonal */
 template <typename Derived>
-EIGEN_DEVICE_FUNC inline Diagonal<Derived, DynamicIndex> MatrixBase<Derived>::diagonal(Index index) {
+EIGEN_DEVICE_FUNC constexpr Diagonal<Derived, DynamicIndex> MatrixBase<Derived>::diagonal(Index index) {
   return Diagonal<Derived, DynamicIndex>(derived(), index);
 }
 
 /** This is the const version of diagonal(Index). */
 template <typename Derived>
-EIGEN_DEVICE_FUNC inline const Diagonal<const Derived, DynamicIndex> MatrixBase<Derived>::diagonal(Index index) const {
+EIGEN_DEVICE_FUNC constexpr const Diagonal<const Derived, DynamicIndex> MatrixBase<Derived>::diagonal(
+    Index index) const {
   return Diagonal<const Derived, DynamicIndex>(derived(), index);
 }
 
@@ -203,14 +205,14 @@
  * \sa MatrixBase::diagonal(), class Diagonal */
 template <typename Derived>
 template <int Index_>
-EIGEN_DEVICE_FUNC inline Diagonal<Derived, Index_> MatrixBase<Derived>::diagonal() {
+EIGEN_DEVICE_FUNC constexpr Diagonal<Derived, Index_> MatrixBase<Derived>::diagonal() {
   return Diagonal<Derived, Index_>(derived());
 }
 
 /** This is the const version of diagonal<int>(). */
 template <typename Derived>
 template <int Index_>
-EIGEN_DEVICE_FUNC inline const Diagonal<const Derived, Index_> MatrixBase<Derived>::diagonal() const {
+EIGEN_DEVICE_FUNC constexpr const Diagonal<const Derived, Index_> MatrixBase<Derived>::diagonal() const {
   return Diagonal<const Derived, Index_>(derived());
 }
 

diff --git a/Eigen/src/Core/DiagonalMatrix.h b/Eigen/src/Core/DiagonalMatrix.h
index 52630d9..c14d081 100644
--- a/Eigen/src/Core/DiagonalMatrix.h
+++ b/Eigen/src/Core/DiagonalMatrix.h

@@ -61,7 +61,7 @@
   /**
    * Constructs a dense matrix from \c *this. Note, this directly returns a dense matrix type,
    * not an expression.
-   * \returns A dense matrix, with its diagonal entries set from the the derived object. */
+   * \returns A dense matrix, with its diagonal entries set from the derived object. */
   EIGEN_DEVICE_FUNC DenseMatrixType toDenseMatrix() const { return derived(); }
 
   /** \returns a reference to the derived object's vector of diagonal coefficients. */
@@ -184,21 +184,22 @@
 
  public:
   /** const version of diagonal(). */
-  EIGEN_DEVICE_FUNC inline const DiagonalVectorType& diagonal() const { return m_diagonal; }
+  EIGEN_DEVICE_FUNC constexpr inline const DiagonalVectorType& diagonal() const { return m_diagonal; }
   /** \returns a reference to the stored vector of diagonal coefficients. */
-  EIGEN_DEVICE_FUNC inline DiagonalVectorType& diagonal() { return m_diagonal; }
+  EIGEN_DEVICE_FUNC constexpr inline DiagonalVectorType& diagonal() { return m_diagonal; }
 
   /** Default constructor without initialization */
-  EIGEN_DEVICE_FUNC inline DiagonalMatrix() {}
+  EIGEN_DEVICE_FUNC constexpr inline DiagonalMatrix() {}
 
   /** Constructs a diagonal matrix with given dimension  */
-  EIGEN_DEVICE_FUNC explicit inline DiagonalMatrix(Index dim) : m_diagonal(dim) {}
+  EIGEN_DEVICE_FUNC constexpr explicit inline DiagonalMatrix(Index dim) : m_diagonal(dim) {}
 
   /** 2D constructor. */
-  EIGEN_DEVICE_FUNC inline DiagonalMatrix(const Scalar& x, const Scalar& y) : m_diagonal(x, y) {}
+  EIGEN_DEVICE_FUNC constexpr inline DiagonalMatrix(const Scalar& x, const Scalar& y) : m_diagonal(x, y) {}
 
   /** 3D constructor. */
-  EIGEN_DEVICE_FUNC inline DiagonalMatrix(const Scalar& x, const Scalar& y, const Scalar& z) : m_diagonal(x, y, z) {}
+  EIGEN_DEVICE_FUNC constexpr inline DiagonalMatrix(const Scalar& x, const Scalar& y, const Scalar& z)
+      : m_diagonal(x, y, z) {}
 
   /** \brief Construct a diagonal matrix with fixed size from an arbitrary number of coefficients.
    *
@@ -209,8 +210,8 @@
    * \sa DiagonalMatrix(const Scalar&, const Scalar&, const Scalar&)
    */
   template <typename... ArgTypes>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DiagonalMatrix(const Scalar& a0, const Scalar& a1, const Scalar& a2,
-                                                       const ArgTypes&... args)
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE DiagonalMatrix(const Scalar& a0, const Scalar& a1, const Scalar& a2,
+                                                                 const ArgTypes&... args)
       : m_diagonal(a0, a1, a2, args...) {}
 
   /** \brief Constructs a DiagonalMatrix and initializes it by elements given by an initializer list of initializer
@@ -221,11 +222,12 @@
       : m_diagonal(list) {}
 
   /** \brief Constructs a DiagonalMatrix from an r-value diagonal vector type */
-  EIGEN_DEVICE_FUNC explicit inline DiagonalMatrix(DiagonalVectorType&& diag) : m_diagonal(std::move(diag)) {}
+  EIGEN_DEVICE_FUNC constexpr explicit inline DiagonalMatrix(DiagonalVectorType&& diag) : m_diagonal(std::move(diag)) {}
 
   /** Copy constructor. */
   template <typename OtherDerived>
-  EIGEN_DEVICE_FUNC inline DiagonalMatrix(const DiagonalBase<OtherDerived>& other) : m_diagonal(other.diagonal()) {}
+  EIGEN_DEVICE_FUNC constexpr inline DiagonalMatrix(const DiagonalBase<OtherDerived>& other)
+      : m_diagonal(other.diagonal()) {}
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
   /** copy constructor. prevent a default copy constructor from hiding the other templated constructor */
@@ -234,7 +236,8 @@
 
   /** generic constructor from expression of the diagonal coefficients */
   template <typename OtherDerived>
-  EIGEN_DEVICE_FUNC explicit inline DiagonalMatrix(const MatrixBase<OtherDerived>& other) : m_diagonal(other) {}
+  EIGEN_DEVICE_FUNC constexpr explicit inline DiagonalMatrix(const MatrixBase<OtherDerived>& other)
+      : m_diagonal(other) {}
 
   /** Copy operator. */
   template <typename OtherDerived>
@@ -325,10 +328,11 @@
 #endif
 
   /** Constructor from expression of diagonal coefficients to wrap. */
-  EIGEN_DEVICE_FUNC explicit inline DiagonalWrapper(DiagonalVectorType& a_diagonal) : m_diagonal(a_diagonal) {}
+  EIGEN_DEVICE_FUNC constexpr explicit inline DiagonalWrapper(DiagonalVectorType& a_diagonal)
+      : m_diagonal(a_diagonal) {}
 
   /** \returns a const reference to the wrapped expression of diagonal coefficients. */
-  EIGEN_DEVICE_FUNC const DiagonalVectorType& diagonal() const { return m_diagonal; }
+  EIGEN_DEVICE_FUNC constexpr const DiagonalVectorType& diagonal() const { return m_diagonal; }
 
  protected:
   typename DiagonalVectorType::Nested m_diagonal;
@@ -344,7 +348,7 @@
  * \sa class DiagonalWrapper, class DiagonalMatrix, diagonal(), isDiagonal()
  **/
 template <typename Derived>
-EIGEN_DEVICE_FUNC inline const DiagonalWrapper<const Derived> MatrixBase<Derived>::asDiagonal() const {
+EIGEN_DEVICE_FUNC constexpr const DiagonalWrapper<const Derived> MatrixBase<Derived>::asDiagonal() const {
   return DiagonalWrapper<const Derived>(derived());
 }
 
@@ -372,6 +376,55 @@
   return true;
 }
 
+/** \returns DiagonalWrapper.
+ *
+ * Example: \include MatrixBase_diagonalView.cpp
+ * Output: \verbinclude MatrixBase_diagonalView.out
+ *
+ * \sa diagonalView()
+ */
+
+/** This is the non-const version of diagonalView() with DiagIndex_ . */
+template <typename Derived>
+template <int DiagIndex_>
+EIGEN_DEVICE_FUNC constexpr DiagonalWrapper<Diagonal<Derived, DiagIndex_>> MatrixBase<Derived>::diagonalView() {
+  typedef Diagonal<Derived, DiagIndex_> DiagType;
+  typedef DiagonalWrapper<DiagType> ReturnType;
+  DiagType diag(this->derived());
+  return ReturnType(diag);
+}
+
+/** This is the const version of diagonalView() with DiagIndex_ . */
+template <typename Derived>
+template <int DiagIndex_>
+EIGEN_DEVICE_FUNC constexpr DiagonalWrapper<Diagonal<const Derived, DiagIndex_>> MatrixBase<Derived>::diagonalView()
+    const {
+  typedef Diagonal<const Derived, DiagIndex_> DiagType;
+  typedef DiagonalWrapper<DiagType> ReturnType;
+  DiagType diag(this->derived());
+  return ReturnType(diag);
+}
+
+/** This is the non-const version of diagonalView() with dynamic index. */
+template <typename Derived>
+EIGEN_DEVICE_FUNC constexpr DiagonalWrapper<Diagonal<Derived, DynamicIndex>> MatrixBase<Derived>::diagonalView(
+    Index index) {
+  typedef Diagonal<Derived, DynamicIndex> DiagType;
+  typedef DiagonalWrapper<DiagType> ReturnType;
+  DiagType diag(this->derived(), index);
+  return ReturnType(diag);
+}
+
+/** This is the const version of diagonalView() with dynamic index. */
+template <typename Derived>
+EIGEN_DEVICE_FUNC constexpr DiagonalWrapper<Diagonal<const Derived, DynamicIndex>> MatrixBase<Derived>::diagonalView(
+    Index index) const {
+  typedef Diagonal<const Derived, DynamicIndex> DiagType;
+  typedef DiagonalWrapper<DiagType> ReturnType;
+  DiagType diag(this->derived(), index);
+  return ReturnType(diag);
+}
+
 namespace internal {
 
 template <>

diff --git a/Eigen/src/Core/Dot.h b/Eigen/src/Core/Dot.h
index a173306..d0a30dd 100644
--- a/Eigen/src/Core/Dot.h
+++ b/Eigen/src/Core/Dot.h

@@ -20,12 +20,14 @@
 template <typename Derived, typename Scalar = typename traits<Derived>::Scalar>
 struct squared_norm_impl {
   using Real = typename NumTraits<Scalar>::Real;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Real run(const Derived& a) { return a.realView().cwiseAbs2().sum(); }
+  static EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Real run(const Derived& a) {
+    return a.realView().cwiseAbs2().sum();
+  }
 };
 
 template <typename Derived>
 struct squared_norm_impl<Derived, bool> {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(const Derived& a) { return a.any(); }
+  static EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE bool run(const Derived& a) { return a.any(); }
 };
 
 }  // end namespace internal
@@ -43,7 +45,7 @@
  */
 template <typename Derived>
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
     typename ScalarBinaryOpTraits<typename internal::traits<Derived>::Scalar,
                                   typename internal::traits<OtherDerived>::Scalar>::ReturnType
     MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const {
@@ -54,19 +56,19 @@
 
 /** \returns, for vectors, the squared \em l2 norm of \c *this, and for matrices the squared Frobenius norm.
  * In both cases, it consists in the sum of the square of all the matrix entries.
- * For vectors, this is also equals to the dot product of \c *this with itself.
+ * For vectors, this is also equal to the dot product of \c *this with itself.
  *
  * \sa dot(), norm(), lpNorm()
  */
 template <typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
 MatrixBase<Derived>::squaredNorm() const {
   return internal::squared_norm_impl<Derived>::run(derived());
 }
 
 /** \returns, for vectors, the \em l2 norm of \c *this, and for matrices the Frobenius norm.
  * In both cases, it consists in the square root of the sum of the square of all the matrix entries.
- * For vectors, this is also equals to the square root of the dot product of \c *this with itself.
+ * For vectors, this is also equal to the square root of the dot product of \c *this with itself.
  *
  * \sa lpNorm(), dot(), squaredNorm()
  */

diff --git a/Eigen/src/Core/EigenBase.h b/Eigen/src/Core/EigenBase.h
index c9a6e88..dfe4a64 100644
--- a/Eigen/src/Core/EigenBase.h
+++ b/Eigen/src/Core/EigenBase.h

@@ -53,7 +53,7 @@
   EIGEN_DEVICE_FUNC inline constexpr Derived& const_cast_derived() const {
     return *static_cast<Derived*>(const_cast<EigenBase*>(this));
   }
-  EIGEN_DEVICE_FUNC inline const Derived& const_derived() const { return *static_cast<const Derived*>(this); }
+  EIGEN_DEVICE_FUNC constexpr inline const Derived& const_derived() const { return *static_cast<const Derived*>(this); }
 
   /** \returns the number of rows. \sa cols(), RowsAtCompileTime */
   EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return derived().rows(); }
@@ -65,13 +65,13 @@
 
   /** \internal Don't use it, but do the equivalent: \code dst = *this; \endcode */
   template <typename Dest>
-  EIGEN_DEVICE_FUNC inline void evalTo(Dest& dst) const {
+  EIGEN_DEVICE_FUNC constexpr inline void evalTo(Dest& dst) const {
     derived().evalTo(dst);
   }
 
   /** \internal Don't use it, but do the equivalent: \code dst += *this; \endcode */
   template <typename Dest>
-  EIGEN_DEVICE_FUNC inline void addTo(Dest& dst) const {
+  EIGEN_DEVICE_FUNC constexpr inline void addTo(Dest& dst) const {
     // This is the default implementation,
     // derived class can reimplement it in a more optimized way.
     typename Dest::PlainObject res(rows(), cols());
@@ -81,7 +81,7 @@
 
   /** \internal Don't use it, but do the equivalent: \code dst -= *this; \endcode */
   template <typename Dest>
-  EIGEN_DEVICE_FUNC inline void subTo(Dest& dst) const {
+  EIGEN_DEVICE_FUNC constexpr inline void subTo(Dest& dst) const {
     // This is the default implementation,
     // derived class can reimplement it in a more optimized way.
     typename Dest::PlainObject res(rows(), cols());
@@ -91,7 +91,7 @@
 
   /** \internal Don't use it, but do the equivalent: \code dst.applyOnTheRight(*this); \endcode */
   template <typename Dest>
-  EIGEN_DEVICE_FUNC inline void applyThisOnTheRight(Dest& dst) const {
+  EIGEN_DEVICE_FUNC constexpr inline void applyThisOnTheRight(Dest& dst) const {
     // This is the default implementation,
     // derived class can reimplement it in a more optimized way.
     dst = dst * this->derived();
@@ -99,7 +99,7 @@
 
   /** \internal Don't use it, but do the equivalent: \code dst.applyOnTheLeft(*this); \endcode */
   template <typename Dest>
-  EIGEN_DEVICE_FUNC inline void applyThisOnTheLeft(Dest& dst) const {
+  EIGEN_DEVICE_FUNC constexpr inline void applyThisOnTheLeft(Dest& dst) const {
     // This is the default implementation,
     // derived class can reimplement it in a more optimized way.
     dst = this->derived() * dst;
@@ -125,21 +125,21 @@
  */
 template <typename Derived>
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC Derived& DenseBase<Derived>::operator=(const EigenBase<OtherDerived>& other) {
+EIGEN_DEVICE_FUNC constexpr Derived& DenseBase<Derived>::operator=(const EigenBase<OtherDerived>& other) {
   call_assignment(derived(), other.derived());
   return derived();
 }
 
 template <typename Derived>
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC Derived& DenseBase<Derived>::operator+=(const EigenBase<OtherDerived>& other) {
+EIGEN_DEVICE_FUNC constexpr Derived& DenseBase<Derived>::operator+=(const EigenBase<OtherDerived>& other) {
   call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar, typename OtherDerived::Scalar>());
   return derived();
 }
 
 template <typename Derived>
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC Derived& DenseBase<Derived>::operator-=(const EigenBase<OtherDerived>& other) {
+EIGEN_DEVICE_FUNC constexpr Derived& DenseBase<Derived>::operator-=(const EigenBase<OtherDerived>& other) {
   call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar, typename OtherDerived::Scalar>());
   return derived();
 }

diff --git a/Eigen/src/Core/Fill.h b/Eigen/src/Core/Fill.h
index 779ef26..cacf921 100644
--- a/Eigen/src/Core/Fill.h
+++ b/Eigen/src/Core/Fill.h

@@ -60,12 +60,12 @@
   using Func = scalar_constant_op<Scalar>;
   using PlainObject = typename Xpr::PlainObject;
   using Constant = typename PlainObject::ConstantReturnType;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void run(Xpr& dst, const Scalar& val) {
+  static EIGEN_DEVICE_FUNC constexpr void run(Xpr& dst, const Scalar& val) {
     const Constant src(dst.rows(), dst.cols(), val);
     run(dst, src);
   }
   template <typename SrcXpr>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void run(Xpr& dst, const SrcXpr& src) {
+  static EIGEN_DEVICE_FUNC constexpr void run(Xpr& dst, const SrcXpr& src) {
     call_dense_assignment_loop(dst, src, assign_op<Scalar, Scalar>());
   }
 };
@@ -93,8 +93,10 @@
 
 template <typename Xpr>
 struct eigen_memset_helper {
-  static constexpr bool value =
-      std::is_trivially_copyable<typename Xpr::Scalar>::value && eigen_fill_helper<Xpr>::value;
+  using Scalar = typename Xpr::Scalar;
+  static constexpr bool value = std::is_trivially_copyable<Scalar>::value &&
+                                !static_cast<bool>(NumTraits<Scalar>::RequireInitialization) &&
+                                eigen_fill_helper<Xpr>::value;
 };
 
 template <typename Xpr>
@@ -102,12 +104,12 @@
   using Scalar = typename Xpr::Scalar;
   using PlainObject = typename Xpr::PlainObject;
   using Zero = typename PlainObject::ZeroReturnType;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void run(Xpr& dst) {
+  static EIGEN_DEVICE_FUNC constexpr void run(Xpr& dst) {
     const Zero src(dst.rows(), dst.cols());
     run(dst, src);
   }
   template <typename SrcXpr>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void run(Xpr& dst, const SrcXpr& src) {
+  static EIGEN_DEVICE_FUNC constexpr void run(Xpr& dst, const SrcXpr& src) {
     call_dense_assignment_loop(dst, src, assign_op<Scalar, Scalar>());
   }
 };

diff --git a/Eigen/src/Core/FindCoeff.h b/Eigen/src/Core/FindCoeff.h
index 0102e8a..1c07260 100644
--- a/Eigen/src/Core/FindCoeff.h
+++ b/Eigen/src/Core/FindCoeff.h

@@ -34,11 +34,11 @@
 
 template <typename Scalar>
 struct max_coeff_functor<Scalar, PropagateNaN, false> {
-  EIGEN_DEVICE_FUNC inline Scalar compareCoeff(const Scalar& incumbent, const Scalar& candidate) {
+  EIGEN_DEVICE_FUNC inline Scalar compareCoeff(const Scalar& incumbent, const Scalar& candidate) const {
     return (candidate > incumbent) || ((candidate != candidate) && (incumbent == incumbent));
   }
   template <typename Packet>
-  EIGEN_DEVICE_FUNC inline Packet comparePacket(const Packet& incumbent, const Packet& candidate) {
+  EIGEN_DEVICE_FUNC inline Packet comparePacket(const Packet& incumbent, const Packet& candidate) const {
     return pandnot(pcmp_lt_or_nan(incumbent, candidate), pisnan(incumbent));
   }
   template <typename Packet>
@@ -79,11 +79,11 @@
 
 template <typename Scalar>
 struct min_coeff_functor<Scalar, PropagateNaN, false> {
-  EIGEN_DEVICE_FUNC inline Scalar compareCoeff(const Scalar& incumbent, const Scalar& candidate) {
+  EIGEN_DEVICE_FUNC inline Scalar compareCoeff(const Scalar& incumbent, const Scalar& candidate) const {
     return (candidate < incumbent) || ((candidate != candidate) && (incumbent == incumbent));
   }
   template <typename Packet>
-  EIGEN_DEVICE_FUNC inline Packet comparePacket(const Packet& incumbent, const Packet& candidate) {
+  EIGEN_DEVICE_FUNC inline Packet comparePacket(const Packet& incumbent, const Packet& candidate) const {
     return pandnot(pcmp_lt_or_nan(candidate, incumbent), pisnan(incumbent));
   }
   template <typename Packet>

diff --git a/Eigen/src/Core/ForceAlignedAccess.h b/Eigen/src/Core/ForceAlignedAccess.h
index 55beab3..4f69c20 100644
--- a/Eigen/src/Core/ForceAlignedAccess.h
+++ b/Eigen/src/Core/ForceAlignedAccess.h

@@ -39,7 +39,7 @@
   typedef typename internal::dense_xpr_base<ForceAlignedAccess>::type Base;
   EIGEN_DENSE_PUBLIC_INTERFACE(ForceAlignedAccess)
 
-  EIGEN_DEVICE_FUNC explicit inline ForceAlignedAccess(const ExpressionType& matrix) : m_expression(matrix) {}
+  EIGEN_DEVICE_FUNC explicit constexpr ForceAlignedAccess(const ExpressionType& matrix) : m_expression(matrix) {}
 
   EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_expression.rows(); }
   EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_expression.cols(); }
@@ -103,25 +103,6 @@
   return ForceAlignedAccess<Derived>(derived());
 }
 
-/** \returns an expression of *this with forced aligned access if \a Enable is true.
- * \sa forceAlignedAccess(), class ForceAlignedAccess
- */
-template <typename Derived>
-template <bool Enable>
-inline add_const_on_value_type_t<std::conditional_t<Enable, ForceAlignedAccess<Derived>, Derived&>>
-MatrixBase<Derived>::forceAlignedAccessIf() const {
-  return derived();  // FIXME This should not work but apparently is never used
-}
-
-/** \returns an expression of *this with forced aligned access if \a Enable is true.
- * \sa forceAlignedAccess(), class ForceAlignedAccess
- */
-template <typename Derived>
-template <bool Enable>
-inline std::conditional_t<Enable, ForceAlignedAccess<Derived>, Derived&> MatrixBase<Derived>::forceAlignedAccessIf() {
-  return derived();  // FIXME This should not work but apparently is never used
-}
-
 }  // end namespace Eigen
 
 #endif  // EIGEN_FORCEALIGNEDACCESS_H

diff --git a/Eigen/src/Core/Fuzzy.h b/Eigen/src/Core/Fuzzy.h
index ed6b4ff..eaa553c 100644
--- a/Eigen/src/Core/Fuzzy.h
+++ b/Eigen/src/Core/Fuzzy.h

@@ -86,8 +86,8 @@
  */
 template <typename Derived>
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isApprox(const DenseBase<OtherDerived>& other,
-                                                    const RealScalar& prec) const {
+EIGEN_DEVICE_FUNC constexpr bool DenseBase<Derived>::isApprox(const DenseBase<OtherDerived>& other,
+                                                              const RealScalar& prec) const {
   return internal::isApprox_selector<Derived, OtherDerived>::run(derived(), other.derived(), prec);
 }
 
@@ -105,8 +105,8 @@
  * \sa isApprox(), isMuchSmallerThan(const DenseBase<OtherDerived>&, RealScalar) const
  */
 template <typename Derived>
-EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isMuchSmallerThan(const typename NumTraits<Scalar>::Real& other,
-                                                             const RealScalar& prec) const {
+EIGEN_DEVICE_FUNC constexpr bool DenseBase<Derived>::isMuchSmallerThan(const typename NumTraits<Scalar>::Real& other,
+                                                                       const RealScalar& prec) const {
   return internal::isMuchSmallerThan_scalar_selector<Derived>::run(derived(), other, prec);
 }
 
@@ -122,8 +122,8 @@
  */
 template <typename Derived>
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isMuchSmallerThan(const DenseBase<OtherDerived>& other,
-                                                             const RealScalar& prec) const {
+EIGEN_DEVICE_FUNC constexpr bool DenseBase<Derived>::isMuchSmallerThan(const DenseBase<OtherDerived>& other,
+                                                                       const RealScalar& prec) const {
   return internal::isMuchSmallerThan_object_selector<Derived, OtherDerived>::run(derived(), other.derived(), prec);
 }
 

diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h
index e4c51d2..707611a 100644
--- a/Eigen/src/Core/GeneralProduct.h
+++ b/Eigen/src/Core/GeneralProduct.h

@@ -89,7 +89,7 @@
 /* The following allows to select the kind of product at compile time
  * based on the three dimensions of the product.
  * This is a compile time mapping from {1,Small,Large}^3 -> {product types} */
-// FIXME I'm not sure the current mapping is the ideal one.
+// FIXME: the current compile-time product-type mapping may not be optimal.
 template <int M, int N>
 struct product_type_selector<M, N, 1> {
   enum { ret = OuterProduct };
@@ -193,12 +193,11 @@
  *  Implementation of Inner Vector Vector Product
  ***********************************************************************/
 
-// FIXME : maybe the "inner product" could return a Scalar
-// instead of a 1x1 matrix ??
-// Pro: more natural for the user
-// Cons: this could be a problem if in a meta unrolled algorithm a matrix-matrix
-// product ends up to a row-vector times col-vector product... To tackle this use
-// case, we could have a specialization for Block<MatrixType,1,1> with: operator=(Scalar x);
+// FIXME: consider returning a Scalar instead of a 1x1 matrix for inner products.
+// Pro: more natural for the user.
+// Con: in a meta-unrolled algorithm a matrix-matrix product may reduce to a
+// row-vector times column-vector product. To handle this, we could specialize
+// Block<MatrixType,1,1> with operator=(Scalar x).
 
 /***********************************************************************
  *  Implementation of Outer Vector Vector Product
@@ -208,7 +207,7 @@
  *  Implementation of General Matrix Vector Product
  ***********************************************************************/
 
-/*  According to the shape/flags of the matrix we have to distinghish 3 different cases:
+/*  According to the shape/flags of the matrix we have to distinguish 3 different cases:
  *   1 - the matrix is col-major, BLAS compatible and M is large => call fast BLAS-like colmajor routine
  *   2 - the matrix is row-major, BLAS compatible and N is large => call fast BLAS-like rowmajor routine
  *   3 - all other cases are handled using a simple loop along the outer-storage direction.
@@ -229,7 +228,7 @@
 
 template <typename Scalar, int Size, int MaxSize>
 struct gemv_static_vector_if<Scalar, Size, MaxSize, false> {
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Scalar* data() {
+  EIGEN_DEVICE_FUNC constexpr Scalar* data() {
     eigen_internal_assert(false && "should never be called");
     return 0;
   }
@@ -237,19 +236,19 @@
 
 template <typename Scalar, int Size>
 struct gemv_static_vector_if<Scalar, Size, Dynamic, true> {
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Scalar* data() { return 0; }
+  EIGEN_DEVICE_FUNC constexpr Scalar* data() { return 0; }
 };
 
 template <typename Scalar, int Size, int MaxSize>
 struct gemv_static_vector_if<Scalar, Size, MaxSize, true> {
 #if EIGEN_MAX_STATIC_ALIGN_BYTES != 0
   internal::plain_array<Scalar, internal::min_size_prefer_fixed(Size, MaxSize), 0, AlignedMax> m_data;
-  EIGEN_STRONG_INLINE constexpr Scalar* data() { return m_data.array; }
+  constexpr Scalar* data() { return m_data.array; }
 #else
   // Some architectures cannot align on the stack,
   // => let's manually enforce alignment by allocating more data and return the address of the first aligned element.
   internal::plain_array<Scalar, internal::min_size_prefer_fixed(Size, MaxSize) + EIGEN_MAX_ALIGN_BYTES, 0> m_data;
-  EIGEN_STRONG_INLINE constexpr Scalar* data() {
+  constexpr Scalar* data() {
     return reinterpret_cast<Scalar*>((std::uintptr_t(m_data.array) & ~(std::size_t(EIGEN_MAX_ALIGN_BYTES - 1))) +
                                      EIGEN_MAX_ALIGN_BYTES);
   }
@@ -293,7 +292,7 @@
     typedef std::conditional_t<Dest::IsVectorAtCompileTime, Dest, typename Dest::ColXpr> ActualDest;
 
     enum {
-      // FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1
+      // FIXME: find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1
       // on, the other hand it is good for the cache to pack the vector anyways...
       EvalToDestAtCompileTime = (ActualDest::InnerStrideAtCompileTime == 1),
       ComplexByReal = (NumTraits<LhsScalar>::IsComplex) && (!NumTraits<RhsScalar>::IsComplex),
@@ -376,7 +375,7 @@
     ResScalar actualAlpha = combine_scalar_factors(alpha, lhs, rhs);
 
     enum {
-      // FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1
+      // FIXME: find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1
       // on, the other hand it is good for the cache to pack the vector anyways...
       DirectlyUseRhs =
           ActualRhsTypeCleaned::InnerStrideAtCompileTime == 1 || ActualRhsTypeCleaned::MaxSizeAtCompileTime == 0
@@ -417,7 +416,7 @@
   static void run(const Lhs& lhs, const Rhs& rhs, Dest& dest, const typename Dest::Scalar& alpha) {
     EIGEN_STATIC_ASSERT((!nested_eval<Lhs, 1>::Evaluate),
                         EIGEN_INTERNAL_COMPILATION_ERROR_OR_YOU_MADE_A_PROGRAMMING_MISTAKE);
-    // TODO if rhs is large enough it might be beneficial to make sure that dest is sequentially stored in memory,
+    // TODO: if rhs is large enough it might be beneficial to make sure that dest is sequentially stored in memory,
     // otherwise use a temp
     typename nested_eval<Rhs, 1>::type actual_rhs(rhs);
     const Index size = rhs.rows();

diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index dc3e03d..10c798a 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h

@@ -610,7 +610,7 @@
   return cond ? a : b;
 }
 
-/** \internal \returns the min or of \a a and \a b (coeff-wise)
+/** \internal \returns the min or max of \a a and \a b (coeff-wise)
     If either \a a or \a b are NaN, the result is implementation defined. */
 template <int NaNPropagation, bool IsInteger>
 struct pminmax_impl {
@@ -648,7 +648,7 @@
 #define EIGEN_BINARY_OP_NAN_PROPAGATION(Type, Func) [](const Type& aa, const Type& bb) { return Func(aa, bb); }
 
 /** \internal \returns the min of \a a and \a b  (coeff-wise).
-    If \a a or \b b is NaN, the return value is implementation defined. */
+    If \a a or \a b is NaN, the return value is implementation defined. */
 template <typename Packet>
 EIGEN_DEVICE_FUNC inline Packet pmin(const Packet& a, const Packet& b) {
   return numext::mini(a, b);
@@ -663,7 +663,7 @@
 }
 
 /** \internal \returns the max of \a a and \a b  (coeff-wise)
-    If \a a or \b b is NaN, the return value is implementation defined. */
+    If \a a or \a b is NaN, the return value is implementation defined. */
 template <typename Packet>
 EIGEN_DEVICE_FUNC inline Packet pmax(const Packet& a, const Packet& b) {
   return numext::maxi(a, b);
@@ -1329,9 +1329,7 @@
 /** \internal \returns true if all coeffs of \a a means "true"
  * It is supposed to be called on values returned by pcmp_*.
  */
-// not needed yet
-// template<typename Packet> EIGEN_DEVICE_FUNC inline bool predux_all(const Packet& a)
-// { return bool(a); }
+// TODO: implement predux_all when needed.
 
 /** \internal \returns true if any coeffs of \a a means "true"
  * It is supposed to be called on values returned by pcmp_*.

diff --git a/Eigen/src/Core/GlobalFunctions.h b/Eigen/src/Core/GlobalFunctions.h
index df1098e..c32aac9 100644
--- a/Eigen/src/Core/GlobalFunctions.h
+++ b/Eigen/src/Core/GlobalFunctions.h

@@ -130,12 +130,12 @@
  */
 #ifdef EIGEN_PARSED_BY_DOXYGEN
 template <typename Derived, typename ScalarExponent>
-EIGEN_DEVICE_FUNC inline const GlobalUnaryPowReturnType<Derived, ScalarExponent> pow(const Eigen::ArrayBase<Derived>& x,
-                                                                                     const ScalarExponent& exponent);
+EIGEN_DEVICE_FUNC constexpr inline const GlobalUnaryPowReturnType<Derived, ScalarExponent> pow(
+    const Eigen::ArrayBase<Derived>& x, const ScalarExponent& exponent);
 #else
 template <typename Derived, typename ScalarExponent>
-EIGEN_DEVICE_FUNC inline const GlobalUnaryPowReturnType<Derived, ScalarExponent> pow(const Eigen::ArrayBase<Derived>& x,
-                                                                                     const ScalarExponent& exponent) {
+EIGEN_DEVICE_FUNC constexpr inline const GlobalUnaryPowReturnType<Derived, ScalarExponent> pow(
+    const Eigen::ArrayBase<Derived>& x, const ScalarExponent& exponent) {
   return GlobalUnaryPowReturnType<Derived, ScalarExponent>(
       x.derived(), internal::scalar_unary_pow_op<typename Derived::Scalar, ScalarExponent>(exponent));
 }

diff --git a/Eigen/src/Core/IO.h b/Eigen/src/Core/IO.h
index 0a1b583..e2362fc 100644
--- a/Eigen/src/Core/IO.h
+++ b/Eigen/src/Core/IO.h

@@ -65,7 +65,7 @@
         fill(_fill),
         precision(_precision),
         flags(_flags) {
-    // TODO check if rowPrefix, rowSuffix or rowSeparator contains a newline
+    // TODO: check if rowPrefix, rowSuffix or rowSeparator contains a newline
     // don't add rowSpacer if columns are not to be aligned
     if ((flags & DontAlignCols)) return;
     int i = int(matPrefix.length()) - 1;

diff --git a/Eigen/src/Core/IndexedView.h b/Eigen/src/Core/IndexedView.h
index 10562c1..44d5c02 100644
--- a/Eigen/src/Core/IndexedView.h
+++ b/Eigen/src/Core/IndexedView.h

@@ -59,7 +59,7 @@
     ReturnAsBlock = (!ReturnAsScalar) && IsBlockAlike,
     ReturnAsIndexedView = (!ReturnAsScalar) && (!ReturnAsBlock),
 
-    // FIXME we deal with compile-time strides if and only if we have DirectAccessBit flag,
+    // FIXME: we deal with compile-time strides if and only if we have DirectAccessBit flag,
     // but this is too strict regarding negative strides...
     DirectAccessMask = (int(InnerIncr) != Undefined && int(OuterIncr) != Undefined && InnerIncr >= 0 && OuterIncr >= 0)
                            ? DirectAccessBit
@@ -259,26 +259,27 @@
     Alignment = 0
   };
 
-  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& xpr) : m_argImpl(xpr.nestedExpression()), m_xpr(xpr) {
+  EIGEN_DEVICE_FUNC constexpr explicit unary_evaluator(const XprType& xpr)
+      : m_argImpl(xpr.nestedExpression()), m_xpr(xpr) {
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
 
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
     eigen_assert(m_xpr.rowIndices()[row] >= 0 && m_xpr.rowIndices()[row] < m_xpr.nestedExpression().rows() &&
                  m_xpr.colIndices()[col] >= 0 && m_xpr.colIndices()[col] < m_xpr.nestedExpression().cols());
     return m_argImpl.coeff(m_xpr.rowIndices()[row], m_xpr.colIndices()[col]);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) {
     eigen_assert(m_xpr.rowIndices()[row] >= 0 && m_xpr.rowIndices()[row] < m_xpr.nestedExpression().rows() &&
                  m_xpr.colIndices()[col] >= 0 && m_xpr.colIndices()[col] < m_xpr.nestedExpression().cols());
     return m_argImpl.coeffRef(m_xpr.rowIndices()[row], m_xpr.colIndices()[col]);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
     EIGEN_STATIC_ASSERT_LVALUE(XprType)
     Index row = XprType::RowsAtCompileTime == 1 ? 0 : index;
     Index col = XprType::RowsAtCompileTime == 1 ? index : 0;
@@ -287,7 +288,7 @@
     return m_argImpl.coeffRef(m_xpr.rowIndices()[row], m_xpr.colIndices()[col]);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeffRef(Index index) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const Scalar& coeffRef(Index index) const {
     Index row = XprType::RowsAtCompileTime == 1 ? 0 : index;
     Index col = XprType::RowsAtCompileTime == 1 ? index : 0;
     eigen_assert(m_xpr.rowIndices()[row] >= 0 && m_xpr.rowIndices()[row] < m_xpr.nestedExpression().rows() &&
@@ -295,7 +296,7 @@
     return m_argImpl.coeffRef(m_xpr.rowIndices()[row], m_xpr.colIndices()[col]);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index index) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index index) const {
     Index row = XprType::RowsAtCompileTime == 1 ? 0 : index;
     Index col = XprType::RowsAtCompileTime == 1 ? index : 0;
     eigen_assert(m_xpr.rowIndices()[row] >= 0 && m_xpr.rowIndices()[row] < m_xpr.nestedExpression().rows() &&

diff --git a/Eigen/src/Core/InnerProduct.h b/Eigen/src/Core/InnerProduct.h
index 686ad13..ad9844d 100644
--- a/Eigen/src/Core/InnerProduct.h
+++ b/Eigen/src/Core/InnerProduct.h

@@ -216,8 +216,8 @@
 template <typename Scalar, bool Conj>
 struct scalar_inner_product_op<
     Scalar,
-    typename std::enable_if<internal::is_same<typename ScalarBinaryOpTraits<Scalar, Scalar>::ReturnType, Scalar>::value,
-                            Scalar>::type,
+    std::enable_if_t<internal::is_same<typename ScalarBinaryOpTraits<Scalar, Scalar>::ReturnType, Scalar>::value,
+                     Scalar>,
     Conj> {
   using result_type = Scalar;
   using conj_helper = conditional_conj<Scalar, Conj>;

diff --git a/Eigen/src/Core/Inverse.h b/Eigen/src/Core/Inverse.h
index 79fc3ab..855e3b3 100644
--- a/Eigen/src/Core/Inverse.h
+++ b/Eigen/src/Core/Inverse.h

@@ -49,12 +49,12 @@
   typedef typename internal::ref_selector<Inverse>::type Nested;
   typedef internal::remove_all_t<XprType> NestedExpression;
 
-  explicit EIGEN_DEVICE_FUNC Inverse(const XprType& xpr) : m_xpr(xpr) {}
+  explicit EIGEN_DEVICE_FUNC constexpr Inverse(const XprType& xpr) : m_xpr(xpr) {}
 
   EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_xpr.cols(); }
   EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_xpr.rows(); }
 
-  EIGEN_DEVICE_FUNC const XprTypeNestedCleaned& nestedExpression() const { return m_xpr; }
+  EIGEN_DEVICE_FUNC constexpr const XprTypeNestedCleaned& nestedExpression() const { return m_xpr; }
 
  protected:
   XprTypeNested m_xpr;

diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h
index c740da7..33b62c2 100644
--- a/Eigen/src/Core/Map.h
+++ b/Eigen/src/Core/Map.h

@@ -100,7 +100,7 @@
 
   typedef typename Base::PointerType PointerType;
   typedef PointerType PointerArgType;
-  EIGEN_DEVICE_FUNC inline PointerType cast_to_pointer_type(PointerArgType ptr) { return ptr; }
+  EIGEN_DEVICE_FUNC constexpr inline PointerType cast_to_pointer_type(PointerArgType ptr) const { return ptr; }
 
   EIGEN_DEVICE_FUNC constexpr Index innerStride() const {
     return StrideType::InnerStrideAtCompileTime != 0 ? m_stride.inner() : 1;
@@ -120,7 +120,7 @@
    * \param dataPtr pointer to the array to map
    * \param stride optional Stride object, passing the strides.
    */
-  EIGEN_DEVICE_FUNC explicit inline Map(PointerArgType dataPtr, const StrideType& stride = StrideType())
+  EIGEN_DEVICE_FUNC constexpr explicit inline Map(PointerArgType dataPtr, const StrideType& stride = StrideType())
       : Base(cast_to_pointer_type(dataPtr)), m_stride(stride) {}
 
   /** Constructor in the dynamic-size vector case.
@@ -129,7 +129,7 @@
    * \param size the size of the vector expression
    * \param stride optional Stride object, passing the strides.
    */
-  EIGEN_DEVICE_FUNC inline Map(PointerArgType dataPtr, Index size, const StrideType& stride = StrideType())
+  EIGEN_DEVICE_FUNC constexpr inline Map(PointerArgType dataPtr, Index size, const StrideType& stride = StrideType())
       : Base(cast_to_pointer_type(dataPtr), size), m_stride(stride) {}
 
   /** Constructor in the dynamic-size matrix case.
@@ -139,7 +139,8 @@
    * \param cols the number of columns of the matrix expression
    * \param stride optional Stride object, passing the strides.
    */
-  EIGEN_DEVICE_FUNC inline Map(PointerArgType dataPtr, Index rows, Index cols, const StrideType& stride = StrideType())
+  EIGEN_DEVICE_FUNC constexpr inline Map(PointerArgType dataPtr, Index rows, Index cols,
+                                         const StrideType& stride = StrideType())
       : Base(cast_to_pointer_type(dataPtr), rows, cols), m_stride(stride) {}
 
   EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Map)

diff --git a/Eigen/src/Core/MapBase.h b/Eigen/src/Core/MapBase.h
index 5e3d746..3e11725 100644
--- a/Eigen/src/Core/MapBase.h
+++ b/Eigen/src/Core/MapBase.h

@@ -97,23 +97,23 @@
   EIGEN_DEVICE_FUNC constexpr const Scalar* data() const { return m_data; }
 
   /** \copydoc PlainObjectBase::coeff(Index,Index) const */
-  EIGEN_DEVICE_FUNC inline const Scalar& coeff(Index rowId, Index colId) const {
+  EIGEN_DEVICE_FUNC constexpr inline const Scalar& coeff(Index rowId, Index colId) const {
     return m_data[colId * colStride() + rowId * rowStride()];
   }
 
   /** \copydoc PlainObjectBase::coeff(Index) const */
-  EIGEN_DEVICE_FUNC inline const Scalar& coeff(Index index) const {
+  EIGEN_DEVICE_FUNC constexpr inline const Scalar& coeff(Index index) const {
     EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived)
     return m_data[index * innerStride()];
   }
 
   /** \copydoc PlainObjectBase::coeffRef(Index,Index) const */
-  EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index rowId, Index colId) const {
+  EIGEN_DEVICE_FUNC constexpr inline const Scalar& coeffRef(Index rowId, Index colId) const {
     return this->m_data[colId * colStride() + rowId * rowStride()];
   }
 
   /** \copydoc PlainObjectBase::coeffRef(Index) const */
-  EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index index) const {
+  EIGEN_DEVICE_FUNC constexpr inline const Scalar& coeffRef(Index index) const {
     EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived)
     return this->m_data[index * innerStride()];
   }
@@ -132,14 +132,14 @@
   }
 
   /** \internal Constructor for fixed size matrices or vectors */
-  EIGEN_DEVICE_FUNC explicit inline MapBase(PointerType dataPtr)
+  EIGEN_DEVICE_FUNC constexpr explicit inline MapBase(PointerType dataPtr)
       : m_data(dataPtr), m_rows(RowsAtCompileTime), m_cols(ColsAtCompileTime) {
     EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
     checkSanity<Derived>();
   }
 
   /** \internal Constructor for dynamically sized vectors */
-  EIGEN_DEVICE_FUNC inline MapBase(PointerType dataPtr, Index vecSize)
+  EIGEN_DEVICE_FUNC constexpr inline MapBase(PointerType dataPtr, Index vecSize)
       : m_data(dataPtr),
         m_rows(RowsAtCompileTime == Dynamic ? vecSize : Index(RowsAtCompileTime)),
         m_cols(ColsAtCompileTime == Dynamic ? vecSize : Index(ColsAtCompileTime)) {
@@ -150,7 +150,7 @@
   }
 
   /** \internal Constructor for dynamically sized matrices */
-  EIGEN_DEVICE_FUNC inline MapBase(PointerType dataPtr, Index rows, Index cols)
+  EIGEN_DEVICE_FUNC constexpr inline MapBase(PointerType dataPtr, Index rows, Index cols)
       : m_data(dataPtr), m_rows(rows), m_cols(cols) {
     eigen_assert((dataPtr == 0) || (rows >= 0 && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == rows) &&
                                     cols >= 0 && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == cols)));
@@ -238,11 +238,11 @@
     return this->m_data;
   }  // no const-cast here so non-const-correct code will give a compile error
 
-  EIGEN_DEVICE_FUNC inline ScalarWithConstIfNotLvalue& coeffRef(Index row, Index col) {
+  EIGEN_DEVICE_FUNC constexpr inline ScalarWithConstIfNotLvalue& coeffRef(Index row, Index col) {
     return this->m_data[col * colStride() + row * rowStride()];
   }
 
-  EIGEN_DEVICE_FUNC inline ScalarWithConstIfNotLvalue& coeffRef(Index index) {
+  EIGEN_DEVICE_FUNC constexpr inline ScalarWithConstIfNotLvalue& coeffRef(Index index) {
     EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived)
     return this->m_data[index * innerStride()];
   }
@@ -258,9 +258,9 @@
     internal::pstoret<Scalar, PacketScalar, StoreMode>(this->m_data + index * innerStride(), val);
   }
 
-  EIGEN_DEVICE_FUNC explicit inline MapBase(PointerType dataPtr) : Base(dataPtr) {}
-  EIGEN_DEVICE_FUNC inline MapBase(PointerType dataPtr, Index vecSize) : Base(dataPtr, vecSize) {}
-  EIGEN_DEVICE_FUNC inline MapBase(PointerType dataPtr, Index rows, Index cols) : Base(dataPtr, rows, cols) {}
+  EIGEN_DEVICE_FUNC constexpr explicit inline MapBase(PointerType dataPtr) : Base(dataPtr) {}
+  EIGEN_DEVICE_FUNC constexpr inline MapBase(PointerType dataPtr, Index vecSize) : Base(dataPtr, vecSize) {}
+  EIGEN_DEVICE_FUNC constexpr inline MapBase(PointerType dataPtr, Index rows, Index cols) : Base(dataPtr, rows, cols) {}
 
   EIGEN_DEVICE_FUNC Derived& operator=(const MapBase& other) {
     ReadOnlyMapBase::Base::operator=(other);

diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index f6269aa..54da17c 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h

@@ -11,7 +11,7 @@
 #ifndef EIGEN_MATHFUNCTIONS_H
 #define EIGEN_MATHFUNCTIONS_H
 
-// TODO this should better be moved to NumTraits
+// TODO: consider moving these constants to NumTraits.
 // Source: WolframAlpha
 #define EIGEN_PI 3.141592653589793238462643383279502884197169399375105820974944592307816406L
 #define EIGEN_LOG2E 1.442695040888963407359924681001892137426645954152985934135449406931109219L
@@ -74,7 +74,7 @@
 template <typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
 struct real_default_impl {
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC static inline RealScalar run(const Scalar& x) { return x; }
+  EIGEN_DEVICE_FUNC static constexpr RealScalar run(const Scalar& x) { return x; }
 };
 
 template <typename Scalar>
@@ -170,18 +170,24 @@
 
 template <typename Scalar>
 struct imag_ref_default_impl<Scalar, false> {
-  EIGEN_DEVICE_FUNC constexpr static Scalar run(Scalar&) { return Scalar(0); }
-  EIGEN_DEVICE_FUNC constexpr static const Scalar run(const Scalar&) { return Scalar(0); }
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC constexpr static inline RealScalar run(Scalar&) { return RealScalar(0); }
+  EIGEN_DEVICE_FUNC constexpr static inline RealScalar run(const Scalar&) { return RealScalar(0); }
 };
 
 template <typename Scalar>
 struct imag_ref_impl : imag_ref_default_impl<Scalar, NumTraits<Scalar>::IsComplex> {};
 
-template <typename Scalar>
+template <typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
 struct imag_ref_retval {
   typedef typename NumTraits<Scalar>::Real& type;
 };
 
+template <typename Scalar>
+struct imag_ref_retval<Scalar, false> {
+  typedef typename NumTraits<Scalar>::Real type;
+};
+
 }  // namespace internal
 
 namespace numext {
@@ -222,7 +228,7 @@
 
 template <typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
 struct conj_default_impl {
-  EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) { return x; }
+  EIGEN_DEVICE_FUNC static constexpr Scalar run(const Scalar& x) { return x; }
 };
 
 template <typename Scalar>
@@ -287,7 +293,7 @@
 
 // Complex sqrt defined in MathFunctionsImpl.h.
 template <typename ComplexT>
-EIGEN_DEVICE_FUNC ComplexT complex_sqrt(const ComplexT& a_x);
+EIGEN_DEVICE_FUNC constexpr ComplexT complex_sqrt(const ComplexT& a_x);
 
 // Custom implementation is faster than `std::sqrt`, works on
 // GPU, and correctly handles special cases (unlike MSVC).
@@ -307,7 +313,7 @@
 
 // Complex rsqrt defined in MathFunctionsImpl.h.
 template <typename ComplexT>
-EIGEN_DEVICE_FUNC ComplexT complex_rsqrt(const ComplexT& a_x);
+EIGEN_DEVICE_FUNC constexpr ComplexT complex_rsqrt(const ComplexT& a_x);
 
 template <typename T>
 struct rsqrt_impl<std::complex<T>> {
@@ -390,7 +396,7 @@
   }
 };
 
-// here, for once, we're plainly returning NewType: we don't want cast to do weird things.
+// Returns NewType directly to avoid unintended intermediate conversions.
 
 template <typename OldType, typename NewType>
 EIGEN_DEVICE_FUNC inline NewType cast(const OldType& x) {
@@ -504,7 +510,7 @@
 
 // Complex log defined in MathFunctionsImpl.h.
 template <typename ComplexT>
-EIGEN_DEVICE_FUNC ComplexT complex_log(const ComplexT& z);
+EIGEN_DEVICE_FUNC constexpr ComplexT complex_log(const ComplexT& z);
 
 template <typename Scalar>
 struct log_impl {
@@ -832,8 +838,8 @@
 
 template <typename T>
 EIGEN_DEVICE_FUNC
-std::enable_if_t<!(std::numeric_limits<T>::has_quiet_NaN || std::numeric_limits<T>::has_signaling_NaN), bool>
-isnan_impl(const T&) {
+    std::enable_if_t<!(std::numeric_limits<T>::has_quiet_NaN || std::numeric_limits<T>::has_signaling_NaN), bool>
+    isnan_impl(const T&) {
   return false;
 }
 
@@ -1023,13 +1029,13 @@
 
 #if (!defined(EIGEN_GPUCC) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC))
 template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y) {
   EIGEN_USING_STD(min)
   return min EIGEN_NOT_A_MACRO(x, y);
 }
 
 template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y) {
   EIGEN_USING_STD(max)
   return max EIGEN_NOT_A_MACRO(x, y);
 }
@@ -1450,9 +1456,9 @@
 }
 
 template <typename T>
-EIGEN_DEVICE_FUNC
-EIGEN_ALWAYS_INLINE std::enable_if_t<!(NumTraits<T>::IsSigned || NumTraits<T>::IsComplex), typename NumTraits<T>::Real>
-abs(const T& x) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+    std::enable_if_t<!(NumTraits<T>::IsSigned || NumTraits<T>::IsComplex), typename NumTraits<T>::Real>
+    abs(const T& x) {
   return x;
 }
 
@@ -2102,6 +2108,57 @@
 };
 #endif
 
+// Complex multiply and division operators.
+// Note that these do not handle the case if inf+NaNi, which is considered an infinity.
+// This is for consistency with our standard pmul, pdiv implementations.
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> complex_multiply(const std::complex<T>& a,
+                                                                       const std::complex<T>& b) {
+  const T a_real = numext::real(a);
+  const T a_imag = numext::imag(a);
+  const T b_real = numext::real(b);
+  const T b_imag = numext::imag(b);
+  return std::complex<T>(a_real * b_real - a_imag * b_imag, a_imag * b_real + a_real * b_imag);
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> complex_divide_fast(const std::complex<T>& a,
+                                                                          const std::complex<T>& b) {
+  const T a_real = numext::real(a);
+  const T a_imag = numext::imag(a);
+  const T b_real = numext::real(b);
+  const T b_imag = numext::imag(b);
+  const T norm = (b_real * b_real + b_imag * b_imag);
+  return std::complex<T>((a_real * b_real + a_imag * b_imag) / norm, (a_imag * b_real - a_real * b_imag) / norm);
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> complex_divide_smith(const std::complex<T>& a,
+                                                                           const std::complex<T>& b) {
+  const T a_real = numext::real(a);
+  const T a_imag = numext::imag(a);
+  const T b_real = numext::real(b);
+  const T b_imag = numext::imag(b);
+  // Smith's complex division (https://arxiv.org/pdf/1210.4539.pdf),
+  // guards against over/under-flow.
+  const bool scale_imag = numext::abs(b_imag) <= numext::abs(b_real);
+  const T rscale = scale_imag ? T(1) : b_real / b_imag;
+  const T iscale = scale_imag ? b_imag / b_real : T(1);
+  const T denominator = b_real * rscale + b_imag * iscale;
+  return std::complex<T>((a_real * rscale + a_imag * iscale) / denominator,
+                         (a_imag * rscale - a_real * iscale) / denominator);
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> complex_divide(const std::complex<T>& a,
+                                                                     const std::complex<T>& b) {
+#if EIGEN_FAST_MATH
+  return complex_divide_fast(a, b);
+#else
+  return complex_divide_smith(a, b);
+#endif
+}
+
 }  // end namespace internal
 
 }  // end namespace Eigen

diff --git a/Eigen/src/Core/MathFunctionsImpl.h b/Eigen/src/Core/MathFunctionsImpl.h
index 43d9d64..2433187 100644
--- a/Eigen/src/Core/MathFunctionsImpl.h
+++ b/Eigen/src/Core/MathFunctionsImpl.h

@@ -148,16 +148,16 @@
 };
 
 template <typename RealScalar>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE RealScalar positive_real_hypot(const RealScalar& x, const RealScalar& y) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE RealScalar positive_real_hypot(const RealScalar& x,
+                                                                               const RealScalar& y) {
   // IEEE IEC 6059 special cases.
   if ((numext::isinf)(x) || (numext::isinf)(y)) return NumTraits<RealScalar>::infinity();
   if ((numext::isnan)(x) || (numext::isnan)(y)) return NumTraits<RealScalar>::quiet_NaN();
 
   EIGEN_USING_STD(sqrt);
-  RealScalar p, qp;
-  p = numext::maxi(x, y);
+  RealScalar p = numext::maxi(x, y);
   if (numext::is_exactly_zero(p)) return RealScalar(0);
-  qp = numext::mini(y, x) / p;
+  RealScalar qp = numext::mini(y, x) / p;
   return p * sqrt(RealScalar(1) + qp * qp);
 }
 
@@ -173,7 +173,7 @@
 // Generic complex sqrt implementation that correctly handles corner cases
 // according to https://en.cppreference.com/w/cpp/numeric/complex/sqrt
 template <typename ComplexT>
-EIGEN_DEVICE_FUNC ComplexT complex_sqrt(const ComplexT& z) {
+EIGEN_DEVICE_FUNC constexpr ComplexT complex_sqrt(const ComplexT& z) {
   // Computes the principal sqrt of the input.
   //
   // For a complex square root of the number x + i*y. We want to find real
@@ -209,7 +209,7 @@
 
 // Generic complex rsqrt implementation.
 template <typename ComplexT>
-EIGEN_DEVICE_FUNC ComplexT complex_rsqrt(const ComplexT& z) {
+EIGEN_DEVICE_FUNC constexpr ComplexT complex_rsqrt(const ComplexT& z) {
   // Computes the principal reciprocal sqrt of the input.
   //
   // For a complex reciprocal square root of the number z = x + i*y. We want to
@@ -248,7 +248,7 @@
 }
 
 template <typename ComplexT>
-EIGEN_DEVICE_FUNC ComplexT complex_log(const ComplexT& z) {
+EIGEN_DEVICE_FUNC constexpr ComplexT complex_log(const ComplexT& z) {
   // Computes complex log.
   using T = typename NumTraits<ComplexT>::Real;
   T a = numext::abs(z);

diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h
index a2c8eba..ca8f622 100644
--- a/Eigen/src/Core/Matrix.h
+++ b/Eigen/src/Core/Matrix.h

@@ -207,7 +207,7 @@
    *
    * \callgraph
    */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Matrix& operator=(const Matrix& other) { return Base::_set(other); }
+  EIGEN_DEVICE_FUNC constexpr Matrix& operator=(const Matrix& other) { return Base::_set(other); }
 
   /** \internal
    * \brief Copies the value of the expression \a other into \c *this with automatic resizing.
@@ -249,16 +249,16 @@
    * \sa resize(Index,Index)
    */
 #if defined(EIGEN_INITIALIZE_COEFFS)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Matrix() { EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED }
+  EIGEN_DEVICE_FUNC constexpr Matrix() { EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED }
 #else
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Matrix() = default;
+  EIGEN_DEVICE_FUNC constexpr Matrix() = default;
 #endif
   /** \brief Move constructor */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Matrix(Matrix&&) = default;
+  EIGEN_DEVICE_FUNC constexpr Matrix(Matrix&&) = default;
   /** \brief Moves the matrix into the other one.
    *
    */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Matrix& operator=(Matrix&& other) noexcept(
+  EIGEN_DEVICE_FUNC constexpr Matrix& operator=(Matrix&& other) noexcept(
       std::is_nothrow_move_assignable<Scalar>::value) {
     Base::operator=(std::move(other));
     return *this;
@@ -271,7 +271,7 @@
    * This constructor is for 1D array or vectors with more than 4 coefficients.
    *
    * \warning To construct a column (resp. row) vector of fixed length, the number of values passed to this
-   * constructor must match the the fixed number of rows (resp. columns) of \c *this.
+   * constructor must match the fixed number of rows (resp. columns) of \c *this.
    *
    *
    * Example: \include Matrix_variadic_ctor_cxx11.cpp
@@ -316,12 +316,12 @@
 
   // This constructor is for both 1x1 matrices and dynamic vectors
   template <typename T>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Matrix(const T& x) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE explicit Matrix(const T& x) {
     Base::template _init1<T>(x);
   }
 
   template <typename T0, typename T1>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Matrix(const T0& x, const T1& y) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Matrix(const T0& x, const T1& y) {
     Base::template _init2<T0, T1>(x, y);
   }
 
@@ -367,7 +367,7 @@
   /** \brief Constructs an initialized 3D vector with given coefficients
    * \sa Matrix(const Scalar&, const Scalar&, const Scalar&,  const Scalar&, const ArgTypes&...)
    */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z) {
     EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Matrix, 3)
     m_storage.data()[0] = x;
     m_storage.data()[1] = y;
@@ -376,7 +376,8 @@
   /** \brief Constructs an initialized 4D vector with given coefficients
    * \sa Matrix(const Scalar&, const Scalar&, const Scalar&,  const Scalar&, const ArgTypes&...)
    */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z, const Scalar& w) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z,
+                                                         const Scalar& w) {
     EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Matrix, 4)
     m_storage.data()[0] = x;
     m_storage.data()[1] = y;
@@ -385,13 +386,14 @@
   }
 
   /** \brief Copy constructor */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Matrix(const Matrix&) = default;
+  EIGEN_DEVICE_FUNC constexpr Matrix(const Matrix&) = default;
 
   /** \brief Copy constructor for generic expressions.
    * \sa MatrixBase::operator=(const EigenBase<OtherDerived>&)
    */
   template <typename OtherDerived>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Matrix(const EigenBase<OtherDerived>& other) : Base(other.derived()) {}
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Matrix(const EigenBase<OtherDerived>& other)
+      : Base(other.derived()) {}
 
   EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { return 1; }
   EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return this->innerSize(); }

diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h
index 045993d..ff1a2c2 100644
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h

@@ -99,7 +99,7 @@
 
   /** \returns the size of the main diagonal, which is min(rows(),cols()).
    * \sa rows(), cols(), SizeAtCompileTime. */
-  EIGEN_DEVICE_FUNC inline Index diagonalSize() const { return (numext::mini)(rows(), cols()); }
+  EIGEN_DEVICE_FUNC constexpr Index diagonalSize() const { return (numext::mini)(rows(), cols()); }
 
   typedef typename Base::PlainObject PlainObject;
 
@@ -136,19 +136,19 @@
   /** Special case of the template operator=, in order to prevent the compiler
    * from generating a default operator= (issue hit with g++ 4.1)
    */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const MatrixBase& other);
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Derived& operator=(const MatrixBase& other);
 
   // We cannot inherit here via Base::operator= since it is causing
   // trouble with MSVC.
 
   template <typename OtherDerived>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const DenseBase<OtherDerived>& other);
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Derived& operator=(const DenseBase<OtherDerived>& other);
 
   template <typename OtherDerived>
-  EIGEN_DEVICE_FUNC Derived& operator=(const EigenBase<OtherDerived>& other);
+  EIGEN_DEVICE_FUNC constexpr Derived& operator=(const EigenBase<OtherDerived>& other);
 
   template <typename OtherDerived>
-  EIGEN_DEVICE_FUNC Derived& operator=(const ReturnByValue<OtherDerived>& other);
+  EIGEN_DEVICE_FUNC constexpr Derived& operator=(const ReturnByValue<OtherDerived>& other);
 
   template <typename OtherDerived>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator+=(const MatrixBase<OtherDerived>& other);
@@ -180,11 +180,11 @@
       const SkewSymmetricBase<SkewDerived>& skew) const;
 
   template <typename OtherDerived>
-  EIGEN_DEVICE_FUNC typename ScalarBinaryOpTraits<typename internal::traits<Derived>::Scalar,
-                                                  typename internal::traits<OtherDerived>::Scalar>::ReturnType
+  EIGEN_DEVICE_FUNC constexpr typename ScalarBinaryOpTraits<typename internal::traits<Derived>::Scalar,
+                                                            typename internal::traits<OtherDerived>::Scalar>::ReturnType
   dot(const MatrixBase<OtherDerived>& other) const;
 
-  EIGEN_DEVICE_FUNC RealScalar squaredNorm() const;
+  EIGEN_DEVICE_FUNC constexpr RealScalar squaredNorm() const;
   EIGEN_DEVICE_FUNC RealScalar norm() const;
   RealScalar stableNorm() const;
   RealScalar blueNorm() const;
@@ -194,23 +194,23 @@
   EIGEN_DEVICE_FUNC void normalize();
   EIGEN_DEVICE_FUNC void stableNormalize();
 
-  EIGEN_DEVICE_FUNC const AdjointReturnType adjoint() const;
+  EIGEN_DEVICE_FUNC constexpr const AdjointReturnType adjoint() const;
   EIGEN_DEVICE_FUNC void adjointInPlace();
 
   typedef Diagonal<Derived> DiagonalReturnType;
-  EIGEN_DEVICE_FUNC DiagonalReturnType diagonal();
+  EIGEN_DEVICE_FUNC constexpr DiagonalReturnType diagonal();
 
   typedef Diagonal<const Derived> ConstDiagonalReturnType;
-  EIGEN_DEVICE_FUNC const ConstDiagonalReturnType diagonal() const;
+  EIGEN_DEVICE_FUNC constexpr const ConstDiagonalReturnType diagonal() const;
 
   template <int Index>
-  EIGEN_DEVICE_FUNC Diagonal<Derived, Index> diagonal();
+  EIGEN_DEVICE_FUNC constexpr Diagonal<Derived, Index> diagonal();
 
   template <int Index>
-  EIGEN_DEVICE_FUNC const Diagonal<const Derived, Index> diagonal() const;
+  EIGEN_DEVICE_FUNC constexpr const Diagonal<const Derived, Index> diagonal() const;
 
-  EIGEN_DEVICE_FUNC Diagonal<Derived, DynamicIndex> diagonal(Index index);
-  EIGEN_DEVICE_FUNC const Diagonal<const Derived, DynamicIndex> diagonal(Index index) const;
+  EIGEN_DEVICE_FUNC constexpr Diagonal<Derived, DynamicIndex> diagonal(Index index);
+  EIGEN_DEVICE_FUNC constexpr const Diagonal<const Derived, DynamicIndex> diagonal(Index index) const;
 
   template <unsigned int Mode>
   struct TriangularViewReturnType {
@@ -222,9 +222,9 @@
   };
 
   template <unsigned int Mode>
-  EIGEN_DEVICE_FUNC typename TriangularViewReturnType<Mode>::Type triangularView();
+  EIGEN_DEVICE_FUNC constexpr typename TriangularViewReturnType<Mode>::Type triangularView();
   template <unsigned int Mode>
-  EIGEN_DEVICE_FUNC typename ConstTriangularViewReturnType<Mode>::Type triangularView() const;
+  EIGEN_DEVICE_FUNC constexpr typename ConstTriangularViewReturnType<Mode>::Type triangularView() const;
 
   template <unsigned int UpLo>
   struct SelfAdjointViewReturnType {
@@ -236,9 +236,9 @@
   };
 
   template <unsigned int UpLo>
-  EIGEN_DEVICE_FUNC typename SelfAdjointViewReturnType<UpLo>::Type selfadjointView();
+  EIGEN_DEVICE_FUNC constexpr typename SelfAdjointViewReturnType<UpLo>::Type selfadjointView();
   template <unsigned int UpLo>
-  EIGEN_DEVICE_FUNC typename ConstSelfAdjointViewReturnType<UpLo>::Type selfadjointView() const;
+  EIGEN_DEVICE_FUNC constexpr typename ConstSelfAdjointViewReturnType<UpLo>::Type selfadjointView() const;
 
   const SparseView<Derived> sparseView(
       const Scalar& m_reference = Scalar(0),
@@ -252,9 +252,9 @@
   EIGEN_DEVICE_FUNC static const BasisReturnType UnitZ();
   EIGEN_DEVICE_FUNC static const BasisReturnType UnitW();
 
-  EIGEN_DEVICE_FUNC const DiagonalWrapper<const Derived> asDiagonal() const;
+  EIGEN_DEVICE_FUNC constexpr const DiagonalWrapper<const Derived> asDiagonal() const;
   const PermutationWrapper<const Derived> asPermutation() const;
-  EIGEN_DEVICE_FUNC const SkewSymmetricWrapper<const Derived> asSkewSymmetric() const;
+  EIGEN_DEVICE_FUNC constexpr const SkewSymmetricWrapper<const Derived> asSkewSymmetric() const;
 
   EIGEN_DEVICE_FUNC Derived& setIdentity();
   EIGEN_DEVICE_FUNC Derived& setIdentity(Index rows, Index cols);
@@ -274,6 +274,17 @@
                     const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
   bool isUnitary(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
 
+  /* diagonalView */
+  template <int DiagIndex_ = 0>
+  EIGEN_DEVICE_FUNC constexpr DiagonalWrapper<Diagonal<Derived, DiagIndex_>> diagonalView();
+
+  template <int DiagIndex_ = 0>
+  EIGEN_DEVICE_FUNC constexpr DiagonalWrapper<Diagonal<const Derived, DiagIndex_>> diagonalView() const;
+
+  EIGEN_DEVICE_FUNC constexpr DiagonalWrapper<Diagonal<Derived, DynamicIndex>> diagonalView(Index index);
+
+  EIGEN_DEVICE_FUNC constexpr DiagonalWrapper<Diagonal<const Derived, DynamicIndex>> diagonalView(Index index) const;
+
   /** \returns true if each coefficients of \c *this and \a other are all exactly equal.
    * \warning When using floating point scalar values you probably should rather use a
    *          fuzzy comparison such as isApprox()
@@ -296,14 +307,14 @@
 
   // TODO forceAlignedAccess is temporarily disabled
   // Need to find a nicer workaround.
-  inline const Derived& forceAlignedAccess() const { return derived(); }
-  inline Derived& forceAlignedAccess() { return derived(); }
+  constexpr const Derived& forceAlignedAccess() const { return derived(); }
+  constexpr Derived& forceAlignedAccess() { return derived(); }
   template <bool Enable>
-  inline const Derived& forceAlignedAccessIf() const {
+  constexpr const Derived& forceAlignedAccessIf() const {
     return derived();
   }
   template <bool Enable>
-  inline Derived& forceAlignedAccessIf() {
+  constexpr Derived& forceAlignedAccessIf() {
     return derived();
   }
 
@@ -312,29 +323,31 @@
   template <int p>
   EIGEN_DEVICE_FUNC RealScalar lpNorm() const;
 
-  EIGEN_DEVICE_FUNC MatrixBase<Derived>& matrix() { return *this; }
-  EIGEN_DEVICE_FUNC const MatrixBase<Derived>& matrix() const { return *this; }
+  EIGEN_DEVICE_FUNC constexpr MatrixBase<Derived>& matrix() { return *this; }
+  EIGEN_DEVICE_FUNC constexpr const MatrixBase<Derived>& matrix() const { return *this; }
 
   /** \returns an \link Eigen::ArrayBase Array \endlink expression of this matrix
    * \sa ArrayBase::matrix() */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ArrayWrapper<Derived> array() { return ArrayWrapper<Derived>(derived()); }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE ArrayWrapper<Derived> array() {
+    return ArrayWrapper<Derived>(derived());
+  }
   /** \returns a const \link Eigen::ArrayBase Array \endlink expression of this matrix
    * \sa ArrayBase::matrix() */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const ArrayWrapper<const Derived> array() const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const ArrayWrapper<const Derived> array() const {
     return ArrayWrapper<const Derived>(derived());
   }
 
   /////////// LU module ///////////
 
   template <typename PermutationIndex = DefaultPermutationIndex>
-  inline const FullPivLU<PlainObject, PermutationIndex> fullPivLu() const;
+  inline FullPivLU<PlainObject, PermutationIndex> fullPivLu() const;
   template <typename PermutationIndex = DefaultPermutationIndex>
-  inline const PartialPivLU<PlainObject, PermutationIndex> partialPivLu() const;
+  inline PartialPivLU<PlainObject, PermutationIndex> partialPivLu() const;
 
   template <typename PermutationIndex = DefaultPermutationIndex>
-  inline const PartialPivLU<PlainObject, PermutationIndex> lu() const;
+  inline PartialPivLU<PlainObject, PermutationIndex> lu() const;
 
-  EIGEN_DEVICE_FUNC inline const Inverse<Derived> inverse() const;
+  EIGEN_DEVICE_FUNC inline Inverse<Derived> inverse() const;
 
   template <typename ResultType>
   inline void computeInverseAndDetWithCheck(
@@ -350,18 +363,18 @@
 
   /////////// Cholesky module ///////////
 
-  inline const LLT<PlainObject> llt() const;
-  inline const LDLT<PlainObject> ldlt() const;
+  inline LLT<PlainObject> llt() const;
+  inline LDLT<PlainObject> ldlt() const;
 
   /////////// QR module ///////////
 
-  inline const HouseholderQR<PlainObject> householderQr() const;
+  inline HouseholderQR<PlainObject> householderQr() const;
   template <typename PermutationIndex = DefaultPermutationIndex>
-  inline const ColPivHouseholderQR<PlainObject, PermutationIndex> colPivHouseholderQr() const;
+  inline ColPivHouseholderQR<PlainObject, PermutationIndex> colPivHouseholderQr() const;
   template <typename PermutationIndex = DefaultPermutationIndex>
-  inline const FullPivHouseholderQR<PlainObject, PermutationIndex> fullPivHouseholderQr() const;
+  inline FullPivHouseholderQR<PlainObject, PermutationIndex> fullPivHouseholderQr() const;
   template <typename PermutationIndex = DefaultPermutationIndex>
-  inline const CompleteOrthogonalDecomposition<PlainObject, PermutationIndex> completeOrthogonalDecomposition() const;
+  inline CompleteOrthogonalDecomposition<PlainObject, PermutationIndex> completeOrthogonalDecomposition() const;
 
   /////////// Eigenvalues module ///////////
 
@@ -398,7 +411,6 @@
 
   EIGEN_DEVICE_FUNC inline Matrix<Scalar, 3, 1> canonicalEulerAngles(Index a0, Index a1, Index a2) const;
 
-  // put this as separate enum value to work around possible GCC 4.3 bug (?)
   enum {
     HomogeneousReturnTypeDirection =
         ColsAtCompileTime == 1 && RowsAtCompileTime == 1

diff --git a/Eigen/src/Core/NestByValue.h b/Eigen/src/Core/NestByValue.h
index 2ce83a8..f18559b 100644
--- a/Eigen/src/Core/NestByValue.h
+++ b/Eigen/src/Core/NestByValue.h

@@ -43,24 +43,24 @@
 
   EIGEN_DENSE_PUBLIC_INTERFACE(NestByValue)
 
-  EIGEN_DEVICE_FUNC explicit inline NestByValue(const ExpressionType& matrix) : m_expression(matrix) {}
+  EIGEN_DEVICE_FUNC constexpr explicit inline NestByValue(const ExpressionType& matrix) : m_expression(matrix) {}
 
   EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_expression.rows(); }
   EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_expression.cols(); }
 
-  EIGEN_DEVICE_FUNC operator const ExpressionType&() const { return m_expression; }
+  EIGEN_DEVICE_FUNC constexpr operator const ExpressionType&() const { return m_expression; }
 
-  EIGEN_DEVICE_FUNC const ExpressionType& nestedExpression() const { return m_expression; }
+  EIGEN_DEVICE_FUNC constexpr const ExpressionType& nestedExpression() const { return m_expression; }
 
-  EIGEN_DEVICE_FUNC typename std::enable_if<HasDirectAccess, const Scalar*>::type data() const {
+  EIGEN_DEVICE_FUNC constexpr std::enable_if_t<HasDirectAccess, const Scalar*> data() const {
     return m_expression.data();
   }
 
-  EIGEN_DEVICE_FUNC typename std::enable_if<HasDirectAccess, Index>::type innerStride() const {
+  EIGEN_DEVICE_FUNC constexpr std::enable_if_t<HasDirectAccess, Index> innerStride() const {
     return m_expression.innerStride();
   }
 
-  EIGEN_DEVICE_FUNC typename std::enable_if<HasDirectAccess, Index>::type outerStride() const {
+  EIGEN_DEVICE_FUNC constexpr std::enable_if_t<HasDirectAccess, Index> outerStride() const {
     return m_expression.outerStride();
   }
 
@@ -71,7 +71,7 @@
 /** \returns an expression of the temporary version of *this.
  */
 template <typename Derived>
-EIGEN_DEVICE_FUNC inline const NestByValue<Derived> DenseBase<Derived>::nestByValue() const {
+EIGEN_DEVICE_FUNC constexpr inline const NestByValue<Derived> DenseBase<Derived>::nestByValue() const {
   return NestByValue<Derived>(derived());
 }
 
@@ -82,7 +82,7 @@
 struct evaluator<NestByValue<ArgType> > : public evaluator<ArgType> {
   typedef evaluator<ArgType> Base;
 
-  EIGEN_DEVICE_FUNC explicit evaluator(const NestByValue<ArgType>& xpr) : Base(xpr.nestedExpression()) {}
+  EIGEN_DEVICE_FUNC constexpr explicit evaluator(const NestByValue<ArgType>& xpr) : Base(xpr.nestedExpression()) {}
 };
 }  // namespace internal
 

diff --git a/Eigen/src/Core/NoAlias.h b/Eigen/src/Core/NoAlias.h
index b6c7209..6a88201 100644
--- a/Eigen/src/Core/NoAlias.h
+++ b/Eigen/src/Core/NoAlias.h

@@ -35,7 +35,7 @@
  public:
   typedef typename ExpressionType::Scalar Scalar;
 
-  EIGEN_DEVICE_FUNC explicit NoAlias(ExpressionType& expression) : m_expression(expression) {}
+  EIGEN_DEVICE_FUNC constexpr explicit NoAlias(ExpressionType& expression) : m_expression(expression) {}
 
   template <typename OtherDerived>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ExpressionType& operator=(const StorageBase<OtherDerived>& other) {
@@ -58,7 +58,7 @@
     return m_expression;
   }
 
-  EIGEN_DEVICE_FUNC ExpressionType& expression() const { return m_expression; }
+  EIGEN_DEVICE_FUNC constexpr ExpressionType& expression() const { return m_expression; }
 
  protected:
   ExpressionType& m_expression;

diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h
index bf41c3b..2bb1eed 100644
--- a/Eigen/src/Core/NumTraits.h
+++ b/Eigen/src/Core/NumTraits.h

@@ -99,12 +99,12 @@
 /** \internal bit-wise cast without changing the underlying bit representation. */
 #if defined(__cpp_lib_bit_cast) && __cpp_lib_bit_cast >= 201806L
 template <typename Tgt, typename Src>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Tgt bit_cast(const Src& src) {
+EIGEN_DEVICE_FUNC constexpr Tgt bit_cast(const Src& src) {
   return std::bit_cast<Tgt>(src);
 }
 #elif EIGEN_HAS_BUILTIN(__builtin_bit_cast)
 template <typename Tgt, typename Src>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Tgt bit_cast(const Src& src) {
+EIGEN_DEVICE_FUNC constexpr Tgt bit_cast(const Src& src) {
   EIGEN_STATIC_ASSERT(std::is_trivially_copyable<Src>::value, THIS_TYPE_IS_NOT_SUPPORTED)
   EIGEN_STATIC_ASSERT(std::is_trivially_copyable<Tgt>::value, THIS_TYPE_IS_NOT_SUPPORTED)
   EIGEN_STATIC_ASSERT(sizeof(Src) == sizeof(Tgt), THIS_TYPE_IS_NOT_SUPPORTED)

diff --git a/Eigen/src/Core/PlainObjectBase.h b/Eigen/src/Core/PlainObjectBase.h
index a78305e..2f4c357 100644
--- a/Eigen/src/Core/PlainObjectBase.h
+++ b/Eigen/src/Core/PlainObjectBase.h

@@ -159,17 +159,17 @@
                       INVALID_MATRIX_TEMPLATE_PARAMETERS)
   EIGEN_STATIC_ASSERT(((Options & (DontAlign | RowMajor)) == Options), INVALID_MATRIX_TEMPLATE_PARAMETERS)
 
-  EIGEN_DEVICE_FUNC Base& base() { return *static_cast<Base*>(this); }
-  EIGEN_DEVICE_FUNC const Base& base() const { return *static_cast<const Base*>(this); }
+  EIGEN_DEVICE_FUNC constexpr Base& base() { return *static_cast<Base*>(this); }
+  EIGEN_DEVICE_FUNC constexpr const Base& base() const { return *static_cast<const Base*>(this); }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const noexcept { return m_storage.rows(); }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const noexcept { return m_storage.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_storage.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_storage.cols(); }
 
   /** This is an overloaded version of DenseCoeffsBase<Derived,ReadOnlyAccessors>::coeff(Index,Index) const
    * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts.
    *
    * See DenseCoeffsBase<Derived,ReadOnlyAccessors>::coeff(Index) const for details. */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const Scalar& coeff(Index rowId, Index colId) const {
+  EIGEN_DEVICE_FUNC constexpr const Scalar& coeff(Index rowId, Index colId) const {
     if (Flags & RowMajorBit)
       return m_storage.data()[colId + rowId * m_storage.cols()];
     else  // column-major
@@ -180,15 +180,13 @@
    * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts.
    *
    * See DenseCoeffsBase<Derived,ReadOnlyAccessors>::coeff(Index) const for details. */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const Scalar& coeff(Index index) const {
-    return m_storage.data()[index];
-  }
+  EIGEN_DEVICE_FUNC constexpr const Scalar& coeff(Index index) const { return m_storage.data()[index]; }
 
   /** This is an overloaded version of DenseCoeffsBase<Derived,WriteAccessors>::coeffRef(Index,Index) const
    * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts.
    *
    * See DenseCoeffsBase<Derived,WriteAccessors>::coeffRef(Index,Index) const for details. */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& coeffRef(Index rowId, Index colId) {
+  EIGEN_DEVICE_FUNC constexpr Scalar& coeffRef(Index rowId, Index colId) {
     if (Flags & RowMajorBit)
       return m_storage.data()[colId + rowId * m_storage.cols()];
     else  // column-major
@@ -199,11 +197,11 @@
    * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts.
    *
    * See DenseCoeffsBase<Derived,WriteAccessors>::coeffRef(Index) const for details. */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& coeffRef(Index index) { return m_storage.data()[index]; }
+  EIGEN_DEVICE_FUNC constexpr Scalar& coeffRef(Index index) { return m_storage.data()[index]; }
 
   /** This is the const version of coeffRef(Index,Index) which is thus synonym of coeff(Index,Index).
    * It is provided for convenience. */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const Scalar& coeffRef(Index rowId, Index colId) const {
+  EIGEN_DEVICE_FUNC constexpr const Scalar& coeffRef(Index rowId, Index colId) const {
     if (Flags & RowMajorBit)
       return m_storage.data()[colId + rowId * m_storage.cols()];
     else  // column-major
@@ -212,9 +210,7 @@
 
   /** This is the const version of coeffRef(Index) which is thus synonym of coeff(Index).
    * It is provided for convenience. */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const Scalar& coeffRef(Index index) const {
-    return m_storage.data()[index];
-  }
+  EIGEN_DEVICE_FUNC constexpr const Scalar& coeffRef(Index index) const { return m_storage.data()[index]; }
 
   /** \internal */
   template <int LoadMode>
@@ -343,7 +339,7 @@
    * remain row-vectors and vectors remain vectors.
    */
   template <typename OtherDerived>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resizeLike(const EigenBase<OtherDerived>& _other) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE void resizeLike(const EigenBase<OtherDerived>& _other) {
     const OtherDerived& other = _other.derived();
 #ifndef EIGEN_NO_DEBUG
     internal::check_rows_cols_for_overflow<MaxSizeAtCompileTime, MaxRowsAtCompileTime, MaxColsAtCompileTime>::run(
@@ -426,9 +422,7 @@
   /** This is a special case of the templated operator=. Its purpose is to
    * prevent a default operator= from hiding the templated operator=.
    */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Derived& operator=(const PlainObjectBase& other) {
-    return _set(other);
-  }
+  EIGEN_DEVICE_FUNC constexpr Derived& operator=(const PlainObjectBase& other) { return _set(other); }
 
   /** \sa MatrixBase::lazyAssign() */
   template <typename OtherDerived>
@@ -446,9 +440,9 @@
   // Prevent user from trying to instantiate PlainObjectBase objects
   // by making all its constructor protected. See bug 1074.
  protected:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr PlainObjectBase() = default;
+  EIGEN_DEVICE_FUNC constexpr PlainObjectBase() = default;
   /** \brief Move constructor */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr PlainObjectBase(PlainObjectBase&&) = default;
+  EIGEN_DEVICE_FUNC constexpr PlainObjectBase(PlainObjectBase&&) = default;
   /** \brief Move assignment operator */
   EIGEN_DEVICE_FUNC constexpr PlainObjectBase& operator=(PlainObjectBase&& other) noexcept {
     m_storage = std::move(other.m_storage);
@@ -456,7 +450,7 @@
   }
 
   /** Copy constructor */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr PlainObjectBase(const PlainObjectBase&) = default;
+  EIGEN_DEVICE_FUNC constexpr PlainObjectBase(const PlainObjectBase&) = default;
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PlainObjectBase(Index size, Index rows, Index cols)
       : m_storage(size, rows, cols) {}
 
@@ -467,7 +461,7 @@
    * This constructor is for 1D array or vectors with more than 4 coefficients.
    *
    * \warning To construct a column (resp. row) vector of fixed length, the number of values passed to this
-   * constructor must match the the fixed number of rows (resp. columns) of \c *this.
+   * constructor must match the fixed number of rows (resp. columns) of \c *this.
    */
   template <typename... ArgTypes>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PlainObjectBase(const Scalar& a0, const Scalar& a1, const Scalar& a2,
@@ -524,14 +518,14 @@
 
   /** \sa PlainObjectBase::operator=(const EigenBase<OtherDerived>&) */
   template <typename OtherDerived>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PlainObjectBase(const DenseBase<OtherDerived>& other) : m_storage() {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE PlainObjectBase(const DenseBase<OtherDerived>& other) : m_storage() {
     resizeLike(other);
     _set_noalias(other);
   }
 
   /** \sa PlainObjectBase::operator=(const EigenBase<OtherDerived>&) */
   template <typename OtherDerived>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PlainObjectBase(const EigenBase<OtherDerived>& other) : m_storage() {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE PlainObjectBase(const EigenBase<OtherDerived>& other) : m_storage() {
     resizeLike(other);
     *this = other.derived();
   }
@@ -691,6 +685,7 @@
     eigen_assert((this->size() == 0 || (IsVectorAtCompileTime ? (this->size() == other.size())
                                                               : (rows() == other.rows() && cols() == other.cols()))) &&
                  "Size mismatch. Automatic resizing is disabled because EIGEN_NO_AUTOMATIC_RESIZING is defined");
+    if (this->size() == 0) resizeLike(other);
     EIGEN_ONLY_USED_FOR_DEBUG(other);
 #else
     resizeLike(other);
@@ -714,7 +709,7 @@
   // aliasing is dealt once in internal::call_assignment
   // so at this stage we have to assume aliasing... and resising has to be done later.
   template <typename OtherDerived>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Derived& _set(const DenseBase<OtherDerived>& other) {
+  EIGEN_DEVICE_FUNC constexpr Derived& _set(const DenseBase<OtherDerived>& other) {
     internal::call_assignment(this->derived(), other.derived());
     return this->derived();
   }
@@ -725,7 +720,7 @@
    * \sa operator=(const MatrixBase<OtherDerived>&), _set()
    */
   template <typename OtherDerived>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Derived& _set_noalias(const DenseBase<OtherDerived>& other) {
+  EIGEN_DEVICE_FUNC constexpr Derived& _set_noalias(const DenseBase<OtherDerived>& other) {
     // I don't think we need this resize call since the lazyAssign will anyways resize
     // and lazyAssign will be called by the assign selector.
     //_resize_to_match(other);
@@ -737,23 +732,23 @@
   }
 
   template <typename T0, typename T1>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init2(Index rows, Index cols,
-                                                    std::enable_if_t<Base::SizeAtCompileTime != 2, T0>* = 0) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE void _init2(Index rows, Index cols,
+                                                              std::enable_if_t<Base::SizeAtCompileTime != 2, T0>* = 0) {
     EIGEN_STATIC_ASSERT(internal::is_valid_index_type<T0>::value && internal::is_valid_index_type<T1>::value,
                         T0 AND T1 MUST BE INTEGER TYPES)
     resize(rows, cols);
   }
 
   template <typename T0, typename T1>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init2(const T0& val0, const T1& val1,
-                                                    std::enable_if_t<Base::SizeAtCompileTime == 2, T0>* = 0) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE void _init2(const T0& val0, const T1& val1,
+                                                              std::enable_if_t<Base::SizeAtCompileTime == 2, T0>* = 0) {
     EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 2)
     m_storage.data()[0] = Scalar(val0);
     m_storage.data()[1] = Scalar(val1);
   }
 
   template <typename T0, typename T1>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init2(
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE void _init2(
       const Index& val0, const Index& val1,
       std::enable_if_t<(!internal::is_same<Index, Scalar>::value) && (internal::is_same<T0, Index>::value) &&
                            (internal::is_same<T1, Index>::value) && Base::SizeAtCompileTime == 2,
@@ -766,7 +761,7 @@
   // The argument is convertible to the Index type and we either have a non 1x1 Matrix, or a dynamic-sized Array,
   // then the argument is meant to be the size of the object.
   template <typename T>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init1(
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE void _init1(
       Index size,
       std::enable_if_t<(Base::SizeAtCompileTime != 1 || !internal::is_convertible<T, Scalar>::value) &&
                            ((!internal::is_same<typename internal::traits<Derived>::XprKind, ArrayXpr>::value ||
@@ -782,7 +777,7 @@
   // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar
   // type can be implicitly converted)
   template <typename T>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init1(
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE void _init1(
       const Scalar& val0,
       std::enable_if_t<Base::SizeAtCompileTime == 1 && internal::is_convertible<T, Scalar>::value, T>* = 0) {
     EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 1)
@@ -792,7 +787,7 @@
   // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar
   // type match the index type)
   template <typename T>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init1(
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE void _init1(
       const Index& val0,
       std::enable_if_t<(!internal::is_same<Index, Scalar>::value) && (internal::is_same<Index, T>::value) &&
                            Base::SizeAtCompileTime == 1 && internal::is_convertible<T, Scalar>::value,
@@ -803,42 +798,42 @@
 
   // Initialize a fixed size matrix from a pointer to raw data
   template <typename T>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init1(const Scalar* data) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE void _init1(const Scalar* data) {
     this->_set_noalias(ConstMapType(data));
   }
 
   // Initialize an arbitrary matrix from a dense expression
   template <typename T, typename OtherDerived>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init1(const DenseBase<OtherDerived>& other) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE void _init1(const DenseBase<OtherDerived>& other) {
     this->_set_noalias(other);
   }
 
   // Initialize an arbitrary matrix from an object convertible to the Derived type.
   template <typename T>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init1(const Derived& other) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE void _init1(const Derived& other) {
     this->_set_noalias(other);
   }
 
   // Initialize an arbitrary matrix from a generic Eigen expression
   template <typename T, typename OtherDerived>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init1(const EigenBase<OtherDerived>& other) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE void _init1(const EigenBase<OtherDerived>& other) {
     this->derived() = other;
   }
 
   template <typename T, typename OtherDerived>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init1(const ReturnByValue<OtherDerived>& other) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE void _init1(const ReturnByValue<OtherDerived>& other) {
     resize(other.rows(), other.cols());
     other.evalTo(this->derived());
   }
 
   template <typename T, typename OtherDerived, int ColsAtCompileTime>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init1(const RotationBase<OtherDerived, ColsAtCompileTime>& r) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE void _init1(const RotationBase<OtherDerived, ColsAtCompileTime>& r) {
     this->derived() = r;
   }
 
   // For fixed-size Array<Scalar,...>
   template <typename T>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init1(
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE void _init1(
       const Scalar& val0,
       std::enable_if_t<Base::SizeAtCompileTime != Dynamic && Base::SizeAtCompileTime != 1 &&
                            internal::is_convertible<T, Scalar>::value &&
@@ -849,7 +844,7 @@
 
   // For fixed-size Array<Index,...>
   template <typename T>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init1(
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE void _init1(
       const Index& val0,
       std::enable_if_t<(!internal::is_same<Index, Scalar>::value) && (internal::is_same<Index, T>::value) &&
                            Base::SizeAtCompileTime != Dynamic && Base::SizeAtCompileTime != 1 &&

diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index e16c7cc..bc8b9da 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h

@@ -219,16 +219,16 @@
   using TransposeReturnType = typename internal::product_transpose_helper<Lhs, Rhs, Option>::TransposeType;
   using AdjointReturnType = typename internal::product_transpose_helper<Lhs, Rhs, Option>::AdjointType;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Product(const Lhs& lhs, const Rhs& rhs) : m_lhs(lhs), m_rhs(rhs) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Product(const Lhs& lhs, const Rhs& rhs) : m_lhs(lhs), m_rhs(rhs) {
     eigen_assert(lhs.cols() == rhs.rows() && "invalid matrix product" &&
                  "if you wanted a coeff-wise or a dot product use the respective explicit functions");
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const noexcept { return m_lhs.rows(); }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const noexcept { return m_rhs.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_lhs.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_rhs.cols(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const LhsNestedCleaned& lhs() const { return m_lhs; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const RhsNestedCleaned& rhs() const { return m_rhs; }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const LhsNestedCleaned& lhs() const { return m_lhs; }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const RhsNestedCleaned& rhs() const { return m_rhs; }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TransposeReturnType transpose() const {
     return internal::product_transpose_helper<Lhs, Rhs, Option>::run_transpose(*this);

diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h
index be55be5..ae860e2 100644
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h

@@ -36,7 +36,7 @@
 };
 
 // Catch "scalar * ( A * B )" and transform it to "(A*scalar) * B"
-// TODO we should apply that rule only if that's really helpful
+// TODO: we should apply that rule only if that's really helpful
 template <typename Lhs, typename Rhs, typename Scalar1, typename Scalar2, typename Plain1>
 struct evaluator_assume_aliasing<CwiseBinaryOp<internal::scalar_product_op<Scalar1, Scalar2>,
                                                const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>,
@@ -161,7 +161,7 @@
 };
 
 // Dense ?= scalar * Product
-// TODO we should apply that rule if that's really helpful
+// TODO: we should apply that rule if that's really helpful
 // for instance, this is not good for inner products
 template <typename DstXprType, typename Lhs, typename Rhs, typename AssignFunc, typename Scalar, typename ScalarBis,
           typename Plain>
@@ -182,7 +182,7 @@
 
 //----------------------------------------
 // Catch "Dense ?= xpr + Product<>" expression to save one temporary
-// FIXME we could probably enable these rules for any product, i.e., not only Dense and DefaultProduct
+// FIXME: consider enabling these rules for all product types, not only Dense and DefaultProduct.
 
 template <typename OtherXpr, typename Lhs, typename Rhs>
 struct evaluator_assume_aliasing<
@@ -237,17 +237,17 @@
 struct generic_product_impl<Lhs, Rhs, DenseShape, DenseShape, InnerProduct> {
   using impl = default_inner_product_impl<Lhs, Rhs, false>;
   template <typename Dst>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
+  static EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
     dst.coeffRef(0, 0) = impl::run(lhs, rhs);
   }
 
   template <typename Dst>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
+  static EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
     dst.coeffRef(0, 0) += impl::run(lhs, rhs);
   }
 
   template <typename Dst>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
+  static EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
     dst.coeffRef(0, 0) -= impl::run(lhs, rhs);
   }
 };
@@ -286,7 +286,7 @@
   struct is_row_major : bool_constant<(int(T::Flags) & RowMajorBit)> {};
   typedef typename Product<Lhs, Rhs>::Scalar Scalar;
 
-  // TODO it would be nice to be able to exploit our *_assign_op functors for that purpose
+  // TODO: it would be nice to be able to exploit our *_assign_op functors for that purpose
   struct set {
     template <typename Dst, typename Src>
     EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const {
@@ -500,18 +500,6 @@
     EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::MulCost);
     EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::AddCost);
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
-#if 0
-    std::cerr << "LhsOuterStrideBytes=  " << LhsOuterStrideBytes << "\n";
-    std::cerr << "RhsOuterStrideBytes=  " << RhsOuterStrideBytes << "\n";
-    std::cerr << "LhsAlignment=         " << LhsAlignment << "\n";
-    std::cerr << "RhsAlignment=         " << RhsAlignment << "\n";
-    std::cerr << "CanVectorizeLhs=      " << CanVectorizeLhs << "\n";
-    std::cerr << "CanVectorizeRhs=      " << CanVectorizeRhs << "\n";
-    std::cerr << "CanVectorizeInner=    " << CanVectorizeInner << "\n";
-    std::cerr << "EvalToRowMajor=       " << EvalToRowMajor << "\n";
-    std::cerr << "Alignment=            " << Alignment << "\n";
-    std::cerr << "Flags=                " << Flags << "\n";
-#endif
   }
 
   // Everything below here is taken from CoeffBasedProduct.h
@@ -575,7 +563,7 @@
 
     Flags = ((int(LhsFlags) | int(RhsFlags)) & HereditaryBits & ~RowMajorBit) |
             (EvalToRowMajor ? RowMajorBit : 0)
-            // TODO enable vectorization for mixed types
+            // TODO: enable vectorization for mixed types
             | (SameType && (CanVectorizeLhs || CanVectorizeRhs) ? PacketAccessBit : 0) |
             (XprType::IsVectorAtCompileTime ? LinearAccessBit : 0),
 
@@ -604,7 +592,7 @@
                         (int(InnerSize) % packet_traits<Scalar>::size == 0)
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index row, Index col) const {
     return (m_lhs.row(row).transpose().cwiseProduct(m_rhs.col(col))).sum();
   }
 
@@ -612,7 +600,7 @@
    * which is why we don't set the LinearAccessBit.
    * TODO: this seems possible when the result is a vector
    */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index index) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index index) const {
     const Index row = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime == 1) ? 0 : index;
     const Index col = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime == 1) ? index : 0;
     return (m_lhs.row(row).transpose().cwiseProduct(m_rhs.col(col))).sum();
@@ -717,8 +705,8 @@
 
 template <typename Lhs, typename Rhs, typename Packet, int LoadMode>
 struct etor_product_packet_impl<RowMajor, 1, Lhs, Rhs, Packet, LoadMode> {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
-                                                        Index /*innerDim*/, Packet& res) {
+  static EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
+                                                                  Index /*innerDim*/, Packet& res) {
     res = pmul(pset1<Packet>(lhs.coeff(row, Index(0))), rhs.template packet<LoadMode, Packet>(Index(0), col));
   }
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
@@ -731,8 +719,8 @@
 
 template <typename Lhs, typename Rhs, typename Packet, int LoadMode>
 struct etor_product_packet_impl<ColMajor, 1, Lhs, Rhs, Packet, LoadMode> {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
-                                                        Index /*innerDim*/, Packet& res) {
+  static EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
+                                                                  Index /*innerDim*/, Packet& res) {
     res = pmul(lhs.template packet<LoadMode, Packet>(row, Index(0)), pset1<Packet>(rhs.coeff(Index(0), col)));
   }
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
@@ -913,7 +901,7 @@
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const {
     if (AsScalarProduct)
       return m_diagImpl.coeff(0) * m_matImpl.coeff(idx);
     else
@@ -983,9 +971,9 @@
   static constexpr int StorageOrder = Base::StorageOrder_;
   using IsRowMajor_t = bool_constant<StorageOrder == RowMajor>;
 
-  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr) : Base(xpr.rhs(), xpr.lhs().diagonal()) {}
+  EIGEN_DEVICE_FUNC constexpr explicit product_evaluator(const XprType& xpr) : Base(xpr.rhs(), xpr.lhs().diagonal()) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const {
     return m_diagImpl.coeff(row) * m_matImpl.coeff(row, col);
   }
 
@@ -1037,9 +1025,9 @@
   static constexpr int StorageOrder = Base::StorageOrder_;
   using IsColMajor_t = bool_constant<StorageOrder == ColMajor>;
 
-  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr) : Base(xpr.lhs(), xpr.rhs().diagonal()) {}
+  EIGEN_DEVICE_FUNC constexpr explicit product_evaluator(const XprType& xpr) : Base(xpr.lhs(), xpr.rhs().diagonal()) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const {
     return m_matImpl.coeff(row, col) * m_diagImpl.coeff(col);
   }
 
@@ -1170,7 +1158,7 @@
  * Products with transpositions matrices
  ***************************************************************************/
 
-// FIXME could we unify Transpositions and Permutation into a single "shape"??
+// FIXME: consider unifying Transpositions and Permutation into a single shape.
 
 /** \internal
  * \class transposition_matrix_product

diff --git a/Eigen/src/Core/Random.h b/Eigen/src/Core/Random.h
index f8a5435..532ca0e 100644
--- a/Eigen/src/Core/Random.h
+++ b/Eigen/src/Core/Random.h

@@ -19,7 +19,7 @@
 
 template <typename Scalar>
 struct scalar_random_op {
-  inline const Scalar operator()() const { return random<Scalar>(); }
+  inline Scalar operator()() const { return random<Scalar>(); }
 };
 
 template <typename Scalar>

diff --git a/Eigen/src/Core/RandomImpl.h b/Eigen/src/Core/RandomImpl.h
index 1a82e62..4a622fc 100644
--- a/Eigen/src/Core/RandomImpl.h
+++ b/Eigen/src/Core/RandomImpl.h

@@ -56,19 +56,21 @@
   EIGEN_STATIC_ASSERT(std::is_unsigned<Scalar>::value, SCALAR MUST BE A BUILT - IN UNSIGNED INTEGER)
   using RandomDevice = eigen_random_device;
   using RandomReturnType = typename RandomDevice::ReturnType;
-  static constexpr int kEntropy = RandomDevice::Entropy;
   static constexpr int kTotalBits = sizeof(Scalar) * CHAR_BIT;
+  static constexpr int kEntropy = plain_enum_min(kTotalBits, RandomDevice::Entropy);
   // return a Scalar filled with numRandomBits beginning from the least significant bit
   static EIGEN_DEVICE_FUNC inline Scalar run(int numRandomBits) {
     eigen_assert((numRandomBits >= 0) && (numRandomBits <= kTotalBits));
-    const Scalar mask = Scalar(-1) >> ((kTotalBits - numRandomBits) & (kTotalBits - 1));
     Scalar randomBits = 0;
-    for (int shift = 0; shift < numRandomBits; shift += kEntropy) {
-      RandomReturnType r = RandomDevice::run();
-      randomBits |= static_cast<Scalar>(r) << shift;
+    for (int filledBits = 0; filledBits < numRandomBits; filledBits += kEntropy) {
+      Scalar r = static_cast<Scalar>(RandomDevice::run());
+      int remainingBits = numRandomBits - filledBits;
+      if (remainingBits < kEntropy) {
+        // clear the excess bits to avoid UB and rounding bias
+        r >>= kEntropy - remainingBits;
+      }
+      randomBits |= r << filledBits;
     }
-    // clear the excess bits
-    randomBits &= mask;
     return randomBits;
   }
 };
@@ -204,7 +206,8 @@
 template <typename Scalar>
 struct random_int_impl<Scalar, true, true> {
   static constexpr int kTotalBits = sizeof(Scalar) * CHAR_BIT;
-  using BitsType = typename make_unsigned<Scalar>::type;
+  // avoid implicit integral promotion to `int`
+  using BitsType = std::conditional_t<(sizeof(Scalar) < sizeof(int)), unsigned int, std::make_unsigned_t<Scalar> >;
   static EIGEN_DEVICE_FUNC inline Scalar run(const Scalar& x, const Scalar& y) {
     if (y <= x) return x;
     // Avoid overflow by representing `range` as an unsigned type

diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h
index 4e9ab0e..fefb4c9 100644
--- a/Eigen/src/Core/Redux.h
+++ b/Eigen/src/Core/Redux.h

@@ -101,7 +101,7 @@
 
   typedef typename Evaluator::Scalar Scalar;
 
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Evaluator& eval, const Func& func) {
+  EIGEN_DEVICE_FUNC static constexpr EIGEN_STRONG_INLINE Scalar run(const Evaluator& eval, const Func& func) {
     return func(redux_novec_unroller<Func, Evaluator, Start, HalfLength>::run(eval, func),
                 redux_novec_unroller<Func, Evaluator, Start + HalfLength, Length - HalfLength>::run(eval, func));
   }
@@ -114,7 +114,7 @@
 
   typedef typename Evaluator::Scalar Scalar;
 
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Evaluator& eval, const Func&) {
+  EIGEN_DEVICE_FUNC static constexpr EIGEN_STRONG_INLINE Scalar run(const Evaluator& eval, const Func&) {
     return eval.coeffByOuterInner(outer, inner);
   }
 };
@@ -125,7 +125,7 @@
 template <typename Func, typename Evaluator, Index Start>
 struct redux_novec_unroller<Func, Evaluator, Start, 0> {
   typedef typename Evaluator::Scalar Scalar;
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Evaluator&, const Func&) { return Scalar(); }
+  EIGEN_DEVICE_FUNC static constexpr EIGEN_STRONG_INLINE Scalar run(const Evaluator&, const Func&) { return Scalar(); }
 };
 
 template <typename Func, typename Evaluator, Index Start, Index Length>
@@ -134,7 +134,7 @@
 
   typedef typename Evaluator::Scalar Scalar;
 
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Evaluator& eval, const Func& func) {
+  EIGEN_DEVICE_FUNC static constexpr EIGEN_STRONG_INLINE Scalar run(const Evaluator& eval, const Func& func) {
     return func(redux_novec_linear_unroller<Func, Evaluator, Start, HalfLength>::run(eval, func),
                 redux_novec_linear_unroller<Func, Evaluator, Start + HalfLength, Length - HalfLength>::run(eval, func));
   }
@@ -144,7 +144,7 @@
 struct redux_novec_linear_unroller<Func, Evaluator, Start, 1> {
   typedef typename Evaluator::Scalar Scalar;
 
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Evaluator& eval, const Func&) {
+  EIGEN_DEVICE_FUNC static constexpr EIGEN_STRONG_INLINE Scalar run(const Evaluator& eval, const Func&) {
     return eval.coeff(Start);
   }
 };
@@ -155,7 +155,7 @@
 template <typename Func, typename Evaluator, Index Start>
 struct redux_novec_linear_unroller<Func, Evaluator, Start, 0> {
   typedef typename Evaluator::Scalar Scalar;
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Evaluator&, const Func&) { return Scalar(); }
+  EIGEN_DEVICE_FUNC static constexpr EIGEN_STRONG_INLINE Scalar run(const Evaluator&, const Func&) { return Scalar(); }
 };
 
 /*** vectorization ***/
@@ -398,8 +398,8 @@
   enum {
     MaxRowsAtCompileTime = XprType::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = XprType::MaxColsAtCompileTime,
-    // TODO we should not remove DirectAccessBit and rather find an elegant way to query the alignment offset at runtime
-    // from the evaluator
+    // TODO: we should not remove DirectAccessBit and rather find an elegant way to query the alignment offset at
+    // runtime from the evaluator
     Flags = Base::Flags & ~DirectAccessBit,
     IsRowMajor = XprType::IsRowMajor,
     SizeAtCompileTime = XprType::SizeAtCompileTime,

diff --git a/Eigen/src/Core/Ref.h b/Eigen/src/Core/Ref.h
index 30ec277..c845462 100644
--- a/Eigen/src/Core/Ref.h
+++ b/Eigen/src/Core/Ref.h

@@ -43,7 +43,7 @@
       OuterStrideMatch = IsVectorAtCompileTime || int(OuterStrideAtCompileTime) == int(Dynamic) ||
                          int(OuterStrideAtCompileTime) == int(Derived::OuterStrideAtCompileTime),
       // NOTE, this indirection of evaluator<Derived>::Alignment is needed
-      // to workaround a very strange bug in MSVC related to the instantiation
+      // to work around an MSVC bug related to the instantiation
       // of has_*ary_operator in evaluator<CwiseNullaryOp>.
       // This line is surprisingly very sensitive. For instance, simply adding parenthesis
       // as "DerivedAlignment = (int(evaluator<Derived>::Alignment))," will make MSVC fail...
@@ -265,7 +265,7 @@
  private:
   typedef internal::traits<Ref> Traits;
   template <typename Derived>
-  EIGEN_DEVICE_FUNC inline Ref(
+  EIGEN_DEVICE_FUNC constexpr inline Ref(
       const PlainObjectBase<Derived>& expr,
       std::enable_if_t<bool(Traits::template match<Derived>::MatchAtCompileTime), Derived>* = 0);
 
@@ -275,7 +275,7 @@
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
   template <typename Derived>
-  EIGEN_DEVICE_FUNC inline Ref(
+  EIGEN_DEVICE_FUNC constexpr inline Ref(
       PlainObjectBase<Derived>& expr,
       std::enable_if_t<bool(Traits::template match<Derived>::MatchAtCompileTime), Derived>* = 0) {
     EIGEN_STATIC_ASSERT(bool(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
@@ -285,7 +285,7 @@
     eigen_assert(success);
   }
   template <typename Derived>
-  EIGEN_DEVICE_FUNC inline Ref(
+  EIGEN_DEVICE_FUNC constexpr inline Ref(
       const DenseBase<Derived>& expr,
       std::enable_if_t<bool(Traits::template match<Derived>::MatchAtCompileTime), Derived>* = 0)
 #else
@@ -327,8 +327,9 @@
   EIGEN_DENSE_PUBLIC_INTERFACE(Ref)
 
   template <typename Derived>
-  EIGEN_DEVICE_FUNC inline Ref(const DenseBase<Derived>& expr,
-                               std::enable_if_t<bool(Traits::template match<Derived>::ScalarTypeMatch), Derived>* = 0) {
+  EIGEN_DEVICE_FUNC constexpr inline Ref(
+      const DenseBase<Derived>& expr,
+      std::enable_if_t<bool(Traits::template match<Derived>::ScalarTypeMatch), Derived>* = 0) {
     //      std::cout << match_helper<Derived>::HasDirectAccess << "," << match_helper<Derived>::OuterStrideMatch << ","
     //      << match_helper<Derived>::InnerStrideMatch << "\n"; std::cout << int(StrideType::OuterStrideAtCompileTime)
     //      << " - " << int(Derived::OuterStrideAtCompileTime) << "\n"; std::cout <<
@@ -338,11 +339,11 @@
     construct(expr.derived(), typename Traits::template match<Derived>::type());
   }
 
-  EIGEN_DEVICE_FUNC inline Ref(const Ref& other) : Base(other) {
+  EIGEN_DEVICE_FUNC constexpr inline Ref(const Ref& other) : Base(other) {
     // copy constructor shall not copy the m_object, to avoid unnecessary malloc and copy
   }
 
-  EIGEN_DEVICE_FUNC inline Ref(Ref&& other) {
+  EIGEN_DEVICE_FUNC constexpr inline Ref(Ref&& other) {
     if (other.data() == other.m_object.data()) {
       m_object = std::move(other.m_object);
       Base::construct(m_object);
@@ -351,7 +352,7 @@
   }
 
   template <typename OtherRef>
-  EIGEN_DEVICE_FUNC inline Ref(const RefBase<OtherRef>& other) {
+  EIGEN_DEVICE_FUNC constexpr inline Ref(const RefBase<OtherRef>& other) {
     EIGEN_STATIC_ASSERT(Traits::template match<OtherRef>::type::value || may_map_m_object_successfully,
                         STORAGE_LAYOUT_DOES_NOT_MATCH);
     construct(other.derived(), typename Traits::template match<OtherRef>::type());

diff --git a/Eigen/src/Core/Replicate.h b/Eigen/src/Core/Replicate.h
index 3415045..9bdc725 100644
--- a/Eigen/src/Core/Replicate.h
+++ b/Eigen/src/Core/Replicate.h

@@ -30,7 +30,7 @@
     ColsAtCompileTime = ColFactor == Dynamic || int(MatrixType::ColsAtCompileTime) == Dynamic
                             ? Dynamic
                             : ColFactor * MatrixType::ColsAtCompileTime,
-    // FIXME we don't propagate the max sizes !!!
+    // FIXME: propagate MaxRowsAtCompileTime and MaxColsAtCompileTime.
     MaxRowsAtCompileTime = RowsAtCompileTime,
     MaxColsAtCompileTime = ColsAtCompileTime,
     IsRowMajor = MaxRowsAtCompileTime == 1 && MaxColsAtCompileTime != 1   ? 1
@@ -38,7 +38,7 @@
                  : (MatrixType::Flags & RowMajorBit)                      ? 1
                                                                           : 0,
 
-    // FIXME enable DirectAccess with negative strides?
+    // FIXME: consider enabling DirectAccess with negative strides.
     Flags = IsRowMajor ? RowMajorBit : 0
   };
 };
@@ -71,7 +71,7 @@
   typedef internal::remove_all_t<MatrixType> NestedExpression;
 
   template <typename OriginalMatrixType>
-  EIGEN_DEVICE_FUNC inline explicit Replicate(const OriginalMatrixType& matrix)
+  EIGEN_DEVICE_FUNC constexpr inline explicit Replicate(const OriginalMatrixType& matrix)
       : m_matrix(matrix), m_rowFactor(RowFactor), m_colFactor(ColFactor) {
     EIGEN_STATIC_ASSERT((internal::is_same<std::remove_const_t<MatrixType>, OriginalMatrixType>::value),
                         THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE)
@@ -79,7 +79,7 @@
   }
 
   template <typename OriginalMatrixType>
-  EIGEN_DEVICE_FUNC inline Replicate(const OriginalMatrixType& matrix, Index rowFactor, Index colFactor)
+  EIGEN_DEVICE_FUNC constexpr inline Replicate(const OriginalMatrixType& matrix, Index rowFactor, Index colFactor)
       : m_matrix(matrix), m_rowFactor(rowFactor), m_colFactor(colFactor) {
     EIGEN_STATIC_ASSERT((internal::is_same<std::remove_const_t<MatrixType>, OriginalMatrixType>::value),
                         THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE)
@@ -88,7 +88,7 @@
   EIGEN_DEVICE_FUNC constexpr Index rows() const { return m_matrix.rows() * m_rowFactor.value(); }
   EIGEN_DEVICE_FUNC constexpr Index cols() const { return m_matrix.cols() * m_colFactor.value(); }
 
-  EIGEN_DEVICE_FUNC const MatrixTypeNested_& nestedExpression() const { return m_matrix; }
+  EIGEN_DEVICE_FUNC constexpr const MatrixTypeNested_& nestedExpression() const { return m_matrix; }
 
  protected:
   MatrixTypeNested m_matrix;

diff --git a/Eigen/src/Core/Reshaped.h b/Eigen/src/Core/Reshaped.h
index 22acdc0..5763d6a 100644
--- a/Eigen/src/Core/Reshaped.h
+++ b/Eigen/src/Core/Reshaped.h

@@ -107,7 +107,7 @@
 
   /** Fixed-size constructor
    */
-  EIGEN_DEVICE_FUNC inline Reshaped(XprType& xpr) : Impl(xpr) {
+  EIGEN_DEVICE_FUNC constexpr inline Reshaped(XprType& xpr) : Impl(xpr) {
     EIGEN_STATIC_ASSERT(RowsAtCompileTime != Dynamic && ColsAtCompileTime != Dynamic,
                         THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE)
     eigen_assert(Rows * Cols == xpr.rows() * xpr.cols());
@@ -115,7 +115,7 @@
 
   /** Dynamic-size constructor
    */
-  EIGEN_DEVICE_FUNC inline Reshaped(XprType& xpr, Index reshapeRows, Index reshapeCols)
+  EIGEN_DEVICE_FUNC constexpr inline Reshaped(XprType& xpr, Index reshapeRows, Index reshapeCols)
       : Impl(xpr, reshapeRows, reshapeCols) {
     eigen_assert((RowsAtCompileTime == Dynamic || RowsAtCompileTime == reshapeRows) &&
                  (ColsAtCompileTime == Dynamic || ColsAtCompileTime == reshapeCols));
@@ -136,8 +136,8 @@
  public:
   typedef Impl Base;
   EIGEN_INHERIT_ASSIGNMENT_OPERATORS(ReshapedImpl)
-  EIGEN_DEVICE_FUNC inline ReshapedImpl(XprType& xpr) : Impl(xpr) {}
-  EIGEN_DEVICE_FUNC inline ReshapedImpl(XprType& xpr, Index reshapeRows, Index reshapeCols)
+  EIGEN_DEVICE_FUNC constexpr inline ReshapedImpl(XprType& xpr) : Impl(xpr) {}
+  EIGEN_DEVICE_FUNC constexpr inline ReshapedImpl(XprType& xpr, Index reshapeRows, Index reshapeCols)
       : Impl(xpr, reshapeRows, reshapeCols) {}
 };
 
@@ -161,15 +161,15 @@
 
   /** Fixed-size constructor
    */
-  EIGEN_DEVICE_FUNC inline ReshapedImpl_dense(XprType& xpr) : m_xpr(xpr), m_rows(Rows), m_cols(Cols) {}
+  EIGEN_DEVICE_FUNC constexpr inline ReshapedImpl_dense(XprType& xpr) : m_xpr(xpr), m_rows(Rows), m_cols(Cols) {}
 
   /** Dynamic-size constructor
    */
-  EIGEN_DEVICE_FUNC inline ReshapedImpl_dense(XprType& xpr, Index nRows, Index nCols)
+  EIGEN_DEVICE_FUNC constexpr inline ReshapedImpl_dense(XprType& xpr, Index nRows, Index nCols)
       : m_xpr(xpr), m_rows(nRows), m_cols(nCols) {}
 
-  EIGEN_DEVICE_FUNC Index rows() const { return m_rows; }
-  EIGEN_DEVICE_FUNC Index cols() const { return m_cols; }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const { return m_rows; }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const { return m_cols; }
 
 #ifdef EIGEN_PARSED_BY_DOXYGEN
   /** \sa MapBase::data() */
@@ -179,10 +179,10 @@
 #endif
 
   /** \returns the nested expression */
-  EIGEN_DEVICE_FUNC const internal::remove_all_t<XprType>& nestedExpression() const { return m_xpr; }
+  EIGEN_DEVICE_FUNC constexpr const internal::remove_all_t<XprType>& nestedExpression() const { return m_xpr; }
 
   /** \returns the nested expression */
-  EIGEN_DEVICE_FUNC std::remove_reference_t<XprType>& nestedExpression() { return m_xpr; }
+  EIGEN_DEVICE_FUNC constexpr std::remove_reference_t<XprType>& nestedExpression() { return m_xpr; }
 
  protected:
   MatrixTypeNested m_xpr;
@@ -203,16 +203,16 @@
 
   /** Fixed-size constructor
    */
-  EIGEN_DEVICE_FUNC inline ReshapedImpl_dense(XprType& xpr) : Base(xpr.data()), m_xpr(xpr) {}
+  EIGEN_DEVICE_FUNC constexpr inline ReshapedImpl_dense(XprType& xpr) : Base(xpr.data()), m_xpr(xpr) {}
 
   /** Dynamic-size constructor
    */
-  EIGEN_DEVICE_FUNC inline ReshapedImpl_dense(XprType& xpr, Index nRows, Index nCols)
+  EIGEN_DEVICE_FUNC constexpr inline ReshapedImpl_dense(XprType& xpr, Index nRows, Index nCols)
       : Base(xpr.data(), nRows, nCols), m_xpr(xpr) {}
 
-  EIGEN_DEVICE_FUNC const internal::remove_all_t<XprTypeNested>& nestedExpression() const { return m_xpr; }
+  EIGEN_DEVICE_FUNC constexpr const internal::remove_all_t<XprTypeNested>& nestedExpression() const { return m_xpr; }
 
-  EIGEN_DEVICE_FUNC XprType& nestedExpression() { return m_xpr; }
+  EIGEN_DEVICE_FUNC constexpr XprType& nestedExpression() { return m_xpr; }
 
   /** \sa MapBase::innerStride() */
   EIGEN_DEVICE_FUNC constexpr Index innerStride() const { return m_xpr.innerStride(); }
@@ -265,7 +265,7 @@
     Alignment = evaluator<ArgType>::Alignment
   };
   typedef reshaped_evaluator<ArgType, Rows, Cols, Order, HasDirectAccess> reshaped_evaluator_type;
-  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : reshaped_evaluator_type(xpr) {
+  EIGEN_DEVICE_FUNC constexpr explicit evaluator(const XprType& xpr) : reshaped_evaluator_type(xpr) {
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
 };
@@ -283,7 +283,8 @@
     Alignment = 0
   };
 
-  EIGEN_DEVICE_FUNC explicit reshaped_evaluator(const XprType& xpr) : m_argImpl(xpr.nestedExpression()), m_xpr(xpr) {
+  EIGEN_DEVICE_FUNC constexpr explicit reshaped_evaluator(const XprType& xpr)
+      : m_argImpl(xpr.nestedExpression()), m_xpr(xpr) {
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
 
@@ -292,7 +293,7 @@
 
   typedef std::pair<Index, Index> RowCol;
 
-  EIGEN_DEVICE_FUNC inline RowCol index_remap(Index rowId, Index colId) const {
+  EIGEN_DEVICE_FUNC constexpr inline RowCol index_remap(Index rowId, Index colId) const {
     if (Order == ColMajor) {
       const Index nth_elem_idx = colId * m_xpr.rows() + rowId;
       return RowCol(nth_elem_idx % m_xpr.nestedExpression().rows(), nth_elem_idx / m_xpr.nestedExpression().rows());
@@ -302,74 +303,38 @@
     }
   }
 
-  EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index rowId, Index colId) {
+  EIGEN_DEVICE_FUNC constexpr inline Scalar& coeffRef(Index rowId, Index colId) {
     EIGEN_STATIC_ASSERT_LVALUE(XprType)
     const RowCol row_col = index_remap(rowId, colId);
     return m_argImpl.coeffRef(row_col.first, row_col.second);
   }
 
-  EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index rowId, Index colId) const {
+  EIGEN_DEVICE_FUNC constexpr inline const Scalar& coeffRef(Index rowId, Index colId) const {
     const RowCol row_col = index_remap(rowId, colId);
     return m_argImpl.coeffRef(row_col.first, row_col.second);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index rowId, Index colId) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index rowId, Index colId) const {
     const RowCol row_col = index_remap(rowId, colId);
     return m_argImpl.coeff(row_col.first, row_col.second);
   }
 
-  EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index index) {
+  EIGEN_DEVICE_FUNC constexpr inline Scalar& coeffRef(Index index) {
     EIGEN_STATIC_ASSERT_LVALUE(XprType)
     const RowCol row_col = index_remap(Rows == 1 ? 0 : index, Rows == 1 ? index : 0);
     return m_argImpl.coeffRef(row_col.first, row_col.second);
   }
 
-  EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index index) const {
+  EIGEN_DEVICE_FUNC constexpr inline const Scalar& coeffRef(Index index) const {
     const RowCol row_col = index_remap(Rows == 1 ? 0 : index, Rows == 1 ? index : 0);
     return m_argImpl.coeffRef(row_col.first, row_col.second);
   }
 
-  EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index index) const {
+  EIGEN_DEVICE_FUNC constexpr inline const CoeffReturnType coeff(Index index) const {
     const RowCol row_col = index_remap(Rows == 1 ? 0 : index, Rows == 1 ? index : 0);
     return m_argImpl.coeff(row_col.first, row_col.second);
   }
-#if 0
-  EIGEN_DEVICE_FUNC
-  template<int LoadMode>
-  inline PacketScalar packet(Index rowId, Index colId) const
-  {
-    const RowCol row_col = index_remap(rowId, colId);
-    return m_argImpl.template packet<Unaligned>(row_col.first, row_col.second);
 
-  }
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC
-  inline void writePacket(Index rowId, Index colId, const PacketScalar& val)
-  {
-    const RowCol row_col = index_remap(rowId, colId);
-    m_argImpl.const_cast_derived().template writePacket<Unaligned>
-            (row_col.first, row_col.second, val);
-  }
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC
-  inline PacketScalar packet(Index index) const
-  {
-    const RowCol row_col = index_remap(RowsAtCompileTime == 1 ? 0 : index,
-                                        RowsAtCompileTime == 1 ? index : 0);
-    return m_argImpl.template packet<Unaligned>(row_col.first, row_col.second);
-  }
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC
-  inline void writePacket(Index index, const PacketScalar& val)
-  {
-    const RowCol row_col = index_remap(RowsAtCompileTime == 1 ? 0 : index,
-                                        RowsAtCompileTime == 1 ? index : 0);
-    return m_argImpl.template packet<Unaligned>(row_col.first, row_col.second, val);
-  }
-#endif
  protected:
   evaluator<ArgType> m_argImpl;
   const XprType& m_xpr;
@@ -382,7 +347,7 @@
   typedef Reshaped<ArgType, Rows, Cols, Order> XprType;
   typedef typename XprType::Scalar Scalar;
 
-  EIGEN_DEVICE_FUNC explicit reshaped_evaluator(const XprType& xpr)
+  EIGEN_DEVICE_FUNC constexpr explicit reshaped_evaluator(const XprType& xpr)
       : mapbase_evaluator<XprType, typename XprType::PlainObject>(xpr) {
     // TODO: for the 3.4 release, this should be turned to an internal assertion, but let's keep it as is for the beta
     // lifetime

diff --git a/Eigen/src/Core/ReturnByValue.h b/Eigen/src/Core/ReturnByValue.h
index 892c193..410b77d 100644
--- a/Eigen/src/Core/ReturnByValue.h
+++ b/Eigen/src/Core/ReturnByValue.h

@@ -23,7 +23,7 @@
   enum {
     // We're disabling the DirectAccess because e.g. the constructor of
     // the Block-with-DirectAccess expression requires to have a coeffRef method.
-    // Also, we don't want to have to implement the stride stuff.
+    // Also, this avoids having to implement stride support.
     Flags = (traits<typename traits<Derived>::ReturnType>::Flags | EvalBeforeNestingBit) & ~DirectAccessBit
   };
 };
@@ -32,7 +32,7 @@
  * So the only way that nesting it in an expression can work, is by evaluating it into a plain matrix.
  * So internal::nested always gives the plain return matrix type.
  *
- * FIXME: I don't understand why we need this specialization: isn't this taken care of by the EvalBeforeNestingBit ??
+ * FIXME: this specialization may be redundant with EvalBeforeNestingBit.
  * Answer: EvalBeforeNestingBit should be deprecated since we have the evaluators
  */
 template <typename Derived, int n, typename PlainObject>

diff --git a/Eigen/src/Core/Reverse.h b/Eigen/src/Core/Reverse.h
index d11ba16..a4af8d1 100644
--- a/Eigen/src/Core/Reverse.h
+++ b/Eigen/src/Core/Reverse.h

@@ -83,7 +83,7 @@
   typedef internal::reverse_packet_cond<PacketScalar, ReversePacket> reverse_packet;
 
  public:
-  EIGEN_DEVICE_FUNC explicit inline Reverse(const MatrixType& matrix) : m_matrix(matrix) {}
+  EIGEN_DEVICE_FUNC constexpr explicit inline Reverse(const MatrixType& matrix) : m_matrix(matrix) {}
 
   EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Reverse)
 
@@ -92,7 +92,7 @@
 
   EIGEN_DEVICE_FUNC inline Index innerStride() const { return -m_matrix.innerStride(); }
 
-  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename MatrixType::Nested>& nestedExpression() const {
+  EIGEN_DEVICE_FUNC constexpr const internal::remove_all_t<typename MatrixType::Nested>& nestedExpression() const {
     return m_matrix;
   }
 

diff --git a/Eigen/src/Core/Select.h b/Eigen/src/Core/Select.h
index 61a67c2..6ad290f 100644
--- a/Eigen/src/Core/Select.h
+++ b/Eigen/src/Core/Select.h

@@ -45,7 +45,7 @@
  */
 template <typename Derived>
 template <typename ThenDerived, typename ElseDerived>
-inline EIGEN_DEVICE_FUNC CwiseTernaryOp<
+inline EIGEN_DEVICE_FUNC constexpr CwiseTernaryOp<
     internal::scalar_boolean_select_op<typename DenseBase<ThenDerived>::Scalar, typename DenseBase<ElseDerived>::Scalar,
                                        typename DenseBase<Derived>::Scalar>,
     ThenDerived, ElseDerived, Derived>
@@ -59,7 +59,7 @@
  */
 template <typename Derived>
 template <typename ThenDerived>
-inline EIGEN_DEVICE_FUNC CwiseTernaryOp<
+inline EIGEN_DEVICE_FUNC constexpr CwiseTernaryOp<
     internal::scalar_boolean_select_op<typename DenseBase<ThenDerived>::Scalar, typename DenseBase<ThenDerived>::Scalar,
                                        typename DenseBase<Derived>::Scalar>,
     ThenDerived, typename DenseBase<ThenDerived>::ConstantReturnType, Derived>
@@ -76,7 +76,7 @@
  */
 template <typename Derived>
 template <typename ElseDerived>
-inline EIGEN_DEVICE_FUNC CwiseTernaryOp<
+inline EIGEN_DEVICE_FUNC constexpr CwiseTernaryOp<
     internal::scalar_boolean_select_op<typename DenseBase<ElseDerived>::Scalar, typename DenseBase<ElseDerived>::Scalar,
                                        typename DenseBase<Derived>::Scalar>,
     typename DenseBase<ElseDerived>::ConstantReturnType, ElseDerived, Derived>

diff --git a/Eigen/src/Core/SelfAdjointView.h b/Eigen/src/Core/SelfAdjointView.h
index 16f0e75..62d0729 100644
--- a/Eigen/src/Core/SelfAdjointView.h
+++ b/Eigen/src/Core/SelfAdjointView.h

@@ -219,8 +219,8 @@
 
   /////////// Cholesky module ///////////
 
-  const LLT<PlainObject, UpLo> llt() const;
-  const LDLT<PlainObject, UpLo> ldlt() const;
+  LLT<PlainObject, UpLo> llt() const;
+  LDLT<PlainObject, UpLo> ldlt() const;
 
   /////////// Eigenvalue module ///////////
 
@@ -236,14 +236,6 @@
   MatrixTypeNested m_matrix;
 };
 
-// template<typename OtherDerived, typename MatrixType, unsigned int UpLo>
-// internal::selfadjoint_matrix_product_returntype<OtherDerived,SelfAdjointView<MatrixType,UpLo> >
-// operator*(const MatrixBase<OtherDerived>& lhs, const SelfAdjointView<MatrixType,UpLo>& rhs)
-// {
-//   return internal::matrix_selfadjoint_product_returntype<OtherDerived,SelfAdjointView<MatrixType,UpLo>
-//   >(lhs.derived(),rhs);
-// }
-
 // selfadjoint to dense matrix
 
 namespace internal {
@@ -288,6 +280,14 @@
     m_functor.assignCoeff(m_dst.coeffRef(col, row), numext::conj(tmp));
   }
 
+  // Override to ensure the SelfAdjoint assignCoeff (which mirrors conjugates) is called,
+  // not the base class version (which is a plain copy).
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeffByOuterInner(Index outer, Index inner) {
+    Index row = Base::rowIndexByOuterInner(outer, inner);
+    Index col = Base::colIndexByOuterInner(outer, inner);
+    assignCoeff(row, col);
+  }
+
   EIGEN_DEVICE_FUNC void assignDiagonalCoeff(Index id) { Base::assignCoeff(id, id); }
 
   EIGEN_DEVICE_FUNC void assignOppositeCoeff(Index, Index) { eigen_internal_assert(false && "should never be called"); }
@@ -302,7 +302,7 @@
 /** This is the const version of MatrixBase::selfadjointView() */
 template <typename Derived>
 template <unsigned int UpLo>
-EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type
+EIGEN_DEVICE_FUNC constexpr typename MatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type
 MatrixBase<Derived>::selfadjointView() const {
   return typename ConstSelfAdjointViewReturnType<UpLo>::Type(derived());
 }
@@ -319,7 +319,7 @@
  */
 template <typename Derived>
 template <unsigned int UpLo>
-EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type
+EIGEN_DEVICE_FUNC constexpr typename MatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type
 MatrixBase<Derived>::selfadjointView() {
   return typename SelfAdjointViewReturnType<UpLo>::Type(derived());
 }

diff --git a/Eigen/src/Core/SelfCwiseBinaryOp.h b/Eigen/src/Core/SelfCwiseBinaryOp.h
index 1bc0373..a887ad2 100644
--- a/Eigen/src/Core/SelfCwiseBinaryOp.h
+++ b/Eigen/src/Core/SelfCwiseBinaryOp.h

@@ -16,7 +16,7 @@
 namespace Eigen {
 
 template <typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator*=(const Scalar& other) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator*=(const Scalar& other) {
   using ConstantExpr = typename internal::plain_constant_type<Derived, Scalar>::type;
   using Op = internal::mul_assign_op<Scalar>;
   internal::call_assignment(derived(), ConstantExpr(rows(), cols(), other), Op());
@@ -25,13 +25,13 @@
 
 template <typename Derived>
 template <bool Enable, typename>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator*=(const RealScalar& other) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator*=(const RealScalar& other) {
   realView() *= other;
   return derived();
 }
 
 template <typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator/=(const Scalar& other) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator/=(const Scalar& other) {
   using ConstantExpr = typename internal::plain_constant_type<Derived, Scalar>::type;
   using Op = internal::div_assign_op<Scalar>;
   internal::call_assignment(derived(), ConstantExpr(rows(), cols(), other), Op());
@@ -40,7 +40,7 @@
 
 template <typename Derived>
 template <bool Enable, typename>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator/=(const RealScalar& other) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator/=(const RealScalar& other) {
   realView() /= other;
   return derived();
 }

diff --git a/Eigen/src/Core/SkewSymmetricMatrix3.h b/Eigen/src/Core/SkewSymmetricMatrix3.h
index 3545afc..a6ad143 100644
--- a/Eigen/src/Core/SkewSymmetricMatrix3.h
+++ b/Eigen/src/Core/SkewSymmetricMatrix3.h

@@ -62,7 +62,7 @@
   /**
    * Constructs a dense matrix from \c *this. Note, this directly returns a dense matrix type,
    * not an expression.
-   * \returns A dense matrix, with its entries set from the the derived object. */
+   * \returns A dense matrix, with its entries set from the derived object. */
   EIGEN_DEVICE_FUNC DenseMatrixType toDenseMatrix() const { return derived(); }
 
   /** Determinant vanishes */
@@ -308,7 +308,7 @@
  * \sa class SkewSymmetricWrapper, class SkewSymmetricMatrix3, vector(), isSkewSymmetric()
  **/
 template <typename Derived>
-EIGEN_DEVICE_FUNC inline const SkewSymmetricWrapper<const Derived> MatrixBase<Derived>::asSkewSymmetric() const {
+EIGEN_DEVICE_FUNC constexpr const SkewSymmetricWrapper<const Derived> MatrixBase<Derived>::asSkewSymmetric() const {
   return SkewSymmetricWrapper<const Derived>(derived());
 }
 

diff --git a/Eigen/src/Core/Solve.h b/Eigen/src/Core/Solve.h
index aa51410..030b934 100644
--- a/Eigen/src/Core/Solve.h
+++ b/Eigen/src/Core/Solve.h

@@ -69,8 +69,8 @@
   EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_dec.cols(); }
   EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_rhs.cols(); }
 
-  EIGEN_DEVICE_FUNC const Decomposition &dec() const { return m_dec; }
-  EIGEN_DEVICE_FUNC const RhsType &rhs() const { return m_rhs; }
+  EIGEN_DEVICE_FUNC constexpr const Decomposition &dec() const { return m_dec; }
+  EIGEN_DEVICE_FUNC constexpr const RhsType &rhs() const { return m_rhs; }
 
  protected:
   const Decomposition &m_dec;

diff --git a/Eigen/src/Core/SolverBase.h b/Eigen/src/Core/SolverBase.h
index 5a6dfd4..a6ecbc9 100644
--- a/Eigen/src/Core/SolverBase.h
+++ b/Eigen/src/Core/SolverBase.h

@@ -111,7 +111,7 @@
   /** \returns an expression of the solution x of \f$ A x = b \f$ using the current decomposition of A.
    */
   template <typename Rhs>
-  inline const Solve<Derived, Rhs> solve(const MatrixBase<Rhs>& b) const {
+  inline Solve<Derived, Rhs> solve(const MatrixBase<Rhs>& b) const {
     internal::solve_assertion<internal::remove_all_t<Derived>>::template run<false>(derived(), b);
     return Solve<Derived, Rhs>(derived(), b.derived());
   }

diff --git a/Eigen/src/Core/StableNorm.h b/Eigen/src/Core/StableNorm.h
index 711ee3f..11abb77 100644
--- a/Eigen/src/Core/StableNorm.h
+++ b/Eigen/src/Core/StableNorm.h

@@ -40,8 +40,7 @@
     scale = maxCoeff;
   }
 
-  // TODO if the maxCoeff is much much smaller than the current scale,
-  // then we can neglect this sub vector
+  // TODO: skip sub-vector when maxCoeff << current scale.
   if (scale > Scalar(0))  // if scale==0, then bl is 0
     ssq += (bl * invScale).squaredNorm();
 }

diff --git a/Eigen/src/Core/StlIterators.h b/Eigen/src/Core/StlIterators.h
index a24d4c2..afa2ecb 100644
--- a/Eigen/src/Core/StlIterators.h
+++ b/Eigen/src/Core/StlIterators.h

@@ -28,7 +28,7 @@
   typedef indexed_based_stl_iterator_base<typename traits::non_const_iterator> non_const_iterator;
   typedef indexed_based_stl_iterator_base<typename traits::const_iterator> const_iterator;
   typedef std::conditional_t<internal::is_const<XprType>::value, non_const_iterator, const_iterator> other_iterator;
-  // NOTE: in C++03 we cannot declare friend classes through typedefs because we need to write friend class:
+
   friend class indexed_based_stl_iterator_base<typename traits::const_iterator>;
   friend class indexed_based_stl_iterator_base<typename traits::non_const_iterator>;
 
@@ -174,7 +174,7 @@
   typedef indexed_based_stl_reverse_iterator_base<typename traits::non_const_iterator> non_const_iterator;
   typedef indexed_based_stl_reverse_iterator_base<typename traits::const_iterator> const_iterator;
   typedef std::conditional_t<internal::is_const<XprType>::value, non_const_iterator, const_iterator> other_iterator;
-  // NOTE: in C++03 we cannot declare friend classes through typedefs because we need to write friend class:
+
   friend class indexed_based_stl_reverse_iterator_base<typename traits::const_iterator>;
   friend class indexed_based_stl_reverse_iterator_base<typename traits::non_const_iterator>;
 
@@ -318,7 +318,7 @@
   typedef pointer_based_stl_iterator<std::remove_const_t<XprType>> non_const_iterator;
   typedef pointer_based_stl_iterator<std::add_const_t<XprType>> const_iterator;
   typedef std::conditional_t<internal::is_const<XprType>::value, non_const_iterator, const_iterator> other_iterator;
-  // NOTE: in C++03 we cannot declare friend classes through typedefs because we need to write friend class:
+
   friend class pointer_based_stl_iterator<std::add_const_t<XprType>>;
   friend class pointer_based_stl_iterator<std::remove_const_t<XprType>>;
 
@@ -335,10 +335,9 @@
   typedef std::conditional_t<bool(is_lvalue), value_type*, const value_type*> pointer;
   typedef std::conditional_t<bool(is_lvalue), value_type&, const value_type&> reference;
 
-  pointer_based_stl_iterator() noexcept : m_ptr(0) {}
-  pointer_based_stl_iterator(XprType& xpr, Index index) noexcept : m_incr(xpr.innerStride()) {
-    m_ptr = xpr.data() + index * m_incr.value();
-  }
+  pointer_based_stl_iterator() noexcept : m_ptr(0), m_incr(XprType::InnerStrideAtCompileTime) {}
+  pointer_based_stl_iterator(XprType& xpr, Index index) noexcept
+      : m_ptr(xpr.data() + index * xpr.innerStride()), m_incr(xpr.innerStride()) {}
 
   pointer_based_stl_iterator(const non_const_iterator& other) noexcept : m_ptr(other.m_ptr), m_incr(other.m_incr) {}
 
@@ -450,7 +449,7 @@
   using Base::m_index;
   using Base::mp_xpr;
 
-  // TODO currently const Transpose/Reshape expressions never returns const references,
+  // TODO: currently const Transpose/Reshape expressions never returns const references,
   // so lets return by value too.
   // typedef std::conditional_t<bool(has_direct_access), const value_type&, const value_type> read_only_ref_t;
   typedef const value_type read_only_ref_t;

diff --git a/Eigen/src/Core/Stride.h b/Eigen/src/Core/Stride.h
index 692f0a1..8957aa9 100644
--- a/Eigen/src/Core/Stride.h
+++ b/Eigen/src/Core/Stride.h

@@ -58,20 +58,21 @@
   enum { InnerStrideAtCompileTime = InnerStrideAtCompileTime_, OuterStrideAtCompileTime = OuterStrideAtCompileTime_ };
 
   /** Default constructor, for use when strides are fixed at compile time */
-  EIGEN_DEVICE_FUNC Stride() : m_outer(OuterStrideAtCompileTime), m_inner(InnerStrideAtCompileTime) {
+  EIGEN_DEVICE_FUNC constexpr Stride() : m_outer(OuterStrideAtCompileTime), m_inner(InnerStrideAtCompileTime) {
     // FIXME: for Eigen 4 we should use DynamicIndex instead of Dynamic.
     // FIXME: for Eigen 4 we should also unify this API with fix<>
     eigen_assert(InnerStrideAtCompileTime != Dynamic && OuterStrideAtCompileTime != Dynamic);
   }
 
   /** Constructor allowing to pass the strides at runtime */
-  EIGEN_DEVICE_FUNC Stride(Index outerStride, Index innerStride) : m_outer(outerStride), m_inner(innerStride) {}
+  EIGEN_DEVICE_FUNC constexpr Stride(Index outerStride, Index innerStride)
+      : m_outer(outerStride), m_inner(innerStride) {}
 
   /** Copy constructor */
-  EIGEN_DEVICE_FUNC Stride(const Stride& other) : m_outer(other.outer()), m_inner(other.inner()) {}
+  EIGEN_DEVICE_FUNC constexpr Stride(const Stride& other) : m_outer(other.outer()), m_inner(other.inner()) {}
 
   /** Copy assignment operator */
-  EIGEN_DEVICE_FUNC Stride& operator=(const Stride& other) {
+  EIGEN_DEVICE_FUNC constexpr Stride& operator=(const Stride& other) {
     m_outer.setValue(other.outer());
     m_inner.setValue(other.inner());
     return *this;
@@ -94,8 +95,8 @@
   typedef Stride<0, Value> Base;
 
  public:
-  EIGEN_DEVICE_FUNC InnerStride() : Base() {}
-  EIGEN_DEVICE_FUNC InnerStride(Index v) : Base(0, v) {}  // FIXME making this explicit could break valid code
+  EIGEN_DEVICE_FUNC constexpr InnerStride() : Base() {}
+  EIGEN_DEVICE_FUNC constexpr InnerStride(Index v) : Base(0, v) {}  // FIXME making this explicit could break valid code
 };
 
 /** \brief Convenience specialization of Stride to specify only an outer stride
@@ -105,8 +106,8 @@
   typedef Stride<Value, 0> Base;
 
  public:
-  EIGEN_DEVICE_FUNC OuterStride() : Base() {}
-  EIGEN_DEVICE_FUNC OuterStride(Index v) : Base(v, 0) {}  // FIXME making this explicit could break valid code
+  EIGEN_DEVICE_FUNC constexpr OuterStride() : Base() {}
+  EIGEN_DEVICE_FUNC constexpr OuterStride(Index v) : Base(v, 0) {}  // FIXME making this explicit could break valid code
 };
 
 }  // end namespace Eigen

diff --git a/Eigen/src/Core/Swap.h b/Eigen/src/Core/Swap.h
index dd825e9..6dc571e 100644
--- a/Eigen/src/Core/Swap.h
+++ b/Eigen/src/Core/Swap.h

@@ -36,9 +36,10 @@
   typedef typename Base::DstXprType DstXprType;
   typedef swap_assign_op<Scalar> Functor;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE generic_dense_assignment_kernel(DstEvaluatorTypeT &dst,
-                                                                        const SrcEvaluatorTypeT &src,
-                                                                        const Functor &func, DstXprType &dstExpr)
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE generic_dense_assignment_kernel(DstEvaluatorTypeT &dst,
+                                                                                  const SrcEvaluatorTypeT &src,
+                                                                                  const Functor &func,
+                                                                                  DstXprType &dstExpr)
       : Base(dst, src, func, dstExpr) {}
 
   template <int StoreMode, int LoadMode, typename PacketType>
@@ -57,7 +58,7 @@
     m_dst.template writePacket<StoreMode>(index, tmp);
   }
 
-  // TODO find a simple way not to have to copy/paste this function from generic_dense_assignment_kernel, by simple I
+  // TODO: find a simple way not to have to copy/paste this function from generic_dense_assignment_kernel, by simple I
   // mean no CRTP (Gael)
   template <int StoreMode, int LoadMode, typename PacketType>
   EIGEN_STRONG_INLINE void assignPacketByOuterInner(Index outer, Index inner) {
@@ -82,7 +83,7 @@
     m_dst.template writePacketSegment<StoreMode>(index, tmp, begin, count);
   }
 
-  // TODO find a simple way not to have to copy/paste this function from generic_dense_assignment_kernel, by simple I
+  // TODO: find a simple way not to have to copy/paste this function from generic_dense_assignment_kernel, by simple I
   // mean no CRTP (Gael)
   template <int StoreMode, int LoadMode, typename PacketType>
   EIGEN_STRONG_INLINE void assignPacketSegmentByOuterInner(Index outer, Index inner, Index begin, Index count) {

diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h
index 0676a25..2077c92 100644
--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h

@@ -61,20 +61,21 @@
   EIGEN_GENERIC_PUBLIC_INTERFACE(Transpose)
   typedef internal::remove_all_t<MatrixType> NestedExpression;
 
-  EIGEN_DEVICE_FUNC explicit EIGEN_STRONG_INLINE Transpose(MatrixType& matrix) : m_matrix(matrix) {}
+  EIGEN_DEVICE_FUNC constexpr explicit EIGEN_STRONG_INLINE Transpose(MatrixType& matrix) : m_matrix(matrix) {}
 
   EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Transpose)
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const noexcept { return m_matrix.cols(); }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const noexcept { return m_matrix.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_matrix.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_matrix.rows(); }
 
   /** \returns the nested expression */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all_t<MatrixTypeNested>& nestedExpression() const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const internal::remove_all_t<MatrixTypeNested>& nestedExpression()
+      const {
     return m_matrix;
   }
 
   /** \returns the nested expression */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::remove_reference_t<MatrixTypeNested>& nestedExpression() {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE std::remove_reference_t<MatrixTypeNested>& nestedExpression() {
     return m_matrix;
   }
 
@@ -114,17 +115,17 @@
   EIGEN_DENSE_PUBLIC_INTERFACE(Transpose<MatrixType>)
   EIGEN_INHERIT_ASSIGNMENT_OPERATORS(TransposeImpl)
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index innerStride() const { return derived().nestedExpression().innerStride(); }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outerStride() const { return derived().nestedExpression().outerStride(); }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Index innerStride() const {
+    return derived().nestedExpression().innerStride();
+  }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Index outerStride() const {
+    return derived().nestedExpression().outerStride();
+  }
 
   typedef std::conditional_t<internal::is_lvalue<MatrixType>::value, Scalar, const Scalar> ScalarWithConstIfNotLvalue;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr ScalarWithConstIfNotLvalue* data() {
-    return derived().nestedExpression().data();
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const Scalar* data() const {
-    return derived().nestedExpression().data();
-  }
+  EIGEN_DEVICE_FUNC constexpr ScalarWithConstIfNotLvalue* data() { return derived().nestedExpression().data(); }
+  EIGEN_DEVICE_FUNC constexpr const Scalar* data() const { return derived().nestedExpression().data(); }
 
   // FIXME: shall we keep the const version of coeffRef?
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeffRef(Index rowId, Index colId) const {
@@ -194,7 +195,7 @@
  *
  * \sa adjointInPlace(), transpose(), conjugate(), class Transpose, class internal::scalar_conjugate_op */
 template <typename Derived>
-EIGEN_DEVICE_FUNC inline const typename MatrixBase<Derived>::AdjointReturnType MatrixBase<Derived>::adjoint() const {
+EIGEN_DEVICE_FUNC constexpr const typename MatrixBase<Derived>::AdjointReturnType MatrixBase<Derived>::adjoint() const {
   return AdjointReturnType(this->transpose());
 }
 

diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h
index 66b860a..e219e51 100644
--- a/Eigen/src/Core/TriangularMatrix.h
+++ b/Eigen/src/Core/TriangularMatrix.h

@@ -244,7 +244,7 @@
   }
 
   template <typename Other>
-  EIGEN_DEVICE_FUNC inline const Solve<TriangularView, Other> solve(const MatrixBase<Other>& other) const {
+  EIGEN_DEVICE_FUNC inline Solve<TriangularView, Other> solve(const MatrixBase<Other>& other) const {
     return Solve<TriangularView, Other>(*this, other.derived());
   }
 
@@ -568,7 +568,7 @@
  */
 template <typename Derived>
 template <unsigned int Mode>
-EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::template TriangularViewReturnType<Mode>::Type
+EIGEN_DEVICE_FUNC constexpr typename MatrixBase<Derived>::template TriangularViewReturnType<Mode>::Type
 MatrixBase<Derived>::triangularView() {
   return typename TriangularViewReturnType<Mode>::Type(derived());
 }
@@ -576,7 +576,7 @@
 /** This is the const version of MatrixBase::triangularView() */
 template <typename Derived>
 template <unsigned int Mode>
-EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::template ConstTriangularViewReturnType<Mode>::Type
+EIGEN_DEVICE_FUNC constexpr typename MatrixBase<Derived>::template ConstTriangularViewReturnType<Mode>::Type
 MatrixBase<Derived>::triangularView() const {
   return typename ConstTriangularViewReturnType<Mode>::Type(derived());
 }
@@ -633,7 +633,7 @@
 
 namespace internal {
 
-// TODO currently a triangular expression has the form TriangularView<.,.>
+// TODO: currently a triangular expression has the form TriangularView<.,.>
 //      in the future triangular-ness should be defined by the expression traits
 //      such that Transpose<TriangularView<.,.> > is valid. (currently TriangularBase::transpose() is overloaded to make
 //      it work)
@@ -817,28 +817,51 @@
 template <typename Kernel, unsigned int Mode, bool SetOpposite>
 struct triangular_assignment_loop<Kernel, Mode, Dynamic, SetOpposite> {
   typedef typename Kernel::Scalar Scalar;
+  typedef typename Kernel::DstEvaluatorType DstEvaluatorType;
+  typedef typename Kernel::AssignmentTraits AssignmentTraits;
+
+  enum {
+    IsRowMajor = (int(DstEvaluatorType::Flags) & RowMajorBit) != 0,
+    // In col-major: inner=row, outer=col. Upper means row<col i.e. inner<outer -> active before diagonal.
+    // In row-major: inner=col, outer=row. Upper means row<col i.e. inner>outer -> active after diagonal.
+    // So ActiveBeforeDiag = (Upper XOR IsRowMajor).
+    ActiveBeforeDiag = (bool(Mode & Upper) != bool(IsRowMajor))
+  };
+
+  // Compile-time outer/inner to row/col mapping. These constant-fold away entirely:
+  // ColMajor: row(outer,i) -> i, col(outer,i) -> outer
+  // RowMajor: row(outer,i) -> outer, col(outer,i) -> i
+  static constexpr Index row(Index outer, Index inner) { return IsRowMajor ? outer : inner; }
+  static constexpr Index col(Index outer, Index inner) { return IsRowMajor ? inner : outer; }
+
+  // Iterates in outer/inner order matching the storage layout for cache friendliness.
+  // Unlike the old code (which always iterated outer=col, inner=row), this gives
+  // contiguous memory access for both ColMajor and RowMajor storage.
+  // Simple scalar loops allow GCC to recognize memcpy/memset idioms and Clang to auto-vectorize.
+  // Uses a single running index 'i' per column (not separate loop variables) so the compiler
+  // can track the continuous progression and optimize register allocation.
   EIGEN_DEVICE_FUNC static inline void run(Kernel& kernel) {
-    for (Index j = 0; j < kernel.cols(); ++j) {
-      Index maxi = numext::mini(j, kernel.rows());
+    const Index outerSize = IsRowMajor ? kernel.rows() : kernel.cols();
+    const Index innerSize = IsRowMajor ? kernel.cols() : kernel.rows();
+
+    for (Index outer = 0; outer < outerSize; ++outer) {
+      const Index maxi = numext::mini(outer, innerSize);
       Index i = 0;
-      if (((Mode & Lower) && SetOpposite) || (Mode & Upper)) {
-        for (; i < maxi; ++i)
-          if (Mode & Upper)
-            kernel.assignCoeff(i, j);
-          else
-            kernel.assignOppositeCoeff(i, j);
-      } else
+
+      if (ActiveBeforeDiag) {
+        for (; i < maxi; ++i) kernel.assignCoeff(row(outer, i), col(outer, i));
+      } else if (SetOpposite) {
+        for (; i < maxi; ++i) kernel.assignOppositeCoeff(row(outer, i), col(outer, i));
+      } else {
         i = maxi;
+      }
 
-      if (i < kernel.rows())  // then i==j
-        kernel.assignDiagonalCoeff(i++);
+      if (i < innerSize) kernel.assignDiagonalCoeff(i++);
 
-      if (((Mode & Upper) && SetOpposite) || (Mode & Lower)) {
-        for (; i < kernel.rows(); ++i)
-          if (Mode & Lower)
-            kernel.assignCoeff(i, j);
-          else
-            kernel.assignOppositeCoeff(i, j);
+      if (!ActiveBeforeDiag) {
+        for (; i < innerSize; ++i) kernel.assignCoeff(row(outer, i), col(outer, i));
+      } else if (SetOpposite) {
+        for (; i < innerSize; ++i) kernel.assignOppositeCoeff(row(outer, i), col(outer, i));
       }
     }
   }

diff --git a/Eigen/src/Core/VectorBlock.h b/Eigen/src/Core/VectorBlock.h
index 5ac13eb..1277e26 100644
--- a/Eigen/src/Core/VectorBlock.h
+++ b/Eigen/src/Core/VectorBlock.h

@@ -68,13 +68,13 @@
 
   /** Dynamic-size constructor
    */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE VectorBlock(VectorType& vector, Index start, Index size)
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE VectorBlock(VectorType& vector, Index start, Index size)
       : Base(vector, IsColVector ? start : 0, IsColVector ? 0 : start, IsColVector ? size : 1, IsColVector ? 1 : size) {
   }
 
   /** Fixed-size constructor
    */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE VectorBlock(VectorType& vector, Index start)
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE VectorBlock(VectorType& vector, Index start)
       : Base(vector, IsColVector ? start : 0, IsColVector ? 0 : start) {}
 };
 

diff --git a/Eigen/src/Core/VectorwiseOp.h b/Eigen/src/Core/VectorwiseOp.h
index 688b49b..9e34d8c 100644
--- a/Eigen/src/Core/VectorwiseOp.h
+++ b/Eigen/src/Core/VectorwiseOp.h

@@ -206,7 +206,9 @@
  public:
   typedef typename ExpressionType::Scalar Scalar;
   typedef typename ExpressionType::RealScalar RealScalar;
-  typedef internal::remove_all_t<ExpressionType> ExpressionTypeCleaned;
+  typedef Eigen::Index Index;  ///< \deprecated since Eigen 3.3
+  typedef typename internal::ref_selector<ExpressionType>::non_const_type ExpressionTypeNested;
+  typedef internal::remove_all_t<ExpressionTypeNested> ExpressionTypeNestedCleaned;
 
   template <template <typename OutScalar, typename InputScalar> class Functor, typename ReturnScalar = Scalar>
   struct ReturnType {
@@ -345,7 +347,7 @@
 
   typedef typename ReturnType<internal::member_minCoeff>::Type MinCoeffReturnType;
   typedef typename ReturnType<internal::member_maxCoeff>::Type MaxCoeffReturnType;
-  typedef PartialReduxExpr<const CwiseUnaryOp<internal::scalar_abs2_op<Scalar>, const ExpressionTypeCleaned>,
+  typedef PartialReduxExpr<const CwiseUnaryOp<internal::scalar_abs2_op<Scalar>, const ExpressionTypeNestedCleaned>,
                            internal::member_sum<RealScalar, RealScalar>, Direction>
       SquaredNormReturnType;
   typedef CwiseUnaryOp<internal::scalar_sqrt_op<RealScalar>, const SquaredNormReturnType> NormReturnType;
@@ -596,7 +598,7 @@
   /** Returns the expression of the sum of the vector \a other to each subvector of \c *this */
   template <typename OtherDerived>
   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
-      CwiseBinaryOp<internal::scalar_sum_op<Scalar, typename OtherDerived::Scalar>, const ExpressionTypeCleaned,
+      CwiseBinaryOp<internal::scalar_sum_op<Scalar, typename OtherDerived::Scalar>, const ExpressionTypeNestedCleaned,
                     const typename ExtendedType<OtherDerived>::Type>
       operator+(const DenseBase<OtherDerived>& other) const {
     EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
@@ -607,7 +609,7 @@
   /** Returns the expression of the difference between each subvector of \c *this and the vector \a other */
   template <typename OtherDerived>
   EIGEN_DEVICE_FUNC CwiseBinaryOp<internal::scalar_difference_op<Scalar, typename OtherDerived::Scalar>,
-                                  const ExpressionTypeCleaned, const typename ExtendedType<OtherDerived>::Type>
+                                  const ExpressionTypeNestedCleaned, const typename ExtendedType<OtherDerived>::Type>
   operator-(const DenseBase<OtherDerived>& other) const {
     EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
     EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
@@ -618,7 +620,7 @@
    * by the corresponding subvector of \c *this */
   template <typename OtherDerived>
   EIGEN_DEVICE_FUNC CwiseBinaryOp<internal::scalar_product_op<Scalar, typename OtherDerived::Scalar>,
-                                  const ExpressionTypeCleaned, const typename ExtendedType<OtherDerived>::Type>
+                                  const ExpressionTypeNestedCleaned, const typename ExtendedType<OtherDerived>::Type>
   operator*(const DenseBase<OtherDerived>& other) const {
     EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
     EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType)
@@ -630,7 +632,7 @@
    * subvector of \c *this by the vector \a other */
   template <typename OtherDerived>
   EIGEN_DEVICE_FUNC CwiseBinaryOp<internal::scalar_quotient_op<Scalar, typename OtherDerived::Scalar>,
-                                  const ExpressionTypeCleaned, const typename ExtendedType<OtherDerived>::Type>
+                                  const ExpressionTypeNestedCleaned, const typename ExtendedType<OtherDerived>::Type>
   operator/(const DenseBase<OtherDerived>& other) const {
     EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
     EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType)
@@ -640,7 +642,7 @@
 
   using Normalized_NonzeroNormType =
       CwiseUnaryOp<internal::scalar_replace_zero_with_one_op<Scalar>, const NormReturnType>;
-  using NormalizedReturnType = CwiseBinaryOp<internal::scalar_quotient_op<Scalar>, const ExpressionTypeCleaned,
+  using NormalizedReturnType = CwiseBinaryOp<internal::scalar_quotient_op<Scalar>, const ExpressionTypeNestedCleaned,
                                              const typename OppositeExtendedType<Normalized_NonzeroNormType>::Type>;
 
   /** \returns an expression where each column (or row) of the referenced matrix are normalized.
@@ -703,7 +705,7 @@
 
  protected:
   EIGEN_DEVICE_FUNC Index redux_length() const { return Direction == Vertical ? m_matrix.rows() : m_matrix.cols(); }
-  ExpressionType& m_matrix;
+  ExpressionTypeNested m_matrix;
 };
 
 // const colwise moved to DenseBase.h due to CUDA compiler bug

diff --git a/Eigen/src/Core/Visitor.h b/Eigen/src/Core/Visitor.h
index e1d2ca5..fa6293f 100644
--- a/Eigen/src/Core/Visitor.h
+++ b/Eigen/src/Core/Visitor.h

@@ -317,6 +317,13 @@
   const XprType& m_xpr;
 };
 
+template <typename T, typename = void>
+struct visitor_has_linear_access : std::false_type {};
+
+template <typename T>
+struct visitor_has_linear_access<T, decltype(functor_traits<T>::LinearAccess)>
+    : std::integral_constant<bool, static_cast<bool>(functor_traits<T>::LinearAccess)> {};
+
 template <typename Derived, typename Visitor, bool ShortCircuitEvaulation>
 struct visit_impl {
   using Evaluator = visitor_evaluator<Derived>;
@@ -329,8 +336,7 @@
   static constexpr int InnerSizeAtCompileTime = IsRowMajor ? ColsAtCompileTime : RowsAtCompileTime;
   static constexpr int OuterSizeAtCompileTime = IsRowMajor ? RowsAtCompileTime : ColsAtCompileTime;
 
-  static constexpr bool LinearAccess =
-      Evaluator::LinearAccess && static_cast<bool>(functor_traits<Visitor>::LinearAccess);
+  static constexpr bool LinearAccess = Evaluator::LinearAccess && visitor_has_linear_access<Visitor>::value;
   static constexpr bool Vectorize = Evaluator::PacketAccess && static_cast<bool>(functor_traits<Visitor>::PacketAccess);
 
   static constexpr int PacketSize = packet_traits<Scalar>::size;

diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h
index a4a87c4..3f3046f 100644
--- a/Eigen/src/Core/arch/AVX/Complex.h
+++ b/Eigen/src/Core/arch/AVX/Complex.h

@@ -79,8 +79,8 @@
 }
 template <>
 EIGEN_STRONG_INLINE Packet4cf pconj(const Packet4cf& a) {
-  const __m256 mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x00000000, 0x80000000, 0x00000000, 0x80000000, 0x00000000,
-                                                            0x80000000, 0x00000000, 0x80000000));
+  const __m256 mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x00000000, SIGN_MASK_I32, 0x00000000, SIGN_MASK_I32,
+                                                            0x00000000, SIGN_MASK_I32, 0x00000000, SIGN_MASK_I32));
   return Packet4cf(_mm256_xor_ps(a.v, mask));
 }
 
@@ -245,6 +245,7 @@
     HasNegate = 1,
     HasSqrt = 1,
     HasLog = 1,
+    HasExp = 1,
     HasAbs = 0,
     HasAbs2 = 0,
     HasMin = 0,
@@ -282,7 +283,8 @@
 }
 template <>
 EIGEN_STRONG_INLINE Packet2cd pconj(const Packet2cd& a) {
-  const __m256d mask = _mm256_castsi256_pd(_mm256_set_epi32(0x80000000, 0x0, 0x0, 0x0, 0x80000000, 0x0, 0x0, 0x0));
+  const __m256d mask =
+      _mm256_castsi256_pd(_mm256_set_epi32(SIGN_MASK_I32, 0x0, 0x0, 0x0, SIGN_MASK_I32, 0x0, 0x0, 0x0));
   return Packet2cd(_mm256_xor_pd(a.v, mask));
 }
 
@@ -430,29 +432,20 @@
   kernel.packet[0].v = tmp;
 }
 
-template <>
-EIGEN_STRONG_INLINE Packet2cd psqrt<Packet2cd>(const Packet2cd& a) {
-  return psqrt_complex<Packet2cd>(a);
-}
+EIGEN_INSTANTIATE_COMPLEX_MATH_FUNCS_NO_EXP(Packet2cd)
+EIGEN_INSTANTIATE_COMPLEX_MATH_FUNCS(Packet4cf)
 
 template <>
-EIGEN_STRONG_INLINE Packet4cf psqrt<Packet4cf>(const Packet4cf& a) {
-  return psqrt_complex<Packet4cf>(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2cd plog<Packet2cd>(const Packet2cd& a) {
-  return plog_complex<Packet2cd>(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4cf plog<Packet4cf>(const Packet4cf& a) {
-  return plog_complex<Packet4cf>(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4cf pexp<Packet4cf>(const Packet4cf& a) {
-  return pexp_complex<Packet4cf>(a);
+EIGEN_STRONG_INLINE Packet2cd pexp<Packet2cd>(const Packet2cd& a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return pexp_complex<Packet2cd>(a);
+#else
+  // Without AVX2, pexp_complex<Packet2cd> requires psincos_double<Packet4d> which needs
+  // 256-bit integer operations (Packet4l) not available on AVX-only targets.
+  // Process as two independent Packet1cd using the SSE implementation instead.
+  return Packet2cd(_mm256_insertf128_pd(_mm256_castpd128_pd256(pexp(Packet1cd(_mm256_castpd256_pd128(a.v))).v),
+                                        pexp(Packet1cd(_mm256_extractf128_pd(a.v, 1))).v, 1));
+#endif
 }
 
 #ifdef EIGEN_VECTORIZE_FMA

diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h
index f4f6794..5a6cba3 100644
--- a/Eigen/src/Core/arch/AVX/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX/MathFunctions.h

@@ -33,6 +33,24 @@
 EIGEN_DOUBLE_PACKET_FUNCTION(sin, Packet4d)
 EIGEN_DOUBLE_PACKET_FUNCTION(cos, Packet4d)
 EIGEN_DOUBLE_PACKET_FUNCTION(tan, Packet4d)
+#else
+// Without AVX2, psincos_double<Packet4d> requires 256-bit integer operations (Packet4l)
+// that are not available. Process as two Packet2d halves using the SSE implementation.
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4d psin<Packet4d>(const Packet4d& x) {
+  return _mm256_insertf128_pd(_mm256_castpd128_pd256(psin(_mm256_castpd256_pd128(x))),
+                              psin(_mm256_extractf128_pd(x, 1)), 1);
+}
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4d pcos<Packet4d>(const Packet4d& x) {
+  return _mm256_insertf128_pd(_mm256_castpd128_pd256(pcos(_mm256_castpd256_pd128(x))),
+                              pcos(_mm256_extractf128_pd(x, 1)), 1);
+}
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4d ptan<Packet4d>(const Packet4d& x) {
+  return _mm256_insertf128_pd(_mm256_castpd128_pd256(ptan(_mm256_castpd256_pd128(x))),
+                              ptan(_mm256_extractf128_pd(x, 1)), 1);
+}
 #endif
 EIGEN_GENERIC_PACKET_FUNCTION(atan, Packet4d)
 EIGEN_GENERIC_PACKET_FUNCTION(exp2, Packet4d)
@@ -98,34 +116,10 @@
   return F32ToBf16(pldexp<Packet8f>(Bf16ToF32(a), Bf16ToF32(exponent)));
 }
 
-BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pcos)
-BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pexp)
-BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pexp2)
-BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pexpm1)
-BF16_PACKET_FUNCTION(Packet8f, Packet8bf, plog)
-BF16_PACKET_FUNCTION(Packet8f, Packet8bf, plog1p)
-BF16_PACKET_FUNCTION(Packet8f, Packet8bf, plog2)
-BF16_PACKET_FUNCTION(Packet8f, Packet8bf, preciprocal)
-BF16_PACKET_FUNCTION(Packet8f, Packet8bf, prsqrt)
-BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pcbrt)
-BF16_PACKET_FUNCTION(Packet8f, Packet8bf, psin)
-BF16_PACKET_FUNCTION(Packet8f, Packet8bf, psqrt)
-BF16_PACKET_FUNCTION(Packet8f, Packet8bf, ptanh)
+EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_BF16(Packet8f, Packet8bf)
 
 #ifndef EIGEN_VECTORIZE_AVX512FP16
-F16_PACKET_FUNCTION(Packet8f, Packet8h, pcos)
-F16_PACKET_FUNCTION(Packet8f, Packet8h, pexp)
-F16_PACKET_FUNCTION(Packet8f, Packet8h, pexp2)
-F16_PACKET_FUNCTION(Packet8f, Packet8h, pexpm1)
-F16_PACKET_FUNCTION(Packet8f, Packet8h, plog)
-F16_PACKET_FUNCTION(Packet8f, Packet8h, plog1p)
-F16_PACKET_FUNCTION(Packet8f, Packet8h, plog2)
-F16_PACKET_FUNCTION(Packet8f, Packet8h, preciprocal)
-F16_PACKET_FUNCTION(Packet8f, Packet8h, prsqrt)
-F16_PACKET_FUNCTION(Packet8f, Packet8h, psin)
-F16_PACKET_FUNCTION(Packet8f, Packet8h, psqrt)
-F16_PACKET_FUNCTION(Packet8f, Packet8h, pcbrt)
-F16_PACKET_FUNCTION(Packet8f, Packet8h, ptanh)
+EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_F16(Packet8f, Packet8h)
 #endif
 
 }  // end namespace internal

diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index eafff3d..df94f70 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h

@@ -46,6 +46,8 @@
 typedef eigen_packet_wrapper<__m256i, 5> Packet4ul;
 #endif
 
+#define SIGN_MASK_I64 static_cast<int64_t>(0x8000000000000000ULL)
+
 template <>
 struct is_arithmetic<__m256> {
   enum { value = true };
@@ -875,12 +877,12 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a) {
-  const Packet8f mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
+  const Packet8f mask = _mm256_castsi256_ps(_mm256_set1_epi32(SIGN_MASK_I32));
   return _mm256_xor_ps(a, mask);
 }
 template <>
 EIGEN_STRONG_INLINE Packet4d pnegate(const Packet4d& a) {
-  const Packet4d mask = _mm256_castsi256_pd(_mm256_set1_epi64x(0x8000000000000000ULL));
+  const Packet4d mask = _mm256_castsi256_pd(_mm256_set1_epi64x(SIGN_MASK_I64));
   return _mm256_xor_pd(a, mask);
 }
 template <>
@@ -1764,12 +1766,6 @@
 EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a) {
   __m256d tmp = _mm256_shuffle_pd(a, a, 5);
   return _mm256_permute2f128_pd(tmp, tmp, 1);
-#if 0
-  // This version is unlikely to be faster as _mm256_shuffle_ps and _mm256_permute_pd
-  // exhibit the same latency/throughput, but it is here for future reference/benchmarking...
-  __m256d swap_halves = _mm256_permute2f128_pd(a,a,1);
-    return _mm256_permute_pd(swap_halves,5);
-#endif
 }
 template <>
 EIGEN_STRONG_INLINE Packet8i preverse(const Packet8i& a) {

diff --git a/Eigen/src/Core/arch/AVX/TypeCasting.h b/Eigen/src/Core/arch/AVX/TypeCasting.h
index 767e2d5..9feb38f 100644
--- a/Eigen/src/Core/arch/AVX/TypeCasting.h
+++ b/Eigen/src/Core/arch/AVX/TypeCasting.h

@@ -56,6 +56,40 @@
 #endif
 #endif
 
+EIGEN_STRONG_INLINE __m256 _eigen_mm256_set_m128(__m128 hi, __m128 lo) {
+#if EIGEN_COMP_GNUC && (EIGEN_COMP_CLANG < 1000 || EIGEN_COMP_GNUC < 810)
+  __m256 result = _mm256_castps128_ps256(lo);
+  return _mm256_insertf128_ps(result, hi, 1);
+#else
+  return _mm256_set_m128(hi, lo);
+#endif
+}
+
+EIGEN_STRONG_INLINE __m256d _eigen_mm256_set_m128d(__m128d hi, __m128d lo) {
+#if EIGEN_COMP_GNUC && (EIGEN_COMP_CLANG < 1000 || EIGEN_COMP_GNUC < 810)
+  __m256d result = _mm256_castpd128_pd256(lo);
+  return _mm256_insertf128_pd(result, hi, 1);
+#else
+  return _mm256_set_m128d(hi, lo);
+#endif
+}
+
+EIGEN_STRONG_INLINE __m256i _eigen_mm256_set_m128i(__m128i hi, __m128i lo) {
+#if EIGEN_COMP_GNUC && (EIGEN_COMP_CLANG < 1000 || EIGEN_COMP_GNUC < 810)
+#if defined(EIGEN_VECTORIZE_AVX2)
+  __m256i result = _mm256_castsi128_si256(lo);
+  return _mm256_inserti128_si256(result, hi, 1);
+#else
+  EIGEN_ALIGN32 int32_t tmp[8];
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(tmp), lo);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(tmp + 4), hi);
+  return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(tmp));
+#endif
+#else
+  return _mm256_set_m128i(hi, lo);
+#endif
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet16b pcast<Packet8f, Packet16b>(const Packet8f& a, const Packet8f& b) {
   __m256 nonzero_a = _mm256_cmp_ps(a, pzero(a), _CMP_NEQ_UQ);
@@ -109,7 +143,7 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet8i pcast<Packet4d, Packet8i>(const Packet4d& a, const Packet4d& b) {
-  return _mm256_set_m128i(_mm256_cvttpd_epi32(b), _mm256_cvttpd_epi32(a));
+  return _eigen_mm256_set_m128i(_mm256_cvttpd_epi32(b), _mm256_cvttpd_epi32(a));
 }
 
 template <>
@@ -124,7 +158,7 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet8f pcast<Packet4d, Packet8f>(const Packet4d& a, const Packet4d& b) {
-  return _mm256_set_m128(_mm256_cvtpd_ps(b), _mm256_cvtpd_ps(a));
+  return _eigen_mm256_set_m128(_mm256_cvtpd_ps(b), _mm256_cvtpd_ps(a));
 }
 
 template <>
@@ -249,7 +283,7 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet4d pcast<Packet2l, Packet4d>(const Packet2l& a, const Packet2l& b) {
-  return _mm256_set_m128d((pcast<Packet2l, Packet2d>(b)), (pcast<Packet2l, Packet2d>(a)));
+  return _eigen_mm256_set_m128d((pcast<Packet2l, Packet2d>(b)), (pcast<Packet2l, Packet2d>(a)));
 }
 
 template <>

diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h
index b9b4953..b8483a8 100644
--- a/Eigen/src/Core/arch/AVX512/Complex.h
+++ b/Eigen/src/Core/arch/AVX512/Complex.h

@@ -82,8 +82,8 @@
 template <>
 EIGEN_STRONG_INLINE Packet8cf pconj(const Packet8cf& a) {
   const __m512 mask = _mm512_castsi512_ps(_mm512_setr_epi32(
-      0x00000000, 0x80000000, 0x00000000, 0x80000000, 0x00000000, 0x80000000, 0x00000000, 0x80000000, 0x00000000,
-      0x80000000, 0x00000000, 0x80000000, 0x00000000, 0x80000000, 0x00000000, 0x80000000));
+      0x00000000, SIGN_MASK_I32, 0x00000000, SIGN_MASK_I32, 0x00000000, SIGN_MASK_I32, 0x00000000, SIGN_MASK_I32,
+      0x00000000, SIGN_MASK_I32, 0x00000000, SIGN_MASK_I32, 0x00000000, SIGN_MASK_I32, 0x00000000, SIGN_MASK_I32));
   return Packet8cf(pxor(a.v, mask));
 }
 
@@ -226,6 +226,7 @@
     HasNegate = 1,
     HasSqrt = 1,
     HasLog = 1,
+    HasExp = 1,
     HasAbs = 0,
     HasAbs2 = 0,
     HasMin = 0,
@@ -262,8 +263,9 @@
 }
 template <>
 EIGEN_STRONG_INLINE Packet4cd pconj(const Packet4cd& a) {
-  const __m512d mask = _mm512_castsi512_pd(_mm512_set_epi32(0x80000000, 0x0, 0x0, 0x0, 0x80000000, 0x0, 0x0, 0x0,
-                                                            0x80000000, 0x0, 0x0, 0x0, 0x80000000, 0x0, 0x0, 0x0));
+  const __m512d mask =
+      _mm512_castsi512_pd(_mm512_set_epi32(SIGN_MASK_I32, 0x0, 0x0, 0x0, SIGN_MASK_I32, 0x0, 0x0, 0x0, SIGN_MASK_I32,
+                                           0x0, 0x0, 0x0, SIGN_MASK_I32, 0x0, 0x0, 0x0));
   return Packet4cd(pxor(a.v, mask));
 }
 
@@ -441,30 +443,8 @@
   kernel.packet[0] = Packet4cd(_mm512_shuffle_f64x2(T0, T2, (shuffle_mask<0, 2, 0, 2>::mask)));  // [a0 b0 c0 d0]
 }
 
-template <>
-EIGEN_STRONG_INLINE Packet4cd psqrt<Packet4cd>(const Packet4cd& a) {
-  return psqrt_complex<Packet4cd>(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8cf psqrt<Packet8cf>(const Packet8cf& a) {
-  return psqrt_complex<Packet8cf>(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4cd plog<Packet4cd>(const Packet4cd& a) {
-  return plog_complex<Packet4cd>(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8cf plog<Packet8cf>(const Packet8cf& a) {
-  return plog_complex<Packet8cf>(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8cf pexp<Packet8cf>(const Packet8cf& a) {
-  return pexp_complex<Packet8cf>(a);
-}
+EIGEN_INSTANTIATE_COMPLEX_MATH_FUNCS(Packet4cd)
+EIGEN_INSTANTIATE_COMPLEX_MATH_FUNCS(Packet8cf)
 
 }  // end namespace internal
 }  // end namespace Eigen

diff --git a/Eigen/src/Core/arch/AVX512/GemmKernel.h b/Eigen/src/Core/arch/AVX512/GemmKernel.h
index 96157f4..556894c 100644
--- a/Eigen/src/Core/arch/AVX512/GemmKernel.h
+++ b/Eigen/src/Core/arch/AVX512/GemmKernel.h

@@ -832,10 +832,6 @@
 
     // n-remainders.
     if (n & 4 && max_b_unroll > 4) nloop<a_unroll, 4, max_b_unroll>(aa, ao, bo, co1, co2);
-#if 0
-        if (n & 2 && max_b_unroll > 2) nloop<a_unroll, 2, max_b_unroll>(aa, ao, bo, co1, co2);
-        if (n & 1 && max_b_unroll > 1) nloop<a_unroll, 1, max_b_unroll>(aa, ao, bo, co1, co2);
-#else
     // Copy kernels don't support tails of n = 2 for single/double precision.
     // Loop over ones.
     int n_rem = 2 * ((n & 2) != 0) + 1 * ((n & 1) != 0);
@@ -843,7 +839,6 @@
       nloop<a_unroll, 1, max_b_unroll>(aa, ao, bo, co1, co2);
       n_rem--;
     }
-#endif
 
     // Advance A matrix pointer.
     a = ao + a_unroll * (a_stride - k - a_off);
@@ -929,6 +924,15 @@
   }
 };
 
+template <typename Scalar, bool is_unit_inc>
+const int gemm_class<Scalar, is_unit_inc>::a_regs[];
+
+template <typename Scalar, bool is_unit_inc>
+const int gemm_class<Scalar, is_unit_inc>::b_regs[];
+
+template <typename Scalar, bool is_unit_inc>
+const int gemm_class<Scalar, is_unit_inc>::c_regs[];
+
 // Compute kernel with max unroll support of:
 //   Single precision:
 //     max_a_unroll: 48, 32, 16, 8, 4, 2, 1
@@ -973,12 +977,12 @@
   typedef typename DataMapper::LinearMapper LinearMapper;
   enum { PacketSize = packet_traits<Scalar>::size };
   EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0,
-                                    Index offset = 0);
+                                    Index offset = 0) const;
 };
 
 template <typename Scalar, typename Index, typename DataMapper, bool Conjugate, bool PanelMode>
 EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, 8, ColMajor, Conjugate, PanelMode>::operator()(
-    Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) {
+    Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) const {
   constexpr int nr = 8;
   EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR");
   EIGEN_UNUSED_VARIABLE(stride);
@@ -1055,7 +1059,7 @@
       const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
 
       Index k = 0;
-      if ((PacketSize % 4) == 0)  // TODO enable vectorized transposition for PacketSize==2 ??
+      if ((PacketSize % 4) == 0)  // TODO: enable vectorized transposition for PacketSize==2.
       {
         for (; k < peeled_k; k += PacketSize) {
           PacketBlock<Packet, (PacketSize % 4) == 0 ? 4 : PacketSize> kernel;
@@ -1107,7 +1111,7 @@
     QuarterPacketSize = unpacket_traits<QuarterPacket>::size
   };
   EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0,
-                                    Index offset = 0) {
+                                    Index offset = 0) const {
     constexpr int nr = 8;
     EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
     EIGEN_UNUSED_VARIABLE(stride);
@@ -1207,13 +1211,13 @@
 struct gebp_kernel<Scalar, Scalar, Index, DataMapper, mr, 8, ConjugateLhs, ConjugateRhs> {
   EIGEN_ALWAYS_INLINE void operator()(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows,
                                       Index depth, Index cols, Scalar alpha, Index strideA = -1, Index strideB = -1,
-                                      Index offsetA = 0, Index offsetB = 0);
+                                      Index offsetA = 0, Index offsetB = 0) const;
 };
 
 template <typename Scalar, typename Index, typename DataMapper, int mr, bool ConjugateLhs, bool ConjugateRhs>
 EIGEN_ALWAYS_INLINE void gebp_kernel<Scalar, Scalar, Index, DataMapper, mr, 8, ConjugateLhs, ConjugateRhs>::operator()(
     const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols,
-    Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) {
+    Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) const {
   if (res.incr() == 1) {
     if (alpha == 1) {
       gemm_kern_avx512<Scalar, mr, 8, true, false, true>(rows, cols, depth, &alpha, blockA, blockB, (Scalar*)res.data(),

diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h
index 04499a0..2499019 100644
--- a/Eigen/src/Core/arch/AVX512/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h

@@ -106,32 +106,10 @@
 }
 #endif
 
-BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pcos)
-BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pexp)
-BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pexp2)
-BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pexpm1)
-BF16_PACKET_FUNCTION(Packet16f, Packet16bf, plog)
-BF16_PACKET_FUNCTION(Packet16f, Packet16bf, plog1p)
-BF16_PACKET_FUNCTION(Packet16f, Packet16bf, plog2)
-BF16_PACKET_FUNCTION(Packet16f, Packet16bf, preciprocal)
-BF16_PACKET_FUNCTION(Packet16f, Packet16bf, prsqrt)
-BF16_PACKET_FUNCTION(Packet16f, Packet16bf, psin)
-BF16_PACKET_FUNCTION(Packet16f, Packet16bf, psqrt)
-BF16_PACKET_FUNCTION(Packet16f, Packet16bf, ptanh)
+EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_BF16(Packet16f, Packet16bf)
 
 #ifndef EIGEN_VECTORIZE_AVX512FP16
-F16_PACKET_FUNCTION(Packet16f, Packet16h, pcos)
-F16_PACKET_FUNCTION(Packet16f, Packet16h, pexp)
-F16_PACKET_FUNCTION(Packet16f, Packet16h, pexp2)
-F16_PACKET_FUNCTION(Packet16f, Packet16h, pexpm1)
-F16_PACKET_FUNCTION(Packet16f, Packet16h, plog)
-F16_PACKET_FUNCTION(Packet16f, Packet16h, plog1p)
-F16_PACKET_FUNCTION(Packet16f, Packet16h, plog2)
-F16_PACKET_FUNCTION(Packet16f, Packet16h, preciprocal)
-F16_PACKET_FUNCTION(Packet16f, Packet16h, prsqrt)
-F16_PACKET_FUNCTION(Packet16f, Packet16h, psin)
-F16_PACKET_FUNCTION(Packet16f, Packet16h, psqrt)
-F16_PACKET_FUNCTION(Packet16f, Packet16h, ptanh)
+EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_F16(Packet16f, Packet16h)
 #endif  // EIGEN_VECTORIZE_AVX512FP16
 
 }  // end namespace internal

diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h
index c69ba15..22271e1 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h

@@ -44,6 +44,54 @@
 typedef eigen_packet_wrapper<__m256i, 6> Packet16s;
 typedef eigen_packet_wrapper<__m128i, 6> Packet8s;
 
+EIGEN_STRONG_INLINE Packet16i _eigen_mm512_loadu_epi32(const int* from) {
+#if EIGEN_COMP_GNUC && (EIGEN_COMP_CLANG < 1000 || EIGEN_COMP_GNUC < 1010)
+  return _mm512_loadu_si512(reinterpret_cast<const void*>(from));
+#else
+  return _mm512_loadu_epi32(from);
+#endif
+}
+
+EIGEN_STRONG_INLINE Packet16i _eigen_mm512_loadu_epi64(const int64_t* from) {
+#if EIGEN_COMP_GNUC && (EIGEN_COMP_CLANG < 1000 || EIGEN_COMP_GNUC < 1010)
+  return _mm512_loadu_si512(reinterpret_cast<const void*>(from));
+#else
+  return _mm512_loadu_epi64(from);
+#endif
+}
+
+EIGEN_STRONG_INLINE void _eigen_mm512_storeu_epi32(void* to, const Packet16i& from) {
+#if EIGEN_COMP_GNUC && (EIGEN_COMP_CLANG < 1000 || EIGEN_COMP_GNUC < 1010)
+  _mm512_storeu_si512(to, from);
+#else
+  _mm512_storeu_epi32(to, from);
+#endif
+}
+
+EIGEN_STRONG_INLINE void _eigen_mm512_storeu_epi64(void* to, const Packet16i& from) {
+#if EIGEN_COMP_GNUC && (EIGEN_COMP_CLANG < 1000 || EIGEN_COMP_GNUC < 1010)
+  _mm512_storeu_si512(to, from);
+#else
+  _mm512_storeu_epi64(to, from);
+#endif
+}
+
+EIGEN_STRONG_INLINE void _eigen_mm256_storeu_epi32(void* to, const __m256i& from) {
+#if EIGEN_COMP_GNUC && (EIGEN_COMP_CLANG < 1000 || EIGEN_COMP_GNUC < 1010)
+  _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from);
+#else
+  _mm256_storeu_epi32(to, from);
+#endif
+}
+
+EIGEN_STRONG_INLINE void _eigen_mm_storeu_epi32(void* to, const __m128i& from) {
+#if EIGEN_COMP_GNUC && (EIGEN_COMP_CLANG < 1000 || EIGEN_COMP_GNUC < 1010)
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
+#else
+  _mm_storeu_epi32(to, from);
+#endif
+}
+
 template <>
 struct is_arithmetic<__m512> {
   enum { value = true };
@@ -443,15 +491,15 @@
   //       The intel docs give it a relatively high latency as well, so we're probably
   //       better off with using _mm512_set_epi32 directly anyways.
   const __m512i mask =
-      _mm512_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000,
-                       0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
+      _mm512_set_epi32(SIGN_MASK_I32, SIGN_MASK_I32, SIGN_MASK_I32, SIGN_MASK_I32, SIGN_MASK_I32, SIGN_MASK_I32,
+                       SIGN_MASK_I32, SIGN_MASK_I32, SIGN_MASK_I32, SIGN_MASK_I32, SIGN_MASK_I32, SIGN_MASK_I32,
+                       SIGN_MASK_I32, SIGN_MASK_I32, SIGN_MASK_I32, SIGN_MASK_I32);
   return _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(a), mask));
 }
 template <>
 EIGEN_STRONG_INLINE Packet8d pnegate(const Packet8d& a) {
-  const __m512i mask =
-      _mm512_set_epi64(0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL,
-                       0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL);
+  const __m512i mask = _mm512_set_epi64(SIGN_MASK_I64, SIGN_MASK_I64, SIGN_MASK_I64, SIGN_MASK_I64, SIGN_MASK_I64,
+                                        SIGN_MASK_I64, SIGN_MASK_I64, SIGN_MASK_I64);
   return _mm512_castsi512_pd(_mm512_xor_epi64(_mm512_castpd_si512(a), mask));
 }
 template <>
@@ -770,22 +818,22 @@
 template <>
 EIGEN_STRONG_INLINE Packet8d pcmp_eq(const Packet8d& a, const Packet8d& b) {
   __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ);
-  return _mm512_castsi512_pd(_mm512_mask_set1_epi64(_mm512_setzero_epi32(), mask, 0xffffffffffffffffu));
+  return _mm512_castsi512_pd(_mm512_mask_set1_epi64(_mm512_setzero_epi32(), mask, int64_t(-1)));
 }
 template <>
 EIGEN_STRONG_INLINE Packet8d pcmp_le(const Packet8d& a, const Packet8d& b) {
   __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_LE_OQ);
-  return _mm512_castsi512_pd(_mm512_mask_set1_epi64(_mm512_setzero_epi32(), mask, 0xffffffffffffffffu));
+  return _mm512_castsi512_pd(_mm512_mask_set1_epi64(_mm512_setzero_epi32(), mask, int64_t(-1)));
 }
 template <>
 EIGEN_STRONG_INLINE Packet8d pcmp_lt(const Packet8d& a, const Packet8d& b) {
   __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_LT_OQ);
-  return _mm512_castsi512_pd(_mm512_mask_set1_epi64(_mm512_setzero_epi32(), mask, 0xffffffffffffffffu));
+  return _mm512_castsi512_pd(_mm512_mask_set1_epi64(_mm512_setzero_epi32(), mask, int64_t(-1)));
 }
 template <>
 EIGEN_STRONG_INLINE Packet8d pcmp_lt_or_nan(const Packet8d& a, const Packet8d& b) {
   __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_NGE_UQ);
-  return _mm512_castsi512_pd(_mm512_mask_set1_epi64(_mm512_setzero_epi32(), mask, 0xffffffffffffffffu));
+  return _mm512_castsi512_pd(_mm512_mask_set1_epi64(_mm512_setzero_epi32(), mask, int64_t(-1)));
 }
 
 template <>
@@ -1033,11 +1081,11 @@
 }
 template <>
 EIGEN_STRONG_INLINE Packet16i ploadu<Packet16i>(const int* from) {
-  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_epi32(from);
+  EIGEN_DEBUG_UNALIGNED_LOAD return _eigen_mm512_loadu_epi32(from);
 }
 template <>
 EIGEN_STRONG_INLINE Packet8l ploadu<Packet8l>(const int64_t* from) {
-  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_epi64(from);
+  EIGEN_DEBUG_UNALIGNED_LOAD return _eigen_mm512_loadu_epi64(from);
 }
 
 template <>
@@ -1158,11 +1206,11 @@
 }
 template <>
 EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet16i& from) {
-  EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_epi32(to, from);
+  EIGEN_DEBUG_UNALIGNED_STORE _eigen_mm512_storeu_epi32(to, from);
 }
 template <>
 EIGEN_STRONG_INLINE void pstoreu<int64_t>(int64_t* to, const Packet8l& from) {
-  EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_epi64(to, from);
+  EIGEN_DEBUG_UNALIGNED_STORE _eigen_mm512_storeu_epi64(to, from);
 }
 template <>
 EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet16f& from, uint16_t umask) {
@@ -1359,12 +1407,12 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet16f pabs(const Packet16f& a) {
-  // _mm512_abs_ps intrinsic not found, so hack around it
+  // _mm512_abs_ps intrinsic not found, so implement via bitwise AND with sign-bit mask.
   return _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(a), _mm512_set1_epi32(0x7fffffff)));
 }
 template <>
 EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) {
-  // _mm512_abs_ps intrinsic not found, so hack around it
+  // _mm512_abs_pd intrinsic not found, so implement via bitwise AND with sign-bit mask.
   return _mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(a), _mm512_set1_epi64(0x7fffffffffffffff)));
 }
 template <>
@@ -2997,19 +3045,19 @@
 template <>
 EIGEN_STRONG_INLINE void pstoreu<numext::int16_t, Packet32s>(numext::int16_t* out, const Packet32s& x) {
   EIGEN_DEBUG_UNALIGNED_STORE
-  _mm512_storeu_epi32(out, x);
+  _eigen_mm512_storeu_epi32(out, x);
 }
 
 template <>
 EIGEN_STRONG_INLINE void pstoreu<numext::int16_t, Packet16s>(numext::int16_t* out, const Packet16s& x) {
   EIGEN_DEBUG_UNALIGNED_STORE
-  _mm256_storeu_epi32(out, x);
+  _eigen_mm256_storeu_epi32(out, x);
 }
 
 template <>
 EIGEN_STRONG_INLINE void pstoreu<numext::int16_t, Packet8s>(numext::int16_t* out, const Packet8s& x) {
   EIGEN_DEBUG_UNALIGNED_STORE
-  _mm_storeu_epi32(out, x);
+  _eigen_mm_storeu_epi32(out, x);
 }
 
 template <>

diff --git a/Eigen/src/Core/arch/AVX512/PacketMathFP16.h b/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
index e6b8d99..370fb83 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMathFP16.h

@@ -735,9 +735,9 @@
   return half(_mm_reduce_add_ph(a));

 }

 

-// predux_half_dowto4

+// predux_half

 template <>

-EIGEN_STRONG_INLINE Packet16h predux_half_dowto4<Packet32h>(const Packet32h& a) {

+EIGEN_STRONG_INLINE Packet16h predux_half<Packet32h>(const Packet32h& a) {

   const __m512i bits = _mm512_castph_si512(a);

   Packet16h lo = _mm256_castsi256_ph(_mm512_castsi512_si256(bits));

   Packet16h hi = _mm256_castsi256_ph(_mm512_extracti64x4_epi64(bits, 1));

@@ -745,7 +745,7 @@
 }

 

 template <>

-EIGEN_STRONG_INLINE Packet8h predux_half_dowto4<Packet16h>(const Packet16h& a) {

+EIGEN_STRONG_INLINE Packet8h predux_half<Packet16h>(const Packet16h& a) {

   Packet8h lo = _mm_castsi128_ph(_mm256_castsi256_si128(_mm256_castph_si256(a)));

   Packet8h hi = _mm_castps_ph(_mm256_extractf128_ps(_mm256_castph_ps(a), 1));

   return padd(lo, hi);


diff --git a/Eigen/src/Core/arch/AVX512/Reductions.h b/Eigen/src/Core/arch/AVX512/Reductions.h
index f7b4c25..f59b78e 100644
--- a/Eigen/src/Core/arch/AVX512/Reductions.h
+++ b/Eigen/src/Core/arch/AVX512/Reductions.h

@@ -55,7 +55,7 @@
 // MSVC's _mm512_reduce_mul_epi64 is borked, at least up to and including 1939.
 //    alignas(64) int64_t data[] = { 1,1,-1,-1,1,-1,-1,-1 };
 //    int64_t out = _mm512_reduce_mul_epi64(_mm512_load_epi64(data));
-// produces garbage: 4294967295.  It seems to happen whenever the output is supposed to be negative.
+// produces garbage: 4294967295.  This occurs when the result should be negative.
 // Fall back to a manual approach:
 template <>
 EIGEN_STRONG_INLINE int64_t predux_mul(const Packet8l& a) {

diff --git a/Eigen/src/Core/arch/AVX512/TypeCasting.h b/Eigen/src/Core/arch/AVX512/TypeCasting.h
index fc55fd8..6c42909 100644
--- a/Eigen/src/Core/arch/AVX512/TypeCasting.h
+++ b/Eigen/src/Core/arch/AVX512/TypeCasting.h

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2019 Rasmus Munk Larsen <rmlarsen@google.com>
+// Copyright (C) 2019 Rasmus Munk Larsen <rmlarsen@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -52,9 +52,17 @@
 template <>
 struct type_casting_traits<float, bfloat16> : vectorized_type_casting_traits<float, bfloat16> {};
 
+EIGEN_STRONG_INLINE __mmask16 _eigen_mm512_cmpneq_ps_mask(__m512 a, __m512 b) {
+#if EIGEN_COMP_GNUC && (EIGEN_COMP_CLANG < 1000 || EIGEN_COMP_GNUC < 810)
+  return _mm512_cmp_ps_mask(a, b, _CMP_NEQ_UQ);
+#else
+  return _mm512_cmpneq_ps_mask(a, b);
+#endif
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet16b pcast<Packet16f, Packet16b>(const Packet16f& a) {
-  __mmask16 mask = _mm512_cmpneq_ps_mask(a, pzero(a));
+  __mmask16 mask = _eigen_mm512_cmpneq_ps_mask(a, pzero(a));
   return _mm512_maskz_cvtepi32_epi8(mask, _mm512_set1_epi32(1));
 }
 

diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h
index d49f136..12d73ed 100644
--- a/Eigen/src/Core/arch/AltiVec/Complex.h
+++ b/Eigen/src/Core/arch/AltiVec/Complex.h

@@ -361,20 +361,7 @@
   return Packet2cf(vec_and(eq, vec_perm(eq, eq, p16uc_COMPLEX32_REV)));
 }
 
-template <>
-EIGEN_STRONG_INLINE Packet2cf psqrt<Packet2cf>(const Packet2cf& a) {
-  return psqrt_complex<Packet2cf>(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2cf plog<Packet2cf>(const Packet2cf& a) {
-  return plog_complex<Packet2cf>(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2cf pexp<Packet2cf>(const Packet2cf& a) {
-  return pexp_complex<Packet2cf>(a);
-}
+EIGEN_INSTANTIATE_COMPLEX_MATH_FUNCS(Packet2cf)
 
 //---------- double ----------
 #ifdef EIGEN_VECTORIZE_VSX
@@ -621,15 +608,7 @@
   return Packet1cd(vec_and(eq, eq_swapped));
 }
 
-template <>
-EIGEN_STRONG_INLINE Packet1cd psqrt<Packet1cd>(const Packet1cd& a) {
-  return psqrt_complex<Packet1cd>(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet1cd plog<Packet1cd>(const Packet1cd& a) {
-  return plog_complex<Packet1cd>(a);
-}
+EIGEN_INSTANTIATE_COMPLEX_MATH_FUNCS_NO_EXP(Packet1cd)
 
 #endif  // __VSX__
 }  // end namespace internal

diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h
index 94c5dd2..ab02d32 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h

@@ -82,7 +82,6 @@
   }
 }
 
-// Defaults to float32, since Eigen still supports C++03 we can't use default template arguments
 template <typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
 EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const RhsPacket& a, const LhsPacket& b) {
   if (NegativeAccumulate) {

diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index acc2048..8d5a351 100644
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h

@@ -41,7 +41,7 @@
 typedef __vector unsigned char Packet16uc;
 typedef eigen_packet_wrapper<__vector unsigned short int, 0> Packet8bf;
 
-// We don't want to write the same code all the time, but we need to reuse the constants
+// To avoid repeating the same code, but we need to reuse the constants
 // and it doesn't really work to declare them global, so we define macros instead
 #define EIGEN_DECLARE_CONST_FAST_Packet4f(NAME, X) Packet4f p4f_##NAME = {X, X, X, X}
 
@@ -1163,7 +1163,7 @@
 #endif
 }
 
-// for some weird raisons, it has to be overloaded for packet of integers
+// This overload is required for integer packet types.
 template <>
 EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
   return vec_madd(a, b, c);
@@ -3274,7 +3274,7 @@
   return vec_div(a, b);
 }
 
-// for some weird raisons, it has to be overloaded for packet of integers
+// This overload is required for integer packet types.
 template <>
 EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
   return vec_madd(a, b, c);

diff --git a/Eigen/src/Core/arch/AltiVec/TypeCasting.h b/Eigen/src/Core/arch/AltiVec/TypeCasting.h
index 439339e..7be6642 100644
--- a/Eigen/src/Core/arch/AltiVec/TypeCasting.h
+++ b/Eigen/src/Core/arch/AltiVec/TypeCasting.h

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2019 Rasmus Munk Larsen <rmlarsen@google.com>
+// Copyright (C) 2019 Rasmus Munk Larsen <rmlarsen@gmail.com>
 // Copyright (C) 2023 Chip Kerchner (chip.kerchner@ibm.com)
 //
 // This Source Code Form is subject to the terms of the Mozilla

diff --git a/Eigen/src/Core/arch/Default/BFloat16.h b/Eigen/src/Core/arch/Default/BFloat16.h
index b69097d..8f1f092 100644
--- a/Eigen/src/Core/arch/Default/BFloat16.h
+++ b/Eigen/src/Core/arch/Default/BFloat16.h

@@ -38,6 +38,40 @@
     return F32ToBf16(METHOD<PACKET_F>(Bf16ToF32(_x)));                                              \
   }
 
+#define EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_BF16(PACKET_F, PACKET_BF16) \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, pcos)                      \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, psin)                      \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, pexp)                      \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, pexp2)                     \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, pexpm1)                    \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, plog)                      \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, plog1p)                    \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, plog2)                     \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, preciprocal)               \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, prsqrt)                    \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, pcbrt)                     \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, psqrt)                     \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, ptanh)
+
+// BF16 wrappers for unsupported/SpecialFunctions.
+#define EIGEN_INSTANTIATE_SPECIAL_FUNCS_BF16(PACKET_F, PACKET_BF16) \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, perf)                 \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, pndtri)
+
+#define EIGEN_INSTANTIATE_BESSEL_FUNCS_BF16(PACKET_F, PACKET_BF16) \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, pbessel_i0)          \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, pbessel_i0e)         \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, pbessel_i1)          \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, pbessel_i1e)         \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, pbessel_j0)          \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, pbessel_j1)          \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, pbessel_k0)          \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, pbessel_k0e)         \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, pbessel_k1)          \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, pbessel_k1e)         \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, pbessel_y0)          \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, pbessel_y1)
+
 // Only use HIP GPU bf16 in kernels
 #if defined(EIGEN_HAS_HIP_BF16) && defined(EIGEN_GPU_COMPILE_PHASE)
 #define EIGEN_USE_HIP_BF16

diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathComplex.h b/Eigen/src/Core/arch/Default/GenericPacketMathComplex.h
new file mode 100644
index 0000000..8ceaf96
--- /dev/null
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathComplex.h

@@ -0,0 +1,283 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2019 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2018-2025 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ARCH_GENERIC_PACKET_MATH_COMPLEX_H
+#define EIGEN_ARCH_GENERIC_PACKET_MATH_COMPLEX_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+//----------------------------------------------------------------------
+// Complex Arithmetic and Functions
+//----------------------------------------------------------------------
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pdiv_complex(const Packet& x, const Packet& y) {
+  typedef typename unpacket_traits<Packet>::as_real RealPacket;
+  typedef typename unpacket_traits<RealPacket>::type RealScalar;
+  // In the following we annotate the code for the case where the inputs
+  // are a pair length-2 SIMD vectors representing a single pair of complex
+  // numbers x = a + i*b, y = c + i*d.
+  const RealPacket one = pset1<RealPacket>(RealScalar(1));
+  const RealPacket abs_y = pabs(y.v);
+  const RealPacket abs_y_flip = pcplxflip(Packet(abs_y)).v;
+
+  const RealPacket mask = pcmp_lt(abs_y, abs_y_flip);  // |c| < |d|
+  RealPacket y_scaled = pselect(mask, pdiv(abs_y, abs_y_flip), one);
+  y_scaled = por(y_scaled, pandnot(y.v, abs_y));    // copy signs in case |c| == |d|
+  RealPacket denom = pmul(y.v, y_scaled);
+  denom = padd(denom, pcplxflip(Packet(denom)).v);  // c * c' + d * d'
+  Packet num = pmul(x, pconj(Packet(y_scaled)));    // a * c' + b * d', -a * d + b * c
+  return Packet(pdiv(num.v, denom));
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pmul_complex(const Packet& x, const Packet& y) {
+  // In the following we annotate the code for the case where the inputs
+  // are a pair length-2 SIMD vectors representing a single pair of complex
+  // numbers x = a + i*b, y = c + i*d.
+  Packet x_re = pdupreal(x);                  // a, a
+  Packet x_im = pdupimag(x);                  // b, b
+  Packet tmp_re = Packet(pmul(x_re.v, y.v));  // a*c, a*d
+  Packet tmp_im = Packet(pmul(x_im.v, y.v));  // b*c, b*d
+  tmp_im = pcplxflip(pconj(tmp_im));          // -b*d, d*c
+  return padd(tmp_im, tmp_re);                // a*c - b*d, a*d + b*c
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_complex(const Packet& x) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  typedef typename Scalar::value_type RealScalar;
+  typedef typename unpacket_traits<Packet>::as_real RealPacket;
+
+  // Real part
+  RealPacket x_flip = pcplxflip(x).v;  // b, a
+  Packet x_norm = phypot_complex(x);   // sqrt(a^2 + b^2), sqrt(a^2 + b^2)
+  RealPacket xlogr = plog(x_norm.v);   // log(sqrt(a^2 + b^2)), log(sqrt(a^2 + b^2))
+
+  // Imag part
+  RealPacket ximg = patan2(x.v, x_flip);  // atan2(a, b), atan2(b, a)
+
+  const RealPacket cst_pos_inf = pset1<RealPacket>(NumTraits<RealScalar>::infinity());
+  RealPacket x_abs = pabs(x.v);
+  RealPacket is_x_pos_inf = pcmp_eq(x_abs, cst_pos_inf);
+  RealPacket is_y_pos_inf = pcplxflip(Packet(is_x_pos_inf)).v;
+  RealPacket is_any_inf = por(is_x_pos_inf, is_y_pos_inf);
+  RealPacket xreal = pselect(is_any_inf, cst_pos_inf, xlogr);
+
+  return Packet(pselect(peven_mask(xreal), xreal, ximg));  // log(sqrt(a^2 + b^2)), atan2(b, a)
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_complex(const Packet& a) {
+  typedef typename unpacket_traits<Packet>::as_real RealPacket;
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  typedef typename Scalar::value_type RealScalar;
+  const RealPacket even_mask = peven_mask(a.v);
+  const RealPacket odd_mask = pcplxflip(Packet(even_mask)).v;
+
+  // Let a = x + iy.
+  // exp(a) = exp(x) * cis(y), plus some special edge-case handling.
+
+  // exp(x):
+  RealPacket x = pand(a.v, even_mask);
+  x = por(x, pcplxflip(Packet(x)).v);
+  RealPacket expx = pexp(x);  // exp(x);
+
+  // cis(y):
+  RealPacket y = pand(odd_mask, a.v);
+  y = por(y, pcplxflip(Packet(y)).v);
+  RealPacket cisy = psincos_selector<RealPacket>(y);
+  cisy = pcplxflip(Packet(cisy)).v;  // cos(y) + i * sin(y)
+
+  const RealPacket cst_pos_inf = pset1<RealPacket>(NumTraits<RealScalar>::infinity());
+  const RealPacket cst_neg_inf = pset1<RealPacket>(-NumTraits<RealScalar>::infinity());
+
+  // If x is -inf, we know that cossin(y) is bounded,
+  //   so the result is (0, +/-0), where the sign of the imaginary part comes
+  //   from the sign of cossin(y).
+  RealPacket cisy_sign = por(pandnot(cisy, pabs(cisy)), pset1<RealPacket>(RealScalar(1)));
+  cisy = pselect(pcmp_eq(x, cst_neg_inf), cisy_sign, cisy);
+
+  // If x is inf, and cos(y) has unknown sign (y is inf or NaN), the result
+  // is (+/-inf, NaN), where the signs are undetermined (take the sign of y).
+  RealPacket y_sign = por(pandnot(y, pabs(y)), pset1<RealPacket>(RealScalar(1)));
+  cisy = pselect(pand(pcmp_eq(x, cst_pos_inf), pisnan(cisy)), pand(y_sign, even_mask), cisy);
+
+  // If exp(x) is +inf and y is finite, replace cisy with copysign(1, cisy) to
+  // prevent inf * 0 = NaN. The vectorized sincos may compute exact zero
+  // for near-zero values like cos(pi/2), and inf * +-1 = +-inf is correct.
+  // The y=0 case is handled separately below.
+  RealPacket cisy_sign_one = por(pand(cisy, pset1<RealPacket>(RealScalar(-0.0))), pset1<RealPacket>(RealScalar(1)));
+  RealPacket expx_inf_y_finite = pand(pcmp_eq(expx, cst_pos_inf), pcmp_lt(pabs(y), cst_pos_inf));
+  cisy = pselect(expx_inf_y_finite, cisy_sign_one, cisy);
+
+  Packet result = Packet(pmul(expx, cisy));
+
+  // If y is +/- 0, the input is real, so take the real result for consistency.
+  result = pselect(Packet(pcmp_eq(y, pzero(y))), Packet(por(pand(expx, even_mask), pand(y, odd_mask))), result);
+
+  return result;
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psqrt_complex(const Packet& a) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  typedef typename Scalar::value_type RealScalar;
+  typedef typename unpacket_traits<Packet>::as_real RealPacket;
+
+  // Computes the principal sqrt of the complex numbers in the input.
+  //
+  // For example, for packets containing 2 complex numbers stored in interleaved format
+  //    a = [a0, a1] = [x0, y0, x1, y1],
+  // where x0 = real(a0), y0 = imag(a0) etc., this function returns
+  //    b = [b0, b1] = [u0, v0, u1, v1],
+  // such that b0^2 = a0, b1^2 = a1.
+  //
+  // To derive the formula for the complex square roots, let's consider the equation for
+  // a single complex square root of the number x + i*y. We want to find real numbers
+  // u and v such that
+  //    (u + i*v)^2 = x + i*y  <=>
+  //    u^2 - v^2 + i*2*u*v = x + i*v.
+  // By equating the real and imaginary parts we get:
+  //    u^2 - v^2 = x
+  //    2*u*v = y.
+  //
+  // For x >= 0, this has the numerically stable solution
+  //    u = sqrt(0.5 * (x + sqrt(x^2 + y^2)))
+  //    v = 0.5 * (y / u)
+  // and for x < 0,
+  //    v = sign(y) * sqrt(0.5 * (-x + sqrt(x^2 + y^2)))
+  //    u = 0.5 * (y / v)
+  //
+  //  To avoid unnecessary over- and underflow, we compute sqrt(x^2 + y^2) as
+  //     l = max(|x|, |y|) * sqrt(1 + (min(|x|, |y|) / max(|x|, |y|))^2) ,
+
+  // In the following, without lack of generality, we have annotated the code, assuming
+  // that the input is a packet of 2 complex numbers.
+  //
+  // Step 1. Compute l = [l0, l0, l1, l1], where
+  //    l0 = sqrt(x0^2 + y0^2),  l1 = sqrt(x1^2 + y1^2)
+  // To avoid over- and underflow, we use the stable formula for each hypotenuse
+  //    l0 = (min0 == 0 ? max0 : max0 * sqrt(1 + (min0/max0)**2)),
+  // where max0 = max(|x0|, |y0|), min0 = min(|x0|, |y0|), and similarly for l1.
+
+  RealPacket a_abs = pabs(a.v);                        // [|x0|, |y0|, |x1|, |y1|]
+  RealPacket a_abs_flip = pcplxflip(Packet(a_abs)).v;  // [|y0|, |x0|, |y1|, |x1|]
+  RealPacket a_max = pmax(a_abs, a_abs_flip);
+  RealPacket a_min = pmin(a_abs, a_abs_flip);
+  RealPacket a_min_zero_mask = pcmp_eq(a_min, pzero(a_min));
+  RealPacket a_max_zero_mask = pcmp_eq(a_max, pzero(a_max));
+  RealPacket r = pdiv(a_min, a_max);
+  const RealPacket cst_one = pset1<RealPacket>(RealScalar(1));
+  RealPacket l = pmul(a_max, psqrt(padd(cst_one, pmul(r, r))));  // [l0, l0, l1, l1]
+  // Set l to a_max if a_min is zero.
+  l = pselect(a_min_zero_mask, a_max, l);
+
+  // Step 2. Compute [rho0, *, rho1, *], where
+  // rho0 = sqrt(0.5 * (l0 + |x0|)), rho1 =  sqrt(0.5 * (l1 + |x1|))
+  // We don't care about the imaginary parts computed here. They will be overwritten later.
+  const RealPacket cst_half = pset1<RealPacket>(RealScalar(0.5));
+  Packet rho;
+  rho.v = psqrt(pmul(cst_half, padd(a_abs, l)));
+
+  // Step 3. Compute [rho0, eta0, rho1, eta1], where
+  // eta0 = (y0 / l0) / 2, and eta1 = (y1 / l1) / 2.
+  // set eta = 0 of input is 0 + i0.
+  RealPacket eta = pandnot(pmul(cst_half, pdiv(a.v, pcplxflip(rho).v)), a_max_zero_mask);
+  RealPacket real_mask = peven_mask(a.v);
+  Packet positive_real_result;
+  // Compute result for inputs with positive real part.
+  positive_real_result.v = pselect(real_mask, rho.v, eta);
+
+  // Step 4. Compute solution for inputs with negative real part:
+  //         [|eta0|, sign(y0)*rho0, |eta1|, sign(y1)*rho1]
+  const RealPacket cst_imag_sign_mask = pset1<Packet>(Scalar(RealScalar(0.0), RealScalar(-0.0))).v;
+  RealPacket imag_signs = pand(a.v, cst_imag_sign_mask);
+  Packet negative_real_result;
+  // Notice that rho is positive, so taking its absolute value is a noop.
+  negative_real_result.v = por(pabs(pcplxflip(positive_real_result).v), imag_signs);
+
+  // Step 5. Select solution branch based on the sign of the real parts.
+  Packet negative_real_mask;
+  negative_real_mask.v = pcmp_lt(pand(real_mask, a.v), pzero(a.v));
+  negative_real_mask.v = por(negative_real_mask.v, pcplxflip(negative_real_mask).v);
+  Packet result = pselect(negative_real_mask, negative_real_result, positive_real_result);
+
+  // Step 6. Handle special cases for infinities:
+  // * If z is (x,+∞), the result is (+∞,+∞) even if x is NaN
+  // * If z is (x,-∞), the result is (+∞,-∞) even if x is NaN
+  // * If z is (-∞,y), the result is (0*|y|,+∞) for finite or NaN y
+  // * If z is (+∞,y), the result is (+∞,0*|y|) for finite or NaN y
+  const RealPacket cst_pos_inf = pset1<RealPacket>(NumTraits<RealScalar>::infinity());
+  Packet is_inf;
+  is_inf.v = pcmp_eq(a_abs, cst_pos_inf);
+  Packet is_real_inf;
+  is_real_inf.v = pand(is_inf.v, real_mask);
+  is_real_inf = por(is_real_inf, pcplxflip(is_real_inf));
+  // prepare packet of (+∞,0*|y|) or (0*|y|,+∞), depending on the sign of the infinite real part.
+  Packet real_inf_result;
+  real_inf_result.v = pmul(a_abs, pset1<Packet>(Scalar(RealScalar(1.0), RealScalar(0.0))).v);
+  real_inf_result.v = pselect(negative_real_mask.v, pcplxflip(real_inf_result).v, real_inf_result.v);
+  // prepare packet of (+∞,+∞) or (+∞,-∞), depending on the sign of the infinite imaginary part.
+  Packet is_imag_inf;
+  is_imag_inf.v = pandnot(is_inf.v, real_mask);
+  is_imag_inf = por(is_imag_inf, pcplxflip(is_imag_inf));
+  Packet imag_inf_result;
+  imag_inf_result.v = por(pand(cst_pos_inf, real_mask), pandnot(a.v, real_mask));
+  // unless otherwise specified, if either the real or imaginary component is nan, the entire result is nan
+  Packet result_is_nan = pisnan(result);
+  result = por(result_is_nan, result);
+
+  return pselect(is_imag_inf, imag_inf_result, pselect(is_real_inf, real_inf_result, result));
+}
+
+// \internal \returns the norm of a complex number z = x + i*y, defined as sqrt(x^2 + y^2).
+// Implemented using the hypot(a,b) algorithm from https://doi.org/10.48550/arXiv.1904.09481
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet phypot_complex(const Packet& a) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  typedef typename Scalar::value_type RealScalar;
+  typedef typename unpacket_traits<Packet>::as_real RealPacket;
+
+  const RealPacket cst_zero_rp = pset1<RealPacket>(static_cast<RealScalar>(0.0));
+  const RealPacket cst_minus_one_rp = pset1<RealPacket>(static_cast<RealScalar>(-1.0));
+  const RealPacket cst_two_rp = pset1<RealPacket>(static_cast<RealScalar>(2.0));
+  const RealPacket evenmask = peven_mask(a.v);
+
+  RealPacket a_abs = pabs(a.v);
+  RealPacket a_flip = pcplxflip(Packet(a_abs)).v;       // |b|, |a|
+  RealPacket a_all = pselect(evenmask, a_abs, a_flip);  // |a|, |a|
+  RealPacket b_all = pselect(evenmask, a_flip, a_abs);  // |b|, |b|
+
+  RealPacket a2 = pmul(a.v, a.v);                    // |a^2, b^2|
+  RealPacket a2_flip = pcplxflip(Packet(a2)).v;      // |b^2, a^2|
+  RealPacket h = psqrt(padd(a2, a2_flip));           // |sqrt(a^2 + b^2), sqrt(a^2 + b^2)|
+  RealPacket h_sq = pmul(h, h);                      // |a^2 + b^2, a^2 + b^2|
+  RealPacket a_sq = pselect(evenmask, a2, a2_flip);  // |a^2, a^2|
+  RealPacket m_h_sq = pmul(h_sq, cst_minus_one_rp);
+  RealPacket m_a_sq = pmul(a_sq, cst_minus_one_rp);
+  RealPacket x = psub(psub(pmadd(h, h, m_h_sq), pmadd(b_all, b_all, psub(a_sq, h_sq))), pmadd(a_all, a_all, m_a_sq));
+  h = psub(h, pdiv(x, pmul(cst_two_rp, h)));  // |h - x/(2*h), h - x/(2*h)|
+
+  // handle zero-case
+  RealPacket iszero = pcmp_eq(por(a_abs, a_flip), cst_zero_rp);
+
+  h = pandnot(h, iszero);  // |sqrt(a^2+b^2), sqrt(a^2+b^2)|
+  return Packet(h);        // |sqrt(a^2+b^2), sqrt(a^2+b^2)|
+}
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_ARCH_GENERIC_PACKET_MATH_COMPLEX_H

diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathDoubleWord.h b/Eigen/src/Core/arch/Default/GenericPacketMathDoubleWord.h
new file mode 100644
index 0000000..5cb677f
--- /dev/null
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathDoubleWord.h

@@ -0,0 +1,208 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018-2025 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ARCH_GENERIC_PACKET_MATH_DOUBLE_WORD_H
+#define EIGEN_ARCH_GENERIC_PACKET_MATH_DOUBLE_WORD_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+// This function splits x into the nearest integer n and fractional part r,
+// such that x = n + r holds exactly.
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void absolute_split(const Packet& x, Packet& n, Packet& r) {
+  n = pround(x);
+  r = psub(x, n);
+}
+
+// This function computes the sum {s, r}, such that x + y = s_hi + s_lo
+// holds exactly, and s_hi = fl(x+y), if |x| >= |y|.
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void fast_twosum(const Packet& x, const Packet& y, Packet& s_hi, Packet& s_lo) {
+  s_hi = padd(x, y);
+  const Packet t = psub(s_hi, x);
+  s_lo = psub(y, t);
+}
+
+#ifdef EIGEN_VECTORIZE_FMA
+// This function implements the extended precision product of
+// a pair of floating point numbers. Given {x, y}, it computes the pair
+// {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and
+// p_hi = fl(x * y).
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twoprod(const Packet& x, const Packet& y, Packet& p_hi, Packet& p_lo) {
+  p_hi = pmul(x, y);
+  p_lo = pmsub(x, y, p_hi);
+}
+
+// A version of twoprod that takes x, y, and fl(x*y) as input and returns the p_lo such that
+// x * y = xy + p_lo holds exactly.
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet twoprod_low(const Packet& x, const Packet& y, const Packet& xy) {
+  return pmsub(x, y, xy);
+}
+
+#else
+
+// This function implements the Veltkamp splitting. Given a floating point
+// number x it returns the pair {x_hi, x_lo} such that x_hi + x_lo = x holds
+// exactly and that half of the significant of x fits in x_hi.
+// This is Algorithm 3 from Jean-Michel Muller, "Elementary Functions",
+// 3rd edition, Birkh\"auser, 2016.
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void veltkamp_splitting(const Packet& x, Packet& x_hi, Packet& x_lo) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  constexpr int shift = (NumTraits<Scalar>::digits() + 1) / 2;
+  const Scalar shift_scale = Scalar(uint64_t(1) << shift);  // Scalar constructor not necessarily constexpr.
+  const Packet gamma = pmul(pset1<Packet>(shift_scale + Scalar(1)), x);
+  Packet rho = psub(x, gamma);
+  x_hi = padd(rho, gamma);
+  x_lo = psub(x, x_hi);
+}
+
+// This function implements Dekker's algorithm for products x * y.
+// Given floating point numbers {x, y} computes the pair
+// {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and
+// p_hi = fl(x * y).
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twoprod(const Packet& x, const Packet& y, Packet& p_hi, Packet& p_lo) {
+  Packet x_hi, x_lo, y_hi, y_lo;
+  veltkamp_splitting(x, x_hi, x_lo);
+  veltkamp_splitting(y, y_hi, y_lo);
+
+  p_hi = pmul(x, y);
+  p_lo = pmadd(x_hi, y_hi, pnegate(p_hi));
+  p_lo = pmadd(x_hi, y_lo, p_lo);
+  p_lo = pmadd(x_lo, y_hi, p_lo);
+  p_lo = pmadd(x_lo, y_lo, p_lo);
+}
+
+// A version of twoprod that takes x, y, and fl(x*y) as input and returns the p_lo such that
+// x * y = xy + p_lo holds exactly.
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet twoprod_low(const Packet& x, const Packet& y, const Packet& xy) {
+  Packet x_hi, x_lo, y_hi, y_lo;
+  veltkamp_splitting(x, x_hi, x_lo);
+  veltkamp_splitting(y, y_hi, y_lo);
+
+  Packet p_lo = pmadd(x_hi, y_hi, pnegate(xy));
+  p_lo = pmadd(x_hi, y_lo, p_lo);
+  p_lo = pmadd(x_lo, y_hi, p_lo);
+  p_lo = pmadd(x_lo, y_lo, p_lo);
+  return p_lo;
+}
+
+#endif  // EIGEN_VECTORIZE_FMA
+
+// This function implements Dekker's algorithm for the addition
+// of two double word numbers represented by {x_hi, x_lo} and {y_hi, y_lo}.
+// It returns the result as a pair {s_hi, s_lo} such that
+// x_hi + x_lo + y_hi + y_lo = s_hi + s_lo holds exactly.
+// This is Algorithm 5 from Jean-Michel Muller, "Elementary Functions",
+// 3rd edition, Birkh\"auser, 2016.
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twosum(const Packet& x_hi, const Packet& x_lo, const Packet& y_hi,
+                                                  const Packet& y_lo, Packet& s_hi, Packet& s_lo) {
+  const Packet x_greater_mask = pcmp_lt(pabs(y_hi), pabs(x_hi));
+  Packet r_hi_1, r_lo_1;
+  fast_twosum(x_hi, y_hi, r_hi_1, r_lo_1);
+  Packet r_hi_2, r_lo_2;
+  fast_twosum(y_hi, x_hi, r_hi_2, r_lo_2);
+  const Packet r_hi = pselect(x_greater_mask, r_hi_1, r_hi_2);
+
+  const Packet s1 = padd(padd(y_lo, r_lo_1), x_lo);
+  const Packet s2 = padd(padd(x_lo, r_lo_2), y_lo);
+  const Packet s = pselect(x_greater_mask, s1, s2);
+
+  fast_twosum(r_hi, s, s_hi, s_lo);
+}
+
+// This is a version of twosum for double word numbers,
+// which assumes that |x_hi| >= |y_hi|.
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void fast_twosum(const Packet& x_hi, const Packet& x_lo, const Packet& y_hi,
+                                                       const Packet& y_lo, Packet& s_hi, Packet& s_lo) {
+  Packet r_hi, r_lo;
+  fast_twosum(x_hi, y_hi, r_hi, r_lo);
+  const Packet s = padd(padd(y_lo, r_lo), x_lo);
+  fast_twosum(r_hi, s, s_hi, s_lo);
+}
+
+// This is a version of twosum for adding a floating point number x to
+// double word number {y_hi, y_lo} number, with the assumption
+// that |x| >= |y_hi|.
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void fast_twosum(const Packet& x, const Packet& y_hi, const Packet& y_lo,
+                                                       Packet& s_hi, Packet& s_lo) {
+  Packet r_hi, r_lo;
+  fast_twosum(x, y_hi, r_hi, r_lo);
+  const Packet s = padd(y_lo, r_lo);
+  fast_twosum(r_hi, s, s_hi, s_lo);
+}
+
+// This function implements the multiplication of a double word
+// number represented by {x_hi, x_lo} by a floating point number y.
+// It returns the result as a pair {p_hi, p_lo} such that
+// (x_hi + x_lo) * y = p_hi + p_lo hold with a relative error
+// of less than 2*2^{-2p}, where p is the number of significand bit
+// in the floating point type.
+// This is Algorithm 7 from Jean-Michel Muller, "Elementary Functions",
+// 3rd edition, Birkh\"auser, 2016.
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twoprod(const Packet& x_hi, const Packet& x_lo, const Packet& y,
+                                                   Packet& p_hi, Packet& p_lo) {
+  Packet c_hi, c_lo1;
+  twoprod(x_hi, y, c_hi, c_lo1);
+  const Packet c_lo2 = pmul(x_lo, y);
+  Packet t_hi, t_lo1;
+  fast_twosum(c_hi, c_lo2, t_hi, t_lo1);
+  const Packet t_lo2 = padd(t_lo1, c_lo1);
+  fast_twosum(t_hi, t_lo2, p_hi, p_lo);
+}
+
+// This function implements the multiplication of two double word
+// numbers represented by {x_hi, x_lo} and {y_hi, y_lo}.
+// It returns the result as a pair {p_hi, p_lo} such that
+// (x_hi + x_lo) * (y_hi + y_lo) = p_hi + p_lo holds with a relative error
+// of less than 2*2^{-2p}, where p is the number of significand bit
+// in the floating point type.
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twoprod(const Packet& x_hi, const Packet& x_lo, const Packet& y_hi,
+                                                   const Packet& y_lo, Packet& p_hi, Packet& p_lo) {
+  Packet p_hi_hi, p_hi_lo;
+  twoprod(x_hi, x_lo, y_hi, p_hi_hi, p_hi_lo);
+  Packet p_lo_hi, p_lo_lo;
+  twoprod(x_hi, x_lo, y_lo, p_lo_hi, p_lo_lo);
+  fast_twosum(p_hi_hi, p_hi_lo, p_lo_hi, p_lo_lo, p_hi, p_lo);
+}
+
+// This function implements the division of double word {x_hi, x_lo}
+// by float y. This is Algorithm 15 from "Tight and rigorous error bounds
+// for basic building blocks of double-word arithmetic", Joldes, Muller, & Popescu,
+// 2017. https://hal.archives-ouvertes.fr/hal-01351529
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void doubleword_div_fp(const Packet& x_hi, const Packet& x_lo, const Packet& y,
+                                                             Packet& z_hi, Packet& z_lo) {
+  const Packet t_hi = pdiv(x_hi, y);
+  Packet pi_hi, pi_lo;
+  twoprod(t_hi, y, pi_hi, pi_lo);
+  const Packet delta_hi = psub(x_hi, pi_hi);
+  const Packet delta_t = psub(delta_hi, pi_lo);
+  const Packet delta = padd(delta_t, x_lo);
+  const Packet t_lo = pdiv(delta, y);
+  fast_twosum(t_hi, t_lo, z_hi, z_lo);
+}
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_ARCH_GENERIC_PACKET_MATH_DOUBLE_WORD_H

diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFrexpLdexp.h b/Eigen/src/Core/arch/Default/GenericPacketMathFrexpLdexp.h
new file mode 100644
index 0000000..978818e
--- /dev/null
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFrexpLdexp.h

@@ -0,0 +1,162 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2019 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2018-2025 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ARCH_GENERIC_PACKET_MATH_FREXP_LDEXP_H
+#define EIGEN_ARCH_GENERIC_PACKET_MATH_FREXP_LDEXP_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+// Creates a Scalar integer type with same bit-width.
+template <typename T>
+struct make_integer;
+template <>
+struct make_integer<float> {
+  typedef numext::int32_t type;
+};
+template <>
+struct make_integer<double> {
+  typedef numext::int64_t type;
+};
+template <>
+struct make_integer<half> {
+  typedef numext::int16_t type;
+};
+template <>
+struct make_integer<bfloat16> {
+  typedef numext::int16_t type;
+};
+
+template <typename Packet>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pfrexp_generic_get_biased_exponent(const Packet& a) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
+  static constexpr int mantissa_bits = numext::numeric_limits<Scalar>::digits - 1;
+  return pcast<PacketI, Packet>(plogical_shift_right<mantissa_bits>(preinterpret<PacketI>(pabs(a))));
+}
+
+// Safely applies frexp, correctly handles denormals.
+// Assumes IEEE floating point format.
+template <typename Packet>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pfrexp_generic(const Packet& a, Packet& exponent) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  typedef typename make_unsigned<typename make_integer<Scalar>::type>::type ScalarUI;
+  static constexpr int TotalBits = sizeof(Scalar) * CHAR_BIT, MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
+                       ExponentBits = TotalBits - MantissaBits - 1;
+
+  constexpr ScalarUI scalar_sign_mantissa_mask =
+      ~(((ScalarUI(1) << ExponentBits) - ScalarUI(1)) << MantissaBits);  // ~0x7f800000
+  const Packet sign_mantissa_mask = pset1frombits<Packet>(static_cast<ScalarUI>(scalar_sign_mantissa_mask));
+  const Packet half = pset1<Packet>(Scalar(0.5));
+  const Packet zero = pzero(a);
+  const Packet normal_min = pset1<Packet>((numext::numeric_limits<Scalar>::min)());  // Minimum normal value, 2^-126
+
+  // To handle denormals, normalize by multiplying by 2^(int(MantissaBits)+1).
+  const Packet is_denormal = pcmp_lt(pabs(a), normal_min);
+  constexpr ScalarUI scalar_normalization_offset = ScalarUI(MantissaBits + 1);  // 24
+  // The following cannot be constexpr because bfloat16(uint16_t) is not constexpr.
+  const Scalar scalar_normalization_factor = Scalar(ScalarUI(1) << int(scalar_normalization_offset));  // 2^24
+  const Packet normalization_factor = pset1<Packet>(scalar_normalization_factor);
+  const Packet normalized_a = pselect(is_denormal, pmul(a, normalization_factor), a);
+
+  // Determine exponent offset: -126 if normal, -126-24 if denormal
+  const Scalar scalar_exponent_offset = -Scalar((ScalarUI(1) << (ExponentBits - 1)) - ScalarUI(2));  // -126
+  Packet exponent_offset = pset1<Packet>(scalar_exponent_offset);
+  const Packet normalization_offset = pset1<Packet>(-Scalar(scalar_normalization_offset));  // -24
+  exponent_offset = pselect(is_denormal, padd(exponent_offset, normalization_offset), exponent_offset);
+
+  // Determine exponent and mantissa from normalized_a.
+  exponent = pfrexp_generic_get_biased_exponent(normalized_a);
+  // Zero, Inf and NaN return 'a' unmodified, exponent is zero
+  // (technically the exponent is unspecified for inf/NaN, but GCC/Clang set it to zero)
+  const Scalar scalar_non_finite_exponent = Scalar((ScalarUI(1) << ExponentBits) - ScalarUI(1));  // 255
+  const Packet non_finite_exponent = pset1<Packet>(scalar_non_finite_exponent);
+  const Packet is_zero_or_not_finite = por(pcmp_eq(a, zero), pcmp_eq(exponent, non_finite_exponent));
+  const Packet m = pselect(is_zero_or_not_finite, a, por(pand(normalized_a, sign_mantissa_mask), half));
+  exponent = pselect(is_zero_or_not_finite, zero, padd(exponent, exponent_offset));
+  return m;
+}
+
+// Safely applies ldexp, correctly handles overflows, underflows and denormals.
+// Assumes IEEE floating point format.
+template <typename Packet>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_generic(const Packet& a, const Packet& exponent) {
+  // We want to return a * 2^exponent, allowing for all possible integer
+  // exponents without overflowing or underflowing in intermediate
+  // computations.
+  //
+  // Since 'a' and the output can be denormal, the maximum range of 'exponent'
+  // to consider for a float is:
+  //   -255-23 -> 255+23
+  // Below -278 any finite float 'a' will become zero, and above +278 any
+  // finite float will become inf, including when 'a' is the smallest possible
+  // denormal.
+  //
+  // Unfortunately, 2^(278) cannot be represented using either one or two
+  // finite normal floats, so we must split the scale factor into at least
+  // three parts. It turns out to be faster to split 'exponent' into four
+  // factors, since [exponent>>2] is much faster to compute that [exponent/3].
+  //
+  // Set e = min(max(exponent, -278), 278);
+  //     b = floor(e/4);
+  //   out = ((((a * 2^(b)) * 2^(b)) * 2^(b)) * 2^(e-3*b))
+  //
+  // This will avoid any intermediate overflows and correctly handle 0, inf,
+  // NaN cases.
+  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  typedef typename unpacket_traits<PacketI>::type ScalarI;
+  static constexpr int TotalBits = sizeof(Scalar) * CHAR_BIT, MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
+                       ExponentBits = TotalBits - MantissaBits - 1;
+
+  const Packet max_exponent = pset1<Packet>(Scalar((ScalarI(1) << ExponentBits) + ScalarI(MantissaBits - 1)));  // 278
+  const PacketI bias = pset1<PacketI>((ScalarI(1) << (ExponentBits - 1)) - ScalarI(1));                         // 127
+  const PacketI e = pcast<Packet, PacketI>(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
+  PacketI b = parithmetic_shift_right<2>(e);                                          // floor(e/4);
+  Packet c = preinterpret<Packet>(plogical_shift_left<MantissaBits>(padd(b, bias)));  // 2^b
+  Packet out = pmul(pmul(pmul(a, c), c), c);                                          // a * 2^(3b)
+  b = pnmadd(pset1<PacketI>(3), b, e);                                                // e - 3b
+  c = preinterpret<Packet>(plogical_shift_left<MantissaBits>(padd(b, bias)));         // 2^(e-3*b)
+  out = pmul(out, c);
+  return out;
+}
+
+// Explicitly multiplies
+//    a * (2^e)
+// clamping e to the range
+// [NumTraits<Scalar>::min_exponent()-2, NumTraits<Scalar>::max_exponent()]
+//
+// This is approx 7x faster than pldexp_impl, but will prematurely over/underflow
+// if 2^e doesn't fit into a normal floating-point Scalar.
+//
+// Assumes IEEE floating point format
+template <typename Packet>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_fast(const Packet& a, const Packet& exponent) {
+  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  typedef typename unpacket_traits<PacketI>::type ScalarI;
+  static constexpr int TotalBits = sizeof(Scalar) * CHAR_BIT, MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
+                       ExponentBits = TotalBits - MantissaBits - 1;
+
+  const Packet bias = pset1<Packet>(Scalar((ScalarI(1) << (ExponentBits - 1)) - ScalarI(1)));  // 127
+  const Packet limit = pset1<Packet>(Scalar((ScalarI(1) << ExponentBits) - ScalarI(1)));       // 255
+  // restrict biased exponent between 0 and 255 for float.
+  const PacketI e = pcast<Packet, PacketI>(pmin(pmax(padd(exponent, bias), pzero(limit)), limit));  // exponent + 127
+  // return a * (2^e)
+  return pmul(a, preinterpret<Packet>(plogical_shift_left<MantissaBits>(e)));
+}
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_ARCH_GENERIC_PACKET_MATH_FREXP_LDEXP_H

diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
index 13cdba7..2ab1847 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h

@@ -19,412 +19,16 @@
 
 // IWYU pragma: private
 #include "../../InternalHeaderCheck.h"
+#include "GenericPacketMathPolynomials.h"
+#include "GenericPacketMathFrexpLdexp.h"
+#include "GenericPacketMathDoubleWord.h"
 
 namespace Eigen {
 namespace internal {
 
-// Creates a Scalar integer type with same bit-width.
-template <typename T>
-struct make_integer;
-template <>
-struct make_integer<float> {
-  typedef numext::int32_t type;
-};
-template <>
-struct make_integer<double> {
-  typedef numext::int64_t type;
-};
-template <>
-struct make_integer<half> {
-  typedef numext::int16_t type;
-};
-template <>
-struct make_integer<bfloat16> {
-  typedef numext::int16_t type;
-};
-
-/* polevl (modified for Eigen)
- *
- *      Evaluate polynomial
- *
- *
- *
- * SYNOPSIS:
- *
- * int N;
- * Scalar x, y, coef[N+1];
- *
- * y = polevl<decltype(x), N>( x, coef);
- *
- *
- *
- * DESCRIPTION:
- *
- * Evaluates polynomial of degree N:
- *
- *                     2          N
- * y  =  C  + C x + C x  +...+ C x
- *        0    1     2          N
- *
- * Coefficients are stored in reverse order:
- *
- * coef[0] = C  , ..., coef[N] = C  .
- *            N                   0
- *
- *  The function p1evl() assumes that coef[N] = 1.0 and is
- * omitted from the array.  Its calling arguments are
- * otherwise the same as polevl().
- *
- *
- * The Eigen implementation is templatized.  For best speed, store
- * coef as a const array (constexpr), e.g.
- *
- * const double coef[] = {1.0, 2.0, 3.0, ...};
- *
- */
-template <typename Packet, int N>
-struct ppolevl {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x,
-                                                          const typename unpacket_traits<Packet>::type coeff[]) {
-    EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    return pmadd(ppolevl<Packet, N - 1>::run(x, coeff), x, pset1<Packet>(coeff[N]));
-  }
-};
-
-template <typename Packet>
-struct ppolevl<Packet, 0> {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x,
-                                                          const typename unpacket_traits<Packet>::type coeff[]) {
-    EIGEN_UNUSED_VARIABLE(x);
-    return pset1<Packet>(coeff[0]);
-  }
-};
-
-/* chbevl (modified for Eigen)
- *
- *     Evaluate Chebyshev series
- *
- *
- *
- * SYNOPSIS:
- *
- * int N;
- * Scalar x, y, coef[N], chebevl();
- *
- * y = chbevl( x, coef, N );
- *
- *
- *
- * DESCRIPTION:
- *
- * Evaluates the series
- *
- *        N-1
- *         - '
- *  y  =   >   coef[i] T (x/2)
- *         -            i
- *        i=0
- *
- * of Chebyshev polynomials Ti at argument x/2.
- *
- * Coefficients are stored in reverse order, i.e. the zero
- * order term is last in the array.  Note N is the number of
- * coefficients, not the order.
- *
- * If coefficients are for the interval a to b, x must
- * have been transformed to x -> 2(2x - b - a)/(b-a) before
- * entering the routine.  This maps x from (a, b) to (-1, 1),
- * over which the Chebyshev polynomials are defined.
- *
- * If the coefficients are for the inverted interval, in
- * which (a, b) is mapped to (1/b, 1/a), the transformation
- * required is x -> 2(2ab/x - b - a)/(b-a).  If b is infinity,
- * this becomes x -> 4a/x - 1.
- *
- *
- *
- * SPEED:
- *
- * Taking advantage of the recurrence properties of the
- * Chebyshev polynomials, the routine requires one more
- * addition per loop than evaluating a nested polynomial of
- * the same degree.
- *
- */
-
-template <typename Packet, int N>
-struct pchebevl {
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Packet run(Packet x,
-                                                          const typename unpacket_traits<Packet>::type coef[]) {
-    typedef typename unpacket_traits<Packet>::type Scalar;
-    Packet b0 = pset1<Packet>(coef[0]);
-    Packet b1 = pset1<Packet>(static_cast<Scalar>(0.f));
-    Packet b2;
-
-    for (int i = 1; i < N; i++) {
-      b2 = b1;
-      b1 = b0;
-      b0 = psub(pmadd(x, b1, pset1<Packet>(coef[i])), b2);
-    }
-
-    return pmul(pset1<Packet>(static_cast<Scalar>(0.5f)), psub(b0, b2));
-  }
-};
-
-template <typename Packet>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pfrexp_generic_get_biased_exponent(const Packet& a) {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
-  static constexpr int mantissa_bits = numext::numeric_limits<Scalar>::digits - 1;
-  return pcast<PacketI, Packet>(plogical_shift_right<mantissa_bits>(preinterpret<PacketI>(pabs(a))));
-}
-
-// Safely applies frexp, correctly handles denormals.
-// Assumes IEEE floating point format.
-template <typename Packet>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pfrexp_generic(const Packet& a, Packet& exponent) {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  typedef typename make_unsigned<typename make_integer<Scalar>::type>::type ScalarUI;
-  static constexpr int TotalBits = sizeof(Scalar) * CHAR_BIT, MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
-                       ExponentBits = TotalBits - MantissaBits - 1;
-
-  constexpr ScalarUI scalar_sign_mantissa_mask =
-      ~(((ScalarUI(1) << ExponentBits) - ScalarUI(1)) << MantissaBits);  // ~0x7f800000
-  const Packet sign_mantissa_mask = pset1frombits<Packet>(static_cast<ScalarUI>(scalar_sign_mantissa_mask));
-  const Packet half = pset1<Packet>(Scalar(0.5));
-  const Packet zero = pzero(a);
-  const Packet normal_min = pset1<Packet>((numext::numeric_limits<Scalar>::min)());  // Minimum normal value, 2^-126
-
-  // To handle denormals, normalize by multiplying by 2^(int(MantissaBits)+1).
-  const Packet is_denormal = pcmp_lt(pabs(a), normal_min);
-  constexpr ScalarUI scalar_normalization_offset = ScalarUI(MantissaBits + 1);  // 24
-  // The following cannot be constexpr because bfloat16(uint16_t) is not constexpr.
-  const Scalar scalar_normalization_factor = Scalar(ScalarUI(1) << int(scalar_normalization_offset));  // 2^24
-  const Packet normalization_factor = pset1<Packet>(scalar_normalization_factor);
-  const Packet normalized_a = pselect(is_denormal, pmul(a, normalization_factor), a);
-
-  // Determine exponent offset: -126 if normal, -126-24 if denormal
-  const Scalar scalar_exponent_offset = -Scalar((ScalarUI(1) << (ExponentBits - 1)) - ScalarUI(2));  // -126
-  Packet exponent_offset = pset1<Packet>(scalar_exponent_offset);
-  const Packet normalization_offset = pset1<Packet>(-Scalar(scalar_normalization_offset));  // -24
-  exponent_offset = pselect(is_denormal, padd(exponent_offset, normalization_offset), exponent_offset);
-
-  // Determine exponent and mantissa from normalized_a.
-  exponent = pfrexp_generic_get_biased_exponent(normalized_a);
-  // Zero, Inf and NaN return 'a' unmodified, exponent is zero
-  // (technically the exponent is unspecified for inf/NaN, but GCC/Clang set it to zero)
-  const Scalar scalar_non_finite_exponent = Scalar((ScalarUI(1) << ExponentBits) - ScalarUI(1));  // 255
-  const Packet non_finite_exponent = pset1<Packet>(scalar_non_finite_exponent);
-  const Packet is_zero_or_not_finite = por(pcmp_eq(a, zero), pcmp_eq(exponent, non_finite_exponent));
-  const Packet m = pselect(is_zero_or_not_finite, a, por(pand(normalized_a, sign_mantissa_mask), half));
-  exponent = pselect(is_zero_or_not_finite, zero, padd(exponent, exponent_offset));
-  return m;
-}
-
-// Safely applies ldexp, correctly handles overflows, underflows and denormals.
-// Assumes IEEE floating point format.
-template <typename Packet>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_generic(const Packet& a, const Packet& exponent) {
-  // We want to return a * 2^exponent, allowing for all possible integer
-  // exponents without overflowing or underflowing in intermediate
-  // computations.
-  //
-  // Since 'a' and the output can be denormal, the maximum range of 'exponent'
-  // to consider for a float is:
-  //   -255-23 -> 255+23
-  // Below -278 any finite float 'a' will become zero, and above +278 any
-  // finite float will become inf, including when 'a' is the smallest possible
-  // denormal.
-  //
-  // Unfortunately, 2^(278) cannot be represented using either one or two
-  // finite normal floats, so we must split the scale factor into at least
-  // three parts. It turns out to be faster to split 'exponent' into four
-  // factors, since [exponent>>2] is much faster to compute that [exponent/3].
-  //
-  // Set e = min(max(exponent, -278), 278);
-  //     b = floor(e/4);
-  //   out = ((((a * 2^(b)) * 2^(b)) * 2^(b)) * 2^(e-3*b))
-  //
-  // This will avoid any intermediate overflows and correctly handle 0, inf,
-  // NaN cases.
-  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  typedef typename unpacket_traits<PacketI>::type ScalarI;
-  static constexpr int TotalBits = sizeof(Scalar) * CHAR_BIT, MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
-                       ExponentBits = TotalBits - MantissaBits - 1;
-
-  const Packet max_exponent = pset1<Packet>(Scalar((ScalarI(1) << ExponentBits) + ScalarI(MantissaBits - 1)));  // 278
-  const PacketI bias = pset1<PacketI>((ScalarI(1) << (ExponentBits - 1)) - ScalarI(1));                         // 127
-  const PacketI e = pcast<Packet, PacketI>(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
-  PacketI b = parithmetic_shift_right<2>(e);                                          // floor(e/4);
-  Packet c = preinterpret<Packet>(plogical_shift_left<MantissaBits>(padd(b, bias)));  // 2^b
-  Packet out = pmul(pmul(pmul(a, c), c), c);                                          // a * 2^(3b)
-  b = pnmadd(pset1<PacketI>(3), b, e);                                                // e - 3b
-  c = preinterpret<Packet>(plogical_shift_left<MantissaBits>(padd(b, bias)));         // 2^(e-3*b)
-  out = pmul(out, c);
-  return out;
-}
-
-// Explicitly multiplies
-//    a * (2^e)
-// clamping e to the range
-// [NumTraits<Scalar>::min_exponent()-2, NumTraits<Scalar>::max_exponent()]
-//
-// This is approx 7x faster than pldexp_impl, but will prematurely over/underflow
-// if 2^e doesn't fit into a normal floating-point Scalar.
-//
-// Assumes IEEE floating point format
-template <typename Packet>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_fast(const Packet& a, const Packet& exponent) {
-  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  typedef typename unpacket_traits<PacketI>::type ScalarI;
-  static constexpr int TotalBits = sizeof(Scalar) * CHAR_BIT, MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
-                       ExponentBits = TotalBits - MantissaBits - 1;
-
-  const Packet bias = pset1<Packet>(Scalar((ScalarI(1) << (ExponentBits - 1)) - ScalarI(1)));  // 127
-  const Packet limit = pset1<Packet>(Scalar((ScalarI(1) << ExponentBits) - ScalarI(1)));       // 255
-  // restrict biased exponent between 0 and 255 for float.
-  const PacketI e = pcast<Packet, PacketI>(pmin(pmax(padd(exponent, bias), pzero(limit)), limit));  // exponent + 127
-  // return a * (2^e)
-  return pmul(a, preinterpret<Packet>(plogical_shift_left<MantissaBits>(e)));
-}
-
-// This function implements a single step of Halley's iteration for
-// computing x = y^(1/3):
-//   x_{k+1} = x_k - (x_k^3 - y) x_k / (2x_k^3 + y)
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet cbrt_halley_iteration_step(const Packet& x_k,
-                                                                                      const Packet& y) {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  Packet x_k_cb = pmul(x_k, pmul(x_k, x_k));
-  Packet denom = pmadd(pset1<Packet>(Scalar(2)), x_k_cb, y);
-  Packet num = psub(x_k_cb, y);
-  Packet r = pdiv(num, denom);
-  return pnmadd(x_k, r, x_k);
-}
-
-// Decompose the input such that x^(1/3) = y^(1/3) * 2^e_div3, and y is in the
-// interval [0.125,1].
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet cbrt_decompose(const Packet& x, Packet& e_div3) {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  // Extract the significant s in the range [0.5,1) and exponent e, such that
-  // x = 2^e * s.
-  Packet e, s;
-  s = pfrexp(x, e);
-
-  // Split the exponent into a part divisible by 3 and the remainder.
-  // e = 3*e_div3 + e_mod3.
-  constexpr Scalar kOneThird = Scalar(1) / 3;
-  e_div3 = pceil(pmul(e, pset1<Packet>(kOneThird)));
-  Packet e_mod3 = pnmadd(pset1<Packet>(Scalar(3)), e_div3, e);
-
-  // Replace s by y = (s * 2^e_mod3).
-  return pldexp_fast(s, e_mod3);
-}
-
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet cbrt_special_cases_and_sign(const Packet& x,
-                                                                                       const Packet& abs_root) {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-
-  // Set sign.
-  const Packet sign_mask = pset1<Packet>(Scalar(-0.0));
-  const Packet x_sign = pand(sign_mask, x);
-  Packet root = por(x_sign, abs_root);
-
-  // Pass non-finite and zero values of x straight through.
-  const Packet is_not_finite = por(pisinf(x), pisnan(x));
-  const Packet is_zero = pcmp_eq(pzero(x), x);
-  const Packet use_x = por(is_not_finite, is_zero);
-  return pselect(use_x, x, root);
-}
-
-// Generic implementation of cbrt(x) for float.
-//
-// The algorithm computes the cubic root of the input by first
-// decomposing it into a exponent and significant
-//   x = s * 2^e.
-//
-// We can then write the cube root as
-//
-//   x^(1/3) = 2^(e/3) * s^(1/3)
-//           = 2^((3*e_div3 + e_mod3)/3) * s^(1/3)
-//           = 2^(e_div3) * 2^(e_mod3/3) * s^(1/3)
-//           = 2^(e_div3) * (s * 2^e_mod3)^(1/3)
-//
-// where e_div3 = ceil(e/3) and e_mod3 = e - 3*e_div3.
-//
-// The cube root of the second term y = (s * 2^e_mod3)^(1/3) is coarsely
-// approximated using a cubic polynomial and subsequently refined using a
-// single step of Halley's iteration, and finally the two terms are combined
-// using pldexp_fast.
-//
-// Note: Many alternatives exist for implementing cbrt. See, for example,
-// the excellent discussion in Kahan's note:
-//   https://csclub.uwaterloo.ca/~pbarfuss/qbrt.pdf
-// This particular implementation was found to be very fast and accurate
-// among several alternatives tried, but is probably not "optimal" on all
-// platforms.
-//
-// This is accurate to 2 ULP.
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcbrt_float(const Packet& x) {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
-
-  // Decompose the input such that x^(1/3) = y^(1/3) * 2^e_div3, and y is in the
-  // interval [0.125,1].
-  Packet e_div3;
-  const Packet y = cbrt_decompose(pabs(x), e_div3);
-
-  // Compute initial approximation accurate to 5.22e-3.
-  // The polynomial was computed using Rminimax.
-  constexpr float alpha[] = {5.9220016002655029296875e-01f, -1.3859539031982421875e+00f, 1.4581282138824462890625e+00f,
-                             3.408401906490325927734375e-01f};
-  Packet r = ppolevl<Packet, 3>::run(y, alpha);
-
-  // Take one step of Halley's iteration.
-  r = cbrt_halley_iteration_step(r, y);
-
-  // Finally multiply by 2^(e_div3)
-  r = pldexp_fast(r, e_div3);
-
-  return cbrt_special_cases_and_sign(x, r);
-}
-
-// Generic implementation of cbrt(x) for double.
-//
-// The algorithm is identical to the one for float except that a different initial
-// approximation is used for y^(1/3) and two Halley iteration steps are peformed.
-//
-// This is accurate to 1 ULP.
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcbrt_double(const Packet& x) {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  static_assert(std::is_same<Scalar, double>::value, "Scalar type must be double");
-
-  // Decompose the input such that x^(1/3) = y^(1/3) * 2^e_div3, and y is in the
-  // interval [0.125,1].
-  Packet e_div3;
-  const Packet y = cbrt_decompose(pabs(x), e_div3);
-
-  // Compute initial approximation accurate to 0.016.
-  // The polynomial was computed using Rminimax.
-  constexpr double alpha[] = {-4.69470621553356115551736138513660989701747894287109375e-01,
-                              1.072314636518546304699839311069808900356292724609375e+00,
-                              3.81249427609571867048288140722434036433696746826171875e-01};
-  Packet r = ppolevl<Packet, 2>::run(y, alpha);
-
-  // Take two steps of Halley's iteration.
-  r = cbrt_halley_iteration_step(r, y);
-  r = cbrt_halley_iteration_step(r, y);
-
-  // Finally multiply by 2^(e_div3).
-  r = pldexp_fast(r, e_div3);
-  return cbrt_special_cases_and_sign(x, r);
-}
+//----------------------------------------------------------------------
+// Exponential and Logarithmic Functions
+//----------------------------------------------------------------------
 
 // Natural or base 2 logarithm.
 // Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
@@ -550,7 +154,7 @@
   Packet x2 = pmul(x, x);
   Packet x3 = pmul(x2, x);
 
-  // Evaluate the polynomial approximant , probably to improve instruction-level parallelism.
+  // Evaluate the polynomial in factored form for better instruction-level parallelism.
   // y = x - 0.5*x^2 + x^3 * polevl( x, P, 5 ) / p1evl( x, Q, 5 ) );
   Packet y, y1, y_;
   y = pmadd(cst_cephes_log_p0, x, cst_cephes_log_p1);
@@ -773,1030 +377,47 @@
   return pselect(zero_mask, cst_zero, pmax(pldexp(x, fx), _x));
 }
 
-// Enum for selecting which function to compute. SinCos is intended to compute
-// pairs of Sin and Cos of the even entries in the packet, e.g.
-// SinCos([a, *, b, *]) = [sin(a), cos(a), sin(b), cos(b)].
-enum class TrigFunction : uint8_t { Sin, Cos, Tan, SinCos };
-
-// The following code is inspired by the following stack-overflow answer:
-//   https://stackoverflow.com/questions/30463616/payne-hanek-algorithm-implementation-in-c/30465751#30465751
-// It has been largely optimized:
-//  - By-pass calls to frexp.
-//  - Aligned loads of required 96 bits of 2/pi. This is accomplished by
-//    (1) balancing the mantissa and exponent to the required bits of 2/pi are
-//    aligned on 8-bits, and (2) replicating the storage of the bits of 2/pi.
-//  - Avoid a branch in rounding and extraction of the remaining fractional part.
-// Overall, I measured a speed up higher than x2 on x86-64.
-inline float trig_reduce_huge(float xf, Eigen::numext::int32_t* quadrant) {
-  using Eigen::numext::int32_t;
-  using Eigen::numext::int64_t;
-  using Eigen::numext::uint32_t;
-  using Eigen::numext::uint64_t;
-
-  const double pio2_62 = 3.4061215800865545e-19;     // pi/2 * 2^-62
-  const uint64_t zero_dot_five = uint64_t(1) << 61;  // 0.5 in 2.62-bit fixed-point format
-
-  // 192 bits of 2/pi for Payne-Hanek reduction
-  // Bits are introduced by packet of 8 to enable aligned reads.
-  static const uint32_t two_over_pi[] = {
-      0x00000028, 0x000028be, 0x0028be60, 0x28be60db, 0xbe60db93, 0x60db9391, 0xdb939105, 0x9391054a, 0x91054a7f,
-      0x054a7f09, 0x4a7f09d5, 0x7f09d5f4, 0x09d5f47d, 0xd5f47d4d, 0xf47d4d37, 0x7d4d3770, 0x4d377036, 0x377036d8,
-      0x7036d8a5, 0x36d8a566, 0xd8a5664f, 0xa5664f10, 0x664f10e4, 0x4f10e410, 0x10e41000, 0xe4100000};
-
-  uint32_t xi = numext::bit_cast<uint32_t>(xf);
-  // Below, -118 = -126 + 8.
-  //   -126 is to get the exponent,
-  //   +8 is to enable alignment of 2/pi's bits on 8 bits.
-  // This is possible because the fractional part of x as only 24 meaningful bits.
-  uint32_t e = (xi >> 23) - 118;
-  // Extract the mantissa and shift it to align it wrt the exponent
-  xi = ((xi & 0x007fffffu) | 0x00800000u) << (e & 0x7);
-
-  uint32_t i = e >> 3;
-  uint32_t twoopi_1 = two_over_pi[i - 1];
-  uint32_t twoopi_2 = two_over_pi[i + 3];
-  uint32_t twoopi_3 = two_over_pi[i + 7];
-
-  // Compute x * 2/pi in 2.62-bit fixed-point format.
-  uint64_t p;
-  p = uint64_t(xi) * twoopi_3;
-  p = uint64_t(xi) * twoopi_2 + (p >> 32);
-  p = (uint64_t(xi * twoopi_1) << 32) + p;
-
-  // Round to nearest: add 0.5 and extract integral part.
-  uint64_t q = (p + zero_dot_five) >> 62;
-  *quadrant = int(q);
-  // Now it remains to compute "r = x - q*pi/2" with high accuracy,
-  // since we have p=x/(pi/2) with high accuracy, we can more efficiently compute r as:
-  //   r = (p-q)*pi/2,
-  // where the product can be be carried out with sufficient accuracy using double precision.
-  p -= q << 62;
-  return float(double(int64_t(p)) * pio2_62);
-}
-
-template <TrigFunction Func, typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-#if EIGEN_COMP_GNUC_STRICT
-    __attribute__((optimize("-fno-unsafe-math-optimizations")))
-#endif
-    Packet
-    psincos_float(const Packet& _x) {
-  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
-
-  const Packet cst_2oPI = pset1<Packet>(0.636619746685028076171875f);  // 2/PI
-  const Packet cst_rounding_magic = pset1<Packet>(12582912);           // 2^23 for rounding
-  const PacketI csti_1 = pset1<PacketI>(1);
-  const Packet cst_sign_mask = pset1frombits<Packet>(static_cast<Eigen::numext::uint32_t>(0x80000000u));
-
-  Packet x = pabs(_x);
-
-  // Scale x by 2/Pi to find x's octant.
-  Packet y = pmul(x, cst_2oPI);
-
-  // Rounding trick to find nearest integer:
-  Packet y_round = padd(y, cst_rounding_magic);
-  EIGEN_OPTIMIZATION_BARRIER(y_round)
-  PacketI y_int = preinterpret<PacketI>(y_round);  // last 23 digits represent integer (if abs(x)<2^24)
-  y = psub(y_round, cst_rounding_magic);           // nearest integer to x * (2/pi)
-
-// Subtract y * Pi/2 to reduce x to the interval -Pi/4 <= x <= +Pi/4
-// using "Extended precision modular arithmetic"
-#if defined(EIGEN_VECTORIZE_FMA)
-  // This version requires true FMA for high accuracy.
-  // It provides a max error of 1ULP up to (with absolute_error < 5.9605e-08):
-  constexpr float huge_th = (Func == TrigFunction::Sin) ? 117435.992f : 71476.0625f;
-  x = pmadd(y, pset1<Packet>(-1.57079601287841796875f), x);
-  x = pmadd(y, pset1<Packet>(-3.1391647326017846353352069854736328125e-07f), x);
-  x = pmadd(y, pset1<Packet>(-5.390302529957764765544681040410068817436695098876953125e-15f), x);
-#else
-  // Without true FMA, the previous set of coefficients maintain 1ULP accuracy
-  // up to x<15.7 (for sin), but accuracy is immediately lost for x>15.7.
-  // We thus use one more iteration to maintain 2ULPs up to reasonably large inputs.
-
-  // The following set of coefficients maintain 1ULP up to 9.43 and 14.16 for sin and cos respectively.
-  // and 2 ULP up to:
-  constexpr float huge_th = (Func == TrigFunction::Sin) ? 25966.f : 18838.f;
-  x = pmadd(y, pset1<Packet>(-1.5703125), x);  // = 0xbfc90000
-  EIGEN_OPTIMIZATION_BARRIER(x)
-  x = pmadd(y, pset1<Packet>(-0.000483989715576171875), x);  // = 0xb9fdc000
-  EIGEN_OPTIMIZATION_BARRIER(x)
-  x = pmadd(y, pset1<Packet>(1.62865035235881805419921875e-07), x);                      // = 0x342ee000
-  x = pmadd(y, pset1<Packet>(5.5644315544167710640977020375430583953857421875e-11), x);  // = 0x2e74b9ee
-
-// For the record, the following set of coefficients maintain 2ULP up
-// to a slightly larger range:
-// const float huge_th = ComputeSine ? 51981.f : 39086.125f;
-// but it slightly fails to maintain 1ULP for two values of sin below pi.
-// x = pmadd(y, pset1<Packet>(-3.140625/2.), x);
-// x = pmadd(y, pset1<Packet>(-0.00048351287841796875), x);
-// x = pmadd(y, pset1<Packet>(-3.13855707645416259765625e-07), x);
-// x = pmadd(y, pset1<Packet>(-6.0771006282767103812147979624569416046142578125e-11), x);
-
-// For the record, with only 3 iterations it is possible to maintain
-// 1 ULP up to 3PI (maybe more) and 2ULP up to 255.
-// The coefficients are: 0xbfc90f80, 0xb7354480, 0x2e74b9ee
-#endif
-
-  if (predux_any(pcmp_le(pset1<Packet>(huge_th), pabs(_x)))) {
-    const int PacketSize = unpacket_traits<Packet>::size;
-    EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float vals[PacketSize];
-    EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float x_cpy[PacketSize];
-    EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) Eigen::numext::int32_t y_int2[PacketSize];
-    pstoreu(vals, pabs(_x));
-    pstoreu(x_cpy, x);
-    pstoreu(y_int2, y_int);
-    for (int k = 0; k < PacketSize; ++k) {
-      float val = vals[k];
-      if (val >= huge_th && (numext::isfinite)(val)) x_cpy[k] = trig_reduce_huge(val, &y_int2[k]);
-    }
-    x = ploadu<Packet>(x_cpy);
-    y_int = ploadu<PacketI>(y_int2);
-  }
-
-  // Get the polynomial selection mask from the second bit of y_int
-  // We'll calculate both (sin and cos) polynomials and then select from the two.
-  Packet poly_mask = preinterpret<Packet>(pcmp_eq(pand(y_int, csti_1), pzero(y_int)));
-
-  Packet x2 = pmul(x, x);
-
-  // Evaluate the cos(x) polynomial. (-Pi/4 <= x <= Pi/4)
-  Packet y1 = pset1<Packet>(2.4372266125283204019069671630859375e-05f);
-  y1 = pmadd(y1, x2, pset1<Packet>(-0.00138865201734006404876708984375f));
-  y1 = pmadd(y1, x2, pset1<Packet>(0.041666619479656219482421875f));
-  y1 = pmadd(y1, x2, pset1<Packet>(-0.5f));
-  y1 = pmadd(y1, x2, pset1<Packet>(1.f));
-
-  // Evaluate the sin(x) polynomial. (Pi/4 <= x <= Pi/4)
-  // octave/matlab code to compute those coefficients:
-  //    x = (0:0.0001:pi/4)';
-  //    A = [x.^3 x.^5 x.^7];
-  //    w = ((1.-(x/(pi/4)).^2).^5)*2000+1;         # weights trading relative accuracy
-  //    c = (A'*diag(w)*A)\(A'*diag(w)*(sin(x)-x)); # weighted LS, linear coeff forced to 1
-  //    printf('%.64f\n %.64f\n%.64f\n', c(3), c(2), c(1))
-  //
-  Packet y2 = pset1<Packet>(-0.0001959234114083702898469196984621021329076029360294342041015625f);
-  y2 = pmadd(y2, x2, pset1<Packet>(0.0083326873655616851693794799871284340042620897293090820312500000f));
-  y2 = pmadd(y2, x2, pset1<Packet>(-0.1666666203982298255503735617821803316473960876464843750000000000f));
-  y2 = pmul(y2, x2);
-  y2 = pmadd(y2, x, x);
-
-  // Select the correct result from the two polynomials.
-  // Compute the sign to apply to the polynomial.
-  // sin: sign = second_bit(y_int) xor signbit(_x)
-  // cos: sign = second_bit(y_int+1)
-  Packet sign_bit = (Func == TrigFunction::Sin) ? pxor(_x, preinterpret<Packet>(plogical_shift_left<30>(y_int)))
-                                                : preinterpret<Packet>(plogical_shift_left<30>(padd(y_int, csti_1)));
-  sign_bit = pand(sign_bit, cst_sign_mask);  // clear all but left most bit
-
-  if ((Func == TrigFunction::SinCos) || (Func == TrigFunction::Tan)) {
-    // TODO(rmlarsen): Add single polynomial for tan(x) instead of paying for sin+cos+div.
-    Packet peven = peven_mask(x);
-    Packet ysin = pselect(poly_mask, y2, y1);
-    Packet ycos = pselect(poly_mask, y1, y2);
-    Packet sign_bit_sin = pxor(_x, preinterpret<Packet>(plogical_shift_left<30>(y_int)));
-    Packet sign_bit_cos = preinterpret<Packet>(plogical_shift_left<30>(padd(y_int, csti_1)));
-    sign_bit_sin = pand(sign_bit_sin, cst_sign_mask);  // clear all but left most bit
-    sign_bit_cos = pand(sign_bit_cos, cst_sign_mask);  // clear all but left most bit
-    y = (Func == TrigFunction::SinCos) ? pselect(peven, pxor(ysin, sign_bit_sin), pxor(ycos, sign_bit_cos))
-                                       : pdiv(pxor(ysin, sign_bit_sin), pxor(ycos, sign_bit_cos));
-  } else {
-    y = (Func == TrigFunction::Sin) ? pselect(poly_mask, y2, y1) : pselect(poly_mask, y1, y2);
-    y = pxor(y, sign_bit);
-  }
-  return y;
-}
-
+// This function computes exp2(x) = exp(ln(2) * x).
+// To improve accuracy, the product ln(2)*x is computed using the twoprod
+// algorithm, such that ln(2) * x = p_hi + p_lo holds exactly. Then exp2(x) is
+// computed as exp2(x) = exp(p_hi) * exp(p_lo) ~= exp(p_hi) * (1 + p_lo). This
+// correction step this reduces the maximum absolute error as follows:
+//
+// type   | max error (simple product) | max error (twoprod) |
+// -----------------------------------------------------------
+// float  |       35 ulps              |       4 ulps        |
+// double |      363 ulps              |     110 ulps        |
+//
 template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psin_float(const Packet& x) {
-  return psincos_float<TrigFunction::Sin>(x);
-}
-
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcos_float(const Packet& x) {
-  return psincos_float<TrigFunction::Cos>(x);
-}
-
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet ptan_float(const Packet& x) {
-  return psincos_float<TrigFunction::Tan>(x);
-}
-
-// Trigonometric argument reduction for double for inputs smaller than 15.
-// Reduces trigonometric arguments for double inputs where x < 15. Given an argument x and its corresponding quadrant
-// count n, the function computes and returns the reduced argument t such that x = n * pi/2 + t.
-template <typename Packet>
-Packet trig_reduce_small_double(const Packet& x, const Packet& q) {
-  // Pi/2 split into 2 values
-  const Packet cst_pio2_a = pset1<Packet>(-1.570796325802803);
-  const Packet cst_pio2_b = pset1<Packet>(-9.920935184482005e-10);
-
-  Packet t;
-  t = pmadd(cst_pio2_a, q, x);
-  t = pmadd(cst_pio2_b, q, t);
-  return t;
-}
-
-// Trigonometric argument reduction for double for inputs smaller than 1e14.
-// Reduces trigonometric arguments for double inputs where x < 1e14. Given an argument x and its corresponding quadrant
-// count n, the function computes and returns the reduced argument t such that x = n * pi/2 + t.
-template <typename Packet>
-Packet trig_reduce_medium_double(const Packet& x, const Packet& q_high, const Packet& q_low) {
-  // Pi/2 split into 4 values
-  const Packet cst_pio2_a = pset1<Packet>(-1.570796325802803);
-  const Packet cst_pio2_b = pset1<Packet>(-9.920935184482005e-10);
-  const Packet cst_pio2_c = pset1<Packet>(-6.123234014771656e-17);
-  const Packet cst_pio2_d = pset1<Packet>(1.903488962019325e-25);
-
-  Packet t;
-  t = pmadd(cst_pio2_a, q_high, x);
-  t = pmadd(cst_pio2_a, q_low, t);
-  t = pmadd(cst_pio2_b, q_high, t);
-  t = pmadd(cst_pio2_b, q_low, t);
-  t = pmadd(cst_pio2_c, q_high, t);
-  t = pmadd(cst_pio2_c, q_low, t);
-  t = pmadd(cst_pio2_d, padd(q_low, q_high), t);
-  return t;
-}
-
-template <TrigFunction Func, typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-#if EIGEN_COMP_GNUC_STRICT
-    __attribute__((optimize("-fno-unsafe-math-optimizations")))
-#endif
-    Packet
-    psincos_double(const Packet& x) {
-  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
-  typedef typename unpacket_traits<PacketI>::type ScalarI;
-
-  const Packet cst_sign_mask = pset1frombits<Packet>(static_cast<Eigen::numext::uint64_t>(0x8000000000000000u));
-
-  // If the argument is smaller than this value, use a simpler argument reduction
-  const double small_th = 15;
-  // If the argument is bigger than this value, use the non-vectorized std version
-  const double huge_th = 1e14;
-
-  const Packet cst_2oPI = pset1<Packet>(0.63661977236758134307553505349006);  // 2/PI
-  // Integer Packet constants
-  const PacketI cst_one = pset1<PacketI>(ScalarI(1));
-  // Constant for splitting
-  const Packet cst_split = pset1<Packet>(1 << 24);
-
-  Packet x_abs = pabs(x);
-
-  // Scale x by 2/Pi
-  PacketI q_int;
-  Packet s;
-
-  // TODO Implement huge angle argument reduction
-  if (EIGEN_PREDICT_FALSE(predux_any(pcmp_le(pset1<Packet>(small_th), x_abs)))) {
-    Packet q_high = pmul(pfloor(pmul(x_abs, pdiv(cst_2oPI, cst_split))), cst_split);
-    Packet q_low_noround = psub(pmul(x_abs, cst_2oPI), q_high);
-    q_int = pcast<Packet, PacketI>(padd(q_low_noround, pset1<Packet>(0.5)));
-    Packet q_low = pcast<PacketI, Packet>(q_int);
-    s = trig_reduce_medium_double(x_abs, q_high, q_low);
-  } else {
-    Packet qval_noround = pmul(x_abs, cst_2oPI);
-    q_int = pcast<Packet, PacketI>(padd(qval_noround, pset1<Packet>(0.5)));
-    Packet q = pcast<PacketI, Packet>(q_int);
-    s = trig_reduce_small_double(x_abs, q);
-  }
-
-  // All the upcoming approximating polynomials have even exponents
-  Packet ss = pmul(s, s);
-
-  // Padé approximant of cos(x)
-  // Assuring < 1 ULP error on the interval [-pi/4, pi/4]
-  // cos(x) ~= (80737373*x^8 - 13853547000*x^6 + 727718024880*x^4 - 11275015752000*x^2 + 23594700729600)/(147173*x^8 +
-  // 39328920*x^6 + 5772800880*x^4 + 522334612800*x^2 + 23594700729600)
-  // MATLAB code to compute those coefficients:
-  //    syms x;
-  //    cosf = @(x) cos(x);
-  //    pade_cosf = pade(cosf(x), x, 0, 'Order', 8)
-  Packet sc1_num = pmadd(ss, pset1<Packet>(80737373), pset1<Packet>(-13853547000));
-  Packet sc2_num = pmadd(sc1_num, ss, pset1<Packet>(727718024880));
-  Packet sc3_num = pmadd(sc2_num, ss, pset1<Packet>(-11275015752000));
-  Packet sc4_num = pmadd(sc3_num, ss, pset1<Packet>(23594700729600));
-  Packet sc1_denum = pmadd(ss, pset1<Packet>(147173), pset1<Packet>(39328920));
-  Packet sc2_denum = pmadd(sc1_denum, ss, pset1<Packet>(5772800880));
-  Packet sc3_denum = pmadd(sc2_denum, ss, pset1<Packet>(522334612800));
-  Packet sc4_denum = pmadd(sc3_denum, ss, pset1<Packet>(23594700729600));
-  Packet scos = pdiv(sc4_num, sc4_denum);
-
-  // Padé approximant of sin(x)
-  // Assuring < 1 ULP error on the interval [-pi/4, pi/4]
-  // sin(x) ~= (x*(4585922449*x^8 - 1066023933480*x^6 + 83284044283440*x^4 - 2303682236856000*x^2 +
-  // 15605159573203200))/(45*(1029037*x^8 + 345207016*x^6 + 61570292784*x^4 + 6603948711360*x^2 + 346781323848960))
-  // MATLAB code to compute those coefficients:
-  //    syms x;
-  //    sinf = @(x) sin(x);
-  //    pade_sinf = pade(sinf(x), x, 0, 'Order', 8, 'OrderMode', 'relative')
-  Packet ss1_num = pmadd(ss, pset1<Packet>(4585922449), pset1<Packet>(-1066023933480));
-  Packet ss2_num = pmadd(ss1_num, ss, pset1<Packet>(83284044283440));
-  Packet ss3_num = pmadd(ss2_num, ss, pset1<Packet>(-2303682236856000));
-  Packet ss4_num = pmadd(ss3_num, ss, pset1<Packet>(15605159573203200));
-  Packet ss1_denum = pmadd(ss, pset1<Packet>(1029037), pset1<Packet>(345207016));
-  Packet ss2_denum = pmadd(ss1_denum, ss, pset1<Packet>(61570292784));
-  Packet ss3_denum = pmadd(ss2_denum, ss, pset1<Packet>(6603948711360));
-  Packet ss4_denum = pmadd(ss3_denum, ss, pset1<Packet>(346781323848960));
-  Packet ssin = pdiv(pmul(s, ss4_num), pmul(pset1<Packet>(45), ss4_denum));
-
-  Packet poly_mask = preinterpret<Packet>(pcmp_eq(pand(q_int, cst_one), pzero(q_int)));
-
-  Packet sign_sin = pxor(x, preinterpret<Packet>(plogical_shift_left<62>(q_int)));
-  Packet sign_cos = preinterpret<Packet>(plogical_shift_left<62>(padd(q_int, cst_one)));
-  Packet sign_bit, sFinalRes;
-  if (Func == TrigFunction::Sin) {
-    sign_bit = sign_sin;
-    sFinalRes = pselect(poly_mask, ssin, scos);
-  } else if (Func == TrigFunction::Cos) {
-    sign_bit = sign_cos;
-    sFinalRes = pselect(poly_mask, scos, ssin);
-  } else if (Func == TrigFunction::Tan) {
-    // TODO(rmlarsen): Add single polynomial for tan(x) instead of paying for sin+cos+div.
-    sign_bit = pxor(sign_sin, sign_cos);
-    sFinalRes = pdiv(pselect(poly_mask, ssin, scos), pselect(poly_mask, scos, ssin));
-  } else if (Func == TrigFunction::SinCos) {
-    Packet peven = peven_mask(x);
-    sign_bit = pselect((s), sign_sin, sign_cos);
-    sFinalRes = pselect(pxor(peven, poly_mask), ssin, scos);
-  }
-  sign_bit = pand(sign_bit, cst_sign_mask);  // clear all but left most bit
-  sFinalRes = pxor(sFinalRes, sign_bit);
-
-  // If the inputs values are higher than that a value that the argument reduction can currently address, compute them
-  // using the C++ standard library.
-  // TODO Remove it when huge angle argument reduction is implemented
-  if (EIGEN_PREDICT_FALSE(predux_any(pcmp_le(pset1<Packet>(huge_th), x_abs)))) {
-    const int PacketSize = unpacket_traits<Packet>::size;
-    EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) double sincos_vals[PacketSize];
-    EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) double x_cpy[PacketSize];
-    pstoreu(x_cpy, x);
-    pstoreu(sincos_vals, sFinalRes);
-    for (int k = 0; k < PacketSize; ++k) {
-      double val = x_cpy[k];
-      if (std::abs(val) > huge_th && (numext::isfinite)(val)) {
-        if (Func == TrigFunction::Sin) {
-          sincos_vals[k] = std::sin(val);
-        } else if (Func == TrigFunction::Cos) {
-          sincos_vals[k] = std::cos(val);
-        } else if (Func == TrigFunction::Tan) {
-          sincos_vals[k] = std::tan(val);
-        } else if (Func == TrigFunction::SinCos) {
-          sincos_vals[k] = k % 2 == 0 ? std::sin(val) : std::cos(val);
-        }
-      }
-    }
-    sFinalRes = ploadu<Packet>(sincos_vals);
-  }
-  return sFinalRes;
-}
-
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psin_double(const Packet& x) {
-  return psincos_double<TrigFunction::Sin>(x);
-}
-
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcos_double(const Packet& x) {
-  return psincos_double<TrigFunction::Cos>(x);
-}
-
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet ptan_double(const Packet& x) {
-  return psincos_double<TrigFunction::Tan>(x);
-}
-
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-    std::enable_if_t<std::is_same<typename unpacket_traits<Packet>::type, float>::value, Packet>
-    psincos_selector(const Packet& x) {
-  return psincos_float<TrigFunction::SinCos, Packet>(x);
-}
-
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-    std::enable_if_t<std::is_same<typename unpacket_traits<Packet>::type, double>::value, Packet>
-    psincos_selector(const Packet& x) {
-  return psincos_double<TrigFunction::SinCos, Packet>(x);
-}
-
-// Generic implementation of acos(x).
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pacos_float(const Packet& x_in) {
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_exp2(const Packet& _x) {
   typedef typename unpacket_traits<Packet>::type Scalar;
-  static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
-
-  const Packet cst_one = pset1<Packet>(Scalar(1));
-  const Packet cst_pi = pset1<Packet>(Scalar(EIGEN_PI));
-  const Packet p6 = pset1<Packet>(Scalar(2.36423197202384471893310546875e-3));
-  const Packet p5 = pset1<Packet>(Scalar(-1.1368644423782825469970703125e-2));
-  const Packet p4 = pset1<Packet>(Scalar(2.717843465507030487060546875e-2));
-  const Packet p3 = pset1<Packet>(Scalar(-4.8969544470310211181640625e-2));
-  const Packet p2 = pset1<Packet>(Scalar(8.8804088532924652099609375e-2));
-  const Packet p1 = pset1<Packet>(Scalar(-0.214591205120086669921875));
-  const Packet p0 = pset1<Packet>(Scalar(1.57079637050628662109375));
-
-  // For x in [0:1], we approximate acos(x)/sqrt(1-x), which is a smooth
-  // function, by a 6'th order polynomial.
-  // For x in [-1:0) we use that acos(-x) = pi - acos(x).
-  const Packet neg_mask = psignbit(x_in);
-  const Packet abs_x = pabs(x_in);
-
-  // Evaluate the polynomial using Horner's rule:
-  //   P(x) = p0 + x * (p1 +  x * (p2 + ... (p5 + x * p6)) ... ) .
-  // We evaluate even and odd terms independently to increase
-  // instruction level parallelism.
-  Packet x2 = pmul(x_in, x_in);
-  Packet p_even = pmadd(p6, x2, p4);
-  Packet p_odd = pmadd(p5, x2, p3);
-  p_even = pmadd(p_even, x2, p2);
-  p_odd = pmadd(p_odd, x2, p1);
-  p_even = pmadd(p_even, x2, p0);
-  Packet p = pmadd(p_odd, abs_x, p_even);
-
-  // The polynomial approximates acos(x)/sqrt(1-x), so
-  // multiply by sqrt(1-x) to get acos(x).
-  // Conveniently returns NaN for arguments outside [-1:1].
-  Packet denom = psqrt(psub(cst_one, abs_x));
-  Packet result = pmul(denom, p);
-  // Undo mapping for negative arguments.
-  return pselect(neg_mask, psub(cst_pi, result), result);
+  constexpr int max_exponent = std::numeric_limits<Scalar>::max_exponent;
+  constexpr int digits = std::numeric_limits<Scalar>::digits;
+  constexpr Scalar max_cap = Scalar(max_exponent + 1);
+  constexpr Scalar min_cap = -Scalar(max_exponent + digits - 1);
+  Packet x = pmax(pmin(_x, pset1<Packet>(max_cap)), pset1<Packet>(min_cap));
+  Packet p_hi, p_lo;
+  twoprod(pset1<Packet>(Scalar(EIGEN_LN2)), x, p_hi, p_lo);
+  Packet exp2_hi = pexp(p_hi);
+  Packet exp2_lo = padd(pset1<Packet>(Scalar(1)), p_lo);
+  return pmul(exp2_hi, exp2_lo);
 }
 
-// Generic implementation of asin(x).
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pasin_float(const Packet& x_in) {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
+}  // end namespace internal
+}  // end namespace Eigen
 
-  constexpr float kPiOverTwo = static_cast<float>(EIGEN_PI / 2);
+// Include the split-out sections. Order matters: Pow depends on exp/log and FrexpLdexp,
+// Trig depends on exp (for ptanh_float), Complex depends on Trig (for psincos_selector).
+#include "GenericPacketMathPow.h"
+#include "GenericPacketMathTrig.h"
+#include "GenericPacketMathComplex.h"
 
-  const Packet cst_half = pset1<Packet>(0.5f);
-  const Packet cst_one = pset1<Packet>(1.0f);
-  const Packet cst_two = pset1<Packet>(2.0f);
-  const Packet cst_pi_over_two = pset1<Packet>(kPiOverTwo);
+namespace Eigen {
+namespace internal {
 
-  const Packet abs_x = pabs(x_in);
-  const Packet sign_mask = pandnot(x_in, abs_x);
-  const Packet invalid_mask = pcmp_lt(cst_one, abs_x);
-
-  // For arguments |x| > 0.5, we map x back to [0:0.5] using
-  // the transformation x_large = sqrt(0.5*(1-x)), and use the
-  // identity
-  //   asin(x) = pi/2 - 2 * asin( sqrt( 0.5 * (1 - x)))
-
-  const Packet x_large = psqrt(pnmadd(cst_half, abs_x, cst_half));
-  const Packet large_mask = pcmp_lt(cst_half, abs_x);
-  const Packet x = pselect(large_mask, x_large, abs_x);
-  const Packet x2 = pmul(x, x);
-
-  // For |x| < 0.5 approximate asin(x)/x by an 8th order polynomial with
-  // even terms only.
-  constexpr float alpha[] = {5.08838854730129241943359375e-2f, 3.95139865577220916748046875e-2f,
-                             7.550220191478729248046875e-2f, 0.16664917767047882080078125f, 1.00000011920928955078125f};
-  Packet p = ppolevl<Packet, 4>::run(x2, alpha);
-  p = pmul(p, x);
-
-  const Packet p_large = pnmadd(cst_two, p, cst_pi_over_two);
-  p = pselect(large_mask, p_large, p);
-  // Flip the sign for negative arguments.
-  p = pxor(p, sign_mask);
-  // Return NaN for arguments outside [-1:1].
-  return por(invalid_mask, p);
-}
-
-template <typename Scalar>
-struct patan_reduced {
-  template <typename Packet>
-  static EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet run(const Packet& x);
-};
-
-template <>
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_reduced<double>::run(const Packet& x) {
-  constexpr double alpha[] = {2.6667153866462208e-05, 3.0917513112462781e-03, 5.2574296781008604e-02,
-                              3.0409318473444424e-01, 7.5365702534987022e-01, 8.2704055405494614e-01,
-                              3.3004361289279920e-01};
-
-  constexpr double beta[] = {
-      2.7311202462436667e-04, 1.0899150928962708e-02, 1.1548932646420353e-01, 4.9716458728465573e-01, 1.0,
-      9.3705509168587852e-01, 3.3004361289279920e-01};
-
-  Packet x2 = pmul(x, x);
-  Packet p = ppolevl<Packet, 6>::run(x2, alpha);
-  Packet q = ppolevl<Packet, 6>::run(x2, beta);
-  return pmul(x, pdiv(p, q));
-}
-
-// Computes elementwise atan(x) for x in [-1:1] with 2 ulp accuracy.
-template <>
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_reduced<float>::run(const Packet& x) {
-  constexpr float alpha[] = {1.12026982009410858154296875e-01f, 7.296695709228515625e-01f, 8.109951019287109375e-01f};
-
-  constexpr float beta[] = {1.00917108356952667236328125e-02f, 2.8318560123443603515625e-01f, 1.0f,
-                            8.109951019287109375e-01f};
-
-  Packet x2 = pmul(x, x);
-  Packet p = ppolevl<Packet, 2>::run(x2, alpha);
-  Packet q = ppolevl<Packet, 3>::run(x2, beta);
-  return pmul(x, pdiv(p, q));
-}
-
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_atan(const Packet& x_in) {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-
-  constexpr Scalar kPiOverTwo = static_cast<Scalar>(EIGEN_PI / 2);
-
-  const Packet cst_signmask = pset1<Packet>(Scalar(-0.0));
-  const Packet cst_one = pset1<Packet>(Scalar(1));
-  const Packet cst_pi_over_two = pset1<Packet>(kPiOverTwo);
-
-  //   "Large": For |x| > 1, use atan(1/x) = sign(x)*pi/2 - atan(x).
-  //   "Small": For |x| <= 1, approximate atan(x) directly by a polynomial
-  //            calculated using Rminimax.
-
-  const Packet abs_x = pabs(x_in);
-  const Packet x_signmask = pand(x_in, cst_signmask);
-  const Packet large_mask = pcmp_lt(cst_one, abs_x);
-  const Packet x = pselect(large_mask, preciprocal(abs_x), abs_x);
-  const Packet p = patan_reduced<Scalar>::run(x);
-  // Apply transformations according to the range reduction masks.
-  Packet result = pselect(large_mask, psub(cst_pi_over_two, p), p);
-  // Return correct sign
-  return pxor(result, x_signmask);
-}
-
-#ifdef EIGEN_FAST_MATH
-
-/** \internal \returns the hyperbolic tan of \a a (coeff-wise)
-    Doesn't do anything fancy, just a 9/8-degree rational interpolant which
-    is accurate up to a couple of ulps in the (approximate) range [-8, 8],
-    outside of which tanh(x) = +/-1 in single precision. The input is clamped
-    to the range [-c, c]. The value c is chosen as the smallest value where
-    the approximation evaluates to exactly 1.
-
-    This implementation works on both scalars and packets.
-*/
-template <typename T>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS T ptanh_float(const T& a_x) {
-  // Clamp the inputs to the range [-c, c] and set everything
-  // outside that range to 1.0. The value c is chosen as the smallest
-  // floating point argument such that the approximation is exactly 1.
-  // This saves clamping the value at the end.
-#ifdef EIGEN_VECTORIZE_FMA
-  const T plus_clamp = pset1<T>(8.01773357391357422f);
-  const T minus_clamp = pset1<T>(-8.01773357391357422f);
-#else
-  const T plus_clamp = pset1<T>(7.90738964080810547f);
-  const T minus_clamp = pset1<T>(-7.90738964080810547f);
-#endif
-  const T x = pmax(pmin(a_x, plus_clamp), minus_clamp);
-
-  // The following rational approximation was generated by rminimax
-  // (https://gitlab.inria.fr/sfilip/rminimax) using the following
-  // command:
-  // $ ratapprox --function="tanh(x)" --dom='[-8.67,8.67]' --num="odd"
-  //   --den="even" --type="[9,8]" --numF="[SG]" --denF="[SG]" --log
-  //   --output=tanhf.sollya --dispCoeff="dec"
-
-  // The monomial coefficients of the numerator polynomial (odd).
-  constexpr float alpha[] = {1.394553628e-8f, 2.102733560e-5f, 3.520756727e-3f, 1.340216100e-1f};
-
-  // The monomial coefficients of the denominator polynomial (even).
-  constexpr float beta[] = {8.015776984e-7f, 3.326951409e-4f, 2.597254514e-2f, 4.673548340e-1f, 1.0f};
-
-  // Since the polynomials are odd/even, we need x^2.
-  const T x2 = pmul(x, x);
-  const T x3 = pmul(x2, x);
-
-  T p = ppolevl<T, 3>::run(x2, alpha);
-  T q = ppolevl<T, 4>::run(x2, beta);
-  // Take advantage of the fact that the constant term in p is 1 to compute
-  // x*(x^2*p + 1) = x^3 * p + x.
-  p = pmadd(x3, p, x);
-
-  // Divide the numerator by the denominator.
-  return pdiv(p, q);
-}
-
-#else
-
-/** \internal \returns the hyperbolic tan of \a a (coeff-wise).
-    On the domain [-1.25:1.25] we use an approximation of the form
-    tanh(x) ~= x^3 * (P(x) / Q(x)) + x, where P and Q are polynomials in x^2.
-    For |x| > 1.25, tanh is implememented as tanh(x) = 1 - (2 / (1 + exp(2*x))).
-
-    This implementation has a maximum error of 1 ULP (measured with AVX2+FMA).
-
-    This implementation works on both scalars and packets.
-*/
-template <typename T>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS T ptanh_float(const T& x) {
-  // The polynomial coefficients were computed using Rminimax:
-  // % ./ratapprox --function="tanh(x)-x" --dom='[-1.25,1.25]' --num="[x^3,x^5]" --den="even"
-  //     --type="[3,4]" --numF="[SG]" --denF="[SG]" --log --dispCoeff="dec" --output=tanhf.solly
-  constexpr float alpha[] = {-1.46725140511989593505859375e-02f, -3.333333432674407958984375e-01f};
-  constexpr float beta[] = {1.570280082523822784423828125e-02, 4.4401752948760986328125e-01, 1.0f};
-  const T x2 = pmul(x, x);
-  const T x3 = pmul(x2, x);
-  const T p = ppolevl<T, 1>::run(x2, alpha);
-  const T q = ppolevl<T, 2>::run(x2, beta);
-  const T small_tanh = pmadd(x3, pdiv(p, q), x);
-
-  const T sign_mask = pset1<T>(-0.0f);
-  const T abs_x = pandnot(x, sign_mask);
-  constexpr float kSmallThreshold = 1.25f;
-  const T large_mask = pcmp_lt(pset1<T>(kSmallThreshold), abs_x);
-  // Fast exit if all elements are small.
-  if (!predux_any(large_mask)) {
-    return small_tanh;
-  }
-
-  //  Compute as 1 - (2 / (1 + exp(2*x)))
-  const T one = pset1<T>(1.0f);
-  const T two = pset1<T>(2.0f);
-  const T s = pexp_float<T, true>(pmul(two, abs_x));
-  const T abs_tanh = psub(one, pdiv(two, padd(s, one)));
-
-  // Handle infinite inputs and set sign bit.
-  constexpr float kHugeThreshold = 16.0f;
-  const T huge_mask = pcmp_lt(pset1<T>(kHugeThreshold), abs_x);
-  const T x_sign = pand(sign_mask, x);
-  const T large_tanh = por(x_sign, pselect(huge_mask, one, abs_tanh));
-  return pselect(large_mask, large_tanh, small_tanh);
-}
-
-#endif  // EIGEN_FAST_MATH
-
-/** \internal \returns the hyperbolic tan of \a a (coeff-wise)
-    This uses a 19/18-degree rational interpolant which
-    is accurate up to a couple of ulps in the (approximate) range [-18.7, 18.7],
-    outside of which tanh(x) = +/-1 in single precision. The input is clamped
-    to the range [-c, c]. The value c is chosen as the smallest value where
-    the approximation evaluates to exactly 1.
-
-    This implementation works on both scalars and packets.
-*/
-template <typename T>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS T ptanh_double(const T& a_x) {
-  // Clamp the inputs to the range [-c, c] and set everything
-  // outside that range to 1.0. The value c is chosen as the smallest
-  // floating point argument such that the approximation is exactly 1.
-  // This saves clamping the value at the end.
-#ifdef EIGEN_VECTORIZE_FMA
-  const T plus_clamp = pset1<T>(17.6610191624600077);
-  const T minus_clamp = pset1<T>(-17.6610191624600077);
-#else
-  const T plus_clamp = pset1<T>(17.714196154005176);
-  const T minus_clamp = pset1<T>(-17.714196154005176);
-#endif
-  const T x = pmax(pmin(a_x, plus_clamp), minus_clamp);
-  // The following rational approximation was generated by rminimax
-  // (https://gitlab.inria.fr/sfilip/rminimax) using the following
-  // command:
-  // $ ./ratapprox --function="tanh(x)" --dom='[-18.72,18.72]'
-  //   --num="odd" --den="even" --type="[19,18]" --numF="[D]"
-  //   --denF="[D]" --log --output=tanh.sollya --dispCoeff="dec"
-
-  // The monomial coefficients of the numerator polynomial (odd).
-  constexpr double alpha[] = {2.6158007860482230e-23, 7.6534862268749319e-19, 3.1309488231386680e-15,
-                              4.2303918148209176e-12, 2.4618379131293676e-09, 6.8644367682497074e-07,
-                              9.3839087674268880e-05, 5.9809711724441161e-03, 1.5184719640284322e-01};
-
-  // The monomial coefficients of the denominator polynomial (even).
-  constexpr double beta[] = {6.463747022670968018e-21, 5.782506856739003571e-17,
-                             1.293019623712687916e-13, 1.123643448069621992e-10,
-                             4.492975677839633985e-08, 8.785185266237658698e-06,
-                             8.295161192716231542e-04, 3.437448108450402717e-02,
-                             4.851805297361760360e-01, 1.0};
-
-  // Since the polynomials are odd/even, we need x^2.
-  const T x2 = pmul(x, x);
-  const T x3 = pmul(x2, x);
-
-  // Interleave the evaluation of the numerator polynomial p and
-  // denominator polynomial q.
-  T p = ppolevl<T, 8>::run(x2, alpha);
-  T q = ppolevl<T, 9>::run(x2, beta);
-  // Take advantage of the fact that the constant term in p is 1 to compute
-  // x*(x^2*p + 1) = x^3 * p + x.
-  p = pmadd(x3, p, x);
-
-  // Divide the numerator by the denominator.
-  return pdiv(p, q);
-}
-
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patanh_float(const Packet& x) {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
-
-  // For |x| in [0:0.5] we use a polynomial approximation of the form
-  // P(x) = x + x^3*(alpha[4] + x^2 * (alpha[3] + x^2 * (... x^2 * alpha[0]) ... )).
-  constexpr float alpha[] = {0.1819281280040740966796875f, 8.2311116158962249755859375e-2f,
-                             0.14672131836414337158203125f, 0.1997792422771453857421875f, 0.3333373963832855224609375f};
-  const Packet x2 = pmul(x, x);
-  const Packet x3 = pmul(x, x2);
-  Packet p = ppolevl<Packet, 4>::run(x2, alpha);
-  p = pmadd(x3, p, x);
-
-  // For |x| in ]0.5:1.0] we use atanh = 0.5*ln((1+x)/(1-x));
-  const Packet half = pset1<Packet>(0.5f);
-  const Packet one = pset1<Packet>(1.0f);
-  Packet r = pdiv(padd(one, x), psub(one, x));
-  r = pmul(half, plog(r));
-
-  const Packet x_gt_half = pcmp_le(half, pabs(x));
-  const Packet x_eq_one = pcmp_eq(one, pabs(x));
-  const Packet x_gt_one = pcmp_lt(one, pabs(x));
-  const Packet sign_mask = pset1<Packet>(-0.0f);
-  const Packet x_sign = pand(sign_mask, x);
-  const Packet inf = pset1<Packet>(std::numeric_limits<float>::infinity());
-  return por(x_gt_one, pselect(x_eq_one, por(x_sign, inf), pselect(x_gt_half, r, p)));
-}
-
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patanh_double(const Packet& x) {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  static_assert(std::is_same<Scalar, double>::value, "Scalar type must be double");
-  // For x in [-0.5:0.5] we use a rational approximation of the form
-  // R(x) = x + x^3*P(x^2)/Q(x^2), where P is or order 4 and Q is of order 5.
-  constexpr double alpha[] = {3.3071338469301391e-03, -4.7129526768798737e-02, 1.8185306179826699e-01,
-                              -2.5949536095445679e-01, 1.2306328729812676e-01};
-
-  constexpr double beta[] = {-3.8679974580640881e-03, 7.6391885763341910e-02,  -4.2828141436397615e-01,
-                             9.8733495886883648e-01,  -1.0000000000000000e+00, 3.6918986189438030e-01};
-
-  const Packet x2 = pmul(x, x);
-  const Packet x3 = pmul(x, x2);
-  Packet p = ppolevl<Packet, 4>::run(x2, alpha);
-  Packet q = ppolevl<Packet, 5>::run(x2, beta);
-  Packet y_small = pmadd(x3, pdiv(p, q), x);
-
-  // For |x| in ]0.5:1.0] we use atanh = 0.5*ln((1+x)/(1-x));
-  const Packet half = pset1<Packet>(0.5);
-  const Packet one = pset1<Packet>(1.0);
-  Packet y_large = pdiv(padd(one, x), psub(one, x));
-  y_large = pmul(half, plog(y_large));
-
-  const Packet x_gt_half = pcmp_le(half, pabs(x));
-  const Packet x_eq_one = pcmp_eq(one, pabs(x));
-  const Packet x_gt_one = pcmp_lt(one, pabs(x));
-  const Packet sign_mask = pset1<Packet>(-0.0);
-  const Packet x_sign = pand(sign_mask, x);
-  const Packet inf = pset1<Packet>(std::numeric_limits<double>::infinity());
-  return por(x_gt_one, pselect(x_eq_one, por(x_sign, inf), pselect(x_gt_half, y_large, y_small)));
-}
-
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pdiv_complex(const Packet& x, const Packet& y) {
-  typedef typename unpacket_traits<Packet>::as_real RealPacket;
-  // In the following we annotate the code for the case where the inputs
-  // are a pair length-2 SIMD vectors representing a single pair of complex
-  // numbers x = a + i*b, y = c + i*d.
-  const RealPacket y_abs = pabs(y.v);                        // |c|, |d|
-  const RealPacket y_abs_flip = pcplxflip(Packet(y_abs)).v;  // |d|, |c|
-  const RealPacket y_max = pmax(y_abs, y_abs_flip);          // max(|c|, |d|), max(|c|, |d|)
-  const RealPacket y_scaled = pdiv(y.v, y_max);              // c / max(|c|, |d|), d / max(|c|, |d|)
-  // Compute scaled denominator.
-  const RealPacket y_scaled_sq = pmul(y_scaled, y_scaled);  // c'**2, d'**2
-  const RealPacket denom = padd(y_scaled_sq, pcplxflip(Packet(y_scaled_sq)).v);
-  Packet result_scaled = pmul(x, pconj(Packet(y_scaled)));  // a * c' + b * d', -a * d + b * c
-  // Divide elementwise by denom.
-  result_scaled = Packet(pdiv(result_scaled.v, denom));
-  // Rescale result
-  return Packet(pdiv(result_scaled.v, y_max));
-}
-
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pmul_complex(const Packet& x, const Packet& y) {
-  // In the following we annotate the code for the case where the inputs
-  // are a pair length-2 SIMD vectors representing a single pair of complex
-  // numbers x = a + i*b, y = c + i*d.
-  Packet x_re = pdupreal(x);                  // a, a
-  Packet x_im = pdupimag(x);                  // b, b
-  Packet tmp_re = Packet(pmul(x_re.v, y.v));  // a*c, a*d
-  Packet tmp_im = Packet(pmul(x_im.v, y.v));  // b*c, b*d
-  tmp_im = pcplxflip(pconj(tmp_im));          // -b*d, d*c
-  return padd(tmp_im, tmp_re);                // a*c - b*d, a*d + b*c
-}
-
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_complex(const Packet& x) {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  typedef typename Scalar::value_type RealScalar;
-  typedef typename unpacket_traits<Packet>::as_real RealPacket;
-
-  // Real part
-  RealPacket x_flip = pcplxflip(x).v;  // b, a
-  Packet x_norm = phypot_complex(x);   // sqrt(a^2 + b^2), sqrt(a^2 + b^2)
-  RealPacket xlogr = plog(x_norm.v);   // log(sqrt(a^2 + b^2)), log(sqrt(a^2 + b^2))
-
-  // Imag part
-  RealPacket ximg = patan2(x.v, x_flip);  // atan2(a, b), atan2(b, a)
-
-  const RealPacket cst_pos_inf = pset1<RealPacket>(NumTraits<RealScalar>::infinity());
-  RealPacket x_abs = pabs(x.v);
-  RealPacket is_x_pos_inf = pcmp_eq(x_abs, cst_pos_inf);
-  RealPacket is_y_pos_inf = pcplxflip(Packet(is_x_pos_inf)).v;
-  RealPacket is_any_inf = por(is_x_pos_inf, is_y_pos_inf);
-  RealPacket xreal = pselect(is_any_inf, cst_pos_inf, xlogr);
-
-  return Packet(pselect(peven_mask(xreal), xreal, ximg));  // log(sqrt(a^2 + b^2)), atan2(b, a)
-}
-
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_complex(const Packet& a) {
-  // FIXME(rmlarsen): This does not work correctly for Packets of std::complex<double>.
-  typedef typename unpacket_traits<Packet>::as_real RealPacket;
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  typedef typename Scalar::value_type RealScalar;
-  const RealPacket even_mask = peven_mask(a.v);
-  const RealPacket odd_mask = pcplxflip(Packet(even_mask)).v;
-
-  // Let a = x + iy.
-  // exp(a) = exp(x) * cis(y), plus some special edge-case handling.
-
-  // exp(x):
-  RealPacket x = pand(a.v, even_mask);
-  x = por(x, pcplxflip(Packet(x)).v);
-  RealPacket expx = pexp(x);  // exp(x);
-
-  // cis(y):
-  RealPacket y = pand(odd_mask, a.v);
-  y = por(y, pcplxflip(Packet(y)).v);
-  RealPacket cisy = psincos_selector<RealPacket>(y);
-  cisy = pcplxflip(Packet(cisy)).v;  // cos(y) + i * sin(y)
-
-  const RealPacket cst_pos_inf = pset1<RealPacket>(NumTraits<RealScalar>::infinity());
-  const RealPacket cst_neg_inf = pset1<RealPacket>(-NumTraits<RealScalar>::infinity());
-
-  // If x is -inf, we know that cossin(y) is bounded,
-  //   so the result is (0, +/-0), where the sign of the imaginary part comes
-  //   from the sign of cossin(y).
-  RealPacket cisy_sign = por(pandnot(cisy, pabs(cisy)), pset1<RealPacket>(RealScalar(1)));
-  cisy = pselect(pcmp_eq(x, cst_neg_inf), cisy_sign, cisy);
-
-  // If x is inf, and cos(y) has unknown sign (y is inf or NaN), the result
-  // is (+/-inf, NaN), where the signs are undetermined (take the sign of y).
-  RealPacket y_sign = por(pandnot(y, pabs(y)), pset1<RealPacket>(RealScalar(1)));
-  cisy = pselect(pand(pcmp_eq(x, cst_pos_inf), pisnan(cisy)), pand(y_sign, even_mask), cisy);
-  Packet result = Packet(pmul(expx, cisy));
-
-  // If y is +/- 0, the input is real, so take the real result for consistency.
-  result = pselect(Packet(pcmp_eq(y, pzero(y))), Packet(por(pand(expx, even_mask), pand(y, odd_mask))), result);
-
-  return result;
-}
-
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psqrt_complex(const Packet& a) {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  typedef typename Scalar::value_type RealScalar;
-  typedef typename unpacket_traits<Packet>::as_real RealPacket;
-
-  // Computes the principal sqrt of the complex numbers in the input.
-  //
-  // For example, for packets containing 2 complex numbers stored in interleaved format
-  //    a = [a0, a1] = [x0, y0, x1, y1],
-  // where x0 = real(a0), y0 = imag(a0) etc., this function returns
-  //    b = [b0, b1] = [u0, v0, u1, v1],
-  // such that b0^2 = a0, b1^2 = a1.
-  //
-  // To derive the formula for the complex square roots, let's consider the equation for
-  // a single complex square root of the number x + i*y. We want to find real numbers
-  // u and v such that
-  //    (u + i*v)^2 = x + i*y  <=>
-  //    u^2 - v^2 + i*2*u*v = x + i*v.
-  // By equating the real and imaginary parts we get:
-  //    u^2 - v^2 = x
-  //    2*u*v = y.
-  //
-  // For x >= 0, this has the numerically stable solution
-  //    u = sqrt(0.5 * (x + sqrt(x^2 + y^2)))
-  //    v = 0.5 * (y / u)
-  // and for x < 0,
-  //    v = sign(y) * sqrt(0.5 * (-x + sqrt(x^2 + y^2)))
-  //    u = 0.5 * (y / v)
-  //
-  //  To avoid unnecessary over- and underflow, we compute sqrt(x^2 + y^2) as
-  //     l = max(|x|, |y|) * sqrt(1 + (min(|x|, |y|) / max(|x|, |y|))^2) ,
-
-  // In the following, without lack of generality, we have annotated the code, assuming
-  // that the input is a packet of 2 complex numbers.
-  //
-  // Step 1. Compute l = [l0, l0, l1, l1], where
-  //    l0 = sqrt(x0^2 + y0^2),  l1 = sqrt(x1^2 + y1^2)
-  // To avoid over- and underflow, we use the stable formula for each hypotenuse
-  //    l0 = (min0 == 0 ? max0 : max0 * sqrt(1 + (min0/max0)**2)),
-  // where max0 = max(|x0|, |y0|), min0 = min(|x0|, |y0|), and similarly for l1.
-
-  RealPacket a_abs = pabs(a.v);                        // [|x0|, |y0|, |x1|, |y1|]
-  RealPacket a_abs_flip = pcplxflip(Packet(a_abs)).v;  // [|y0|, |x0|, |y1|, |x1|]
-  RealPacket a_max = pmax(a_abs, a_abs_flip);
-  RealPacket a_min = pmin(a_abs, a_abs_flip);
-  RealPacket a_min_zero_mask = pcmp_eq(a_min, pzero(a_min));
-  RealPacket a_max_zero_mask = pcmp_eq(a_max, pzero(a_max));
-  RealPacket r = pdiv(a_min, a_max);
-  const RealPacket cst_one = pset1<RealPacket>(RealScalar(1));
-  RealPacket l = pmul(a_max, psqrt(padd(cst_one, pmul(r, r))));  // [l0, l0, l1, l1]
-  // Set l to a_max if a_min is zero.
-  l = pselect(a_min_zero_mask, a_max, l);
-
-  // Step 2. Compute [rho0, *, rho1, *], where
-  // rho0 = sqrt(0.5 * (l0 + |x0|)), rho1 =  sqrt(0.5 * (l1 + |x1|))
-  // We don't care about the imaginary parts computed here. They will be overwritten later.
-  const RealPacket cst_half = pset1<RealPacket>(RealScalar(0.5));
-  Packet rho;
-  rho.v = psqrt(pmul(cst_half, padd(a_abs, l)));
-
-  // Step 3. Compute [rho0, eta0, rho1, eta1], where
-  // eta0 = (y0 / l0) / 2, and eta1 = (y1 / l1) / 2.
-  // set eta = 0 of input is 0 + i0.
-  RealPacket eta = pandnot(pmul(cst_half, pdiv(a.v, pcplxflip(rho).v)), a_max_zero_mask);
-  RealPacket real_mask = peven_mask(a.v);
-  Packet positive_real_result;
-  // Compute result for inputs with positive real part.
-  positive_real_result.v = pselect(real_mask, rho.v, eta);
-
-  // Step 4. Compute solution for inputs with negative real part:
-  //         [|eta0|, sign(y0)*rho0, |eta1|, sign(y1)*rho1]
-  const RealPacket cst_imag_sign_mask = pset1<Packet>(Scalar(RealScalar(0.0), RealScalar(-0.0))).v;
-  RealPacket imag_signs = pand(a.v, cst_imag_sign_mask);
-  Packet negative_real_result;
-  // Notice that rho is positive, so taking it's absolute value is a noop.
-  negative_real_result.v = por(pabs(pcplxflip(positive_real_result).v), imag_signs);
-
-  // Step 5. Select solution branch based on the sign of the real parts.
-  Packet negative_real_mask;
-  negative_real_mask.v = pcmp_lt(pand(real_mask, a.v), pzero(a.v));
-  negative_real_mask.v = por(negative_real_mask.v, pcplxflip(negative_real_mask).v);
-  Packet result = pselect(negative_real_mask, negative_real_result, positive_real_result);
-
-  // Step 6. Handle special cases for infinities:
-  // * If z is (x,+∞), the result is (+∞,+∞) even if x is NaN
-  // * If z is (x,-∞), the result is (+∞,-∞) even if x is NaN
-  // * If z is (-∞,y), the result is (0*|y|,+∞) for finite or NaN y
-  // * If z is (+∞,y), the result is (+∞,0*|y|) for finite or NaN y
-  const RealPacket cst_pos_inf = pset1<RealPacket>(NumTraits<RealScalar>::infinity());
-  Packet is_inf;
-  is_inf.v = pcmp_eq(a_abs, cst_pos_inf);
-  Packet is_real_inf;
-  is_real_inf.v = pand(is_inf.v, real_mask);
-  is_real_inf = por(is_real_inf, pcplxflip(is_real_inf));
-  // prepare packet of (+∞,0*|y|) or (0*|y|,+∞), depending on the sign of the infinite real part.
-  Packet real_inf_result;
-  real_inf_result.v = pmul(a_abs, pset1<Packet>(Scalar(RealScalar(1.0), RealScalar(0.0))).v);
-  real_inf_result.v = pselect(negative_real_mask.v, pcplxflip(real_inf_result).v, real_inf_result.v);
-  // prepare packet of (+∞,+∞) or (+∞,-∞), depending on the sign of the infinite imaginary part.
-  Packet is_imag_inf;
-  is_imag_inf.v = pandnot(is_inf.v, real_mask);
-  is_imag_inf = por(is_imag_inf, pcplxflip(is_imag_inf));
-  Packet imag_inf_result;
-  imag_inf_result.v = por(pand(cst_pos_inf, real_mask), pandnot(a.v, real_mask));
-  // unless otherwise specified, if either the real or imaginary component is nan, the entire result is nan
-  Packet result_is_nan = pisnan(result);
-  result = por(result_is_nan, result);
-
-  return pselect(is_imag_inf, imag_inf_result, pselect(is_real_inf, real_inf_result, result));
-}
-
-// \internal \returns the norm of a complex number z = x + i*y, defined as sqrt(x^2 + y^2).
-// Implemented using the hypot(a,b) algorithm from https://doi.org/10.48550/arXiv.1904.09481
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet phypot_complex(const Packet& a) {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  typedef typename Scalar::value_type RealScalar;
-  typedef typename unpacket_traits<Packet>::as_real RealPacket;
-
-  const RealPacket cst_zero_rp = pset1<RealPacket>(static_cast<RealScalar>(0.0));
-  const RealPacket cst_minus_one_rp = pset1<RealPacket>(static_cast<RealScalar>(-1.0));
-  const RealPacket cst_two_rp = pset1<RealPacket>(static_cast<RealScalar>(2.0));
-  const RealPacket evenmask = peven_mask(a.v);
-
-  RealPacket a_abs = pabs(a.v);
-  RealPacket a_flip = pcplxflip(Packet(a_abs)).v;       // |b|, |a|
-  RealPacket a_all = pselect(evenmask, a_abs, a_flip);  // |a|, |a|
-  RealPacket b_all = pselect(evenmask, a_flip, a_abs);  // |b|, |b|
-
-  RealPacket a2 = pmul(a.v, a.v);                    // |a^2, b^2|
-  RealPacket a2_flip = pcplxflip(Packet(a2)).v;      // |b^2, a^2|
-  RealPacket h = psqrt(padd(a2, a2_flip));           // |sqrt(a^2 + b^2), sqrt(a^2 + b^2)|
-  RealPacket h_sq = pmul(h, h);                      // |a^2 + b^2, a^2 + b^2|
-  RealPacket a_sq = pselect(evenmask, a2, a2_flip);  // |a^2, a^2|
-  RealPacket m_h_sq = pmul(h_sq, cst_minus_one_rp);
-  RealPacket m_a_sq = pmul(a_sq, cst_minus_one_rp);
-  RealPacket x = psub(psub(pmadd(h, h, m_h_sq), pmadd(b_all, b_all, psub(a_sq, h_sq))), pmadd(a_all, a_all, m_a_sq));
-  h = psub(h, pdiv(x, pmul(cst_two_rp, h)));  // |h - x/(2*h), h - x/(2*h)|
-
-  // handle zero-case
-  RealPacket iszero = pcmp_eq(por(a_abs, a_flip), cst_zero_rp);
-
-  h = pandnot(h, iszero);  // |sqrt(a^2+b^2), sqrt(a^2+b^2)|
-  return Packet(h);        // |sqrt(a^2+b^2), sqrt(a^2+b^2)|
-}
+//----------------------------------------------------------------------
+// Sign Function
+//----------------------------------------------------------------------
 
 template <typename Packet>
 struct psign_impl<Packet, std::enable_if_t<!is_scalar<Packet>::value &&
@@ -1850,7 +471,7 @@
   }
 };
 
-// \internal \returns the the sign of a complex number z, defined as z / abs(z).
+// \internal \returns the sign of a complex number z, defined as z / abs(z).
 template <typename Packet>
 struct psign_impl<Packet, std::enable_if_t<!is_scalar<Packet>::value &&
                                            NumTraits<typename unpacket_traits<Packet>::type>::IsComplex &&
@@ -1885,766 +506,9 @@
   }
 };
 
-// TODO(rmlarsen): The following set of utilities for double word arithmetic
-// should perhaps be refactored as a separate file, since it would be generally
-// useful for special function implementation etc. Writing the algorithms in
-// terms if a double word type would also make the code more readable.
-
-// This function splits x into the nearest integer n and fractional part r,
-// such that x = n + r holds exactly.
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void absolute_split(const Packet& x, Packet& n, Packet& r) {
-  n = pround(x);
-  r = psub(x, n);
-}
-
-// This function computes the sum {s, r}, such that x + y = s_hi + s_lo
-// holds exactly, and s_hi = fl(x+y), if |x| >= |y|.
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void fast_twosum(const Packet& x, const Packet& y, Packet& s_hi, Packet& s_lo) {
-  s_hi = padd(x, y);
-  const Packet t = psub(s_hi, x);
-  s_lo = psub(y, t);
-}
-
-#ifdef EIGEN_VECTORIZE_FMA
-// This function implements the extended precision product of
-// a pair of floating point numbers. Given {x, y}, it computes the pair
-// {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and
-// p_hi = fl(x * y).
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twoprod(const Packet& x, const Packet& y, Packet& p_hi, Packet& p_lo) {
-  p_hi = pmul(x, y);
-  p_lo = pmsub(x, y, p_hi);
-}
-
-// A version of twoprod that takes x, y, and fl(x*y) as input and returns the p_lo such that
-// x * y = xy + p_lo holds exactly.
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet twoprod_low(const Packet& x, const Packet& y, const Packet& xy) {
-  return pmsub(x, y, xy);
-}
-
-#else
-
-// This function implements the Veltkamp splitting. Given a floating point
-// number x it returns the pair {x_hi, x_lo} such that x_hi + x_lo = x holds
-// exactly and that half of the significant of x fits in x_hi.
-// This is Algorithm 3 from Jean-Michel Muller, "Elementary Functions",
-// 3rd edition, Birkh\"auser, 2016.
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void veltkamp_splitting(const Packet& x, Packet& x_hi, Packet& x_lo) {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  constexpr int shift = (NumTraits<Scalar>::digits() + 1) / 2;
-  const Scalar shift_scale = Scalar(uint64_t(1) << shift);  // Scalar constructor not necessarily constexpr.
-  const Packet gamma = pmul(pset1<Packet>(shift_scale + Scalar(1)), x);
-  Packet rho = psub(x, gamma);
-  x_hi = padd(rho, gamma);
-  x_lo = psub(x, x_hi);
-}
-
-// This function implements Dekker's algorithm for products x * y.
-// Given floating point numbers {x, y} computes the pair
-// {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and
-// p_hi = fl(x * y).
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twoprod(const Packet& x, const Packet& y, Packet& p_hi, Packet& p_lo) {
-  Packet x_hi, x_lo, y_hi, y_lo;
-  veltkamp_splitting(x, x_hi, x_lo);
-  veltkamp_splitting(y, y_hi, y_lo);
-
-  p_hi = pmul(x, y);
-  p_lo = pmadd(x_hi, y_hi, pnegate(p_hi));
-  p_lo = pmadd(x_hi, y_lo, p_lo);
-  p_lo = pmadd(x_lo, y_hi, p_lo);
-  p_lo = pmadd(x_lo, y_lo, p_lo);
-}
-
-// A version of twoprod that takes x, y, and fl(x*y) as input and returns the p_lo such that
-// x * y = xy + p_lo holds exactly.
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet twoprod_low(const Packet& x, const Packet& y, const Packet& xy) {
-  Packet x_hi, x_lo, y_hi, y_lo;
-  veltkamp_splitting(x, x_hi, x_lo);
-  veltkamp_splitting(y, y_hi, y_lo);
-
-  Packet p_lo = pmadd(x_hi, y_hi, pnegate(xy));
-  p_lo = pmadd(x_hi, y_lo, p_lo);
-  p_lo = pmadd(x_lo, y_hi, p_lo);
-  p_lo = pmadd(x_lo, y_lo, p_lo);
-  return p_lo;
-}
-
-#endif  // EIGEN_VECTORIZE_FMA
-
-// This function implements Dekker's algorithm for the addition
-// of two double word numbers represented by {x_hi, x_lo} and {y_hi, y_lo}.
-// It returns the result as a pair {s_hi, s_lo} such that
-// x_hi + x_lo + y_hi + y_lo = s_hi + s_lo holds exactly.
-// This is Algorithm 5 from Jean-Michel Muller, "Elementary Functions",
-// 3rd edition, Birkh\"auser, 2016.
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twosum(const Packet& x_hi, const Packet& x_lo, const Packet& y_hi,
-                                                  const Packet& y_lo, Packet& s_hi, Packet& s_lo) {
-  const Packet x_greater_mask = pcmp_lt(pabs(y_hi), pabs(x_hi));
-  Packet r_hi_1, r_lo_1;
-  fast_twosum(x_hi, y_hi, r_hi_1, r_lo_1);
-  Packet r_hi_2, r_lo_2;
-  fast_twosum(y_hi, x_hi, r_hi_2, r_lo_2);
-  const Packet r_hi = pselect(x_greater_mask, r_hi_1, r_hi_2);
-
-  const Packet s1 = padd(padd(y_lo, r_lo_1), x_lo);
-  const Packet s2 = padd(padd(x_lo, r_lo_2), y_lo);
-  const Packet s = pselect(x_greater_mask, s1, s2);
-
-  fast_twosum(r_hi, s, s_hi, s_lo);
-}
-
-// This is a version of twosum for double word numbers,
-// which assumes that |x_hi| >= |y_hi|.
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void fast_twosum(const Packet& x_hi, const Packet& x_lo, const Packet& y_hi,
-                                                       const Packet& y_lo, Packet& s_hi, Packet& s_lo) {
-  Packet r_hi, r_lo;
-  fast_twosum(x_hi, y_hi, r_hi, r_lo);
-  const Packet s = padd(padd(y_lo, r_lo), x_lo);
-  fast_twosum(r_hi, s, s_hi, s_lo);
-}
-
-// This is a version of twosum for adding a floating point number x to
-// double word number {y_hi, y_lo} number, with the assumption
-// that |x| >= |y_hi|.
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void fast_twosum(const Packet& x, const Packet& y_hi, const Packet& y_lo,
-                                                       Packet& s_hi, Packet& s_lo) {
-  Packet r_hi, r_lo;
-  fast_twosum(x, y_hi, r_hi, r_lo);
-  const Packet s = padd(y_lo, r_lo);
-  fast_twosum(r_hi, s, s_hi, s_lo);
-}
-
-// This function implements the multiplication of a double word
-// number represented by {x_hi, x_lo} by a floating point number y.
-// It returns the result as a pair {p_hi, p_lo} such that
-// (x_hi + x_lo) * y = p_hi + p_lo hold with a relative error
-// of less than 2*2^{-2p}, where p is the number of significand bit
-// in the floating point type.
-// This is Algorithm 7 from Jean-Michel Muller, "Elementary Functions",
-// 3rd edition, Birkh\"auser, 2016.
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twoprod(const Packet& x_hi, const Packet& x_lo, const Packet& y,
-                                                   Packet& p_hi, Packet& p_lo) {
-  Packet c_hi, c_lo1;
-  twoprod(x_hi, y, c_hi, c_lo1);
-  const Packet c_lo2 = pmul(x_lo, y);
-  Packet t_hi, t_lo1;
-  fast_twosum(c_hi, c_lo2, t_hi, t_lo1);
-  const Packet t_lo2 = padd(t_lo1, c_lo1);
-  fast_twosum(t_hi, t_lo2, p_hi, p_lo);
-}
-
-// This function implements the multiplication of two double word
-// numbers represented by {x_hi, x_lo} and {y_hi, y_lo}.
-// It returns the result as a pair {p_hi, p_lo} such that
-// (x_hi + x_lo) * (y_hi + y_lo) = p_hi + p_lo holds with a relative error
-// of less than 2*2^{-2p}, where p is the number of significand bit
-// in the floating point type.
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twoprod(const Packet& x_hi, const Packet& x_lo, const Packet& y_hi,
-                                                   const Packet& y_lo, Packet& p_hi, Packet& p_lo) {
-  Packet p_hi_hi, p_hi_lo;
-  twoprod(x_hi, x_lo, y_hi, p_hi_hi, p_hi_lo);
-  Packet p_lo_hi, p_lo_lo;
-  twoprod(x_hi, x_lo, y_lo, p_lo_hi, p_lo_lo);
-  fast_twosum(p_hi_hi, p_hi_lo, p_lo_hi, p_lo_lo, p_hi, p_lo);
-}
-
-// This function implements the division of double word {x_hi, x_lo}
-// by float y. This is Algorithm 15 from "Tight and rigorous error bounds
-// for basic building blocks of double-word arithmetic", Joldes, Muller, & Popescu,
-// 2017. https://hal.archives-ouvertes.fr/hal-01351529
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void doubleword_div_fp(const Packet& x_hi, const Packet& x_lo, const Packet& y,
-                                                             Packet& z_hi, Packet& z_lo) {
-  const Packet t_hi = pdiv(x_hi, y);
-  Packet pi_hi, pi_lo;
-  twoprod(t_hi, y, pi_hi, pi_lo);
-  const Packet delta_hi = psub(x_hi, pi_hi);
-  const Packet delta_t = psub(delta_hi, pi_lo);
-  const Packet delta = padd(delta_t, x_lo);
-  const Packet t_lo = pdiv(delta, y);
-  fast_twosum(t_hi, t_lo, z_hi, z_lo);
-}
-
-// This function computes log2(x) and returns the result as a double word.
-template <typename Scalar>
-struct accurate_log2 {
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(const Packet& x, Packet& log2_x_hi, Packet& log2_x_lo) {
-    log2_x_hi = plog2(x);
-    log2_x_lo = pzero(x);
-  }
-};
-
-// This specialization uses a more accurate algorithm to compute log2(x) for
-// floats in [1/sqrt(2);sqrt(2)] with a relative accuracy of ~6.56508e-10.
-// This additional accuracy is needed to counter the error-magnification
-// inherent in multiplying by a potentially large exponent in pow(x,y).
-// The minimax polynomial used was calculated using the Rminimax tool,
-// see https://gitlab.inria.fr/sfilip/rminimax.
-// Command line:
-//   $ ratapprox --function="log2(1+x)/x"  --dom='[-0.2929,0.41422]'
-//   --type=[10,0]
-//       --numF="[D,D,SG]" --denF="[SG]" --log --dispCoeff="dec"
-//
-// The resulting implementation of pow(x,y) is accurate to 3 ulps.
-template <>
-struct accurate_log2<float> {
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(const Packet& z, Packet& log2_x_hi, Packet& log2_x_lo) {
-    // Split the two lowest order constant coefficient into double-word representation.
-    constexpr double kC0 = 1.442695041742110273474963832995854318141937255859375e+00;
-    constexpr float kC0_hi = static_cast<float>(kC0);
-    constexpr float kC0_lo = static_cast<float>(kC0 - static_cast<double>(kC0_hi));
-    const Packet c0_hi = pset1<Packet>(kC0_hi);
-    const Packet c0_lo = pset1<Packet>(kC0_lo);
-
-    constexpr double kC1 = -7.2134751588268664068692714863573201000690460205078125e-01;
-    constexpr float kC1_hi = static_cast<float>(kC1);
-    constexpr float kC1_lo = static_cast<float>(kC1 - static_cast<double>(kC1_hi));
-    const Packet c1_hi = pset1<Packet>(kC1_hi);
-    const Packet c1_lo = pset1<Packet>(kC1_lo);
-
-    constexpr float c[] = {
-        9.7010828554630279541015625e-02,  -1.6896486282348632812500000e-01, 1.7200836539268493652343750e-01,
-        -1.7892272770404815673828125e-01, 2.0505344867706298828125000e-01,  -2.4046677350997924804687500e-01,
-        2.8857553005218505859375000e-01,  -3.6067414283752441406250000e-01, 4.8089790344238281250000000e-01};
-
-    // Evaluate the higher order terms in the polynomial using
-    // standard arithmetic.
-    const Packet one = pset1<Packet>(1.0f);
-    const Packet x = psub(z, one);
-    Packet p = ppolevl<Packet, 8>::run(x, c);
-    // Evaluate the final two step in Horner's rule using double-word
-    // arithmetic.
-    Packet p_hi, p_lo;
-    twoprod(x, p, p_hi, p_lo);
-    fast_twosum(c1_hi, c1_lo, p_hi, p_lo, p_hi, p_lo);
-    twoprod(p_hi, p_lo, x, p_hi, p_lo);
-    fast_twosum(c0_hi, c0_lo, p_hi, p_lo, p_hi, p_lo);
-    // Multiply by x to recover log2(z).
-    twoprod(p_hi, p_lo, x, log2_x_hi, log2_x_lo);
-  }
-};
-
-// This specialization uses a more accurate algorithm to compute log2(x) for
-// floats in [1/sqrt(2);sqrt(2)] with a relative accuracy of ~1.27e-18.
-// This additional accuracy is needed to counter the error-magnification
-// inherent in multiplying by a potentially large exponent in pow(x,y).
-// The minimax polynomial used was calculated using the Sollya tool.
-// See sollya.org.
-
-template <>
-struct accurate_log2<double> {
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(const Packet& x, Packet& log2_x_hi, Packet& log2_x_lo) {
-    // We use a transformation of variables:
-    //    r = c * (x-1) / (x+1),
-    // such that
-    //    log2(x) = log2((1 + r/c) / (1 - r/c)) = f(r).
-    // The function f(r) can be approximated well using an odd polynomial
-    // of the form
-    //   P(r) = ((Q(r^2) * r^2 + C) * r^2 + 1) * r,
-    // For the implementation of log2<double> here, Q is of degree 6 with
-    // coefficient represented in working precision (double), while C is a
-    // constant represented in extra precision as a double word to achieve
-    // full accuracy.
-    //
-    // The polynomial coefficients were computed by the Sollya script:
-    //
-    // c = 2 / log(2);
-    // trans = c * (x-1)/(x+1);
-    // itrans = (1+x/c)/(1-x/c);
-    // interval=[trans(sqrt(0.5)); trans(sqrt(2))];
-    // print(interval);
-    // f = log2(itrans(x));
-    // p=fpminimax(f,[|1,3,5,7,9,11,13,15,17|],[|1,DD,double...|],interval,relative,floating);
-    const Packet q12 = pset1<Packet>(2.87074255468000586e-9);
-    const Packet q10 = pset1<Packet>(2.38957980901884082e-8);
-    const Packet q8 = pset1<Packet>(2.31032094540014656e-7);
-    const Packet q6 = pset1<Packet>(2.27279857398537278e-6);
-    const Packet q4 = pset1<Packet>(2.31271023278625638e-5);
-    const Packet q2 = pset1<Packet>(2.47556738444535513e-4);
-    const Packet q0 = pset1<Packet>(2.88543873228900172e-3);
-    const Packet C_hi = pset1<Packet>(0.0400377511598501157);
-    const Packet C_lo = pset1<Packet>(-4.77726582251425391e-19);
-    const Packet one = pset1<Packet>(1.0);
-
-    const Packet cst_2_log2e_hi = pset1<Packet>(2.88539008177792677);
-    const Packet cst_2_log2e_lo = pset1<Packet>(4.07660016854549667e-17);
-    // c * (x - 1)
-    Packet t_hi, t_lo;
-    // t = c * (x-1)
-    twoprod(cst_2_log2e_hi, cst_2_log2e_lo, psub(x, one), t_hi, t_lo);
-    // r = c * (x-1) / (x+1),
-    Packet r_hi, r_lo;
-    doubleword_div_fp(t_hi, t_lo, padd(x, one), r_hi, r_lo);
-
-    // r2 = r * r
-    Packet r2_hi, r2_lo;
-    twoprod(r_hi, r_lo, r_hi, r_lo, r2_hi, r2_lo);
-    // r4 = r2 * r2
-    Packet r4_hi, r4_lo;
-    twoprod(r2_hi, r2_lo, r2_hi, r2_lo, r4_hi, r4_lo);
-
-    // Evaluate Q(r^2) in working precision. We evaluate it in two parts
-    // (even and odd in r^2) to improve instruction level parallelism.
-    Packet q_even = pmadd(q12, r4_hi, q8);
-    Packet q_odd = pmadd(q10, r4_hi, q6);
-    q_even = pmadd(q_even, r4_hi, q4);
-    q_odd = pmadd(q_odd, r4_hi, q2);
-    q_even = pmadd(q_even, r4_hi, q0);
-    Packet q = pmadd(q_odd, r2_hi, q_even);
-
-    // Now evaluate the low order terms of P(x) in double word precision.
-    // In the following, due to the increasing magnitude of the coefficients
-    // and r being constrained to [-0.5, 0.5] we can use fast_twosum instead
-    // of the slower twosum.
-    // Q(r^2) * r^2
-    Packet p_hi, p_lo;
-    twoprod(r2_hi, r2_lo, q, p_hi, p_lo);
-    // Q(r^2) * r^2 + C
-    Packet p1_hi, p1_lo;
-    fast_twosum(C_hi, C_lo, p_hi, p_lo, p1_hi, p1_lo);
-    // (Q(r^2) * r^2 + C) * r^2
-    Packet p2_hi, p2_lo;
-    twoprod(r2_hi, r2_lo, p1_hi, p1_lo, p2_hi, p2_lo);
-    // ((Q(r^2) * r^2 + C) * r^2 + 1)
-    Packet p3_hi, p3_lo;
-    fast_twosum(one, p2_hi, p2_lo, p3_hi, p3_lo);
-
-    // log(z) ~= ((Q(r^2) * r^2 + C) * r^2 + 1) * r
-    twoprod(p3_hi, p3_lo, r_hi, r_lo, log2_x_hi, log2_x_lo);
-  }
-};
-
-// This function implements the non-trivial case of pow(x,y) where x is
-// positive and y is (possibly) non-integer.
-// Formally, pow(x,y) = exp2(y * log2(x)), where exp2(x) is shorthand for 2^x.
-// TODO(rmlarsen): We should probably add this as a packet up 'ppow', to make it
-// easier to specialize or turn off for specific types and/or backends.x
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_pow_impl(const Packet& x, const Packet& y) {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  // Split x into exponent e_x and mantissa m_x.
-  Packet e_x;
-  Packet m_x = pfrexp(x, e_x);
-
-  // Adjust m_x to lie in [1/sqrt(2):sqrt(2)] to minimize absolute error in log2(m_x).
-  constexpr Scalar sqrt_half = Scalar(0.70710678118654752440);
-  const Packet m_x_scale_mask = pcmp_lt(m_x, pset1<Packet>(sqrt_half));
-  m_x = pselect(m_x_scale_mask, pmul(pset1<Packet>(Scalar(2)), m_x), m_x);
-  e_x = pselect(m_x_scale_mask, psub(e_x, pset1<Packet>(Scalar(1))), e_x);
-
-  // Compute log2(m_x) with 6 extra bits of accuracy.
-  Packet rx_hi, rx_lo;
-  accurate_log2<Scalar>()(m_x, rx_hi, rx_lo);
-
-  // Compute the two terms {y * e_x, y * r_x} in f = y * log2(x) with doubled
-  // precision using double word arithmetic.
-  Packet f1_hi, f1_lo, f2_hi, f2_lo;
-  twoprod(e_x, y, f1_hi, f1_lo);
-  twoprod(rx_hi, rx_lo, y, f2_hi, f2_lo);
-  // Sum the two terms in f using double word arithmetic. We know
-  // that |e_x| > |log2(m_x)|, except for the case where e_x==0.
-  // This means that we can use fast_twosum(f1,f2).
-  // In the case e_x == 0, e_x * y = f1 = 0, so we don't lose any
-  // accuracy by violating the assumption of fast_twosum, because
-  // it's a no-op.
-  Packet f_hi, f_lo;
-  fast_twosum(f1_hi, f1_lo, f2_hi, f2_lo, f_hi, f_lo);
-
-  // Split f into integer and fractional parts.
-  Packet n_z, r_z;
-  absolute_split(f_hi, n_z, r_z);
-  r_z = padd(r_z, f_lo);
-  Packet n_r;
-  absolute_split(r_z, n_r, r_z);
-  n_z = padd(n_z, n_r);
-
-  // We now have an accurate split of f = n_z + r_z and can compute
-  //   x^y = 2**{n_z + r_z) = exp2(r_z) * 2**{n_z}.
-  // Multiplication by the second factor can be done exactly using pldexp(), since
-  // it is an integer power of 2.
-  const Packet e_r = generic_exp2(r_z);
-
-  // Since we know that e_r is in [1/sqrt(2); sqrt(2)], we can use the fast version
-  // of pldexp to multiply by 2**{n_z} when |n_z| is sufficiently small.
-  constexpr Scalar kPldExpThresh = std::numeric_limits<Scalar>::max_exponent - 2;
-  const Packet pldexp_fast_unsafe = pcmp_lt(pset1<Packet>(kPldExpThresh), pabs(n_z));
-  if (predux_any(pldexp_fast_unsafe)) {
-    return pldexp(e_r, n_z);
-  }
-  return pldexp_fast(e_r, n_z);
-}
-
-// Generic implementation of pow(x,y).
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS std::enable_if_t<!is_scalar<Packet>::value, Packet> generic_pow(
-    const Packet& x, const Packet& y) {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-
-  const Packet cst_inf = pset1<Packet>(NumTraits<Scalar>::infinity());
-  const Packet cst_zero = pset1<Packet>(Scalar(0));
-  const Packet cst_one = pset1<Packet>(Scalar(1));
-  const Packet cst_nan = pset1<Packet>(NumTraits<Scalar>::quiet_NaN());
-
-  const Packet x_abs = pabs(x);
-  Packet pow = generic_pow_impl(x_abs, y);
-
-  // In the following we enforce the special case handling prescribed in
-  // https://en.cppreference.com/w/cpp/numeric/math/pow.
-
-  // Predicates for sign and magnitude of x.
-  const Packet x_is_negative = pcmp_lt(x, cst_zero);
-  const Packet x_is_zero = pcmp_eq(x, cst_zero);
-  const Packet x_is_one = pcmp_eq(x, cst_one);
-  const Packet x_has_signbit = psignbit(x);
-  const Packet x_abs_gt_one = pcmp_lt(cst_one, x_abs);
-  const Packet x_abs_is_inf = pcmp_eq(x_abs, cst_inf);
-
-  // Predicates for sign and magnitude of y.
-  const Packet y_abs = pabs(y);
-  const Packet y_abs_is_inf = pcmp_eq(y_abs, cst_inf);
-  const Packet y_is_negative = pcmp_lt(y, cst_zero);
-  const Packet y_is_zero = pcmp_eq(y, cst_zero);
-  const Packet y_is_one = pcmp_eq(y, cst_one);
-  // Predicates for whether y is integer and odd/even.
-  const Packet y_is_int = pandnot(pcmp_eq(pfloor(y), y), y_abs_is_inf);
-  const Packet y_div_2 = pmul(y, pset1<Packet>(Scalar(0.5)));
-  const Packet y_is_even = pcmp_eq(pround(y_div_2), y_div_2);
-  const Packet y_is_odd_int = pandnot(y_is_int, y_is_even);
-  // Smallest exponent for which (1 + epsilon) overflows to infinity.
-  constexpr Scalar huge_exponent =
-      (NumTraits<Scalar>::max_exponent() * Scalar(EIGEN_LN2)) / NumTraits<Scalar>::epsilon();
-  const Packet y_abs_is_huge = pcmp_le(pset1<Packet>(huge_exponent), y_abs);
-
-  // *  pow(base, exp) returns NaN if base is finite and negative
-  //    and exp is finite and non-integer.
-  pow = pselect(pandnot(x_is_negative, y_is_int), cst_nan, pow);
-
-  // * pow(±0, exp), where exp is negative, finite, and is an even integer or
-  // a non-integer, returns +∞
-  // * pow(±0, exp), where exp is positive non-integer or a positive even
-  // integer, returns +0
-  // * pow(+0, exp), where exp is a negative odd integer, returns +∞
-  // * pow(-0, exp), where exp is a negative odd integer, returns -∞
-  // * pow(+0, exp), where exp is a positive odd integer, returns +0
-  // * pow(-0, exp), where exp is a positive odd integer, returns -0
-  // Sign is flipped by the rule below.
-  pow = pselect(x_is_zero, pselect(y_is_negative, cst_inf, cst_zero), pow);
-
-  // pow(base, exp) returns -pow(abs(base), exp) if base has the sign bit set,
-  // and exp is an odd integer exponent.
-  pow = pselect(pand(x_has_signbit, y_is_odd_int), pnegate(pow), pow);
-
-  // * pow(base, -∞) returns +∞ for any |base|<1
-  // * pow(base, -∞) returns +0 for any |base|>1
-  // * pow(base, +∞) returns +0 for any |base|<1
-  // * pow(base, +∞) returns +∞ for any |base|>1
-  // * pow(±0, -∞) returns +∞
-  // * pow(-1, +-∞) = 1
-  Packet inf_y_val = pselect(por(pand(y_is_negative, x_is_zero), pxor(y_is_negative, x_abs_gt_one)), cst_inf, cst_zero);
-  inf_y_val = pselect(pcmp_eq(x, pset1<Packet>(Scalar(-1.0))), cst_one, inf_y_val);
-  pow = pselect(y_abs_is_huge, inf_y_val, pow);
-
-  // * pow(+∞, exp) returns +0 for any negative exp
-  // * pow(+∞, exp) returns +∞ for any positive exp
-  // * pow(-∞, exp) returns -0 if exp is a negative odd integer.
-  // * pow(-∞, exp) returns +0 if exp is a negative non-integer or negative
-  //     even integer.
-  // * pow(-∞, exp) returns -∞ if exp is a positive odd integer.
-  // * pow(-∞, exp) returns +∞ if exp is a positive non-integer or positive
-  //     even integer.
-  auto x_pos_inf_value = pselect(y_is_negative, cst_zero, cst_inf);
-  auto x_neg_inf_value = pselect(y_is_odd_int, pnegate(x_pos_inf_value), x_pos_inf_value);
-  pow = pselect(x_abs_is_inf, pselect(x_is_negative, x_neg_inf_value, x_pos_inf_value), pow);
-
-  // All cases of NaN inputs return NaN, except the two below.
-  pow = pselect(por(pisnan(x), pisnan(y)), cst_nan, pow);
-
-  // * pow(base, 1) returns base.
-  // * pow(base, +/-0) returns 1, regardless of base, even NaN.
-  // * pow(+1, exp) returns 1, regardless of exponent, even NaN.
-  pow = pselect(y_is_one, x, pselect(por(x_is_one, y_is_zero), cst_one, pow));
-
-  return pow;
-}
-
-template <typename Scalar>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS std::enable_if_t<is_scalar<Scalar>::value, Scalar> generic_pow(
-    const Scalar& x, const Scalar& y) {
-  return numext::pow(x, y);
-}
-
-namespace unary_pow {
-
-template <typename ScalarExponent, bool IsInteger = NumTraits<ScalarExponent>::IsInteger>
-struct exponent_helper {
-  using safe_abs_type = ScalarExponent;
-  static constexpr ScalarExponent one_half = ScalarExponent(0.5);
-  // these routines assume that exp is an integer stored as a floating point type
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScalarExponent safe_abs(const ScalarExponent& exp) {
-    return numext::abs(exp);
-  }
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool is_odd(const ScalarExponent& exp) {
-    eigen_assert(((numext::isfinite)(exp) && exp == numext::floor(exp)) && "exp must be an integer");
-    ScalarExponent exp_div_2 = exp * one_half;
-    ScalarExponent floor_exp_div_2 = numext::floor(exp_div_2);
-    return exp_div_2 != floor_exp_div_2;
-  }
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScalarExponent floor_div_two(const ScalarExponent& exp) {
-    ScalarExponent exp_div_2 = exp * one_half;
-    return numext::floor(exp_div_2);
-  }
-};
-
-template <typename ScalarExponent>
-struct exponent_helper<ScalarExponent, true> {
-  // if `exp` is a signed integer type, cast it to its unsigned counterpart to safely store its absolute value
-  // consider the (rare) case where `exp` is an int32_t: abs(-2147483648) != 2147483648
-  using safe_abs_type = typename numext::get_integer_by_size<sizeof(ScalarExponent)>::unsigned_type;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE safe_abs_type safe_abs(const ScalarExponent& exp) {
-    ScalarExponent mask = numext::signbit(exp);
-    safe_abs_type result = safe_abs_type(exp ^ mask);
-    return result + safe_abs_type(ScalarExponent(1) & mask);
-  }
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool is_odd(const safe_abs_type& exp) {
-    return exp % safe_abs_type(2) != safe_abs_type(0);
-  }
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE safe_abs_type floor_div_two(const safe_abs_type& exp) {
-    return exp >> safe_abs_type(1);
-  }
-};
-
-template <typename Packet, typename ScalarExponent,
-          bool ReciprocateIfExponentIsNegative =
-              !NumTraits<typename unpacket_traits<Packet>::type>::IsInteger && NumTraits<ScalarExponent>::IsSigned>
-struct reciprocate {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) {
-    using Scalar = typename unpacket_traits<Packet>::type;
-    const Packet cst_pos_one = pset1<Packet>(Scalar(1));
-    return exponent < 0 ? pdiv(cst_pos_one, x) : x;
-  }
-};
-
-template <typename Packet, typename ScalarExponent>
-struct reciprocate<Packet, ScalarExponent, false> {
-  // pdiv not defined, nor necessary for integer base types
-  // if the exponent is unsigned, then the exponent cannot be negative
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent&) { return x; }
-};
-
-template <typename Packet, typename ScalarExponent>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet int_pow(const Packet& x, const ScalarExponent& exponent) {
-  using Scalar = typename unpacket_traits<Packet>::type;
-  using ExponentHelper = exponent_helper<ScalarExponent>;
-  using AbsExponentType = typename ExponentHelper::safe_abs_type;
-  const Packet cst_pos_one = pset1<Packet>(Scalar(1));
-  if (exponent == ScalarExponent(0)) return cst_pos_one;
-
-  Packet result = reciprocate<Packet, ScalarExponent>::run(x, exponent);
-  Packet y = cst_pos_one;
-  AbsExponentType m = ExponentHelper::safe_abs(exponent);
-
-  while (m > 1) {
-    bool odd = ExponentHelper::is_odd(m);
-    if (odd) y = pmul(y, result);
-    result = pmul(result, result);
-    m = ExponentHelper::floor_div_two(m);
-  }
-
-  return pmul(y, result);
-}
-
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!is_scalar<Packet>::value, Packet> gen_pow(
-    const Packet& x, const typename unpacket_traits<Packet>::type& exponent) {
-  const Packet exponent_packet = pset1<Packet>(exponent);
-  return generic_pow_impl(x, exponent_packet);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<is_scalar<Scalar>::value, Scalar> gen_pow(
-    const Scalar& x, const Scalar& exponent) {
-  return numext::pow(x, exponent);
-}
-
-template <typename Packet, typename ScalarExponent>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet handle_nonint_nonint_errors(const Packet& x, const Packet& powx,
-                                                                         const ScalarExponent& exponent) {
-  using Scalar = typename unpacket_traits<Packet>::type;
-
-  // non-integer base and exponent case
-  const Packet cst_pos_zero = pzero(x);
-  const Packet cst_pos_one = pset1<Packet>(Scalar(1));
-  const Packet cst_pos_inf = pset1<Packet>(NumTraits<Scalar>::infinity());
-  const Packet cst_true = ptrue<Packet>(x);
-
-  const bool exponent_is_not_fin = !(numext::isfinite)(exponent);
-  const bool exponent_is_neg = exponent < ScalarExponent(0);
-  const bool exponent_is_pos = exponent > ScalarExponent(0);
-
-  const Packet exp_is_not_fin = exponent_is_not_fin ? cst_true : cst_pos_zero;
-  const Packet exp_is_neg = exponent_is_neg ? cst_true : cst_pos_zero;
-  const Packet exp_is_pos = exponent_is_pos ? cst_true : cst_pos_zero;
-  const Packet exp_is_inf = pand(exp_is_not_fin, por(exp_is_neg, exp_is_pos));
-  const Packet exp_is_nan = pandnot(exp_is_not_fin, por(exp_is_neg, exp_is_pos));
-
-  const Packet x_is_le_zero = pcmp_le(x, cst_pos_zero);
-  const Packet x_is_ge_zero = pcmp_le(cst_pos_zero, x);
-  const Packet x_is_zero = pand(x_is_le_zero, x_is_ge_zero);
-
-  const Packet abs_x = pabs(x);
-  const Packet abs_x_is_le_one = pcmp_le(abs_x, cst_pos_one);
-  const Packet abs_x_is_ge_one = pcmp_le(cst_pos_one, abs_x);
-  const Packet abs_x_is_inf = pcmp_eq(abs_x, cst_pos_inf);
-  const Packet abs_x_is_one = pand(abs_x_is_le_one, abs_x_is_ge_one);
-
-  Packet pow_is_inf_if_exp_is_neg = por(x_is_zero, pand(abs_x_is_le_one, exp_is_inf));
-  Packet pow_is_inf_if_exp_is_pos = por(abs_x_is_inf, pand(abs_x_is_ge_one, exp_is_inf));
-  Packet pow_is_one = pand(abs_x_is_one, por(exp_is_inf, x_is_ge_zero));
-
-  Packet result = powx;
-  result = por(x_is_le_zero, result);
-  result = pselect(pow_is_inf_if_exp_is_neg, pand(cst_pos_inf, exp_is_neg), result);
-  result = pselect(pow_is_inf_if_exp_is_pos, pand(cst_pos_inf, exp_is_pos), result);
-  result = por(exp_is_nan, result);
-  result = pselect(pow_is_one, cst_pos_one, result);
-  return result;
-}
-
-template <typename Packet, typename ScalarExponent,
-          std::enable_if_t<NumTraits<typename unpacket_traits<Packet>::type>::IsSigned, bool> = true>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet handle_negative_exponent(const Packet& x, const ScalarExponent& exponent) {
-  using Scalar = typename unpacket_traits<Packet>::type;
-
-  // signed integer base, signed integer exponent case
-
-  // This routine handles negative exponents.
-  // The return value is either 0, 1, or -1.
-  const Packet cst_pos_one = pset1<Packet>(Scalar(1));
-  const bool exponent_is_odd = exponent % ScalarExponent(2) != ScalarExponent(0);
-  const Packet exp_is_odd = exponent_is_odd ? ptrue<Packet>(x) : pzero<Packet>(x);
-
-  const Packet abs_x = pabs(x);
-  const Packet abs_x_is_one = pcmp_eq(abs_x, cst_pos_one);
-
-  Packet result = pselect(exp_is_odd, x, abs_x);
-  result = pselect(abs_x_is_one, result, pzero<Packet>(x));
-  return result;
-}
-
-template <typename Packet, typename ScalarExponent,
-          std::enable_if_t<!NumTraits<typename unpacket_traits<Packet>::type>::IsSigned, bool> = true>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet handle_negative_exponent(const Packet& x, const ScalarExponent&) {
-  using Scalar = typename unpacket_traits<Packet>::type;
-
-  // unsigned integer base, signed integer exponent case
-
-  // This routine handles negative exponents.
-  // The return value is either 0 or 1
-
-  const Scalar pos_one = Scalar(1);
-
-  const Packet cst_pos_one = pset1<Packet>(pos_one);
-
-  const Packet x_is_one = pcmp_eq(x, cst_pos_one);
-
-  return pand(x_is_one, x);
-}
-
-}  // end namespace unary_pow
-
-template <typename Packet, typename ScalarExponent,
-          bool BaseIsIntegerType = NumTraits<typename unpacket_traits<Packet>::type>::IsInteger,
-          bool ExponentIsIntegerType = NumTraits<ScalarExponent>::IsInteger,
-          bool ExponentIsSigned = NumTraits<ScalarExponent>::IsSigned>
-struct unary_pow_impl;
-
-template <typename Packet, typename ScalarExponent, bool ExponentIsSigned>
-struct unary_pow_impl<Packet, ScalarExponent, false, false, ExponentIsSigned> {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) {
-    const bool exponent_is_integer = (numext::isfinite)(exponent) && numext::round(exponent) == exponent;
-    if (exponent_is_integer) {
-      // The simple recursive doubling implementation is only accurate to 3 ulps
-      // for integer exponents in [-3:7]. Since this is a common case, we
-      // specialize it here.
-      bool use_repeated_squaring =
-          (exponent <= ScalarExponent(7) && (!ExponentIsSigned || exponent >= ScalarExponent(-3)));
-      return use_repeated_squaring ? unary_pow::int_pow(x, exponent) : generic_pow(x, pset1<Packet>(exponent));
-    } else {
-      Packet result = unary_pow::gen_pow(x, exponent);
-      result = unary_pow::handle_nonint_nonint_errors(x, result, exponent);
-      return result;
-    }
-  }
-};
-
-template <typename Packet, typename ScalarExponent, bool ExponentIsSigned>
-struct unary_pow_impl<Packet, ScalarExponent, false, true, ExponentIsSigned> {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) {
-    return unary_pow::int_pow(x, exponent);
-  }
-};
-
-template <typename Packet, typename ScalarExponent>
-struct unary_pow_impl<Packet, ScalarExponent, true, true, true> {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) {
-    if (exponent < ScalarExponent(0)) {
-      return unary_pow::handle_negative_exponent(x, exponent);
-    } else {
-      return unary_pow::int_pow(x, exponent);
-    }
-  }
-};
-
-template <typename Packet, typename ScalarExponent>
-struct unary_pow_impl<Packet, ScalarExponent, true, true, false> {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) {
-    return unary_pow::int_pow(x, exponent);
-  }
-};
-
-// This function computes exp2(x) = exp(ln(2) * x).
-// To improve accuracy, the product ln(2)*x is computed using the twoprod
-// algorithm, such that ln(2) * x = p_hi + p_lo holds exactly. Then exp2(x) is
-// computed as exp2(x) = exp(p_hi) * exp(p_lo) ~= exp(p_hi) * (1 + p_lo). This
-// correction step this reduces the maximum absolute error as follows:
-//
-// type   | max error (simple product) | max error (twoprod) |
-// -----------------------------------------------------------
-// float  |       35 ulps              |       4 ulps        |
-// double |      363 ulps              |     110 ulps        |
-//
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_exp2(const Packet& _x) {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  constexpr int max_exponent = std::numeric_limits<Scalar>::max_exponent;
-  constexpr int digits = std::numeric_limits<Scalar>::digits;
-  constexpr Scalar max_cap = Scalar(max_exponent + 1);
-  constexpr Scalar min_cap = -Scalar(max_exponent + digits - 1);
-  Packet x = pmax(pmin(_x, pset1<Packet>(max_cap)), pset1<Packet>(min_cap));
-  Packet p_hi, p_lo;
-  twoprod(pset1<Packet>(Scalar(EIGEN_LN2)), x, p_hi, p_lo);
-  Packet exp2_hi = pexp(p_hi);
-  Packet exp2_lo = padd(pset1<Packet>(Scalar(1)), p_lo);
-  return pmul(exp2_hi, exp2_lo);
-}
+//----------------------------------------------------------------------
+// Rounding Functions
+//----------------------------------------------------------------------
 
 template <typename Packet>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_rint(const Packet& a) {

diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
index 942ae12..3bca1f8 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h

@@ -70,11 +70,11 @@
 template <typename Packet>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog2_float(const Packet _x);
 
-/** \internal \returns log(x) for single precision float */
+/** \internal \returns log(x) for double precision float */
 template <typename Packet>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_double(const Packet _x);
 
-/** \internal \returns log2(x) for single precision float */
+/** \internal \returns log2(x) for double precision float */
 template <typename Packet>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog2_double(const Packet _x);
 
@@ -237,6 +237,33 @@
   EIGEN_GENERIC_PACKET_FUNCTION(log1p, PACKET)              \
   EIGEN_GENERIC_PACKET_FUNCTION(atan, PACKET)
 
+// Macro to instantiate complex math function specializations (psqrt, plog, pexp)
+// that delegate to the generic implementations. Use in arch-specific Complex.h files.
+#define EIGEN_INSTANTIATE_COMPLEX_MATH_FUNCS(PacketType)                  \
+  template <>                                                             \
+  EIGEN_STRONG_INLINE PacketType psqrt<PacketType>(const PacketType& a) { \
+    return psqrt_complex<PacketType>(a);                                  \
+  }                                                                       \
+  template <>                                                             \
+  EIGEN_STRONG_INLINE PacketType plog<PacketType>(const PacketType& a) {  \
+    return plog_complex<PacketType>(a);                                   \
+  }                                                                       \
+  template <>                                                             \
+  EIGEN_STRONG_INLINE PacketType pexp<PacketType>(const PacketType& a) {  \
+    return pexp_complex<PacketType>(a);                                   \
+  }
+
+// Variant without pexp, for backends where pexp needs special handling for a given packet type.
+#define EIGEN_INSTANTIATE_COMPLEX_MATH_FUNCS_NO_EXP(PacketType)           \
+  template <>                                                             \
+  EIGEN_STRONG_INLINE PacketType psqrt<PacketType>(const PacketType& a) { \
+    return psqrt_complex<PacketType>(a);                                  \
+  }                                                                       \
+  template <>                                                             \
+  EIGEN_STRONG_INLINE PacketType plog<PacketType>(const PacketType& a) {  \
+    return plog_complex<PacketType>(a);                                   \
+  }
+
 }  // end namespace internal
 }  // end namespace Eigen
 

diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathPolynomials.h b/Eigen/src/Core/arch/Default/GenericPacketMathPolynomials.h
new file mode 100644
index 0000000..a8ec6ae
--- /dev/null
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathPolynomials.h

@@ -0,0 +1,151 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2007 Julien Pommier
+// Copyright (C) 2009-2019 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ARCH_GENERIC_PACKET_MATH_POLYNOMIALS_H
+#define EIGEN_ARCH_GENERIC_PACKET_MATH_POLYNOMIALS_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+/* polevl (modified for Eigen)
+ *
+ *      Evaluate polynomial
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * int N;
+ * Scalar x, y, coef[N+1];
+ *
+ * y = polevl<decltype(x), N>( x, coef);
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Evaluates polynomial of degree N:
+ *
+ *                     2          N
+ * y  =  C  + C x + C x  +...+ C x
+ *        0    1     2          N
+ *
+ * Coefficients are stored in reverse order:
+ *
+ * coef[0] = C  , ..., coef[N] = C  .
+ *            N                   0
+ *
+ *  The function p1evl() assumes that coef[N] = 1.0 and is
+ * omitted from the array.  Its calling arguments are
+ * otherwise the same as polevl().
+ *
+ *
+ * The Eigen implementation is templatized.  For best speed, store
+ * coef as a const array (constexpr), e.g.
+ *
+ * const double coef[] = {1.0, 2.0, 3.0, ...};
+ *
+ */
+template <typename Packet, int N>
+struct ppolevl {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x,
+                                                          const typename unpacket_traits<Packet>::type coeff[]) {
+    EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    return pmadd(ppolevl<Packet, N - 1>::run(x, coeff), x, pset1<Packet>(coeff[N]));
+  }
+};
+
+template <typename Packet>
+struct ppolevl<Packet, 0> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x,
+                                                          const typename unpacket_traits<Packet>::type coeff[]) {
+    EIGEN_UNUSED_VARIABLE(x);
+    return pset1<Packet>(coeff[0]);
+  }
+};
+
+/* chbevl (modified for Eigen)
+ *
+ *     Evaluate Chebyshev series
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * int N;
+ * Scalar x, y, coef[N], chebevl();
+ *
+ * y = chbevl( x, coef, N );
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Evaluates the series
+ *
+ *        N-1
+ *         - '
+ *  y  =   >   coef[i] T (x/2)
+ *         -            i
+ *        i=0
+ *
+ * of Chebyshev polynomials Ti at argument x/2.
+ *
+ * Coefficients are stored in reverse order, i.e. the zero
+ * order term is last in the array.  Note N is the number of
+ * coefficients, not the order.
+ *
+ * If coefficients are for the interval a to b, x must
+ * have been transformed to x -> 2(2x - b - a)/(b-a) before
+ * entering the routine.  This maps x from (a, b) to (-1, 1),
+ * over which the Chebyshev polynomials are defined.
+ *
+ * If the coefficients are for the inverted interval, in
+ * which (a, b) is mapped to (1/b, 1/a), the transformation
+ * required is x -> 2(2ab/x - b - a)/(b-a).  If b is infinity,
+ * this becomes x -> 4a/x - 1.
+ *
+ *
+ *
+ * SPEED:
+ *
+ * Taking advantage of the recurrence properties of the
+ * Chebyshev polynomials, the routine requires one more
+ * addition per loop than evaluating a nested polynomial of
+ * the same degree.
+ *
+ */
+
+template <typename Packet, int N>
+struct pchebevl {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Packet run(Packet x,
+                                                          const typename unpacket_traits<Packet>::type coef[]) {
+    typedef typename unpacket_traits<Packet>::type Scalar;
+    Packet b0 = pset1<Packet>(coef[0]);
+    Packet b1 = pset1<Packet>(static_cast<Scalar>(0.f));
+    Packet b2;
+
+    for (int i = 1; i < N; i++) {
+      b2 = b1;
+      b1 = b0;
+      b0 = psub(pmadd(x, b1, pset1<Packet>(coef[i])), b2);
+    }
+
+    return pmul(pset1<Packet>(static_cast<Scalar>(0.5f)), psub(b0, b2));
+  }
+};
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_ARCH_GENERIC_PACKET_MATH_POLYNOMIALS_H

diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathPow.h b/Eigen/src/Core/arch/Default/GenericPacketMathPow.h
new file mode 100644
index 0000000..4ab75b9
--- /dev/null
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathPow.h

@@ -0,0 +1,724 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018-2025 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ARCH_GENERIC_PACKET_MATH_POW_H
+#define EIGEN_ARCH_GENERIC_PACKET_MATH_POW_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+//----------------------------------------------------------------------
+// Cubic Root Functions
+//----------------------------------------------------------------------
+
+// This function implements a single step of Halley's iteration for
+// computing x = y^(1/3):
+//   x_{k+1} = x_k - (x_k^3 - y) x_k / (2x_k^3 + y)
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet cbrt_halley_iteration_step(const Packet& x_k,
+                                                                                      const Packet& y) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  Packet x_k_cb = pmul(x_k, pmul(x_k, x_k));
+  Packet denom = pmadd(pset1<Packet>(Scalar(2)), x_k_cb, y);
+  Packet num = psub(x_k_cb, y);
+  Packet r = pdiv(num, denom);
+  return pnmadd(x_k, r, x_k);
+}
+
+// Decompose the input such that x^(1/3) = y^(1/3) * 2^e_div3, and y is in the
+// interval [0.125,1].
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet cbrt_decompose(const Packet& x, Packet& e_div3) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  // Extract the significant s in the range [0.5,1) and exponent e, such that
+  // x = 2^e * s.
+  Packet e, s;
+  s = pfrexp(x, e);
+
+  // Split the exponent into a part divisible by 3 and the remainder.
+  // e = 3*e_div3 + e_mod3.
+  constexpr Scalar kOneThird = Scalar(1) / 3;
+  e_div3 = pceil(pmul(e, pset1<Packet>(kOneThird)));
+  Packet e_mod3 = pnmadd(pset1<Packet>(Scalar(3)), e_div3, e);
+
+  // Replace s by y = (s * 2^e_mod3).
+  return pldexp_fast(s, e_mod3);
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet cbrt_special_cases_and_sign(const Packet& x,
+                                                                                       const Packet& abs_root) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+
+  // Set sign.
+  const Packet sign_mask = pset1<Packet>(Scalar(-0.0));
+  const Packet x_sign = pand(sign_mask, x);
+  Packet root = por(x_sign, abs_root);
+
+  // Pass non-finite and zero values of x straight through.
+  const Packet is_not_finite = por(pisinf(x), pisnan(x));
+  const Packet is_zero = pcmp_eq(pzero(x), x);
+  const Packet use_x = por(is_not_finite, is_zero);
+  return pselect(use_x, x, root);
+}
+
+// Generic implementation of cbrt(x) for float.
+//
+// The algorithm computes the cubic root of the input by first
+// decomposing it into a exponent and significant
+//   x = s * 2^e.
+//
+// We can then write the cube root as
+//
+//   x^(1/3) = 2^(e/3) * s^(1/3)
+//           = 2^((3*e_div3 + e_mod3)/3) * s^(1/3)
+//           = 2^(e_div3) * 2^(e_mod3/3) * s^(1/3)
+//           = 2^(e_div3) * (s * 2^e_mod3)^(1/3)
+//
+// where e_div3 = ceil(e/3) and e_mod3 = e - 3*e_div3.
+//
+// The cube root of the second term y = (s * 2^e_mod3)^(1/3) is coarsely
+// approximated using a cubic polynomial and subsequently refined using a
+// single step of Halley's iteration, and finally the two terms are combined
+// using pldexp_fast.
+//
+// Note: Many alternatives exist for implementing cbrt. See, for example,
+// the excellent discussion in Kahan's note:
+//   https://csclub.uwaterloo.ca/~pbarfuss/qbrt.pdf
+// This particular implementation was found to be very fast and accurate
+// among several alternatives tried, but is probably not "optimal" on all
+// platforms.
+//
+// This is accurate to 2 ULP.
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcbrt_float(const Packet& x) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
+
+  // Decompose the input such that x^(1/3) = y^(1/3) * 2^e_div3, and y is in the
+  // interval [0.125,1].
+  Packet e_div3;
+  const Packet y = cbrt_decompose(pabs(x), e_div3);
+
+  // Compute initial approximation accurate to 5.22e-3.
+  // The polynomial was computed using Rminimax.
+  constexpr float alpha[] = {5.9220016002655029296875e-01f, -1.3859539031982421875e+00f, 1.4581282138824462890625e+00f,
+                             3.408401906490325927734375e-01f};
+  Packet r = ppolevl<Packet, 3>::run(y, alpha);
+
+  // Take one step of Halley's iteration.
+  r = cbrt_halley_iteration_step(r, y);
+
+  // Finally multiply by 2^(e_div3)
+  r = pldexp_fast(r, e_div3);
+
+  return cbrt_special_cases_and_sign(x, r);
+}
+
+// Generic implementation of cbrt(x) for double.
+//
+// The algorithm is identical to the one for float except that a different initial
+// approximation is used for y^(1/3) and two Halley iteration steps are performed.
+//
+// This is accurate to 1 ULP.
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcbrt_double(const Packet& x) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  static_assert(std::is_same<Scalar, double>::value, "Scalar type must be double");
+
+  // Decompose the input such that x^(1/3) = y^(1/3) * 2^e_div3, and y is in the
+  // interval [0.125,1].
+  Packet e_div3;
+  const Packet y = cbrt_decompose(pabs(x), e_div3);
+
+  // Compute initial approximation accurate to 0.016.
+  // The polynomial was computed using Rminimax.
+  constexpr double alpha[] = {-4.69470621553356115551736138513660989701747894287109375e-01,
+                              1.072314636518546304699839311069808900356292724609375e+00,
+                              3.81249427609571867048288140722434036433696746826171875e-01};
+  Packet r = ppolevl<Packet, 2>::run(y, alpha);
+
+  // Take two steps of Halley's iteration.
+  r = cbrt_halley_iteration_step(r, y);
+  r = cbrt_halley_iteration_step(r, y);
+
+  // Finally multiply by 2^(e_div3).
+  r = pldexp_fast(r, e_div3);
+  return cbrt_special_cases_and_sign(x, r);
+}
+
+//----------------------------------------------------------------------
+// Power Functions (accurate_log2, generic_pow, unary_pow)
+//----------------------------------------------------------------------
+
+// This function computes log2(x) and returns the result as a double word.
+template <typename Scalar>
+struct accurate_log2 {
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(const Packet& x, Packet& log2_x_hi, Packet& log2_x_lo) const {
+    log2_x_hi = plog2(x);
+    log2_x_lo = pzero(x);
+  }
+};
+
+// This specialization uses a more accurate algorithm to compute log2(x) for
+// floats in [1/sqrt(2);sqrt(2)] with a relative accuracy of ~6.56508e-10.
+// This additional accuracy is needed to counter the error-magnification
+// inherent in multiplying by a potentially large exponent in pow(x,y).
+// The minimax polynomial used was calculated using the Rminimax tool,
+// see https://gitlab.inria.fr/sfilip/rminimax.
+// Command line:
+//   $ ratapprox --function="log2(1+x)/x"  --dom='[-0.2929,0.41422]'
+//   --type=[10,0]
+//       --numF="[D,D,SG]" --denF="[SG]" --log --dispCoeff="dec"
+//
+// The resulting implementation of pow(x,y) is accurate to 3 ulps.
+template <>
+struct accurate_log2<float> {
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(const Packet& z, Packet& log2_x_hi, Packet& log2_x_lo) const {
+    // Split the two lowest order constant coefficient into double-word representation.
+    constexpr double kC0 = 1.442695041742110273474963832995854318141937255859375e+00;
+    constexpr float kC0_hi = static_cast<float>(kC0);
+    constexpr float kC0_lo = static_cast<float>(kC0 - static_cast<double>(kC0_hi));
+    const Packet c0_hi = pset1<Packet>(kC0_hi);
+    const Packet c0_lo = pset1<Packet>(kC0_lo);
+
+    constexpr double kC1 = -7.2134751588268664068692714863573201000690460205078125e-01;
+    constexpr float kC1_hi = static_cast<float>(kC1);
+    constexpr float kC1_lo = static_cast<float>(kC1 - static_cast<double>(kC1_hi));
+    const Packet c1_hi = pset1<Packet>(kC1_hi);
+    const Packet c1_lo = pset1<Packet>(kC1_lo);
+
+    constexpr float c[] = {
+        9.7010828554630279541015625e-02,  -1.6896486282348632812500000e-01, 1.7200836539268493652343750e-01,
+        -1.7892272770404815673828125e-01, 2.0505344867706298828125000e-01,  -2.4046677350997924804687500e-01,
+        2.8857553005218505859375000e-01,  -3.6067414283752441406250000e-01, 4.8089790344238281250000000e-01};
+
+    // Evaluate the higher order terms in the polynomial using
+    // standard arithmetic.
+    const Packet one = pset1<Packet>(1.0f);
+    const Packet x = psub(z, one);
+    Packet p = ppolevl<Packet, 8>::run(x, c);
+    // Evaluate the final two step in Horner's rule using double-word
+    // arithmetic.
+    Packet p_hi, p_lo;
+    twoprod(x, p, p_hi, p_lo);
+    fast_twosum(c1_hi, c1_lo, p_hi, p_lo, p_hi, p_lo);
+    twoprod(p_hi, p_lo, x, p_hi, p_lo);
+    fast_twosum(c0_hi, c0_lo, p_hi, p_lo, p_hi, p_lo);
+    // Multiply by x to recover log2(z).
+    twoprod(p_hi, p_lo, x, log2_x_hi, log2_x_lo);
+  }
+};
+
+// This specialization uses a more accurate algorithm to compute log2(x) for
+// floats in [1/sqrt(2);sqrt(2)] with a relative accuracy of ~1.27e-18.
+// This additional accuracy is needed to counter the error-magnification
+// inherent in multiplying by a potentially large exponent in pow(x,y).
+// The minimax polynomial used was calculated using the Sollya tool.
+// See sollya.org.
+
+template <>
+struct accurate_log2<double> {
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(const Packet& x, Packet& log2_x_hi, Packet& log2_x_lo) const {
+    // We use a transformation of variables:
+    //    r = c * (x-1) / (x+1),
+    // such that
+    //    log2(x) = log2((1 + r/c) / (1 - r/c)) = f(r).
+    // The function f(r) can be approximated well using an odd polynomial
+    // of the form
+    //   P(r) = ((Q(r^2) * r^2 + C) * r^2 + 1) * r,
+    // For the implementation of log2<double> here, Q is of degree 6 with
+    // coefficient represented in working precision (double), while C is a
+    // constant represented in extra precision as a double word to achieve
+    // full accuracy.
+    //
+    // The polynomial coefficients were computed by the Sollya script:
+    //
+    // c = 2 / log(2);
+    // trans = c * (x-1)/(x+1);
+    // itrans = (1+x/c)/(1-x/c);
+    // interval=[trans(sqrt(0.5)); trans(sqrt(2))];
+    // print(interval);
+    // f = log2(itrans(x));
+    // p=fpminimax(f,[|1,3,5,7,9,11,13,15,17|],[|1,DD,double...|],interval,relative,floating);
+    const Packet q12 = pset1<Packet>(2.87074255468000586e-9);
+    const Packet q10 = pset1<Packet>(2.38957980901884082e-8);
+    const Packet q8 = pset1<Packet>(2.31032094540014656e-7);
+    const Packet q6 = pset1<Packet>(2.27279857398537278e-6);
+    const Packet q4 = pset1<Packet>(2.31271023278625638e-5);
+    const Packet q2 = pset1<Packet>(2.47556738444535513e-4);
+    const Packet q0 = pset1<Packet>(2.88543873228900172e-3);
+    const Packet C_hi = pset1<Packet>(0.0400377511598501157);
+    const Packet C_lo = pset1<Packet>(-4.77726582251425391e-19);
+    const Packet one = pset1<Packet>(1.0);
+
+    const Packet cst_2_log2e_hi = pset1<Packet>(2.88539008177792677);
+    const Packet cst_2_log2e_lo = pset1<Packet>(4.07660016854549667e-17);
+    // c * (x - 1)
+    Packet t_hi, t_lo;
+    // t = c * (x-1)
+    twoprod(cst_2_log2e_hi, cst_2_log2e_lo, psub(x, one), t_hi, t_lo);
+    // r = c * (x-1) / (x+1),
+    Packet r_hi, r_lo;
+    doubleword_div_fp(t_hi, t_lo, padd(x, one), r_hi, r_lo);
+
+    // r2 = r * r
+    Packet r2_hi, r2_lo;
+    twoprod(r_hi, r_lo, r_hi, r_lo, r2_hi, r2_lo);
+    // r4 = r2 * r2
+    Packet r4_hi, r4_lo;
+    twoprod(r2_hi, r2_lo, r2_hi, r2_lo, r4_hi, r4_lo);
+
+    // Evaluate Q(r^2) in working precision. We evaluate it in two parts
+    // (even and odd in r^2) to improve instruction level parallelism.
+    Packet q_even = pmadd(q12, r4_hi, q8);
+    Packet q_odd = pmadd(q10, r4_hi, q6);
+    q_even = pmadd(q_even, r4_hi, q4);
+    q_odd = pmadd(q_odd, r4_hi, q2);
+    q_even = pmadd(q_even, r4_hi, q0);
+    Packet q = pmadd(q_odd, r2_hi, q_even);
+
+    // Now evaluate the low order terms of P(x) in double word precision.
+    // In the following, due to the increasing magnitude of the coefficients
+    // and r being constrained to [-0.5, 0.5] we can use fast_twosum instead
+    // of the slower twosum.
+    // Q(r^2) * r^2
+    Packet p_hi, p_lo;
+    twoprod(r2_hi, r2_lo, q, p_hi, p_lo);
+    // Q(r^2) * r^2 + C
+    Packet p1_hi, p1_lo;
+    fast_twosum(C_hi, C_lo, p_hi, p_lo, p1_hi, p1_lo);
+    // (Q(r^2) * r^2 + C) * r^2
+    Packet p2_hi, p2_lo;
+    twoprod(r2_hi, r2_lo, p1_hi, p1_lo, p2_hi, p2_lo);
+    // ((Q(r^2) * r^2 + C) * r^2 + 1)
+    Packet p3_hi, p3_lo;
+    fast_twosum(one, p2_hi, p2_lo, p3_hi, p3_lo);
+
+    // log(z) ~= ((Q(r^2) * r^2 + C) * r^2 + 1) * r
+    twoprod(p3_hi, p3_lo, r_hi, r_lo, log2_x_hi, log2_x_lo);
+  }
+};
+
+// This function implements the non-trivial case of pow(x,y) where x is
+// positive and y is (possibly) non-integer.
+// Formally, pow(x,y) = exp2(y * log2(x)), where exp2(x) is shorthand for 2^x.
+// TODO(rmlarsen): We should probably add this as a packet op 'ppow', to make it
+// easier to specialize or turn off for specific types and/or backends.
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_pow_impl(const Packet& x, const Packet& y) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  // Split x into exponent e_x and mantissa m_x.
+  Packet e_x;
+  Packet m_x = pfrexp(x, e_x);
+
+  // Adjust m_x to lie in [1/sqrt(2):sqrt(2)] to minimize absolute error in log2(m_x).
+  constexpr Scalar sqrt_half = Scalar(0.70710678118654752440);
+  const Packet m_x_scale_mask = pcmp_lt(m_x, pset1<Packet>(sqrt_half));
+  m_x = pselect(m_x_scale_mask, pmul(pset1<Packet>(Scalar(2)), m_x), m_x);
+  e_x = pselect(m_x_scale_mask, psub(e_x, pset1<Packet>(Scalar(1))), e_x);
+
+  // Compute log2(m_x) with 6 extra bits of accuracy.
+  Packet rx_hi, rx_lo;
+  accurate_log2<Scalar>()(m_x, rx_hi, rx_lo);
+
+  // Compute the two terms {y * e_x, y * r_x} in f = y * log2(x) with doubled
+  // precision using double word arithmetic.
+  Packet f1_hi, f1_lo, f2_hi, f2_lo;
+  twoprod(e_x, y, f1_hi, f1_lo);
+  twoprod(rx_hi, rx_lo, y, f2_hi, f2_lo);
+  // Sum the two terms in f using double word arithmetic. We know
+  // that |e_x| > |log2(m_x)|, except for the case where e_x==0.
+  // This means that we can use fast_twosum(f1,f2).
+  // In the case e_x == 0, e_x * y = f1 = 0, so we don't lose any
+  // accuracy by violating the assumption of fast_twosum, because
+  // it's a no-op.
+  Packet f_hi, f_lo;
+  fast_twosum(f1_hi, f1_lo, f2_hi, f2_lo, f_hi, f_lo);
+
+  // Split f into integer and fractional parts.
+  Packet n_z, r_z;
+  absolute_split(f_hi, n_z, r_z);
+  r_z = padd(r_z, f_lo);
+  Packet n_r;
+  absolute_split(r_z, n_r, r_z);
+  n_z = padd(n_z, n_r);
+
+  // We now have an accurate split of f = n_z + r_z and can compute
+  //   x^y = 2**{n_z + r_z) = exp2(r_z) * 2**{n_z}.
+  // Multiplication by the second factor can be done exactly using pldexp(), since
+  // it is an integer power of 2.
+  const Packet e_r = generic_exp2(r_z);
+
+  // Since we know that e_r is in [1/sqrt(2); sqrt(2)], we can use the fast version
+  // of pldexp to multiply by 2**{n_z} when |n_z| is sufficiently small.
+  constexpr Scalar kPldExpThresh = std::numeric_limits<Scalar>::max_exponent - 2;
+  const Packet pldexp_fast_unsafe = pcmp_lt(pset1<Packet>(kPldExpThresh), pabs(n_z));
+  if (predux_any(pldexp_fast_unsafe)) {
+    return pldexp(e_r, n_z);
+  }
+  return pldexp_fast(e_r, n_z);
+}
+
+// Generic implementation of pow(x,y).
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS std::enable_if_t<!is_scalar<Packet>::value, Packet> generic_pow(
+    const Packet& x, const Packet& y) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+
+  const Packet cst_inf = pset1<Packet>(NumTraits<Scalar>::infinity());
+  const Packet cst_zero = pset1<Packet>(Scalar(0));
+  const Packet cst_one = pset1<Packet>(Scalar(1));
+  const Packet cst_nan = pset1<Packet>(NumTraits<Scalar>::quiet_NaN());
+
+  const Packet x_abs = pabs(x);
+  Packet result = generic_pow_impl(x_abs, y);
+
+  // In the following we enforce the special case handling prescribed in
+  // https://en.cppreference.com/w/cpp/numeric/math/pow.
+
+  // Predicates for sign and magnitude of x.
+  const Packet x_is_negative = pcmp_lt(x, cst_zero);
+  const Packet x_is_zero = pcmp_eq(x, cst_zero);
+  const Packet x_is_one = pcmp_eq(x, cst_one);
+  const Packet x_has_signbit = psignbit(x);
+  const Packet x_abs_gt_one = pcmp_lt(cst_one, x_abs);
+  const Packet x_abs_is_inf = pcmp_eq(x_abs, cst_inf);
+
+  // Predicates for sign and magnitude of y.
+  const Packet y_abs = pabs(y);
+  const Packet y_abs_is_inf = pcmp_eq(y_abs, cst_inf);
+  const Packet y_is_negative = pcmp_lt(y, cst_zero);
+  const Packet y_is_zero = pcmp_eq(y, cst_zero);
+  const Packet y_is_one = pcmp_eq(y, cst_one);
+  // Predicates for whether y is integer and odd/even.
+  const Packet y_is_int = pandnot(pcmp_eq(pfloor(y), y), y_abs_is_inf);
+  const Packet y_div_2 = pmul(y, pset1<Packet>(Scalar(0.5)));
+  const Packet y_is_even = pcmp_eq(pround(y_div_2), y_div_2);
+  const Packet y_is_odd_int = pandnot(y_is_int, y_is_even);
+  // Smallest exponent for which (1 + epsilon) overflows to infinity.
+  constexpr Scalar huge_exponent =
+      (NumTraits<Scalar>::max_exponent() * Scalar(EIGEN_LN2)) / NumTraits<Scalar>::epsilon();
+  const Packet y_abs_is_huge = pcmp_le(pset1<Packet>(huge_exponent), y_abs);
+
+  // *  pow(base, exp) returns NaN if base is finite and negative
+  //    and exp is finite and non-integer.
+  result = pselect(pandnot(x_is_negative, y_is_int), cst_nan, result);
+
+  // * pow(±0, exp), where exp is negative, finite, and is an even integer or
+  // a non-integer, returns +∞
+  // * pow(±0, exp), where exp is positive non-integer or a positive even
+  // integer, returns +0
+  // * pow(+0, exp), where exp is a negative odd integer, returns +∞
+  // * pow(-0, exp), where exp is a negative odd integer, returns -∞
+  // * pow(+0, exp), where exp is a positive odd integer, returns +0
+  // * pow(-0, exp), where exp is a positive odd integer, returns -0
+  // Sign is flipped by the rule below.
+  result = pselect(x_is_zero, pselect(y_is_negative, cst_inf, cst_zero), result);
+
+  // pow(base, exp) returns -pow(abs(base), exp) if base has the sign bit set,
+  // and exp is an odd integer exponent.
+  result = pselect(pand(x_has_signbit, y_is_odd_int), pnegate(result), result);
+
+  // * pow(base, -∞) returns +∞ for any |base|<1
+  // * pow(base, -∞) returns +0 for any |base|>1
+  // * pow(base, +∞) returns +0 for any |base|<1
+  // * pow(base, +∞) returns +∞ for any |base|>1
+  // * pow(±0, -∞) returns +∞
+  // * pow(-1, +-∞) = 1
+  Packet inf_y_val = pselect(pxor(y_is_negative, x_abs_gt_one), cst_inf, cst_zero);
+  inf_y_val = pselect(pcmp_eq(x, pset1<Packet>(Scalar(-1.0))), cst_one, inf_y_val);
+  result = pselect(y_abs_is_huge, inf_y_val, result);
+
+  // * pow(+∞, exp) returns +0 for any negative exp
+  // * pow(+∞, exp) returns +∞ for any positive exp
+  // * pow(-∞, exp) returns -0 if exp is a negative odd integer.
+  // * pow(-∞, exp) returns +0 if exp is a negative non-integer or negative
+  //     even integer.
+  // * pow(-∞, exp) returns -∞ if exp is a positive odd integer.
+  // * pow(-∞, exp) returns +∞ if exp is a positive non-integer or positive
+  //     even integer.
+  auto x_pos_inf_value = pselect(y_is_negative, cst_zero, cst_inf);
+  auto x_neg_inf_value = pselect(y_is_odd_int, pnegate(x_pos_inf_value), x_pos_inf_value);
+  result = pselect(x_abs_is_inf, pselect(x_is_negative, x_neg_inf_value, x_pos_inf_value), result);
+
+  // All cases of NaN inputs return NaN, except the two below.
+  result = pselect(por(pisnan(x), pisnan(y)), cst_nan, result);
+
+  // * pow(base, 1) returns base.
+  // * pow(base, +/-0) returns 1, regardless of base, even NaN.
+  // * pow(+1, exp) returns 1, regardless of exponent, even NaN.
+  result = pselect(y_is_one, x, pselect(por(x_is_one, y_is_zero), cst_one, result));
+
+  return result;
+}
+
+template <typename Scalar>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS std::enable_if_t<is_scalar<Scalar>::value, Scalar> generic_pow(
+    const Scalar& x, const Scalar& y) {
+  return numext::pow(x, y);
+}
+
+namespace unary_pow {
+
+template <typename ScalarExponent, bool IsInteger = NumTraits<ScalarExponent>::IsInteger>
+struct exponent_helper {
+  using safe_abs_type = ScalarExponent;
+  static constexpr ScalarExponent one_half = ScalarExponent(0.5);
+  // these routines assume that exp is an integer stored as a floating point type
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScalarExponent safe_abs(const ScalarExponent& exp) {
+    return numext::abs(exp);
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool is_odd(const ScalarExponent& exp) {
+    eigen_assert(((numext::isfinite)(exp) && exp == numext::floor(exp)) && "exp must be an integer");
+    ScalarExponent exp_div_2 = exp * one_half;
+    ScalarExponent floor_exp_div_2 = numext::floor(exp_div_2);
+    return exp_div_2 != floor_exp_div_2;
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScalarExponent floor_div_two(const ScalarExponent& exp) {
+    ScalarExponent exp_div_2 = exp * one_half;
+    return numext::floor(exp_div_2);
+  }
+};
+
+template <typename ScalarExponent>
+struct exponent_helper<ScalarExponent, true> {
+  // if `exp` is a signed integer type, cast it to its unsigned counterpart to safely store its absolute value
+  // consider the (rare) case where `exp` is an int32_t: abs(-2147483648) != 2147483648
+  using safe_abs_type = typename numext::get_integer_by_size<sizeof(ScalarExponent)>::unsigned_type;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE safe_abs_type safe_abs(const ScalarExponent& exp) {
+    ScalarExponent mask = numext::signbit(exp);
+    safe_abs_type result = safe_abs_type(exp ^ mask);
+    return result + safe_abs_type(ScalarExponent(1) & mask);
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool is_odd(const safe_abs_type& exp) {
+    return exp % safe_abs_type(2) != safe_abs_type(0);
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE safe_abs_type floor_div_two(const safe_abs_type& exp) {
+    return exp >> safe_abs_type(1);
+  }
+};
+
+template <typename Packet, typename ScalarExponent,
+          bool ReciprocateIfExponentIsNegative =
+              !NumTraits<typename unpacket_traits<Packet>::type>::IsInteger && NumTraits<ScalarExponent>::IsSigned>
+struct reciprocate {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) {
+    using Scalar = typename unpacket_traits<Packet>::type;
+    const Packet cst_pos_one = pset1<Packet>(Scalar(1));
+    return exponent < 0 ? pdiv(cst_pos_one, x) : x;
+  }
+};
+
+template <typename Packet, typename ScalarExponent>
+struct reciprocate<Packet, ScalarExponent, false> {
+  // pdiv not defined, nor necessary for integer base types
+  // if the exponent is unsigned, then the exponent cannot be negative
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent&) { return x; }
+};
+
+template <typename Packet, typename ScalarExponent>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet int_pow(const Packet& x, const ScalarExponent& exponent) {
+  using Scalar = typename unpacket_traits<Packet>::type;
+  using ExponentHelper = exponent_helper<ScalarExponent>;
+  using AbsExponentType = typename ExponentHelper::safe_abs_type;
+  const Packet cst_pos_one = pset1<Packet>(Scalar(1));
+  if (exponent == ScalarExponent(0)) return cst_pos_one;
+
+  Packet result = reciprocate<Packet, ScalarExponent>::run(x, exponent);
+  Packet y = cst_pos_one;
+  AbsExponentType m = ExponentHelper::safe_abs(exponent);
+
+  while (m > 1) {
+    bool odd = ExponentHelper::is_odd(m);
+    if (odd) y = pmul(y, result);
+    result = pmul(result, result);
+    m = ExponentHelper::floor_div_two(m);
+  }
+
+  return pmul(y, result);
+}
+
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!is_scalar<Packet>::value, Packet> gen_pow(
+    const Packet& x, const typename unpacket_traits<Packet>::type& exponent) {
+  const Packet exponent_packet = pset1<Packet>(exponent);
+  // generic_pow_impl requires positive x; sign/error handling is done by the caller.
+  return generic_pow_impl(pabs(x), exponent_packet);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<is_scalar<Scalar>::value, Scalar> gen_pow(
+    const Scalar& x, const Scalar& exponent) {
+  return numext::pow(x, exponent);
+}
+
+// Handle special cases for pow(x, exponent) where both base and exponent are
+// floating point and the exponent is a non-integer scalar (uniform across all
+// SIMD lanes). This allows us to use scalar branches on exponent properties.
+template <typename Packet, typename ScalarExponent>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet handle_nonint_nonint_errors(const Packet& x, const Packet& powx,
+                                                                         const ScalarExponent& exponent) {
+  using Scalar = typename unpacket_traits<Packet>::type;
+  const Packet cst_zero = pzero(x);
+  const Packet cst_one = pset1<Packet>(Scalar(1));
+  const Packet cst_inf = pset1<Packet>(NumTraits<Scalar>::infinity());
+  const Packet cst_nan = pset1<Packet>(NumTraits<Scalar>::quiet_NaN());
+
+  const Packet abs_x = pabs(x);
+
+  // x < 0 with non-integer exponent -> NaN.
+  Packet result = pselect(pcmp_lt(x, cst_zero), cst_nan, powx);
+
+  if (!(numext::isfinite)(exponent)) {
+    if (exponent != exponent) {
+      // pow(x, NaN) = NaN, except pow(+1, NaN) = 1.
+      result = pselect(pcmp_eq(x, cst_one), cst_one, cst_nan);
+    } else {
+      // Exponent is +inf or -inf.
+      const Packet abs_x_is_one = pcmp_eq(abs_x, cst_one);
+      if (exponent > ScalarExponent(0)) {
+        // pow(x, +inf): |x| > 1 -> +inf, |x| < 1 -> 0, |x| == 1 -> 1.
+        result = pselect(pcmp_lt(cst_one, abs_x), cst_inf, cst_zero);
+      } else {
+        // pow(x, -inf): |x| < 1 -> +inf, |x| > 1 -> 0, |x| == 1 -> 1.
+        result = pselect(pcmp_lt(abs_x, cst_one), cst_inf, cst_zero);
+      }
+      // pow(+-1, +-inf) = 1.
+      result = pselect(abs_x_is_one, cst_one, result);
+    }
+  } else {
+    // Finite non-integer exponent.
+    const Packet x_is_zero = pcmp_eq(x, cst_zero);
+    const Packet abs_x_is_inf = pcmp_eq(abs_x, cst_inf);
+    if (exponent < ScalarExponent(0)) {
+      // pow(+-0, negative non-integer) = +inf. pow(+-inf, negative) = +0.
+      result = pselect(x_is_zero, cst_inf, result);
+      result = pselect(abs_x_is_inf, cst_zero, result);
+    } else {
+      // pow(+-0, positive non-integer) = +0. pow(+-inf, positive) = +inf.
+      result = pselect(x_is_zero, cst_zero, result);
+      result = pselect(abs_x_is_inf, cst_inf, result);
+    }
+  }
+
+  // NaN base produces NaN. This overrides all cases above, but pow(NaN, 0) = 1
+  // and pow(NaN, integer) are handled by the integer exponent path and never
+  // reach this function.
+  result = pselect(pisnan(x), cst_nan, result);
+
+  return result;
+}
+
+template <typename Packet, typename ScalarExponent,
+          std::enable_if_t<NumTraits<typename unpacket_traits<Packet>::type>::IsSigned, bool> = true>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet handle_negative_exponent(const Packet& x, const ScalarExponent& exponent) {
+  using Scalar = typename unpacket_traits<Packet>::type;
+
+  // signed integer base, signed integer exponent case
+
+  // This routine handles negative exponents.
+  // The return value is either 0, 1, or -1.
+  const Packet cst_pos_one = pset1<Packet>(Scalar(1));
+  const bool exponent_is_odd = exponent % ScalarExponent(2) != ScalarExponent(0);
+  const Packet exp_is_odd = exponent_is_odd ? ptrue<Packet>(x) : pzero<Packet>(x);
+
+  const Packet abs_x = pabs(x);
+  const Packet abs_x_is_one = pcmp_eq(abs_x, cst_pos_one);
+
+  Packet result = pselect(exp_is_odd, x, abs_x);
+  result = pselect(abs_x_is_one, result, pzero<Packet>(x));
+  return result;
+}
+
+template <typename Packet, typename ScalarExponent,
+          std::enable_if_t<!NumTraits<typename unpacket_traits<Packet>::type>::IsSigned, bool> = true>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet handle_negative_exponent(const Packet& x, const ScalarExponent&) {
+  using Scalar = typename unpacket_traits<Packet>::type;
+
+  // unsigned integer base, signed integer exponent case
+
+  // This routine handles negative exponents.
+  // The return value is either 0 or 1
+
+  const Scalar pos_one = Scalar(1);
+
+  const Packet cst_pos_one = pset1<Packet>(pos_one);
+
+  const Packet x_is_one = pcmp_eq(x, cst_pos_one);
+
+  return pand(x_is_one, x);
+}
+
+}  // end namespace unary_pow
+
+template <typename Packet, typename ScalarExponent,
+          bool BaseIsIntegerType = NumTraits<typename unpacket_traits<Packet>::type>::IsInteger,
+          bool ExponentIsIntegerType = NumTraits<ScalarExponent>::IsInteger,
+          bool ExponentIsSigned = NumTraits<ScalarExponent>::IsSigned>
+struct unary_pow_impl;
+
+template <typename Packet, typename ScalarExponent, bool ExponentIsSigned>
+struct unary_pow_impl<Packet, ScalarExponent, false, false, ExponentIsSigned> {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) {
+    const bool exponent_is_integer = (numext::isfinite)(exponent) && numext::round(exponent) == exponent;
+    if (exponent_is_integer) {
+      // The simple recursive doubling implementation is only accurate to 3 ulps
+      // for integer exponents in [-3:7]. Since this is a common case, we
+      // specialize it here.
+      bool use_repeated_squaring =
+          (exponent <= ScalarExponent(7) && (!ExponentIsSigned || exponent >= ScalarExponent(-3)));
+      return use_repeated_squaring ? unary_pow::int_pow(x, exponent) : generic_pow(x, pset1<Packet>(exponent));
+    } else {
+      Packet result = unary_pow::gen_pow(x, exponent);
+      result = unary_pow::handle_nonint_nonint_errors(x, result, exponent);
+      return result;
+    }
+  }
+};
+
+template <typename Packet, typename ScalarExponent, bool ExponentIsSigned>
+struct unary_pow_impl<Packet, ScalarExponent, false, true, ExponentIsSigned> {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) {
+    return unary_pow::int_pow(x, exponent);
+  }
+};
+
+template <typename Packet, typename ScalarExponent>
+struct unary_pow_impl<Packet, ScalarExponent, true, true, true> {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) {
+    if (exponent < ScalarExponent(0)) {
+      return unary_pow::handle_negative_exponent(x, exponent);
+    } else {
+      return unary_pow::int_pow(x, exponent);
+    }
+  }
+};
+
+template <typename Packet, typename ScalarExponent>
+struct unary_pow_impl<Packet, ScalarExponent, true, true, false> {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) {
+    return unary_pow::int_pow(x, exponent);
+  }
+};
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_ARCH_GENERIC_PACKET_MATH_POW_H

diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathTrig.h b/Eigen/src/Core/arch/Default/GenericPacketMathTrig.h
new file mode 100644
index 0000000..1d7c594
--- /dev/null
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathTrig.h

@@ -0,0 +1,833 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2007 Julien Pommier
+// Copyright (C) 2009-2019 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2018-2025 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ARCH_GENERIC_PACKET_MATH_TRIG_H
+#define EIGEN_ARCH_GENERIC_PACKET_MATH_TRIG_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+//----------------------------------------------------------------------
+// Trigonometric Functions
+//----------------------------------------------------------------------
+
+// Enum for selecting which function to compute. SinCos is intended to compute
+// pairs of Sin and Cos of the even entries in the packet, e.g.
+// SinCos([a, *, b, *]) = [sin(a), cos(a), sin(b), cos(b)].
+enum class TrigFunction : uint8_t { Sin, Cos, Tan, SinCos };
+
+// The following code is inspired by the following stack-overflow answer:
+//   https://stackoverflow.com/questions/30463616/payne-hanek-algorithm-implementation-in-c/30465751#30465751
+// It has been largely optimized:
+//  - By-pass calls to frexp.
+//  - Aligned loads of required 96 bits of 2/pi. This is accomplished by
+//    (1) balancing the mantissa and exponent to the required bits of 2/pi are
+//    aligned on 8-bits, and (2) replicating the storage of the bits of 2/pi.
+//  - Avoid a branch in rounding and extraction of the remaining fractional part.
+// Overall, I measured a speed up higher than x2 on x86-64.
+inline float trig_reduce_huge(float xf, Eigen::numext::int32_t* quadrant) {
+  using Eigen::numext::int32_t;
+  using Eigen::numext::int64_t;
+  using Eigen::numext::uint32_t;
+  using Eigen::numext::uint64_t;
+
+  const double pio2_62 = 3.4061215800865545e-19;     // pi/2 * 2^-62
+  const uint64_t zero_dot_five = uint64_t(1) << 61;  // 0.5 in 2.62-bit fixed-point format
+
+  // 192 bits of 2/pi for Payne-Hanek reduction
+  // Bits are introduced by packet of 8 to enable aligned reads.
+  static const uint32_t two_over_pi[] = {
+      0x00000028, 0x000028be, 0x0028be60, 0x28be60db, 0xbe60db93, 0x60db9391, 0xdb939105, 0x9391054a, 0x91054a7f,
+      0x054a7f09, 0x4a7f09d5, 0x7f09d5f4, 0x09d5f47d, 0xd5f47d4d, 0xf47d4d37, 0x7d4d3770, 0x4d377036, 0x377036d8,
+      0x7036d8a5, 0x36d8a566, 0xd8a5664f, 0xa5664f10, 0x664f10e4, 0x4f10e410, 0x10e41000, 0xe4100000};
+
+  uint32_t xi = numext::bit_cast<uint32_t>(xf);
+  // Below, -118 = -126 + 8.
+  //   -126 is to get the exponent,
+  //   +8 is to enable alignment of 2/pi's bits on 8 bits.
+  // This is possible because the fractional part of x as only 24 meaningful bits.
+  uint32_t e = (xi >> 23) - 118;
+  // Extract the mantissa and shift it to align it wrt the exponent
+  xi = ((xi & 0x007fffffu) | 0x00800000u) << (e & 0x7);
+
+  uint32_t i = e >> 3;
+  uint32_t twoopi_1 = two_over_pi[i - 1];
+  uint32_t twoopi_2 = two_over_pi[i + 3];
+  uint32_t twoopi_3 = two_over_pi[i + 7];
+
+  // Compute x * 2/pi in 2.62-bit fixed-point format.
+  uint64_t p;
+  p = uint64_t(xi) * twoopi_3;
+  p = uint64_t(xi) * twoopi_2 + (p >> 32);
+  p = (uint64_t(xi * twoopi_1) << 32) + p;
+
+  // Round to nearest: add 0.5 and extract integral part.
+  uint64_t q = (p + zero_dot_five) >> 62;
+  *quadrant = int(q);
+  // Now it remains to compute "r = x - q*pi/2" with high accuracy,
+  // since we have p=x/(pi/2) with high accuracy, we can more efficiently compute r as:
+  //   r = (p-q)*pi/2,
+  // where the product can be be carried out with sufficient accuracy using double precision.
+  p -= q << 62;
+  return float(double(int64_t(p)) * pio2_62);
+}
+
+template <TrigFunction Func, typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+#if EIGEN_COMP_GNUC_STRICT
+    __attribute__((optimize("-fno-unsafe-math-optimizations")))
+#endif
+    Packet
+    psincos_float(const Packet& _x) {
+  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
+
+  const Packet cst_2oPI = pset1<Packet>(0.636619746685028076171875f);  // 2/PI
+  const Packet cst_rounding_magic = pset1<Packet>(12582912);           // 2^23 for rounding
+  const PacketI csti_1 = pset1<PacketI>(1);
+  const Packet cst_sign_mask = pset1frombits<Packet>(static_cast<Eigen::numext::uint32_t>(0x80000000u));
+
+  Packet x = pabs(_x);
+
+  // Scale x by 2/Pi to find x's octant.
+  Packet y = pmul(x, cst_2oPI);
+
+  // Rounding trick to find nearest integer:
+  Packet y_round = padd(y, cst_rounding_magic);
+  EIGEN_OPTIMIZATION_BARRIER(y_round)
+  PacketI y_int = preinterpret<PacketI>(y_round);  // last 23 digits represent integer (if abs(x)<2^24)
+  y = psub(y_round, cst_rounding_magic);           // nearest integer to x * (2/pi)
+
+// Subtract y * Pi/2 to reduce x to the interval -Pi/4 <= x <= +Pi/4
+// using "Extended precision modular arithmetic"
+#if defined(EIGEN_VECTORIZE_FMA)
+  // This version requires true FMA for high accuracy.
+  // It provides a max error of 1ULP up to (with absolute_error < 5.9605e-08):
+  constexpr float huge_th = (Func == TrigFunction::Sin) ? 117435.992f : 71476.0625f;
+  x = pmadd(y, pset1<Packet>(-1.57079601287841796875f), x);
+  x = pmadd(y, pset1<Packet>(-3.1391647326017846353352069854736328125e-07f), x);
+  x = pmadd(y, pset1<Packet>(-5.390302529957764765544681040410068817436695098876953125e-15f), x);
+#else
+  // Without true FMA, the previous set of coefficients maintain 1ULP accuracy
+  // up to x<15.7 (for sin), but accuracy is immediately lost for x>15.7.
+  // We thus use one more iteration to maintain 2ULPs up to reasonably large inputs.
+
+  // The following set of coefficients maintain 1ULP up to 9.43 and 14.16 for sin and cos respectively.
+  // and 2 ULP up to:
+  constexpr float huge_th = (Func == TrigFunction::Sin) ? 25966.f : 18838.f;
+  x = pmadd(y, pset1<Packet>(-1.5703125), x);  // = 0xbfc90000
+  EIGEN_OPTIMIZATION_BARRIER(x)
+  x = pmadd(y, pset1<Packet>(-0.000483989715576171875), x);  // = 0xb9fdc000
+  EIGEN_OPTIMIZATION_BARRIER(x)
+  x = pmadd(y, pset1<Packet>(1.62865035235881805419921875e-07), x);                      // = 0x342ee000
+  x = pmadd(y, pset1<Packet>(5.5644315544167710640977020375430583953857421875e-11), x);  // = 0x2e74b9ee
+
+// For the record, the following set of coefficients maintain 2ULP up
+// to a slightly larger range:
+// const float huge_th = ComputeSine ? 51981.f : 39086.125f;
+// but it slightly fails to maintain 1ULP for two values of sin below pi.
+// x = pmadd(y, pset1<Packet>(-3.140625/2.), x);
+// x = pmadd(y, pset1<Packet>(-0.00048351287841796875), x);
+// x = pmadd(y, pset1<Packet>(-3.13855707645416259765625e-07), x);
+// x = pmadd(y, pset1<Packet>(-6.0771006282767103812147979624569416046142578125e-11), x);
+
+// For the record, with only 3 iterations it is possible to maintain
+// 1 ULP up to 3PI (maybe more) and 2ULP up to 255.
+// The coefficients are: 0xbfc90f80, 0xb7354480, 0x2e74b9ee
+#endif
+
+  if (predux_any(pcmp_le(pset1<Packet>(huge_th), pabs(_x)))) {
+    const int PacketSize = unpacket_traits<Packet>::size;
+    EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float vals[PacketSize];
+    EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float x_cpy[PacketSize];
+    EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) Eigen::numext::int32_t y_int2[PacketSize];
+    pstoreu(vals, pabs(_x));
+    pstoreu(x_cpy, x);
+    pstoreu(y_int2, y_int);
+    for (int k = 0; k < PacketSize; ++k) {
+      float val = vals[k];
+      if (val >= huge_th && (numext::isfinite)(val)) x_cpy[k] = trig_reduce_huge(val, &y_int2[k]);
+    }
+    x = ploadu<Packet>(x_cpy);
+    y_int = ploadu<PacketI>(y_int2);
+  }
+
+  // Get the polynomial selection mask from the second bit of y_int
+  // We'll calculate both (sin and cos) polynomials and then select from the two.
+  Packet poly_mask = preinterpret<Packet>(pcmp_eq(pand(y_int, csti_1), pzero(y_int)));
+
+  Packet x2 = pmul(x, x);
+
+  // Evaluate the cos(x) polynomial. (-Pi/4 <= x <= Pi/4)
+  Packet y1 = pset1<Packet>(2.4372266125283204019069671630859375e-05f);
+  y1 = pmadd(y1, x2, pset1<Packet>(-0.00138865201734006404876708984375f));
+  y1 = pmadd(y1, x2, pset1<Packet>(0.041666619479656219482421875f));
+  y1 = pmadd(y1, x2, pset1<Packet>(-0.5f));
+  y1 = pmadd(y1, x2, pset1<Packet>(1.f));
+
+  // Evaluate the sin(x) polynomial. (Pi/4 <= x <= Pi/4)
+  // octave/matlab code to compute those coefficients:
+  //    x = (0:0.0001:pi/4)';
+  //    A = [x.^3 x.^5 x.^7];
+  //    w = ((1.-(x/(pi/4)).^2).^5)*2000+1;         # weights trading relative accuracy
+  //    c = (A'*diag(w)*A)\(A'*diag(w)*(sin(x)-x)); # weighted LS, linear coeff forced to 1
+  //    printf('%.64f\n %.64f\n%.64f\n', c(3), c(2), c(1))
+  //
+  Packet y2 = pset1<Packet>(-0.0001959234114083702898469196984621021329076029360294342041015625f);
+  y2 = pmadd(y2, x2, pset1<Packet>(0.0083326873655616851693794799871284340042620897293090820312500000f));
+  y2 = pmadd(y2, x2, pset1<Packet>(-0.1666666203982298255503735617821803316473960876464843750000000000f));
+  y2 = pmul(y2, x2);
+  y2 = pmadd(y2, x, x);
+
+  // Select the correct result from the two polynomials.
+  // Compute the sign to apply to the polynomial.
+  // sin: sign = second_bit(y_int) xor signbit(_x)
+  // cos: sign = second_bit(y_int+1)
+  Packet sign_bit = (Func == TrigFunction::Sin) ? pxor(_x, preinterpret<Packet>(plogical_shift_left<30>(y_int)))
+                                                : preinterpret<Packet>(plogical_shift_left<30>(padd(y_int, csti_1)));
+  sign_bit = pand(sign_bit, cst_sign_mask);  // clear all but left most bit
+
+  if ((Func == TrigFunction::SinCos) || (Func == TrigFunction::Tan)) {
+    // TODO(rmlarsen): Add single polynomial for tan(x) instead of paying for sin+cos+div.
+    Packet peven = peven_mask(x);
+    Packet ysin = pselect(poly_mask, y2, y1);
+    Packet ycos = pselect(poly_mask, y1, y2);
+    Packet sign_bit_sin = pxor(_x, preinterpret<Packet>(plogical_shift_left<30>(y_int)));
+    Packet sign_bit_cos = preinterpret<Packet>(plogical_shift_left<30>(padd(y_int, csti_1)));
+    sign_bit_sin = pand(sign_bit_sin, cst_sign_mask);  // clear all but left most bit
+    sign_bit_cos = pand(sign_bit_cos, cst_sign_mask);  // clear all but left most bit
+    y = (Func == TrigFunction::SinCos) ? pselect(peven, pxor(ysin, sign_bit_sin), pxor(ycos, sign_bit_cos))
+                                       : pdiv(pxor(ysin, sign_bit_sin), pxor(ycos, sign_bit_cos));
+  } else {
+    y = (Func == TrigFunction::Sin) ? pselect(poly_mask, y2, y1) : pselect(poly_mask, y1, y2);
+    y = pxor(y, sign_bit);
+  }
+  return y;
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psin_float(const Packet& x) {
+  return psincos_float<TrigFunction::Sin>(x);
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcos_float(const Packet& x) {
+  return psincos_float<TrigFunction::Cos>(x);
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet ptan_float(const Packet& x) {
+  return psincos_float<TrigFunction::Tan>(x);
+}
+
+// Trigonometric argument reduction for double for inputs smaller than 15.
+// Reduces trigonometric arguments for double inputs where x < 15. Given an argument x and its corresponding quadrant
+// count n, the function computes and returns the reduced argument t such that x = n * pi/2 + t.
+template <typename Packet>
+Packet trig_reduce_small_double(const Packet& x, const Packet& q) {
+  // Pi/2 split into 2 values
+  const Packet cst_pio2_a = pset1<Packet>(-1.570796325802803);
+  const Packet cst_pio2_b = pset1<Packet>(-9.920935184482005e-10);
+
+  Packet t;
+  t = pmadd(cst_pio2_a, q, x);
+  t = pmadd(cst_pio2_b, q, t);
+  return t;
+}
+
+// Trigonometric argument reduction for double for inputs smaller than 1e14.
+// Reduces trigonometric arguments for double inputs where x < 1e14. Given an argument x and its corresponding quadrant
+// count n, the function computes and returns the reduced argument t such that x = n * pi/2 + t.
+template <typename Packet>
+Packet trig_reduce_medium_double(const Packet& x, const Packet& q_high, const Packet& q_low) {
+  // Pi/2 split into 4 values
+  const Packet cst_pio2_a = pset1<Packet>(-1.570796325802803);
+  const Packet cst_pio2_b = pset1<Packet>(-9.920935184482005e-10);
+  const Packet cst_pio2_c = pset1<Packet>(-6.123234014771656e-17);
+  const Packet cst_pio2_d = pset1<Packet>(1.903488962019325e-25);
+
+  Packet t;
+  t = pmadd(cst_pio2_a, q_high, x);
+  t = pmadd(cst_pio2_a, q_low, t);
+  t = pmadd(cst_pio2_b, q_high, t);
+  t = pmadd(cst_pio2_b, q_low, t);
+  t = pmadd(cst_pio2_c, q_high, t);
+  t = pmadd(cst_pio2_c, q_low, t);
+  t = pmadd(cst_pio2_d, padd(q_low, q_high), t);
+  return t;
+}
+
+template <TrigFunction Func, typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+#if EIGEN_COMP_GNUC_STRICT
+    __attribute__((optimize("-fno-unsafe-math-optimizations")))
+#endif
+    Packet
+    psincos_double(const Packet& x) {
+  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
+  typedef typename unpacket_traits<PacketI>::type ScalarI;
+
+  const Packet cst_sign_mask = pset1frombits<Packet>(static_cast<Eigen::numext::uint64_t>(0x8000000000000000u));
+
+  // If the argument is smaller than this value, use a simpler argument reduction
+  const double small_th = 15;
+  // If the argument is bigger than this value, use the non-vectorized std version
+  const double huge_th = 1e14;
+
+  const Packet cst_2oPI = pset1<Packet>(0.63661977236758134307553505349006);  // 2/PI
+  // Integer Packet constants
+  const PacketI cst_one = pset1<PacketI>(ScalarI(1));
+  // Constant for splitting
+  const Packet cst_split = pset1<Packet>(1 << 24);
+
+  Packet x_abs = pabs(x);
+
+  // Scale x by 2/Pi
+  PacketI q_int;
+  Packet s;
+
+  // TODO Implement huge angle argument reduction
+  if (EIGEN_PREDICT_FALSE(predux_any(pcmp_le(pset1<Packet>(small_th), x_abs)))) {
+    Packet q_high = pmul(pfloor(pmul(x_abs, pdiv(cst_2oPI, cst_split))), cst_split);
+    Packet q_low_noround = psub(pmul(x_abs, cst_2oPI), q_high);
+    q_int = pcast<Packet, PacketI>(padd(q_low_noround, pset1<Packet>(0.5)));
+    Packet q_low = pcast<PacketI, Packet>(q_int);
+    s = trig_reduce_medium_double(x_abs, q_high, q_low);
+  } else {
+    Packet qval_noround = pmul(x_abs, cst_2oPI);
+    q_int = pcast<Packet, PacketI>(padd(qval_noround, pset1<Packet>(0.5)));
+    Packet q = pcast<PacketI, Packet>(q_int);
+    s = trig_reduce_small_double(x_abs, q);
+  }
+
+  // All the upcoming approximating polynomials have even exponents
+  Packet ss = pmul(s, s);
+
+  // Padé approximant of cos(x)
+  // Assuring < 1 ULP error on the interval [-pi/4, pi/4]
+  // cos(x) ~= (80737373*x^8 - 13853547000*x^6 + 727718024880*x^4 - 11275015752000*x^2 + 23594700729600)/(147173*x^8 +
+  // 39328920*x^6 + 5772800880*x^4 + 522334612800*x^2 + 23594700729600)
+  // MATLAB code to compute those coefficients:
+  //    syms x;
+  //    cosf = @(x) cos(x);
+  //    pade_cosf = pade(cosf(x), x, 0, 'Order', 8)
+  const Packet cn4 = pset1<Packet>(80737373);
+  const Packet cn3 = pset1<Packet>(-13853547000);
+  const Packet cn2 = pset1<Packet>(727718024880);
+  const Packet cn1 = pset1<Packet>(-11275015752000);
+  const Packet cn0 = pset1<Packet>(23594700729600);  // shared with cd0
+  const Packet cd3 = pset1<Packet>(147173);
+  const Packet cd2 = pset1<Packet>(39328920);
+  const Packet cd1 = pset1<Packet>(5772800880);
+  const Packet cd0 = pset1<Packet>(522334612800);
+  Packet sc1_num = pmadd(ss, cn4, cn3);
+  Packet sc2_num = pmadd(sc1_num, ss, cn2);
+  Packet sc3_num = pmadd(sc2_num, ss, cn1);
+  Packet sc4_num = pmadd(sc3_num, ss, cn0);
+  Packet sc1_denum = pmadd(ss, cd3, cd2);
+  Packet sc2_denum = pmadd(sc1_denum, ss, cd1);
+  Packet sc3_denum = pmadd(sc2_denum, ss, cd0);
+  Packet sc4_denum = pmadd(sc3_denum, ss, cn0);
+  Packet scos = pdiv(sc4_num, sc4_denum);
+
+  // Padé approximant of sin(x)
+  // Assuring < 1 ULP error on the interval [-pi/4, pi/4]
+  // sin(x) ~= (x*(4585922449*x^8 - 1066023933480*x^6 + 83284044283440*x^4 - 2303682236856000*x^2 +
+  // 15605159573203200))/(45*(1029037*x^8 + 345207016*x^6 + 61570292784*x^4 + 6603948711360*x^2 + 346781323848960))
+  // MATLAB code to compute those coefficients:
+  //    syms x;
+  //    sinf = @(x) sin(x);
+  //    pade_sinf = pade(sinf(x), x, 0, 'Order', 8, 'OrderMode', 'relative')
+  const Packet sn4 = pset1<Packet>(4585922449);
+  const Packet sn3 = pset1<Packet>(-1066023933480);
+  const Packet sn2 = pset1<Packet>(83284044283440);
+  const Packet sn1 = pset1<Packet>(-2303682236856000);
+  const Packet sn0 = pset1<Packet>(15605159573203200);
+  const Packet sd3 = pset1<Packet>(1029037);
+  const Packet sd2 = pset1<Packet>(345207016);
+  const Packet sd1 = pset1<Packet>(61570292784);
+  const Packet sd0_inner = pset1<Packet>(6603948711360);
+  const Packet sd0 = pset1<Packet>(346781323848960);
+  const Packet cst_45 = pset1<Packet>(45);
+  Packet ss1_num = pmadd(ss, sn4, sn3);
+  Packet ss2_num = pmadd(ss1_num, ss, sn2);
+  Packet ss3_num = pmadd(ss2_num, ss, sn1);
+  Packet ss4_num = pmadd(ss3_num, ss, sn0);
+  Packet ss1_denum = pmadd(ss, sd3, sd2);
+  Packet ss2_denum = pmadd(ss1_denum, ss, sd1);
+  Packet ss3_denum = pmadd(ss2_denum, ss, sd0_inner);
+  Packet ss4_denum = pmadd(ss3_denum, ss, sd0);
+  Packet ssin = pdiv(pmul(s, ss4_num), pmul(cst_45, ss4_denum));
+
+  Packet poly_mask = preinterpret<Packet>(pcmp_eq(pand(q_int, cst_one), pzero(q_int)));
+
+  Packet sign_sin = pxor(x, preinterpret<Packet>(plogical_shift_left<62>(q_int)));
+  Packet sign_cos = preinterpret<Packet>(plogical_shift_left<62>(padd(q_int, cst_one)));
+  Packet sign_bit, sFinalRes;
+  if (Func == TrigFunction::Sin) {
+    sign_bit = sign_sin;
+    sFinalRes = pselect(poly_mask, ssin, scos);
+  } else if (Func == TrigFunction::Cos) {
+    sign_bit = sign_cos;
+    sFinalRes = pselect(poly_mask, scos, ssin);
+  } else if (Func == TrigFunction::Tan) {
+    // TODO(rmlarsen): Add single polynomial for tan(x) instead of paying for sin+cos+div.
+    sign_bit = pxor(sign_sin, sign_cos);
+    sFinalRes = pdiv(pselect(poly_mask, ssin, scos), pselect(poly_mask, scos, ssin));
+  } else if (Func == TrigFunction::SinCos) {
+    Packet peven = peven_mask(x);
+    sign_bit = pselect(peven, sign_sin, sign_cos);
+    sFinalRes = pselect(pxor(peven, poly_mask), scos, ssin);
+  }
+  sign_bit = pand(sign_bit, cst_sign_mask);  // clear all but left most bit
+  sFinalRes = pxor(sFinalRes, sign_bit);
+
+  // If the inputs values are higher than that a value that the argument reduction can currently address, compute them
+  // using the C++ standard library.
+  // TODO Remove it when huge angle argument reduction is implemented
+  if (EIGEN_PREDICT_FALSE(predux_any(pcmp_le(pset1<Packet>(huge_th), x_abs)))) {
+    const int PacketSize = unpacket_traits<Packet>::size;
+    EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) double sincos_vals[PacketSize];
+    EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) double x_cpy[PacketSize];
+    pstoreu(x_cpy, x);
+    pstoreu(sincos_vals, sFinalRes);
+    for (int k = 0; k < PacketSize; ++k) {
+      double val = x_cpy[k];
+      if (std::abs(val) > huge_th && (numext::isfinite)(val)) {
+        if (Func == TrigFunction::Sin) {
+          sincos_vals[k] = std::sin(val);
+        } else if (Func == TrigFunction::Cos) {
+          sincos_vals[k] = std::cos(val);
+        } else if (Func == TrigFunction::Tan) {
+          sincos_vals[k] = std::tan(val);
+        } else if (Func == TrigFunction::SinCos) {
+          sincos_vals[k] = k % 2 == 0 ? std::sin(val) : std::cos(val);
+        }
+      }
+    }
+    sFinalRes = ploadu<Packet>(sincos_vals);
+  }
+  return sFinalRes;
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psin_double(const Packet& x) {
+  return psincos_double<TrigFunction::Sin>(x);
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcos_double(const Packet& x) {
+  return psincos_double<TrigFunction::Cos>(x);
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet ptan_double(const Packet& x) {
+  return psincos_double<TrigFunction::Tan>(x);
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+    std::enable_if_t<std::is_same<typename unpacket_traits<Packet>::type, float>::value, Packet>
+    psincos_selector(const Packet& x) {
+  return psincos_float<TrigFunction::SinCos, Packet>(x);
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+    std::enable_if_t<std::is_same<typename unpacket_traits<Packet>::type, double>::value, Packet>
+    psincos_selector(const Packet& x) {
+  return psincos_double<TrigFunction::SinCos, Packet>(x);
+}
+
+//----------------------------------------------------------------------
+// Inverse Trigonometric Functions
+//----------------------------------------------------------------------
+
+// Generic implementation of acos(x).
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pacos_float(const Packet& x_in) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
+
+  const Packet cst_one = pset1<Packet>(Scalar(1));
+  const Packet cst_pi = pset1<Packet>(Scalar(EIGEN_PI));
+  const Packet p6 = pset1<Packet>(Scalar(2.36423197202384471893310546875e-3));
+  const Packet p5 = pset1<Packet>(Scalar(-1.1368644423782825469970703125e-2));
+  const Packet p4 = pset1<Packet>(Scalar(2.717843465507030487060546875e-2));
+  const Packet p3 = pset1<Packet>(Scalar(-4.8969544470310211181640625e-2));
+  const Packet p2 = pset1<Packet>(Scalar(8.8804088532924652099609375e-2));
+  const Packet p1 = pset1<Packet>(Scalar(-0.214591205120086669921875));
+  const Packet p0 = pset1<Packet>(Scalar(1.57079637050628662109375));
+
+  // For x in [0:1], we approximate acos(x)/sqrt(1-x), which is a smooth
+  // function, by a 6'th order polynomial.
+  // For x in [-1:0) we use that acos(-x) = pi - acos(x).
+  const Packet neg_mask = psignbit(x_in);
+  const Packet abs_x = pabs(x_in);
+
+  // Evaluate the polynomial using Horner's rule:
+  //   P(x) = p0 + x * (p1 +  x * (p2 + ... (p5 + x * p6)) ... ) .
+  // We evaluate even and odd terms independently to increase
+  // instruction level parallelism.
+  Packet x2 = pmul(x_in, x_in);
+  Packet p_even = pmadd(p6, x2, p4);
+  Packet p_odd = pmadd(p5, x2, p3);
+  p_even = pmadd(p_even, x2, p2);
+  p_odd = pmadd(p_odd, x2, p1);
+  p_even = pmadd(p_even, x2, p0);
+  Packet p = pmadd(p_odd, abs_x, p_even);
+
+  // The polynomial approximates acos(x)/sqrt(1-x), so
+  // multiply by sqrt(1-x) to get acos(x).
+  // Conveniently returns NaN for arguments outside [-1:1].
+  Packet denom = psqrt(psub(cst_one, abs_x));
+  Packet result = pmul(denom, p);
+  // Undo mapping for negative arguments.
+  return pselect(neg_mask, psub(cst_pi, result), result);
+}
+
+// Generic implementation of asin(x).
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pasin_float(const Packet& x_in) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
+
+  constexpr float kPiOverTwo = static_cast<float>(EIGEN_PI / 2);
+
+  const Packet cst_half = pset1<Packet>(0.5f);
+  const Packet cst_one = pset1<Packet>(1.0f);
+  const Packet cst_two = pset1<Packet>(2.0f);
+  const Packet cst_pi_over_two = pset1<Packet>(kPiOverTwo);
+
+  const Packet abs_x = pabs(x_in);
+  const Packet sign_mask = pandnot(x_in, abs_x);
+  const Packet invalid_mask = pcmp_lt(cst_one, abs_x);
+
+  // For arguments |x| > 0.5, we map x back to [0:0.5] using
+  // the transformation x_large = sqrt(0.5*(1-x)), and use the
+  // identity
+  //   asin(x) = pi/2 - 2 * asin( sqrt( 0.5 * (1 - x)))
+
+  const Packet x_large = psqrt(pnmadd(cst_half, abs_x, cst_half));
+  const Packet large_mask = pcmp_lt(cst_half, abs_x);
+  const Packet x = pselect(large_mask, x_large, abs_x);
+  const Packet x2 = pmul(x, x);
+
+  // For |x| < 0.5 approximate asin(x)/x by an 8th order polynomial with
+  // even terms only.
+  constexpr float alpha[] = {5.08838854730129241943359375e-2f, 3.95139865577220916748046875e-2f,
+                             7.550220191478729248046875e-2f, 0.16664917767047882080078125f, 1.00000011920928955078125f};
+  Packet p = ppolevl<Packet, 4>::run(x2, alpha);
+  p = pmul(p, x);
+
+  const Packet p_large = pnmadd(cst_two, p, cst_pi_over_two);
+  p = pselect(large_mask, p_large, p);
+  // Flip the sign for negative arguments.
+  p = pxor(p, sign_mask);
+  // Return NaN for arguments outside [-1:1].
+  return por(invalid_mask, p);
+}
+
+template <typename Scalar>
+struct patan_reduced {
+  template <typename Packet>
+  static EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet run(const Packet& x);
+};
+
+template <>
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_reduced<double>::run(const Packet& x) {
+  constexpr double alpha[] = {2.6667153866462208e-05, 3.0917513112462781e-03, 5.2574296781008604e-02,
+                              3.0409318473444424e-01, 7.5365702534987022e-01, 8.2704055405494614e-01,
+                              3.3004361289279920e-01};
+
+  constexpr double beta[] = {
+      2.7311202462436667e-04, 1.0899150928962708e-02, 1.1548932646420353e-01, 4.9716458728465573e-01, 1.0,
+      9.3705509168587852e-01, 3.3004361289279920e-01};
+
+  Packet x2 = pmul(x, x);
+  Packet p = ppolevl<Packet, 6>::run(x2, alpha);
+  Packet q = ppolevl<Packet, 6>::run(x2, beta);
+  return pmul(x, pdiv(p, q));
+}
+
+// Computes elementwise atan(x) for x in [-1:1] with 2 ulp accuracy.
+template <>
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_reduced<float>::run(const Packet& x) {
+  constexpr float alpha[] = {1.12026982009410858154296875e-01f, 7.296695709228515625e-01f, 8.109951019287109375e-01f};
+
+  constexpr float beta[] = {1.00917108356952667236328125e-02f, 2.8318560123443603515625e-01f, 1.0f,
+                            8.109951019287109375e-01f};
+
+  Packet x2 = pmul(x, x);
+  Packet p = ppolevl<Packet, 2>::run(x2, alpha);
+  Packet q = ppolevl<Packet, 3>::run(x2, beta);
+  return pmul(x, pdiv(p, q));
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_atan(const Packet& x_in) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+
+  constexpr Scalar kPiOverTwo = static_cast<Scalar>(EIGEN_PI / 2);
+
+  const Packet cst_signmask = pset1<Packet>(Scalar(-0.0));
+  const Packet cst_one = pset1<Packet>(Scalar(1));
+  const Packet cst_pi_over_two = pset1<Packet>(kPiOverTwo);
+
+  //   "Large": For |x| > 1, use atan(1/x) = sign(x)*pi/2 - atan(x).
+  //   "Small": For |x| <= 1, approximate atan(x) directly by a polynomial
+  //            calculated using Rminimax.
+
+  const Packet abs_x = pabs(x_in);
+  const Packet x_signmask = pand(x_in, cst_signmask);
+  const Packet large_mask = pcmp_lt(cst_one, abs_x);
+  const Packet x = pselect(large_mask, preciprocal(abs_x), abs_x);
+  const Packet p = patan_reduced<Scalar>::run(x);
+  // Apply transformations according to the range reduction masks.
+  Packet result = pselect(large_mask, psub(cst_pi_over_two, p), p);
+  // Return correct sign
+  return pxor(result, x_signmask);
+}
+
+//----------------------------------------------------------------------
+// Hyperbolic Functions
+//----------------------------------------------------------------------
+
+#ifdef EIGEN_FAST_MATH
+
+/** \internal \returns the hyperbolic tan of \a a (coeff-wise)
+    Doesn't do anything fancy, just a 9/8-degree rational interpolant which
+    is accurate up to a couple of ulps in the (approximate) range [-8, 8],
+    outside of which tanh(x) = +/-1 in single precision. The input is clamped
+    to the range [-c, c]. The value c is chosen as the smallest value where
+    the approximation evaluates to exactly 1.
+
+    This implementation works on both scalars and packets.
+*/
+template <typename T>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS T ptanh_float(const T& a_x) {
+  // Clamp the inputs to the range [-c, c] and set everything
+  // outside that range to 1.0. The value c is chosen as the smallest
+  // floating point argument such that the approximation is exactly 1.
+  // This saves clamping the value at the end.
+#ifdef EIGEN_VECTORIZE_FMA
+  const T plus_clamp = pset1<T>(8.01773357391357422f);
+  const T minus_clamp = pset1<T>(-8.01773357391357422f);
+#else
+  const T plus_clamp = pset1<T>(7.90738964080810547f);
+  const T minus_clamp = pset1<T>(-7.90738964080810547f);
+#endif
+  const T x = pmax(pmin(a_x, plus_clamp), minus_clamp);
+
+  // The following rational approximation was generated by rminimax
+  // (https://gitlab.inria.fr/sfilip/rminimax) using the following
+  // command:
+  // $ ratapprox --function="tanh(x)" --dom='[-8.67,8.67]' --num="odd"
+  //   --den="even" --type="[9,8]" --numF="[SG]" --denF="[SG]" --log
+  //   --output=tanhf.sollya --dispCoeff="dec"
+
+  // The monomial coefficients of the numerator polynomial (odd).
+  constexpr float alpha[] = {1.394553628e-8f, 2.102733560e-5f, 3.520756727e-3f, 1.340216100e-1f};
+
+  // The monomial coefficients of the denominator polynomial (even).
+  constexpr float beta[] = {8.015776984e-7f, 3.326951409e-4f, 2.597254514e-2f, 4.673548340e-1f, 1.0f};
+
+  // Since the polynomials are odd/even, we need x^2.
+  const T x2 = pmul(x, x);
+  const T x3 = pmul(x2, x);
+
+  T p = ppolevl<T, 3>::run(x2, alpha);
+  T q = ppolevl<T, 4>::run(x2, beta);
+  // Take advantage of the fact that the constant term in p is 1 to compute
+  // x*(x^2*p + 1) = x^3 * p + x.
+  p = pmadd(x3, p, x);
+
+  // Divide the numerator by the denominator.
+  return pdiv(p, q);
+}
+
+#else
+
+/** \internal \returns the hyperbolic tan of \a a (coeff-wise).
+    On the domain [-1.25:1.25] we use an approximation of the form
+    tanh(x) ~= x^3 * (P(x) / Q(x)) + x, where P and Q are polynomials in x^2.
+    For |x| > 1.25, tanh is implemented as tanh(x) = 1 - (2 / (1 + exp(2*x))).
+
+    This implementation has a maximum error of 1 ULP (measured with AVX2+FMA).
+
+    This implementation works on both scalars and packets.
+*/
+template <typename T>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS T ptanh_float(const T& x) {
+  // The polynomial coefficients were computed using Rminimax:
+  // % ./ratapprox --function="tanh(x)-x" --dom='[-1.25,1.25]' --num="[x^3,x^5]" --den="even"
+  //     --type="[3,4]" --numF="[SG]" --denF="[SG]" --log --dispCoeff="dec" --output=tanhf.solly
+  constexpr float alpha[] = {-1.46725140511989593505859375e-02f, -3.333333432674407958984375e-01f};
+  constexpr float beta[] = {1.570280082523822784423828125e-02, 4.4401752948760986328125e-01, 1.0f};
+  const T x2 = pmul(x, x);
+  const T x3 = pmul(x2, x);
+  const T p = ppolevl<T, 1>::run(x2, alpha);
+  const T q = ppolevl<T, 2>::run(x2, beta);
+  const T small_tanh = pmadd(x3, pdiv(p, q), x);
+
+  const T sign_mask = pset1<T>(-0.0f);
+  const T abs_x = pandnot(x, sign_mask);
+  constexpr float kSmallThreshold = 1.25f;
+  const T large_mask = pcmp_lt(pset1<T>(kSmallThreshold), abs_x);
+  // Fast exit if all elements are small.
+  if (!predux_any(large_mask)) {
+    return small_tanh;
+  }
+
+  //  Compute as 1 - (2 / (1 + exp(2*x)))
+  const T one = pset1<T>(1.0f);
+  const T two = pset1<T>(2.0f);
+  const T s = pexp_float<T, true>(pmul(two, abs_x));
+  const T abs_tanh = psub(one, pdiv(two, padd(s, one)));
+
+  // Handle infinite inputs and set sign bit.
+  constexpr float kHugeThreshold = 16.0f;
+  const T huge_mask = pcmp_lt(pset1<T>(kHugeThreshold), abs_x);
+  const T x_sign = pand(sign_mask, x);
+  const T large_tanh = por(x_sign, pselect(huge_mask, one, abs_tanh));
+  return pselect(large_mask, large_tanh, small_tanh);
+}
+
+#endif  // EIGEN_FAST_MATH
+
+/** \internal \returns the hyperbolic tan of \a a (coeff-wise)
+    This uses a 19/18-degree rational interpolant which
+    is accurate up to a couple of ulps in the (approximate) range [-18.7, 18.7],
+    outside of which tanh(x) = +/-1 in single precision. The input is clamped
+    to the range [-c, c]. The value c is chosen as the smallest value where
+    the approximation evaluates to exactly 1.
+
+    This implementation works on both scalars and packets.
+*/
+template <typename T>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS T ptanh_double(const T& a_x) {
+  // Clamp the inputs to the range [-c, c] and set everything
+  // outside that range to 1.0. The value c is chosen as the smallest
+  // floating point argument such that the approximation is exactly 1.
+  // This saves clamping the value at the end.
+#ifdef EIGEN_VECTORIZE_FMA
+  const T plus_clamp = pset1<T>(17.6610191624600077);
+  const T minus_clamp = pset1<T>(-17.6610191624600077);
+#else
+  const T plus_clamp = pset1<T>(17.714196154005176);
+  const T minus_clamp = pset1<T>(-17.714196154005176);
+#endif
+  const T x = pmax(pmin(a_x, plus_clamp), minus_clamp);
+  // The following rational approximation was generated by rminimax
+  // (https://gitlab.inria.fr/sfilip/rminimax) using the following
+  // command:
+  // $ ./ratapprox --function="tanh(x)" --dom='[-18.72,18.72]'
+  //   --num="odd" --den="even" --type="[19,18]" --numF="[D]"
+  //   --denF="[D]" --log --output=tanh.sollya --dispCoeff="dec"
+
+  // The monomial coefficients of the numerator polynomial (odd).
+  constexpr double alpha[] = {2.6158007860482230e-23, 7.6534862268749319e-19, 3.1309488231386680e-15,
+                              4.2303918148209176e-12, 2.4618379131293676e-09, 6.8644367682497074e-07,
+                              9.3839087674268880e-05, 5.9809711724441161e-03, 1.5184719640284322e-01};
+
+  // The monomial coefficients of the denominator polynomial (even).
+  constexpr double beta[] = {6.463747022670968018e-21, 5.782506856739003571e-17,
+                             1.293019623712687916e-13, 1.123643448069621992e-10,
+                             4.492975677839633985e-08, 8.785185266237658698e-06,
+                             8.295161192716231542e-04, 3.437448108450402717e-02,
+                             4.851805297361760360e-01, 1.0};
+
+  // Since the polynomials are odd/even, we need x^2.
+  const T x2 = pmul(x, x);
+  const T x3 = pmul(x2, x);
+
+  // Interleave the evaluation of the numerator polynomial p and
+  // denominator polynomial q.
+  T p = ppolevl<T, 8>::run(x2, alpha);
+  T q = ppolevl<T, 9>::run(x2, beta);
+  // Take advantage of the fact that the constant term in p is 1 to compute
+  // x*(x^2*p + 1) = x^3 * p + x.
+  p = pmadd(x3, p, x);
+
+  // Divide the numerator by the denominator.
+  return pdiv(p, q);
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patanh_float(const Packet& x) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
+
+  // For |x| in [0:0.5] we use a polynomial approximation of the form
+  // P(x) = x + x^3*(alpha[4] + x^2 * (alpha[3] + x^2 * (... x^2 * alpha[0]) ... )).
+  constexpr float alpha[] = {0.1819281280040740966796875f, 8.2311116158962249755859375e-2f,
+                             0.14672131836414337158203125f, 0.1997792422771453857421875f, 0.3333373963832855224609375f};
+  const Packet x2 = pmul(x, x);
+  const Packet x3 = pmul(x, x2);
+  Packet p = ppolevl<Packet, 4>::run(x2, alpha);
+  p = pmadd(x3, p, x);
+
+  // For |x| in ]0.5:1.0] we use atanh = 0.5*ln((1+x)/(1-x));
+  const Packet half = pset1<Packet>(0.5f);
+  const Packet one = pset1<Packet>(1.0f);
+  Packet r = pdiv(padd(one, x), psub(one, x));
+  r = pmul(half, plog(r));
+
+  const Packet x_gt_half = pcmp_le(half, pabs(x));
+  const Packet x_eq_one = pcmp_eq(one, pabs(x));
+  const Packet x_gt_one = pcmp_lt(one, pabs(x));
+  const Packet sign_mask = pset1<Packet>(-0.0f);
+  const Packet x_sign = pand(sign_mask, x);
+  const Packet inf = pset1<Packet>(std::numeric_limits<float>::infinity());
+  return por(x_gt_one, pselect(x_eq_one, por(x_sign, inf), pselect(x_gt_half, r, p)));
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patanh_double(const Packet& x) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  static_assert(std::is_same<Scalar, double>::value, "Scalar type must be double");
+  // For x in [-0.5:0.5] we use a rational approximation of the form
+  // R(x) = x + x^3*P(x^2)/Q(x^2), where P is or order 4 and Q is of order 5.
+  constexpr double alpha[] = {3.3071338469301391e-03, -4.7129526768798737e-02, 1.8185306179826699e-01,
+                              -2.5949536095445679e-01, 1.2306328729812676e-01};
+
+  constexpr double beta[] = {-3.8679974580640881e-03, 7.6391885763341910e-02,  -4.2828141436397615e-01,
+                             9.8733495886883648e-01,  -1.0000000000000000e+00, 3.6918986189438030e-01};
+
+  const Packet x2 = pmul(x, x);
+  const Packet x3 = pmul(x, x2);
+  Packet p = ppolevl<Packet, 4>::run(x2, alpha);
+  Packet q = ppolevl<Packet, 5>::run(x2, beta);
+  Packet y_small = pmadd(x3, pdiv(p, q), x);
+
+  // For |x| in ]0.5:1.0] we use atanh = 0.5*ln((1+x)/(1-x));
+  const Packet half = pset1<Packet>(0.5);
+  const Packet one = pset1<Packet>(1.0);
+  Packet y_large = pdiv(padd(one, x), psub(one, x));
+  y_large = pmul(half, plog(y_large));
+
+  const Packet x_gt_half = pcmp_le(half, pabs(x));
+  const Packet x_eq_one = pcmp_eq(one, pabs(x));
+  const Packet x_gt_one = pcmp_lt(one, pabs(x));
+  const Packet sign_mask = pset1<Packet>(-0.0);
+  const Packet x_sign = pand(sign_mask, x);
+  const Packet inf = pset1<Packet>(std::numeric_limits<double>::infinity());
+  return por(x_gt_one, pselect(x_eq_one, por(x_sign, inf), pselect(x_gt_half, y_large, y_small)));
+}
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_ARCH_GENERIC_PACKET_MATH_TRIG_H

diff --git a/Eigen/src/Core/arch/Default/Half.h b/Eigen/src/Core/arch/Default/Half.h
index 210dfff..671a9dc 100644
--- a/Eigen/src/Core/arch/Default/Half.h
+++ b/Eigen/src/Core/arch/Default/Half.h

@@ -57,6 +57,40 @@
     return float2half(METHOD<PACKET_F>(half2float(_x)));                                                   \
   }
 
+#define EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_F16(PACKET_F, PACKET_F16) \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, pcos)                      \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, psin)                      \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, pexp)                      \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, pexp2)                     \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, pexpm1)                    \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, plog)                      \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, plog1p)                    \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, plog2)                     \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, preciprocal)               \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, prsqrt)                    \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, pcbrt)                     \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, psqrt)                     \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, ptanh)
+
+// F16 wrappers for unsupported/SpecialFunctions.
+#define EIGEN_INSTANTIATE_SPECIAL_FUNCS_F16(PACKET_F, PACKET_F16) \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, perf)                 \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, pndtri)
+
+#define EIGEN_INSTANTIATE_BESSEL_FUNCS_F16(PACKET_F, PACKET_F16) \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, pbessel_i0)          \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, pbessel_i0e)         \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, pbessel_i1)          \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, pbessel_i1e)         \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, pbessel_j0)          \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, pbessel_j1)          \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, pbessel_k0)          \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, pbessel_k0e)         \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, pbessel_k1)          \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, pbessel_k1e)         \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, pbessel_y0)          \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, pbessel_y1)
+
 namespace Eigen {
 
 struct half;
@@ -508,7 +542,7 @@
 // fp16 uses 1 sign bit, 5 exponent bits, and 10 mantissa bits. The bit pattern conveys NaN when all the exponent
 // bits (5) are set, and at least one mantissa bit is set. The sign bit is irrelevant for determining NaN. To check for
 // NaN, clear the sign bit and check if the integral representation is greater than 01111100000000. To test
-// for non-NaN, clear the sign bit and check if the integeral representation is less than or equal to 01111100000000.
+// for non-NaN, clear the sign bit and check if the integral representation is less than or equal to 01111100000000.
 
 // convert sign-magnitude representation to two's complement
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC int16_t mapToSigned(uint16_t a) {
@@ -721,7 +755,7 @@
     o_bits = Eigen::numext::bit_cast<uint32_t>(Eigen::numext::bit_cast<float>(o_bits) - magic);
   }
 
-  o_bits |= (h.x & 0x8000) << 16;  // sign bit
+  o_bits |= (h.x & 0x8000u) << 16;  // sign bit
   return Eigen::numext::bit_cast<float>(o_bits);
 #endif
 }
@@ -956,8 +990,9 @@
 }
 
 // Specialize multiply-add to match packet operations and reduce conversions to/from float.
-template<>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half madd<Eigen::half>(const Eigen::half& x, const Eigen::half& y, const Eigen::half& z) {
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half madd<Eigen::half>(const Eigen::half& x, const Eigen::half& y,
+                                                                    const Eigen::half& z) {
   return Eigen::half(static_cast<float>(x) * static_cast<float>(y) + static_cast<float>(z));
 }
 

diff --git a/Eigen/src/Core/arch/GPU/Complex.h b/Eigen/src/Core/arch/GPU/Complex.h
index fa46aec..be4c218 100644
--- a/Eigen/src/Core/arch/GPU/Complex.h
+++ b/Eigen/src/Core/arch/GPU/Complex.h

@@ -62,54 +62,6 @@
 // Specialized std::complex overloads.
 namespace complex_operator_detail {
 
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> complex_multiply(const std::complex<T>& a,
-                                                                       const std::complex<T>& b) {
-  const T a_real = numext::real(a);
-  const T a_imag = numext::imag(a);
-  const T b_real = numext::real(b);
-  const T b_imag = numext::imag(b);
-  return std::complex<T>(a_real * b_real - a_imag * b_imag, a_imag * b_real + a_real * b_imag);
-}
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> complex_divide_fast(const std::complex<T>& a,
-                                                                          const std::complex<T>& b) {
-  const T a_real = numext::real(a);
-  const T a_imag = numext::imag(a);
-  const T b_real = numext::real(b);
-  const T b_imag = numext::imag(b);
-  const T norm = (b_real * b_real + b_imag * b_imag);
-  return std::complex<T>((a_real * b_real + a_imag * b_imag) / norm, (a_imag * b_real - a_real * b_imag) / norm);
-}
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> complex_divide_stable(const std::complex<T>& a,
-                                                                            const std::complex<T>& b) {
-  const T a_real = numext::real(a);
-  const T a_imag = numext::imag(a);
-  const T b_real = numext::real(b);
-  const T b_imag = numext::imag(b);
-  // Smith's complex division (https://arxiv.org/pdf/1210.4539.pdf),
-  // guards against over/under-flow.
-  const bool scale_imag = numext::abs(b_imag) <= numext::abs(b_real);
-  const T rscale = scale_imag ? T(1) : b_real / b_imag;
-  const T iscale = scale_imag ? b_imag / b_real : T(1);
-  const T denominator = b_real * rscale + b_imag * iscale;
-  return std::complex<T>((a_real * rscale + a_imag * iscale) / denominator,
-                         (a_imag * rscale - a_real * iscale) / denominator);
-}
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> complex_divide(const std::complex<T>& a,
-                                                                     const std::complex<T>& b) {
-#if EIGEN_FAST_MATH
-  return complex_divide_fast(a, b);
-#else
-  return complex_divide_stable(a, b);
-#endif
-}
-
 // NOTE: We cannot specialize compound assignment operators with Scalar T,
 //         (i.e.  operator@=(const T&), for @=+,-,*,/)
 //       since they are already specialized for float/double/long double within
@@ -151,7 +103,7 @@
                                                                                                                     \
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator*(const std::complex<T>& a,                         \
                                                                   const std::complex<T>& b) {                       \
-    return complex_multiply(a, b);                                                                                  \
+    return internal::complex_multiply(a, b);                                                                        \
   }                                                                                                                 \
                                                                                                                     \
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator*(const std::complex<T>& a, const T& b) {           \
@@ -164,7 +116,7 @@
                                                                                                                     \
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator/(const std::complex<T>& a,                         \
                                                                   const std::complex<T>& b) {                       \
-    return complex_divide(a, b);                                                                                    \
+    return internal::complex_divide(a, b);                                                                          \
   }                                                                                                                 \
                                                                                                                     \
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator/(const std::complex<T>& a, const T& b) {           \
@@ -172,7 +124,7 @@
   }                                                                                                                 \
                                                                                                                     \
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator/(const T& a, const std::complex<T>& b) {           \
-    return complex_divide(std::complex<T>(a, 0), b);                                                                \
+    return internal::complex_divide(std::complex<T>(a, 0), b);                                                      \
   }                                                                                                                 \
                                                                                                                     \
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T>& operator+=(std::complex<T>& a, const std::complex<T>& b) { \
@@ -188,12 +140,12 @@
   }                                                                                                                 \
                                                                                                                     \
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T>& operator*=(std::complex<T>& a, const std::complex<T>& b) { \
-    a = complex_multiply(a, b);                                                                                     \
+    a = internal::complex_multiply(a, b);                                                                           \
     return a;                                                                                                       \
   }                                                                                                                 \
                                                                                                                     \
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T>& operator/=(std::complex<T>& a, const std::complex<T>& b) { \
-    a = complex_divide(a, b);                                                                                       \
+    a = internal::complex_divide(a, b);                                                                             \
     return a;                                                                                                       \
   }                                                                                                                 \
                                                                                                                     \

diff --git a/Eigen/src/Core/arch/GPU/Tuple.h b/Eigen/src/Core/arch/GPU/Tuple.h
index 402d92f..0e74487 100644
--- a/Eigen/src/Core/arch/GPU/Tuple.h
+++ b/Eigen/src/Core/arch/GPU/Tuple.h

@@ -34,7 +34,7 @@
   template <typename U1 = T1,
             typename EnableIf = std::enable_if_t<std::is_default_constructible<U1>::value &&
                                                  reduce_all<std::is_default_constructible<Ts>::value...>::value>>
-  constexpr EIGEN_DEVICE_FUNC TupleImpl() : head_{}, tail_{} {}
+  constexpr EIGEN_DEVICE_FUNC TupleImpl() : m_head{}, m_tail{} {}
 
   // Element constructor.
   template <typename U1, typename... Us,
@@ -45,45 +45,45 @@
                                                       // this does not look like a copy/move constructor.
                                                       N > 1 || std::is_convertible<U1, T1>::value)>>
   constexpr EIGEN_DEVICE_FUNC TupleImpl(U1&& arg1, Us&&... args)
-      : head_(std::forward<U1>(arg1)), tail_(std::forward<Us>(args)...) {}
+      : m_head(std::forward<U1>(arg1)), m_tail(std::forward<Us>(args)...) {}
 
   // The first stored value.
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T1& head() { return head_; }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T1& head() { return m_head; }
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const T1& head() const { return head_; }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const T1& head() const { return m_head; }
 
   // The tail values.
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TupleImpl<N - 1, Ts...>& tail() { return tail_; }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TupleImpl<N - 1, Ts...>& tail() { return m_tail; }
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const TupleImpl<N - 1, Ts...>& tail() const { return tail_; }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const TupleImpl<N - 1, Ts...>& tail() const { return m_tail; }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void swap(TupleImpl& other) {
     using numext::swap;
-    swap(head_, other.head_);
-    swap(tail_, other.tail_);
+    swap(m_head, other.m_head);
+    swap(m_tail, other.m_tail);
   }
 
   template <typename... UTypes>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TupleImpl& operator=(const TupleImpl<N, UTypes...>& other) {
-    head_ = other.head_;
-    tail_ = other.tail_;
+    m_head = other.m_head;
+    m_tail = other.m_tail;
     return *this;
   }
 
   template <typename... UTypes>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TupleImpl& operator=(TupleImpl<N, UTypes...>&& other) {
-    head_ = std::move(other.head_);
-    tail_ = std::move(other.tail_);
+    m_head = std::move(other.m_head);
+    m_tail = std::move(other.m_tail);
     return *this;
   }
 
  private:
-  // Allow related tuples to reference head_/tail_.
+  // Allow related tuples to reference m_head/m_tail.
   template <size_t M, typename... UTypes>
   friend class TupleImpl;
 
-  T1 head_;
-  TupleImpl<N - 1, Ts...> tail_;
+  T1 m_head;
+  TupleImpl<N - 1, Ts...> m_tail;
 };
 
 // Empty tuple specialization.

diff --git a/Eigen/src/Core/arch/HVX/PacketMath.h b/Eigen/src/Core/arch/HVX/PacketMath.h
index db543e0..1249426 100644
--- a/Eigen/src/Core/arch/HVX/PacketMath.h
+++ b/Eigen/src/Core/arch/HVX/PacketMath.h

@@ -764,11 +764,19 @@
 template <HVXPacketSize T>
 EIGEN_STRONG_INLINE float predux_hvx(const HVXPacket<T>& a) {
   const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
+#if __HVX_ARCH__ >= 79
+  HVX_Vector vsum = Q6_Vsf_vadd_VsfVsf(a.Get(), Q6_V_vror_VR(a.Get(), sizeof(float)));
+  for (int i = 2; i < packet_size; i <<= 1) {
+    vsum = Q6_Vsf_vadd_VsfVsf(vsum, Q6_V_vror_VR(vsum, i * sizeof(float)));
+  }
+  return pfirst(HVXPacket<T>::Create(vsum));
+#else
   HVX_Vector vsum = Q6_Vqf32_vadd_VsfVsf(a.Get(), Q6_V_vror_VR(a.Get(), sizeof(float)));
   for (int i = 2; i < packet_size; i <<= 1) {
     vsum = Q6_Vqf32_vadd_Vqf32Vqf32(vsum, Q6_V_vror_VR(vsum, i * sizeof(float)));
   }
   return pfirst(HVXPacket<T>::Create(Q6_Vsf_equals_Vqf32(vsum)));
+#endif
 }
 template <>
 EIGEN_STRONG_INLINE float predux<Packet32f>(const Packet32f& a) {

diff --git a/Eigen/src/Core/arch/LSX/Complex.h b/Eigen/src/Core/arch/LSX/Complex.h
index 0b60a83..8138210 100644
--- a/Eigen/src/Core/arch/LSX/Complex.h
+++ b/Eigen/src/Core/arch/LSX/Complex.h

@@ -227,11 +227,6 @@
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet2cf plog<Packet2cf>(const Packet2cf& a) {
-  return plog_complex(a);
-}
-
-template <>
 EIGEN_STRONG_INLINE Packet2cf pzero(const Packet2cf& /* a */) {
   __m128 v = {0.0f, 0.0f, 0.0f, 0.0f};
   return (Packet2cf)v;
@@ -251,11 +246,6 @@
   return result;
 }
 
-template <>
-EIGEN_STRONG_INLINE Packet2cf pexp<Packet2cf>(const Packet2cf& a) {
-  return pexp_complex(a);
-}
-
 //---------- double ----------
 struct Packet1cd {
   EIGEN_STRONG_INLINE Packet1cd() {}
@@ -364,7 +354,7 @@
   return res;
 }
 
-// FIXME force unaligned load, this is a temporary fix
+// FIXME: force unaligned load, this is a temporary fix
 template <>
 EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) {
   EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from));
@@ -384,7 +374,7 @@
   return pset1<Packet1cd>(*from);
 }
 
-// FIXME force unaligned store, this is a temporary fix
+// FIXME: force unaligned store, this is a temporary fix
 template <>
 EIGEN_STRONG_INLINE void pstore<std::complex<double> >(std::complex<double>* to, const Packet1cd& from) {
   EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v));
@@ -458,20 +448,8 @@
   return res;
 }
 
-template <>
-EIGEN_STRONG_INLINE Packet1cd psqrt<Packet1cd>(const Packet1cd& a) {
-  return psqrt_complex<Packet1cd>(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2cf psqrt<Packet2cf>(const Packet2cf& a) {
-  return psqrt_complex<Packet2cf>(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet1cd plog<Packet1cd>(const Packet1cd& a) {
-  return plog_complex(a);
-}
+EIGEN_INSTANTIATE_COMPLEX_MATH_FUNCS(Packet2cf)
+EIGEN_INSTANTIATE_COMPLEX_MATH_FUNCS_NO_EXP(Packet1cd)
 
 template <>
 EIGEN_STRONG_INLINE Packet1cd pzero<Packet1cd>(const Packet1cd& /* a */) {

diff --git a/Eigen/src/Core/arch/MSA/PacketMath.h b/Eigen/src/Core/arch/MSA/PacketMath.h
index 2d5032a..558893e 100644
--- a/Eigen/src/Core/arch/MSA/PacketMath.h
+++ b/Eigen/src/Core/arch/MSA/PacketMath.h

@@ -35,18 +35,7 @@
 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
 #endif
 
-#if 0
-#define EIGEN_MSA_DEBUG                                                             \
-  static bool firstTime = true;                                                     \
-  do {                                                                              \
-    if (firstTime) {                                                                \
-      std::cout << __FILE__ << ':' << __LINE__ << ':' << __FUNCTION__ << std::endl; \
-      firstTime = false;                                                            \
-    }                                                                               \
-  } while (0)
-#else
 #define EIGEN_MSA_DEBUG
-#endif
 
 #define EIGEN_MSA_SHF_I8(a, b, c, d) (((d) << 6) | ((c) << 4) | ((b) << 2) | (a))
 
@@ -81,7 +70,7 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 4,
-    // FIXME check the Has*
+    // FIXME: verify the Has* flags.
     HasDiv = 1,
     HasSin = EIGEN_FAST_MATH,
     HasCos = EIGEN_FAST_MATH,
@@ -102,7 +91,7 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 4,
-    // FIXME check the Has*
+    // FIXME: verify the Has* flags.
     HasDiv = 1,
   };
 };
@@ -833,7 +822,7 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 2,
-    // FIXME check the Has*
+    // FIXME: verify the Has* flags.
     HasDiv = 1,
     HasExp = 1,
     HasSqrt = 1,

diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h
index b8655c8..48ac0cf 100644
--- a/Eigen/src/Core/arch/NEON/Complex.h
+++ b/Eigen/src/Core/arch/NEON/Complex.h

@@ -456,35 +456,8 @@
   kernel.packet[1].v = tmp;
 }
 
-template <>
-EIGEN_STRONG_INLINE Packet1cf psqrt<Packet1cf>(const Packet1cf& a) {
-  return psqrt_complex<Packet1cf>(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2cf psqrt<Packet2cf>(const Packet2cf& a) {
-  return psqrt_complex<Packet2cf>(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet1cf plog<Packet1cf>(const Packet1cf& a) {
-  return plog_complex(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2cf plog<Packet2cf>(const Packet2cf& a) {
-  return plog_complex(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet1cf pexp<Packet1cf>(const Packet1cf& a) {
-  return pexp_complex(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2cf pexp<Packet2cf>(const Packet2cf& a) {
-  return pexp_complex(a);
-}
+EIGEN_INSTANTIATE_COMPLEX_MATH_FUNCS(Packet1cf)
+EIGEN_INSTANTIATE_COMPLEX_MATH_FUNCS(Packet2cf)
 
 //---------- double ----------
 #if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
@@ -516,6 +489,7 @@
     HasNegate = 1,
     HasSqrt = 1,
     HasLog = 1,
+    HasExp = 1,
     HasAbs = 0,
     HasAbs2 = 0,
     HasMin = 0,
@@ -713,15 +687,7 @@
   kernel.packet[1].v = tmp;
 }
 
-template <>
-EIGEN_STRONG_INLINE Packet1cd psqrt<Packet1cd>(const Packet1cd& a) {
-  return psqrt_complex<Packet1cd>(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet1cd plog<Packet1cd>(const Packet1cd& a) {
-  return plog_complex(a);
-}
+EIGEN_INSTANTIATE_COMPLEX_MATH_FUNCS(Packet1cd)
 
 #endif  // EIGEN_ARCH_ARM64
 

diff --git a/Eigen/src/Core/arch/NEON/MathFunctions.h b/Eigen/src/Core/arch/NEON/MathFunctions.h
index 0046e01..5c48db8 100644
--- a/Eigen/src/Core/arch/NEON/MathFunctions.h
+++ b/Eigen/src/Core/arch/NEON/MathFunctions.h

@@ -33,12 +33,7 @@
 }
 #endif  // EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
 
-BF16_PACKET_FUNCTION(Packet4f, Packet4bf, psin)
-BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pcos)
-BF16_PACKET_FUNCTION(Packet4f, Packet4bf, plog)
-BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pexp)
-BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pexp2)
-BF16_PACKET_FUNCTION(Packet4f, Packet4bf, ptanh)
+EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_BF16(Packet4f, Packet4bf)
 
 template <>
 EIGEN_STRONG_INLINE Packet4bf pfrexp(const Packet4bf& a, Packet4bf& exponent) {

diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index bf50697..701ddcb 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h

@@ -2496,38 +2496,60 @@
 EIGEN_STRONG_INLINE Packet4f ploadquad<Packet4f>(const float* from) {
   return vld1q_dup_f32(from);
 }
+
+// WORKAROUND: Apple Clang 17.0.0 (and Homebrew Clang 21.1.8) at -O0 optimization
+// generate incorrect code for vld1_dup_[su]8, ignoring the pointer offset.
+// We use vdup_n_s8(*from) to force a safe scalar load before broadcast.
+EIGEN_ALWAYS_INLINE int8x8_t eigen_vld1_dup_s8(const int8_t* ptr) {
+#if EIGEN_COMP_CLANGAPPLE && EIGEN_ARCH_ARM64
+  return vdup_n_s8(*ptr);
+#else
+  return vld1_dup_s8(ptr);
+#endif
+}
+
+EIGEN_ALWAYS_INLINE uint8x8_t eigen_vld1_dup_u8(const uint8_t* ptr) {
+#if EIGEN_COMP_CLANGAPPLE && EIGEN_ARCH_ARM64
+  return vdup_n_u8(*ptr);
+#else
+  return vld1_dup_u8(ptr);
+#endif
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet4c ploadquad<Packet4c>(const int8_t* from) {
-  return vget_lane_s32(vreinterpret_s32_s8(vld1_dup_s8(from)), 0);
+  return vget_lane_s32(vreinterpret_s32_s8(eigen_vld1_dup_s8(from)), 0);
 }
 template <>
 EIGEN_STRONG_INLINE Packet8c ploadquad<Packet8c>(const int8_t* from) {
   return vreinterpret_s8_u32(
-      vzip_u32(vreinterpret_u32_s8(vld1_dup_s8(from)), vreinterpret_u32_s8(vld1_dup_s8(from + 1))).val[0]);
+      vzip_u32(vreinterpret_u32_s8(eigen_vld1_dup_s8(from)), vreinterpret_u32_s8(eigen_vld1_dup_s8(from + 1))).val[0]);
 }
 template <>
 EIGEN_STRONG_INLINE Packet16c ploadquad<Packet16c>(const int8_t* from) {
   const int8x8_t a = vreinterpret_s8_u32(
-      vzip_u32(vreinterpret_u32_s8(vld1_dup_s8(from)), vreinterpret_u32_s8(vld1_dup_s8(from + 1))).val[0]);
+      vzip_u32(vreinterpret_u32_s8(eigen_vld1_dup_s8(from)), vreinterpret_u32_s8(eigen_vld1_dup_s8(from + 1))).val[0]);
   const int8x8_t b = vreinterpret_s8_u32(
-      vzip_u32(vreinterpret_u32_s8(vld1_dup_s8(from + 2)), vreinterpret_u32_s8(vld1_dup_s8(from + 3))).val[0]);
+      vzip_u32(vreinterpret_u32_s8(eigen_vld1_dup_s8(from + 2)), vreinterpret_u32_s8(eigen_vld1_dup_s8(from + 3)))
+          .val[0]);
   return vcombine_s8(a, b);
 }
 template <>
 EIGEN_STRONG_INLINE Packet4uc ploadquad<Packet4uc>(const uint8_t* from) {
-  return vget_lane_u32(vreinterpret_u32_u8(vld1_dup_u8(from)), 0);
+  return vget_lane_u32(vreinterpret_u32_u8(eigen_vld1_dup_u8(from)), 0);
 }
 template <>
 EIGEN_STRONG_INLINE Packet8uc ploadquad<Packet8uc>(const uint8_t* from) {
   return vreinterpret_u8_u32(
-      vzip_u32(vreinterpret_u32_u8(vld1_dup_u8(from)), vreinterpret_u32_u8(vld1_dup_u8(from + 1))).val[0]);
+      vzip_u32(vreinterpret_u32_u8(eigen_vld1_dup_u8(from)), vreinterpret_u32_u8(eigen_vld1_dup_u8(from + 1))).val[0]);
 }
 template <>
 EIGEN_STRONG_INLINE Packet16uc ploadquad<Packet16uc>(const uint8_t* from) {
   const uint8x8_t a = vreinterpret_u8_u32(
-      vzip_u32(vreinterpret_u32_u8(vld1_dup_u8(from)), vreinterpret_u32_u8(vld1_dup_u8(from + 1))).val[0]);
+      vzip_u32(vreinterpret_u32_u8(eigen_vld1_dup_u8(from)), vreinterpret_u32_u8(eigen_vld1_dup_u8(from + 1))).val[0]);
   const uint8x8_t b = vreinterpret_u8_u32(
-      vzip_u32(vreinterpret_u32_u8(vld1_dup_u8(from + 2)), vreinterpret_u32_u8(vld1_dup_u8(from + 3))).val[0]);
+      vzip_u32(vreinterpret_u32_u8(eigen_vld1_dup_u8(from + 2)), vreinterpret_u32_u8(eigen_vld1_dup_u8(from + 3)))
+          .val[0]);
   return vcombine_u8(a, b);
 }
 template <>
@@ -3966,8 +3988,16 @@
 
 template <>
 EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) {
-  uint32x2_t tmp = vorr_u32(vget_low_u32(vreinterpretq_u32_f32(x)), vget_high_u32(vreinterpretq_u32_f32(x)));
-  return vget_lane_u32(vpmax_u32(tmp, tmp), 0);
+  uint32x4_t u = vreinterpretq_u32_f32(x);
+#if EIGEN_ARCH_ARM64
+  return vget_lane_u64(vreinterpret_u64_u16(vmovn_u32(u)), 0);
+#else
+  uint32x2_t tmp = vorr_u32(vget_low_u32(u), vget_high_u32(u));
+  uint32_t a, b;
+  // GCC and Clang refuse to emit this instruction.
+  asm("vmov %0, %1, %P2" : "=r"(a), "=r"(b) : "w"(tmp));
+  return a | b;
+#endif
 }
 
 // Helpers for ptranspose.

diff --git a/Eigen/src/Core/arch/NEON/TypeCasting.h b/Eigen/src/Core/arch/NEON/TypeCasting.h
index 58d7b8c..748d701 100644
--- a/Eigen/src/Core/arch/NEON/TypeCasting.h
+++ b/Eigen/src/Core/arch/NEON/TypeCasting.h

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2018 Rasmus Munk Larsen <rmlarsen@google.com>
+// Copyright (C) 2018 Rasmus Munk Larsen <rmlarsen@gmail.com>
 // Copyright (C) 2020 Antonio Sanchez <cantonios@google.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla

diff --git a/Eigen/src/Core/arch/RVV10/GeneralBlockPanelKernel.h b/Eigen/src/Core/arch/RVV10/GeneralBlockPanelKernel.h
new file mode 100644
index 0000000..026e1db
--- /dev/null
+++ b/Eigen/src/Core/arch/RVV10/GeneralBlockPanelKernel.h

@@ -0,0 +1,236 @@
+// This file is part of Eigen, a lightweight C template library
+// for linear algebra.
+//
+// Copyright (C) 2024 Kseniya Zaytseva <kseniya.zaytseva@syntacore.com>
+// Copyright (C) 2025 Chip Kerchner <ckerchner@tenstorrent.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_RVV10_GENERAL_BLOCK_KERNEL_H
+#define EIGEN_RVV10_GENERAL_BLOCK_KERNEL_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+/********************************* real ************************************/
+
+template <>
+struct gebp_traits<float, float, false, false, Architecture::RVV10, GEBPPacketFull>
+    : gebp_traits<float, float, false, false, Architecture::Generic, GEBPPacketFull> {
+  typedef float RhsPacket;
+  typedef QuadPacket<float> RhsPacketx4;
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = pset1<RhsPacket>(*b); }
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const {
+    pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
+  }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const { loadRhs(b, dest); }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
+
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { dest = ploadquad<RhsPacket>(b); }
+
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/,
+                                const FixedInt<0>&) const {
+#if EIGEN_RISCV64_DEFAULT_LMUL == 1
+    c = __riscv_vfmadd_vf_f32m1(a, b, c, unpacket_traits<AccPacket>::size);
+#elif EIGEN_RISCV64_DEFAULT_LMUL == 2
+    c = __riscv_vfmadd_vf_f32m2(a, b, c, unpacket_traits<AccPacket>::size);
+#elif EIGEN_RISCV64_DEFAULT_LMUL == 4
+    c = __riscv_vfmadd_vf_f32m4(a, b, c, unpacket_traits<AccPacket>::size);
+#endif
+  }
+
+#if EIGEN_RISCV64_DEFAULT_LMUL >= 2
+  EIGEN_STRONG_INLINE void madd(const Packet1Xf& a, const RhsPacket& b, Packet1Xf& c, RhsPacket& /*tmp*/,
+                                const FixedInt<0>&) const {
+    c = __riscv_vfmadd_vf_f32m1(a, b, c, unpacket_traits<Packet1Xf>::size);
+  }
+#endif
+#if EIGEN_RISCV64_DEFAULT_LMUL == 4
+  EIGEN_STRONG_INLINE void madd(const Packet2Xf& a, const RhsPacket& b, Packet2Xf& c, RhsPacket& /*tmp*/,
+                                const FixedInt<0>&) const {
+    c = __riscv_vfmadd_vf_f32m2(a, b, c, unpacket_traits<Packet2Xf>::size);
+  }
+#endif
+
+  template <typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
+                                const LaneIdType& lane) const {
+#if EIGEN_RISCV64_DEFAULT_LMUL == 1
+    c = __riscv_vfmadd_vf_f32m1(a, b.get(lane), c, unpacket_traits<AccPacket>::size);
+#elif EIGEN_RISCV64_DEFAULT_LMUL == 2
+    c = __riscv_vfmadd_vf_f32m2(a, b.get(lane), c, unpacket_traits<AccPacket>::size);
+#elif EIGEN_RISCV64_DEFAULT_LMUL == 4
+    c = __riscv_vfmadd_vf_f32m4(a, b.get(lane), c, unpacket_traits<AccPacket>::size);
+#endif
+  }
+};
+
+template <>
+struct gebp_traits<double, double, false, false, Architecture::RVV10, GEBPPacketFull>
+    : gebp_traits<double, double, false, false, Architecture::Generic, GEBPPacketFull> {
+  typedef double RhsPacket;
+  typedef QuadPacket<double> RhsPacketx4;
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = pset1<RhsPacket>(*b); }
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const {
+    pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
+  }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const { loadRhs(b, dest); }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
+
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { dest = ploadquad<RhsPacket>(b); }
+
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/,
+                                const FixedInt<0>&) const {
+#if EIGEN_RISCV64_DEFAULT_LMUL == 1
+    c = __riscv_vfmadd_vf_f64m1(a, b, c, unpacket_traits<AccPacket>::size);
+#elif EIGEN_RISCV64_DEFAULT_LMUL == 2
+    c = __riscv_vfmadd_vf_f64m2(a, b, c, unpacket_traits<AccPacket>::size);
+#elif EIGEN_RISCV64_DEFAULT_LMUL == 4
+    c = __riscv_vfmadd_vf_f64m4(a, b, c, unpacket_traits<AccPacket>::size);
+#endif
+  }
+
+#if EIGEN_RISCV64_DEFAULT_LMUL >= 2
+  EIGEN_STRONG_INLINE void madd(const Packet1Xd& a, const RhsPacket& b, Packet1Xd& c, RhsPacket& /*tmp*/,
+                                const FixedInt<0>&) const {
+    c = __riscv_vfmadd_vf_f64m1(a, b, c, unpacket_traits<Packet1Xd>::size);
+  }
+#endif
+#if EIGEN_RISCV64_DEFAULT_LMUL == 4
+  EIGEN_STRONG_INLINE void madd(const Packet2Xd& a, const RhsPacket& b, Packet2Xd& c, RhsPacket& /*tmp*/,
+                                const FixedInt<0>&) const {
+    c = __riscv_vfmadd_vf_f64m2(a, b, c, unpacket_traits<Packet2Xd>::size);
+  }
+#endif
+
+  template <typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
+                                const LaneIdType& lane) const {
+#if EIGEN_RISCV64_DEFAULT_LMUL == 1
+    c = __riscv_vfmadd_vf_f64m1(a, b.get(lane), c, unpacket_traits<AccPacket>::size);
+#elif EIGEN_RISCV64_DEFAULT_LMUL == 2
+    c = __riscv_vfmadd_vf_f64m2(a, b.get(lane), c, unpacket_traits<AccPacket>::size);
+#elif EIGEN_RISCV64_DEFAULT_LMUL == 4
+    c = __riscv_vfmadd_vf_f64m4(a, b.get(lane), c, unpacket_traits<AccPacket>::size);
+#endif
+  }
+};
+
+#if defined(EIGEN_VECTORIZE_RVV10FP16)
+
+template <>
+struct gebp_traits<half, half, false, false, Architecture::RVV10>
+    : gebp_traits<half, half, false, false, Architecture::Generic> {
+  typedef half RhsPacket;
+  typedef PacketXh LhsPacket;
+  typedef PacketXh AccPacket;
+  typedef QuadPacket<half> RhsPacketx4;
+
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = pset1<RhsPacket>(*b); }
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const {
+    pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
+  }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const { loadRhs(b, dest); }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
+
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { dest = pload<RhsPacket>(b); }
+
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/,
+                                const FixedInt<0>&) const {
+#if EIGEN_RISCV64_DEFAULT_LMUL == 1
+    c = __riscv_vfmadd_vf_f16m1(a, numext::bit_cast<_Float16>(b), c, unpacket_traits<AccPacket>::size);
+#else
+    c = __riscv_vfmadd_vf_f16m2(a, numext::bit_cast<_Float16>(b), c, unpacket_traits<AccPacket>::size);
+#endif
+  }
+
+#if EIGEN_RISCV64_DEFAULT_LMUL >= 2
+  EIGEN_STRONG_INLINE void madd(const Packet1Xh& a, const RhsPacket& b, Packet1Xh& c, RhsPacket& /*tmp*/,
+                                const FixedInt<0>&) const {
+    c = __riscv_vfmadd_vf_f16m1(a, numext::bit_cast<_Float16>(b), c, unpacket_traits<Packet1Xh>::size);
+  }
+#endif
+
+  template <typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
+                                const LaneIdType& lane) const {
+#if EIGEN_RISCV64_DEFAULT_LMUL == 1
+    c = __riscv_vfmadd_vf_f16m1(a, numext::bit_cast<_Float16>(b.get(lane)), c, unpacket_traits<AccPacket>::size);
+#else
+    c = __riscv_vfmadd_vf_f16m2(a, numext::bit_cast<_Float16>(b.get(lane)), c, unpacket_traits<AccPacket>::size);
+#endif
+  }
+};
+
+#endif
+
+#if defined(EIGEN_VECTORIZE_RVV10BF16)
+
+template <>
+struct gebp_traits<bfloat16, bfloat16, false, false, Architecture::RVV10>
+    : gebp_traits<bfloat16, bfloat16, false, false, Architecture::Generic> {
+  typedef bfloat16 RhsPacket;
+  typedef PacketXbf LhsPacket;
+  typedef PacketXbf AccPacket;
+  typedef QuadPacket<bfloat16> RhsPacketx4;
+
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = pset1<RhsPacket>(*b); }
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const {
+    pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
+  }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const { loadRhs(b, dest); }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
+
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { dest = pload<RhsPacket>(b); }
+
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/,
+                                const FixedInt<0>&) const {
+#if EIGEN_RISCV64_DEFAULT_LMUL == 1
+    c = F32ToBf16(
+        __riscv_vfwmaccbf16_vf_f32m2(Bf16ToF32(c), numext::bit_cast<__bf16>(b), a, unpacket_traits<AccPacket>::size));
+#else
+    c = F32ToBf16(
+        __riscv_vfwmaccbf16_vf_f32m4(Bf16ToF32(c), numext::bit_cast<__bf16>(b), a, unpacket_traits<AccPacket>::size));
+#endif
+  }
+
+#if EIGEN_RISCV64_DEFAULT_LMUL >= 2
+  EIGEN_STRONG_INLINE void madd(const Packet1Xbf& a, const RhsPacket& b, Packet1Xbf& c, RhsPacket& /*tmp*/,
+                                const FixedInt<0>&) const {
+    c = F32ToBf16(
+        __riscv_vfwmaccbf16_vf_f32m2(Bf16ToF32(c), numext::bit_cast<__bf16>(b), a, unpacket_traits<Packet1Xbf>::size));
+  }
+#endif
+
+  template <typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
+                                const LaneIdType& lane) const {
+#if EIGEN_RISCV64_DEFAULT_LMUL == 1
+    c = F32ToBf16(__riscv_vfwmaccbf16_vf_f32m2(Bf16ToF32(c), numext::bit_cast<__bf16>(b.get(lane)), a,
+                                               unpacket_traits<AccPacket>::size));
+#else
+    c = F32ToBf16(__riscv_vfwmaccbf16_vf_f32m4(Bf16ToF32(c), numext::bit_cast<__bf16>(b.get(lane)), a,
+                                               unpacket_traits<AccPacket>::size));
+#endif
+  }
+};
+
+#endif
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_RVV10_GENERAL_BLOCK_KERNEL_H

diff --git a/Eigen/src/Core/arch/RVV10/PacketMath.h b/Eigen/src/Core/arch/RVV10/PacketMath.h
index e0e0be4..679d5c1 100644
--- a/Eigen/src/Core/arch/RVV10/PacketMath.h
+++ b/Eigen/src/Core/arch/RVV10/PacketMath.h

@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2024 Kseniya Zaytseva <kseniya.zaytseva@syntacore.com>
+// Copyright (C) 2025 Chip Kerchner <ckerchner@tenstorrent.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -51,15 +52,11 @@
 typedef eigen_packet_wrapper<vint32m1_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))), 0> Packet1Xi;
 typedef eigen_packet_wrapper<vuint32m1_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))), 1> Packet1Xu;
 
-typedef eigen_packet_wrapper<vint32m2_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 2))), 2>
-    Packet2Xi;
-typedef eigen_packet_wrapper<vuint32m2_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 2))), 3>
-    Packet2Xu;
+typedef eigen_packet_wrapper<vint32m2_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 2))), 2> Packet2Xi;
+typedef eigen_packet_wrapper<vuint32m2_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 2))), 3> Packet2Xu;
 
-typedef eigen_packet_wrapper<vint32m4_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 4))), 4>
-    Packet4Xi;
-typedef eigen_packet_wrapper<vuint32m4_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 4))), 5>
-    Packet4Xu;
+typedef eigen_packet_wrapper<vint32m4_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 4))), 4> Packet4Xi;
+typedef eigen_packet_wrapper<vuint32m4_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 4))), 5> Packet4Xu;
 
 #if EIGEN_RISCV64_DEFAULT_LMUL == 1
 typedef Packet1Xi PacketXi;
@@ -351,18 +348,17 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet1Xi ploaddup<Packet1Xi>(const numext::int32_t* from) {
-  Packet1Xu idx = __riscv_vid_v_u32m1(unpacket_traits<Packet1Xi>::size);
-  idx = __riscv_vsll_vx_u32m1(__riscv_vand_vx_u32m1(idx, 0xfffffffeu, unpacket_traits<Packet1Xi>::size), 1,
-                              unpacket_traits<Packet1Xi>::size);
-  // idx = 0 0 sizeof(int32_t) sizeof(int32_t) 2*sizeof(int32_t) 2*sizeof(int32_t) ...
-  return __riscv_vloxei32_v_i32m1(from, idx, unpacket_traits<Packet1Xi>::size);
+  Packet1Xu data = __riscv_vreinterpret_v_i32m1_u32m1(pload<Packet1Xi>(from));
+  return __riscv_vreinterpret_v_i64m1_i32m1(__riscv_vreinterpret_v_u64m1_i64m1(__riscv_vlmul_trunc_v_u64m2_u64m1(
+      __riscv_vwmaccu_vx_u64m2(__riscv_vwaddu_vv_u64m2(data, data, unpacket_traits<Packet1Xi>::size), 0xffffffffu, data,
+                               unpacket_traits<Packet1Xi>::size))));
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet1Xi ploadquad<Packet1Xi>(const numext::int32_t* from) {
-  Packet1Xu idx = __riscv_vid_v_u32m1(unpacket_traits<Packet1Xi>::size);
-  idx = __riscv_vand_vx_u32m1(idx, 0xfffffffcu, unpacket_traits<Packet1Xi>::size);
-  return __riscv_vloxei32_v_i32m1(from, idx, unpacket_traits<Packet1Xi>::size);
+  Packet1Xu idx =
+      __riscv_vsrl_vx_u32m1(__riscv_vid_v_u32m1(unpacket_traits<Packet1Xi>::size), 2, unpacket_traits<Packet1Xi>::size);
+  return __riscv_vrgather_vv_i32m1(pload<Packet1Xi>(from), idx, unpacket_traits<Packet1Xi>::size);
 }
 
 template <>
@@ -382,7 +378,7 @@
 
 template <>
 EIGEN_DEVICE_FUNC inline void pscatter<numext::int32_t, Packet1Xi>(numext::int32_t* to, const Packet1Xi& from,
-                                                                  Index stride) {
+                                                                   Index stride) {
   __riscv_vsse32(to, stride * sizeof(numext::int32_t), from, unpacket_traits<Packet1Xi>::size);
 }
 
@@ -394,7 +390,7 @@
 template <>
 EIGEN_STRONG_INLINE Packet1Xi preverse(const Packet1Xi& a) {
   Packet1Xu idx = __riscv_vrsub_vx_u32m1(__riscv_vid_v_u32m1(unpacket_traits<Packet1Xi>::size),
-                                        unpacket_traits<Packet1Xi>::size - 1, unpacket_traits<Packet1Xi>::size);
+                                         unpacket_traits<Packet1Xi>::size - 1, unpacket_traits<Packet1Xi>::size);
   return __riscv_vrgather_vv_i32m1(a, idx, unpacket_traits<Packet1Xi>::size);
 }
 
@@ -670,6 +666,12 @@
 }
 
 template <>
+EIGEN_STRONG_INLINE Packet1Xf pabsdiff(const Packet1Xf& a, const Packet1Xf& b) {
+  return __riscv_vfabs_v_f32m1(__riscv_vfsub_vv_f32m1(a, b, unpacket_traits<Packet1Xf>::size),
+                               unpacket_traits<Packet1Xf>::size);
+}
+
+template <>
 EIGEN_STRONG_INLINE Packet1Xf pset1<Packet1Xf>(const float& from) {
   return __riscv_vfmv_v_f_f32m1(from, unpacket_traits<Packet1Xf>::size);
 }
@@ -688,6 +690,16 @@
 }
 
 template <>
+EIGEN_STRONG_INLINE void pbroadcast4<Packet1Xf>(const float* a, Packet1Xf& a0, Packet1Xf& a1, Packet1Xf& a2,
+                                                Packet1Xf& a3) {
+  vfloat32m1_t aa = __riscv_vle32_v_f32m1(a, 4);
+  a0 = __riscv_vrgather_vx_f32m1(aa, 0, unpacket_traits<Packet1Xf>::size);
+  a1 = __riscv_vrgather_vx_f32m1(aa, 1, unpacket_traits<Packet1Xf>::size);
+  a2 = __riscv_vrgather_vx_f32m1(aa, 2, unpacket_traits<Packet1Xf>::size);
+  a3 = __riscv_vrgather_vx_f32m1(aa, 3, unpacket_traits<Packet1Xf>::size);
+}
+
+template <>
 EIGEN_STRONG_INLINE Packet1Xf padd<Packet1Xf>(const Packet1Xf& a, const Packet1Xf& b) {
   return __riscv_vfadd_vv_f32m1(a, b, unpacket_traits<Packet1Xf>::size);
 }
@@ -703,6 +715,12 @@
 }
 
 template <>
+EIGEN_STRONG_INLINE Packet1Xf psignbit(const Packet1Xf& a) {
+  return __riscv_vreinterpret_v_i32m1_f32m1(
+      __riscv_vsra_vx_i32m1(__riscv_vreinterpret_v_f32m1_i32m1(a), 31, unpacket_traits<Packet1Xi>::size));
+}
+
+template <>
 EIGEN_STRONG_INLINE Packet1Xf pconj(const Packet1Xf& a) {
   return a;
 }
@@ -840,17 +858,18 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet1Xf ploaddup<Packet1Xf>(const float* from) {
-  Packet1Xu idx = __riscv_vid_v_u32m1(unpacket_traits<Packet1Xf>::size);
-  idx = __riscv_vsll_vx_u32m1(__riscv_vand_vx_u32m1(idx, 0xfffffffeu, unpacket_traits<Packet1Xf>::size), 1,
-                              unpacket_traits<Packet1Xf>::size);
-  return __riscv_vloxei32_v_f32m1(from, idx, unpacket_traits<Packet1Xf>::size);
+  Packet1Xu data = __riscv_vreinterpret_v_f32m1_u32m1(pload<Packet1Xf>(from));
+  return __riscv_vreinterpret_v_i32m1_f32m1(
+      __riscv_vreinterpret_v_i64m1_i32m1(__riscv_vreinterpret_v_u64m1_i64m1(__riscv_vlmul_trunc_v_u64m2_u64m1(
+          __riscv_vwmaccu_vx_u64m2(__riscv_vwaddu_vv_u64m2(data, data, unpacket_traits<Packet1Xi>::size), 0xffffffffu,
+                                   data, unpacket_traits<Packet1Xi>::size)))));
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet1Xf ploadquad<Packet1Xf>(const float* from) {
-  Packet1Xu idx = __riscv_vid_v_u32m1(unpacket_traits<Packet1Xf>::size);
-  idx = __riscv_vand_vx_u32m1(idx, 0xfffffffcu, unpacket_traits<Packet1Xf>::size);
-  return __riscv_vloxei32_v_f32m1(from, idx, unpacket_traits<Packet1Xf>::size);
+  Packet1Xu idx =
+      __riscv_vsrl_vx_u32m1(__riscv_vid_v_u32m1(unpacket_traits<Packet1Xf>::size), 2, unpacket_traits<Packet1Xf>::size);
+  return __riscv_vrgather_vv_f32m1(pload<Packet1Xf>(from), idx, unpacket_traits<Packet1Xf>::size);
 }
 
 template <>
@@ -891,7 +910,7 @@
   PacketMask32 mask = __riscv_vmfne_vv_f32m1_b32(a, a, unpacket_traits<Packet1Xf>::size);
   const Packet1Xf x = __riscv_vfadd_vv_f32m1_tumu(mask, a, a, a, unpacket_traits<Packet1Xf>::size);
   const Packet1Xf new_x = __riscv_vfcvt_f_x_v_f32m1(__riscv_vfcvt_x_f_v_i32m1(a, unpacket_traits<Packet1Xf>::size),
-                                                   unpacket_traits<Packet1Xf>::size);
+                                                    unpacket_traits<Packet1Xf>::size);
 
   mask = __riscv_vmflt_vv_f32m1_b32(abs_a, limit, unpacket_traits<Packet1Xf>::size);
   Packet1Xf signed_x = __riscv_vfsgnj_vv_f32m1(new_x, x, unpacket_traits<Packet1Xf>::size);
@@ -909,7 +928,7 @@
 template <>
 EIGEN_STRONG_INLINE Packet1Xf preverse(const Packet1Xf& a) {
   Packet1Xu idx = __riscv_vrsub_vx_u32m1(__riscv_vid_v_u32m1(unpacket_traits<Packet1Xf>::size),
-                                        unpacket_traits<Packet1Xf>::size - 1, unpacket_traits<Packet1Xf>::size);
+                                         unpacket_traits<Packet1Xf>::size - 1, unpacket_traits<Packet1Xf>::size);
   return __riscv_vrgather_vv_f32m1(a, idx, unpacket_traits<Packet1Xf>::size);
 }
 
@@ -952,22 +971,20 @@
 
 template <>
 EIGEN_STRONG_INLINE float predux_min<Packet1Xf>(const Packet1Xf& a) {
-  return (
-      std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f32m1_f32m1(
-                    a,
-                    __riscv_vfmv_v_f_f32m1((std::numeric_limits<float>::quiet_NaN)(), unpacket_traits<Packet1Xf>::size),
-                    unpacket_traits<Packet1Xf>::size)),
-                (std::numeric_limits<float>::max)());
+  return (std::min)(
+      __riscv_vfmv_f(__riscv_vfredmin_vs_f32m1_f32m1(
+          a, __riscv_vfmv_v_f_f32m1((std::numeric_limits<float>::quiet_NaN)(), unpacket_traits<Packet1Xf>::size),
+          unpacket_traits<Packet1Xf>::size)),
+      (std::numeric_limits<float>::max)());
 }
 
 template <>
 EIGEN_STRONG_INLINE float predux_max<Packet1Xf>(const Packet1Xf& a) {
-  return (
-      std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f32m1_f32m1(
-                    a,
-                    __riscv_vfmv_v_f_f32m1((std::numeric_limits<float>::quiet_NaN)(), unpacket_traits<Packet1Xf>::size),
-                    unpacket_traits<Packet1Xf>::size)),
-                -(std::numeric_limits<float>::max)());
+  return (std::max)(
+      __riscv_vfmv_f(__riscv_vfredmax_vs_f32m1_f32m1(
+          a, __riscv_vfmv_v_f_f32m1((std::numeric_limits<float>::quiet_NaN)(), unpacket_traits<Packet1Xf>::size),
+          unpacket_traits<Packet1Xf>::size)),
+      -(std::numeric_limits<float>::max)());
 }
 
 template <int N>
@@ -1012,18 +1029,22 @@
   return __riscv_vmerge_vvm_f32m1(b, a, mask, unpacket_traits<Packet1Xf>::size);
 }
 
+EIGEN_STRONG_INLINE Packet1Xf pselect(const Packet1Xf& mask, const Packet1Xf& a, const Packet1Xf& b) {
+  PacketMask32 mask2 =
+      __riscv_vmsne_vx_i32m1_b32(__riscv_vreinterpret_v_f32m1_i32m1(mask), 0, unpacket_traits<Packet1Xf>::size);
+  return __riscv_vmerge_vvm_f32m1(b, a, mask2, unpacket_traits<Packet1Xf>::size);
+}
+
 /********************************* int64 **************************************/
 
 typedef eigen_packet_wrapper<vint64m1_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))), 9> Packet1Xl;
 typedef eigen_packet_wrapper<vuint64m1_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))), 10> Packet1Xul;
 
-typedef eigen_packet_wrapper<vint64m2_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 2))), 11>
-    Packet2Xl;
+typedef eigen_packet_wrapper<vint64m2_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 2))), 11> Packet2Xl;
 typedef eigen_packet_wrapper<vuint64m2_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 2))), 12>
     Packet2Xul;
 
-typedef eigen_packet_wrapper<vint64m4_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 4))), 13>
-    Packet4Xl;
+typedef eigen_packet_wrapper<vint64m4_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 4))), 13> Packet4Xl;
 typedef eigen_packet_wrapper<vuint64m4_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 4))), 14>
     Packet4Xul;
 
@@ -1317,20 +1338,16 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet1Xl ploaddup<Packet1Xl>(const numext::int64_t* from) {
-  Packet1Xul idx = __riscv_vid_v_u64m1(unpacket_traits<Packet1Xl>::size);
-  idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffeu, unpacket_traits<Packet1Xl>::size), 2,
-                              unpacket_traits<Packet1Xl>::size);
-  // idx = 0 0 sizeof(int64_t) sizeof(int64_t) 2*sizeof(int64_t) 2*sizeof(int64_t) ...
-  return __riscv_vloxei64_v_i64m1(from, idx, unpacket_traits<Packet1Xl>::size);
+  Packet1Xul idx =
+      __riscv_vsrl_vx_u64m1(__riscv_vid_v_u64m1(unpacket_traits<Packet1Xl>::size), 1, unpacket_traits<Packet1Xl>::size);
+  return __riscv_vrgather_vv_i64m1(pload<Packet1Xl>(from), idx, unpacket_traits<Packet1Xl>::size);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet1Xl ploadquad<Packet1Xl>(const numext::int64_t* from) {
-  Packet1Xul idx = __riscv_vid_v_u64m1(unpacket_traits<Packet1Xl>::size);
-  idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffcu, unpacket_traits<Packet1Xl>::size), 1,
-                              unpacket_traits<Packet1Xl>::size);
-  ;
-  return __riscv_vloxei64_v_i64m1(from, idx, unpacket_traits<Packet1Xl>::size);
+  Packet1Xul idx =
+      __riscv_vsrl_vx_u64m1(__riscv_vid_v_u64m1(unpacket_traits<Packet1Xl>::size), 2, unpacket_traits<Packet1Xl>::size);
+  return __riscv_vrgather_vv_i64m1(pload<Packet1Xl>(from), idx, unpacket_traits<Packet1Xl>::size);
 }
 
 template <>
@@ -1350,7 +1367,7 @@
 
 template <>
 EIGEN_DEVICE_FUNC inline void pscatter<numext::int64_t, Packet1Xl>(numext::int64_t* to, const Packet1Xl& from,
-                                                                  Index stride) {
+                                                                   Index stride) {
   __riscv_vsse64(to, stride * sizeof(numext::int64_t), from, unpacket_traits<Packet1Xl>::size);
 }
 
@@ -1362,7 +1379,7 @@
 template <>
 EIGEN_STRONG_INLINE Packet1Xl preverse(const Packet1Xl& a) {
   Packet1Xul idx = __riscv_vrsub_vx_u64m1(__riscv_vid_v_u64m1(unpacket_traits<Packet1Xl>::size),
-                                         unpacket_traits<Packet1Xl>::size - 1, unpacket_traits<Packet1Xl>::size);
+                                          unpacket_traits<Packet1Xl>::size - 1, unpacket_traits<Packet1Xl>::size);
   return __riscv_vrgather_vv_i64m1(a, idx, unpacket_traits<Packet1Xl>::size);
 }
 
@@ -1622,6 +1639,12 @@
 }
 
 template <>
+EIGEN_STRONG_INLINE Packet1Xd pabsdiff(const Packet1Xd& a, const Packet1Xd& b) {
+  return __riscv_vfabs_v_f64m1(__riscv_vfsub_vv_f64m1(a, b, unpacket_traits<Packet1Xd>::size),
+                               unpacket_traits<Packet1Xd>::size);
+}
+
+template <>
 EIGEN_STRONG_INLINE Packet1Xd pset1<Packet1Xd>(const double& from) {
   return __riscv_vfmv_v_f_f64m1(from, unpacket_traits<Packet1Xd>::size);
 }
@@ -1640,6 +1663,25 @@
 }
 
 template <>
+EIGEN_STRONG_INLINE void pbroadcast4<Packet1Xd>(const double* a, Packet1Xd& a0, Packet1Xd& a1, Packet1Xd& a2,
+                                                Packet1Xd& a3) {
+  if (EIGEN_RISCV64_RVV_VL >= 256) {
+    vfloat64m1_t aa = __riscv_vle64_v_f64m1(a, 4);
+    a0 = __riscv_vrgather_vx_f64m1(aa, 0, unpacket_traits<Packet1Xd>::size);
+    a1 = __riscv_vrgather_vx_f64m1(aa, 1, unpacket_traits<Packet1Xd>::size);
+    a2 = __riscv_vrgather_vx_f64m1(aa, 2, unpacket_traits<Packet1Xd>::size);
+    a3 = __riscv_vrgather_vx_f64m1(aa, 3, unpacket_traits<Packet1Xd>::size);
+  } else {
+    vfloat64m1_t aa0 = __riscv_vle64_v_f64m1(a + 0, 2);
+    vfloat64m1_t aa1 = __riscv_vle64_v_f64m1(a + 2, 2);
+    a0 = __riscv_vrgather_vx_f64m1(aa0, 0, unpacket_traits<Packet1Xd>::size);
+    a1 = __riscv_vrgather_vx_f64m1(aa0, 1, unpacket_traits<Packet1Xd>::size);
+    a2 = __riscv_vrgather_vx_f64m1(aa1, 0, unpacket_traits<Packet1Xd>::size);
+    a3 = __riscv_vrgather_vx_f64m1(aa1, 1, unpacket_traits<Packet1Xd>::size);
+  }
+}
+
+template <>
 EIGEN_STRONG_INLINE Packet1Xd padd<Packet1Xd>(const Packet1Xd& a, const Packet1Xd& b) {
   return __riscv_vfadd_vv_f64m1(a, b, unpacket_traits<Packet1Xd>::size);
 }
@@ -1655,6 +1697,12 @@
 }
 
 template <>
+EIGEN_STRONG_INLINE Packet1Xd psignbit(const Packet1Xd& a) {
+  return __riscv_vreinterpret_v_i64m1_f64m1(
+      __riscv_vsra_vx_i64m1(__riscv_vreinterpret_v_f64m1_i64m1(a), 63, unpacket_traits<Packet1Xl>::size));
+}
+
+template <>
 EIGEN_STRONG_INLINE Packet1Xd pconj(const Packet1Xd& a) {
   return a;
 }
@@ -1792,19 +1840,16 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet1Xd ploaddup<Packet1Xd>(const double* from) {
-  Packet1Xul idx = __riscv_vid_v_u64m1(unpacket_traits<Packet1Xd>::size);
-  idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffeu, unpacket_traits<Packet1Xd>::size), 2,
-                              unpacket_traits<Packet1Xd>::size);
-  return __riscv_vloxei64_v_f64m1(from, idx, unpacket_traits<Packet1Xd>::size);
+  Packet1Xul idx =
+      __riscv_vsrl_vx_u64m1(__riscv_vid_v_u64m1(unpacket_traits<Packet1Xd>::size), 1, unpacket_traits<Packet1Xd>::size);
+  return __riscv_vrgather_vv_f64m1(pload<Packet1Xd>(from), idx, unpacket_traits<Packet1Xd>::size);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet1Xd ploadquad<Packet1Xd>(const double* from) {
-  Packet1Xul idx = __riscv_vid_v_u64m1(unpacket_traits<Packet1Xd>::size);
-  idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffcu, unpacket_traits<Packet1Xd>::size), 1,
-                              unpacket_traits<Packet1Xd>::size);
-  ;
-  return __riscv_vloxei64_v_f64m1(from, idx, unpacket_traits<Packet1Xd>::size);
+  Packet1Xul idx =
+      __riscv_vsrl_vx_u64m1(__riscv_vid_v_u64m1(unpacket_traits<Packet1Xd>::size), 2, unpacket_traits<Packet1Xd>::size);
+  return __riscv_vrgather_vv_f64m1(pload<Packet1Xd>(from), idx, unpacket_traits<Packet1Xd>::size);
 }
 
 template <>
@@ -1845,7 +1890,7 @@
   PacketMask64 mask = __riscv_vmfne_vv_f64m1_b64(a, a, unpacket_traits<Packet1Xd>::size);
   const Packet1Xd x = __riscv_vfadd_vv_f64m1_tumu(mask, a, a, a, unpacket_traits<Packet1Xd>::size);
   const Packet1Xd new_x = __riscv_vfcvt_f_x_v_f64m1(__riscv_vfcvt_x_f_v_i64m1(a, unpacket_traits<Packet1Xd>::size),
-                                                   unpacket_traits<Packet1Xd>::size);
+                                                    unpacket_traits<Packet1Xd>::size);
 
   mask = __riscv_vmflt_vv_f64m1_b64(abs_a, limit, unpacket_traits<Packet1Xd>::size);
   Packet1Xd signed_x = __riscv_vfsgnj_vv_f64m1(new_x, x, unpacket_traits<Packet1Xd>::size);
@@ -1863,7 +1908,7 @@
 template <>
 EIGEN_STRONG_INLINE Packet1Xd preverse(const Packet1Xd& a) {
   Packet1Xul idx = __riscv_vrsub_vx_u64m1(__riscv_vid_v_u64m1(unpacket_traits<Packet1Xd>::size),
-                                         unpacket_traits<Packet1Xd>::size - 1, unpacket_traits<Packet1Xd>::size);
+                                          unpacket_traits<Packet1Xd>::size - 1, unpacket_traits<Packet1Xd>::size);
   return __riscv_vrgather_vv_f64m1(a, idx, unpacket_traits<Packet1Xd>::size);
 }
 
@@ -1903,22 +1948,20 @@
 
 template <>
 EIGEN_STRONG_INLINE double predux_min<Packet1Xd>(const Packet1Xd& a) {
-  return (
-      std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f64m1_f64m1(
-                    a,
-                    __riscv_vfmv_v_f_f64m1((std::numeric_limits<double>::quiet_NaN)(), unpacket_traits<Packet1Xd>::size),
-                    unpacket_traits<Packet1Xd>::size)),
-                (std::numeric_limits<double>::max)());
+  return (std::min)(
+      __riscv_vfmv_f(__riscv_vfredmin_vs_f64m1_f64m1(
+          a, __riscv_vfmv_v_f_f64m1((std::numeric_limits<double>::quiet_NaN)(), unpacket_traits<Packet1Xd>::size),
+          unpacket_traits<Packet1Xd>::size)),
+      (std::numeric_limits<double>::max)());
 }
 
 template <>
 EIGEN_STRONG_INLINE double predux_max<Packet1Xd>(const Packet1Xd& a) {
-  return (
-      std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f64m1_f64m1(
-                    a,
-                    __riscv_vfmv_v_f_f64m1((std::numeric_limits<double>::quiet_NaN)(), unpacket_traits<Packet1Xd>::size),
-                    unpacket_traits<Packet1Xd>::size)),
-                -(std::numeric_limits<double>::max)());
+  return (std::max)(
+      __riscv_vfmv_f(__riscv_vfredmax_vs_f64m1_f64m1(
+          a, __riscv_vfmv_v_f_f64m1((std::numeric_limits<double>::quiet_NaN)(), unpacket_traits<Packet1Xd>::size),
+          unpacket_traits<Packet1Xd>::size)),
+      -(std::numeric_limits<double>::max)());
 }
 
 template <int N>
@@ -1968,18 +2011,22 @@
   return __riscv_vmerge_vvm_f64m1(b, a, mask, unpacket_traits<Packet1Xd>::size);
 }
 
+EIGEN_STRONG_INLINE Packet1Xd pselect(const Packet1Xd& mask, const Packet1Xd& a, const Packet1Xd& b) {
+  PacketMask64 mask2 =
+      __riscv_vmsne_vx_i64m1_b64(__riscv_vreinterpret_v_f64m1_i64m1(mask), 0, unpacket_traits<Packet1Xd>::size);
+  return __riscv_vmerge_vvm_f64m1(b, a, mask2, unpacket_traits<Packet1Xd>::size);
+}
+
 /********************************* short **************************************/
 
 typedef eigen_packet_wrapper<vint16m1_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))), 18> Packet1Xs;
 typedef eigen_packet_wrapper<vuint16m1_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))), 19> Packet1Xsu;
 
-typedef eigen_packet_wrapper<vint16m2_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 2))), 20>
-    Packet2Xs;
+typedef eigen_packet_wrapper<vint16m2_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 2))), 20> Packet2Xs;
 typedef eigen_packet_wrapper<vuint16m2_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 2))), 21>
     Packet2Xsu;
 
-typedef eigen_packet_wrapper<vint16m4_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 4))), 22>
-    Packet4Xs;
+typedef eigen_packet_wrapper<vint16m4_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 4))), 22> Packet4Xs;
 typedef eigen_packet_wrapper<vuint16m4_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 4))), 23>
     Packet4Xsu;
 
@@ -2273,18 +2320,17 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet1Xs ploaddup<Packet1Xs>(const numext::int16_t* from) {
-  Packet1Xsu idx = __riscv_vid_v_u16m1(unpacket_traits<Packet1Xs>::size);
-  idx = __riscv_vand_vx_u16m1(idx, 0xfffeu, unpacket_traits<Packet1Xs>::size);
-  // idx = 0 0 sizeof(int16_t) sizeof(int16_t) 2*sizeof(int16_t) 2*sizeof(int16_t) ...
-  return __riscv_vloxei16_v_i16m1(from, idx, unpacket_traits<Packet1Xs>::size);
+  Packet1Xsu data = __riscv_vreinterpret_v_i16m1_u16m1(pload<Packet1Xs>(from));
+  return __riscv_vreinterpret_v_i32m1_i16m1(__riscv_vreinterpret_v_u32m1_i32m1(__riscv_vlmul_trunc_v_u32m2_u32m1(
+      __riscv_vwmaccu_vx_u32m2(__riscv_vwaddu_vv_u32m2(data, data, unpacket_traits<Packet1Xs>::size), 0xffffu, data,
+                               unpacket_traits<Packet1Xs>::size))));
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet1Xs ploadquad<Packet1Xs>(const numext::int16_t* from) {
-  Packet1Xsu idx = __riscv_vid_v_u16m1(unpacket_traits<Packet1Xs>::size);
-  idx = __riscv_vsrl_vx_u16m1(__riscv_vand_vx_u16m1(idx, 0xfffcu, unpacket_traits<Packet1Xs>::size), 1,
-                              unpacket_traits<Packet1Xs>::size);
-  return __riscv_vloxei16_v_i16m1(from, idx, unpacket_traits<Packet1Xs>::size);
+  Packet1Xsu idx =
+      __riscv_vsrl_vx_u16m1(__riscv_vid_v_u16m1(unpacket_traits<Packet1Xs>::size), 2, unpacket_traits<Packet1Xs>::size);
+  return __riscv_vrgather_vv_i16m1(pload<Packet1Xs>(from), idx, unpacket_traits<Packet1Xs>::size);
 }
 
 template <>
@@ -2304,7 +2350,7 @@
 
 template <>
 EIGEN_DEVICE_FUNC inline void pscatter<numext::int16_t, Packet1Xs>(numext::int16_t* to, const Packet1Xs& from,
-                                                                  Index stride) {
+                                                                   Index stride) {
   __riscv_vsse16(to, stride * sizeof(numext::int16_t), from, unpacket_traits<Packet1Xs>::size);
 }
 
@@ -2316,7 +2362,7 @@
 template <>
 EIGEN_STRONG_INLINE Packet1Xs preverse(const Packet1Xs& a) {
   Packet1Xsu idx = __riscv_vrsub_vx_u16m1(__riscv_vid_v_u16m1(unpacket_traits<Packet1Xs>::size),
-                                         unpacket_traits<Packet1Xs>::size - 1, unpacket_traits<Packet1Xs>::size);
+                                          unpacket_traits<Packet1Xs>::size - 1, unpacket_traits<Packet1Xs>::size);
   return __riscv_vrgather_vv_i16m1(a, idx, unpacket_traits<Packet1Xs>::size);
 }
 

diff --git a/Eigen/src/Core/arch/RVV10/PacketMath2.h b/Eigen/src/Core/arch/RVV10/PacketMath2.h
index 1fda511..d99a154 100644
--- a/Eigen/src/Core/arch/RVV10/PacketMath2.h
+++ b/Eigen/src/Core/arch/RVV10/PacketMath2.h

@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2024 Kseniya Zaytseva <kseniya.zaytseva@syntacore.com>
+// Copyright (C) 2025 Chip Kerchner <ckerchner@tenstorrent.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -166,18 +167,17 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet2Xi ploaddup<Packet2Xi>(const numext::int32_t* from) {
-  Packet2Xu idx = __riscv_vid_v_u32m2(unpacket_traits<Packet2Xi>::size);
-  idx = __riscv_vsll_vx_u32m2(__riscv_vand_vx_u32m2(idx, 0xfffffffeu, unpacket_traits<Packet2Xi>::size), 1,
-                              unpacket_traits<Packet2Xi>::size);
-  // idx = 0 0 sizeof(int32_t) sizeof(int32_t) 2*sizeof(int32_t) 2*sizeof(int32_t) ...
-  return __riscv_vloxei32_v_i32m2(from, idx, unpacket_traits<Packet2Xi>::size);
+  Packet2Xu data = __riscv_vreinterpret_v_i32m2_u32m2(pload<Packet2Xi>(from));
+  return __riscv_vreinterpret_v_i64m2_i32m2(__riscv_vreinterpret_v_u64m2_i64m2(__riscv_vlmul_trunc_v_u64m4_u64m2(
+      __riscv_vwmaccu_vx_u64m4(__riscv_vwaddu_vv_u64m4(data, data, unpacket_traits<Packet2Xi>::size), 0xffffffffu, data,
+                               unpacket_traits<Packet2Xi>::size))));
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet2Xi ploadquad<Packet2Xi>(const numext::int32_t* from) {
-  Packet2Xu idx = __riscv_vid_v_u32m2(unpacket_traits<Packet2Xi>::size);
-  idx = __riscv_vand_vx_u32m2(idx, 0xfffffffcu, unpacket_traits<Packet2Xi>::size);
-  return __riscv_vloxei32_v_i32m2(from, idx, unpacket_traits<Packet2Xi>::size);
+  Packet2Xu idx =
+      __riscv_vsrl_vx_u32m2(__riscv_vid_v_u32m2(unpacket_traits<Packet2Xi>::size), 2, unpacket_traits<Packet2Xi>::size);
+  return __riscv_vrgather_vv_i32m2(pload<Packet2Xi>(from), idx, unpacket_traits<Packet2Xi>::size);
 }
 
 template <>
@@ -191,14 +191,13 @@
 }
 
 template <>
-EIGEN_DEVICE_FUNC inline Packet2Xi pgather<numext::int32_t, Packet2Xi>(const numext::int32_t* from,
-                                                                             Index stride) {
+EIGEN_DEVICE_FUNC inline Packet2Xi pgather<numext::int32_t, Packet2Xi>(const numext::int32_t* from, Index stride) {
   return __riscv_vlse32_v_i32m2(from, stride * sizeof(numext::int32_t), unpacket_traits<Packet2Xi>::size);
 }
 
 template <>
 EIGEN_DEVICE_FUNC inline void pscatter<numext::int32_t, Packet2Xi>(numext::int32_t* to, const Packet2Xi& from,
-                                                                      Index stride) {
+                                                                   Index stride) {
   __riscv_vsse32(to, stride * sizeof(numext::int32_t), from, unpacket_traits<Packet2Xi>::size);
 }
 
@@ -209,9 +208,8 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet2Xi preverse(const Packet2Xi& a) {
-  Packet2Xu idx =
-      __riscv_vrsub_vx_u32m2(__riscv_vid_v_u32m2(unpacket_traits<Packet2Xi>::size),
-                             unpacket_traits<Packet2Xi>::size - 1, unpacket_traits<Packet2Xi>::size);
+  Packet2Xu idx = __riscv_vrsub_vx_u32m2(__riscv_vid_v_u32m2(unpacket_traits<Packet2Xi>::size),
+                                         unpacket_traits<Packet2Xi>::size - 1, unpacket_traits<Packet2Xi>::size);
   return __riscv_vrgather_vv_i32m2(a, idx, unpacket_traits<Packet2Xi>::size);
 }
 
@@ -224,14 +222,14 @@
 
 template <>
 EIGEN_STRONG_INLINE numext::int32_t predux<Packet2Xi>(const Packet2Xi& a) {
-  return __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(
-      a, __riscv_vmv_v_x_i32m1(0, unpacket_traits<Packet2Xi>::size / 2), unpacket_traits<Packet2Xi>::size));
+  return __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(a, __riscv_vmv_v_x_i32m1(0, unpacket_traits<Packet2Xi>::size / 2),
+                                                      unpacket_traits<Packet2Xi>::size));
 }
 
 template <>
 EIGEN_STRONG_INLINE numext::int32_t predux_mul<Packet2Xi>(const Packet2Xi& a) {
   return predux_mul<Packet1Xi>(__riscv_vmul_vv_i32m1(__riscv_vget_v_i32m2_i32m1(a, 0), __riscv_vget_v_i32m2_i32m1(a, 1),
-                                                    unpacket_traits<Packet1Xi>::size));
+                                                     unpacket_traits<Packet1Xi>::size));
 }
 
 template <>
@@ -264,18 +262,18 @@
 
 template <typename Packet = Packet4Xi>
 EIGEN_STRONG_INLINE
-typename std::enable_if<std::is_same<Packet, Packet4Xi>::value && (unpacket_traits<Packet4Xi>::size % 8) == 0,
-                        Packet2Xi>::type
-predux_half_dowto4(const Packet4Xi& a) {
+    typename std::enable_if<std::is_same<Packet, Packet4Xi>::value && (unpacket_traits<Packet4Xi>::size % 8) == 0,
+                            Packet2Xi>::type
+    predux_half(const Packet4Xi& a) {
   return __riscv_vadd_vv_i32m2(__riscv_vget_v_i32m4_i32m2(a, 0), __riscv_vget_v_i32m4_i32m2(a, 1),
                                unpacket_traits<Packet2Xi>::size);
 }
 
 template <typename Packet = Packet2Xi>
 EIGEN_STRONG_INLINE
-typename std::enable_if<std::is_same<Packet, Packet2Xi>::value && (unpacket_traits<Packet2Xi>::size % 8) == 0,
-                        Packet1Xi>::type
-predux_half_dowto4(const Packet2Xi& a) {
+    typename std::enable_if<std::is_same<Packet, Packet2Xi>::value && (unpacket_traits<Packet2Xi>::size % 8) == 0,
+                            Packet1Xi>::type
+    predux_half(const Packet2Xi& a) {
   return __riscv_vadd_vv_i32m1(__riscv_vget_v_i32m2_i32m1(a, 0), __riscv_vget_v_i32m2_i32m1(a, 1),
                                unpacket_traits<Packet1Xi>::size);
 }
@@ -298,6 +296,12 @@
 }
 
 template <>
+EIGEN_STRONG_INLINE Packet2Xf pabsdiff(const Packet2Xf& a, const Packet2Xf& b) {
+  return __riscv_vfabs_v_f32m2(__riscv_vfsub_vv_f32m2(a, b, unpacket_traits<Packet2Xf>::size),
+                               unpacket_traits<Packet2Xf>::size);
+}
+
+template <>
 EIGEN_STRONG_INLINE Packet2Xf pset1<Packet2Xf>(const float& from) {
   return __riscv_vfmv_v_f_f32m2(from, unpacket_traits<Packet2Xf>::size);
 }
@@ -316,6 +320,16 @@
 }
 
 template <>
+EIGEN_STRONG_INLINE void pbroadcast4<Packet2Xf>(const float* a, Packet2Xf& a0, Packet2Xf& a1, Packet2Xf& a2,
+                                                Packet2Xf& a3) {
+  vfloat32m2_t aa = __riscv_vle32_v_f32m2(a, 4);
+  a0 = __riscv_vrgather_vx_f32m2(aa, 0, unpacket_traits<Packet2Xf>::size);
+  a1 = __riscv_vrgather_vx_f32m2(aa, 1, unpacket_traits<Packet2Xf>::size);
+  a2 = __riscv_vrgather_vx_f32m2(aa, 2, unpacket_traits<Packet2Xf>::size);
+  a3 = __riscv_vrgather_vx_f32m2(aa, 3, unpacket_traits<Packet2Xf>::size);
+}
+
+template <>
 EIGEN_STRONG_INLINE Packet2Xf padd<Packet2Xf>(const Packet2Xf& a, const Packet2Xf& b) {
   return __riscv_vfadd_vv_f32m2(a, b, unpacket_traits<Packet2Xf>::size);
 }
@@ -331,6 +345,12 @@
 }
 
 template <>
+EIGEN_STRONG_INLINE Packet2Xf psignbit(const Packet2Xf& a) {
+  return __riscv_vreinterpret_v_i32m2_f32m2(
+      __riscv_vsra_vx_i32m2(__riscv_vreinterpret_v_f32m2_i32m2(a), 31, unpacket_traits<Packet2Xi>::size));
+}
+
+template <>
 EIGEN_STRONG_INLINE Packet2Xf pconj(const Packet2Xf& a) {
   return a;
 }
@@ -367,8 +387,7 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet2Xf pmin<Packet2Xf>(const Packet2Xf& a, const Packet2Xf& b) {
-  Packet2Xf nans =
-      __riscv_vfmv_v_f_f32m2((std::numeric_limits<float>::quiet_NaN)(), unpacket_traits<Packet2Xf>::size);
+  Packet2Xf nans = __riscv_vfmv_v_f_f32m2((std::numeric_limits<float>::quiet_NaN)(), unpacket_traits<Packet2Xf>::size);
   PacketMask16 mask = __riscv_vmfeq_vv_f32m2_b16(a, a, unpacket_traits<Packet2Xf>::size);
   PacketMask16 mask2 = __riscv_vmfeq_vv_f32m2_b16(b, b, unpacket_traits<Packet2Xf>::size);
   mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits<Packet2Xf>::size);
@@ -388,8 +407,7 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet2Xf pmax<Packet2Xf>(const Packet2Xf& a, const Packet2Xf& b) {
-  Packet2Xf nans =
-      __riscv_vfmv_v_f_f32m2((std::numeric_limits<float>::quiet_NaN)(), unpacket_traits<Packet2Xf>::size);
+  Packet2Xf nans = __riscv_vfmv_v_f_f32m2((std::numeric_limits<float>::quiet_NaN)(), unpacket_traits<Packet2Xf>::size);
   PacketMask16 mask = __riscv_vmfeq_vv_f32m2_b16(a, a, unpacket_traits<Packet2Xf>::size);
   PacketMask16 mask2 = __riscv_vmfeq_vv_f32m2_b16(b, b, unpacket_traits<Packet2Xf>::size);
   mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits<Packet2Xf>::size);
@@ -410,22 +428,19 @@
 template <>
 EIGEN_STRONG_INLINE Packet2Xf pcmp_le<Packet2Xf>(const Packet2Xf& a, const Packet2Xf& b) {
   PacketMask16 mask = __riscv_vmfle_vv_f32m2_b16(a, b, unpacket_traits<Packet2Xf>::size);
-  return __riscv_vmerge_vvm_f32m2(pzero<Packet2Xf>(a), ptrue<Packet2Xf>(a), mask,
-                                  unpacket_traits<Packet2Xf>::size);
+  return __riscv_vmerge_vvm_f32m2(pzero<Packet2Xf>(a), ptrue<Packet2Xf>(a), mask, unpacket_traits<Packet2Xf>::size);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet2Xf pcmp_lt<Packet2Xf>(const Packet2Xf& a, const Packet2Xf& b) {
   PacketMask16 mask = __riscv_vmflt_vv_f32m2_b16(a, b, unpacket_traits<Packet2Xf>::size);
-  return __riscv_vmerge_vvm_f32m2(pzero<Packet2Xf>(a), ptrue<Packet2Xf>(a), mask,
-                                  unpacket_traits<Packet2Xf>::size);
+  return __riscv_vmerge_vvm_f32m2(pzero<Packet2Xf>(a), ptrue<Packet2Xf>(a), mask, unpacket_traits<Packet2Xf>::size);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet2Xf pcmp_eq<Packet2Xf>(const Packet2Xf& a, const Packet2Xf& b) {
   PacketMask16 mask = __riscv_vmfeq_vv_f32m2_b16(a, b, unpacket_traits<Packet2Xf>::size);
-  return __riscv_vmerge_vvm_f32m2(pzero<Packet2Xf>(a), ptrue<Packet2Xf>(a), mask,
-                                  unpacket_traits<Packet2Xf>::size);
+  return __riscv_vmerge_vvm_f32m2(pzero<Packet2Xf>(a), ptrue<Packet2Xf>(a), mask, unpacket_traits<Packet2Xf>::size);
 }
 
 template <>
@@ -434,26 +449,33 @@
   return __riscv_vfmerge_vfm_f32m2(ptrue<Packet2Xf>(a), 0.0f, mask, unpacket_traits<Packet2Xf>::size);
 }
 
+EIGEN_STRONG_INLINE Packet2Xf pselect(const PacketMask16& mask, const Packet2Xf& a, const Packet2Xf& b) {
+  return __riscv_vmerge_vvm_f32m2(b, a, mask, unpacket_traits<Packet2Xf>::size);
+}
+
+EIGEN_STRONG_INLINE Packet2Xf pselect(const Packet2Xf& mask, const Packet2Xf& a, const Packet2Xf& b) {
+  PacketMask16 mask2 =
+      __riscv_vmsne_vx_i32m2_b16(__riscv_vreinterpret_v_f32m2_i32m2(mask), 0, unpacket_traits<Packet2Xf>::size);
+  return __riscv_vmerge_vvm_f32m2(b, a, mask2, unpacket_traits<Packet2Xf>::size);
+}
+
 // Logical Operations are not supported for float, so reinterpret casts
 template <>
 EIGEN_STRONG_INLINE Packet2Xf pand<Packet2Xf>(const Packet2Xf& a, const Packet2Xf& b) {
-  return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vand_vv_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(a),
-                                                                  __riscv_vreinterpret_v_f32m2_u32m2(b),
-                                                                  unpacket_traits<Packet2Xf>::size));
+  return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vand_vv_u32m2(
+      __riscv_vreinterpret_v_f32m2_u32m2(a), __riscv_vreinterpret_v_f32m2_u32m2(b), unpacket_traits<Packet2Xf>::size));
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet2Xf por<Packet2Xf>(const Packet2Xf& a, const Packet2Xf& b) {
-  return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vor_vv_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(a),
-                                                                 __riscv_vreinterpret_v_f32m2_u32m2(b),
-                                                                 unpacket_traits<Packet2Xf>::size));
+  return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vor_vv_u32m2(
+      __riscv_vreinterpret_v_f32m2_u32m2(a), __riscv_vreinterpret_v_f32m2_u32m2(b), unpacket_traits<Packet2Xf>::size));
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet2Xf pxor<Packet2Xf>(const Packet2Xf& a, const Packet2Xf& b) {
-  return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vxor_vv_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(a),
-                                                                  __riscv_vreinterpret_v_f32m2_u32m2(b),
-                                                                  unpacket_traits<Packet2Xf>::size));
+  return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vxor_vv_u32m2(
+      __riscv_vreinterpret_v_f32m2_u32m2(a), __riscv_vreinterpret_v_f32m2_u32m2(b), unpacket_traits<Packet2Xf>::size));
 }
 
 template <>
@@ -476,17 +498,18 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet2Xf ploaddup<Packet2Xf>(const float* from) {
-  Packet2Xu idx = __riscv_vid_v_u32m2(unpacket_traits<Packet2Xf>::size);
-  idx = __riscv_vsll_vx_u32m2(__riscv_vand_vx_u32m2(idx, 0xfffffffeu, unpacket_traits<Packet2Xf>::size), 1,
-                              unpacket_traits<Packet2Xf>::size);
-  return __riscv_vloxei32_v_f32m2(from, idx, unpacket_traits<Packet2Xf>::size);
+  Packet2Xu data = __riscv_vreinterpret_v_f32m2_u32m2(pload<Packet2Xf>(from));
+  return __riscv_vreinterpret_v_i32m2_f32m2(
+      __riscv_vreinterpret_v_i64m2_i32m2(__riscv_vreinterpret_v_u64m2_i64m2(__riscv_vlmul_trunc_v_u64m4_u64m2(
+          __riscv_vwmaccu_vx_u64m4(__riscv_vwaddu_vv_u64m4(data, data, unpacket_traits<Packet2Xi>::size), 0xffffffffu,
+                                   data, unpacket_traits<Packet2Xi>::size)))));
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet2Xf ploadquad<Packet2Xf>(const float* from) {
-  Packet2Xu idx = __riscv_vid_v_u32m2(unpacket_traits<Packet2Xf>::size);
-  idx = __riscv_vand_vx_u32m2(idx, 0xfffffffcu, unpacket_traits<Packet2Xf>::size);
-  return __riscv_vloxei32_v_f32m2(from, idx, unpacket_traits<Packet2Xf>::size);
+  Packet2Xu idx =
+      __riscv_vsrl_vx_u32m2(__riscv_vid_v_u32m2(unpacket_traits<Packet2Xf>::size), 2, unpacket_traits<Packet2Xf>::size);
+  return __riscv_vrgather_vv_f32m2(pload<Packet2Xf>(from), idx, unpacket_traits<Packet2Xf>::size);
 }
 
 template <>
@@ -526,8 +549,8 @@
 
   PacketMask16 mask = __riscv_vmfne_vv_f32m2_b16(a, a, unpacket_traits<Packet2Xf>::size);
   const Packet2Xf x = __riscv_vfadd_vv_f32m2_tumu(mask, a, a, a, unpacket_traits<Packet2Xf>::size);
-  const Packet2Xf new_x = __riscv_vfcvt_f_x_v_f32m2(
-      __riscv_vfcvt_x_f_v_i32m2(a, unpacket_traits<Packet2Xf>::size), unpacket_traits<Packet2Xf>::size);
+  const Packet2Xf new_x = __riscv_vfcvt_f_x_v_f32m2(__riscv_vfcvt_x_f_v_i32m2(a, unpacket_traits<Packet2Xf>::size),
+                                                    unpacket_traits<Packet2Xf>::size);
 
   mask = __riscv_vmflt_vv_f32m2_b16(abs_a, limit, unpacket_traits<Packet2Xf>::size);
   Packet2Xf signed_x = __riscv_vfsgnj_vv_f32m2(new_x, x, unpacket_traits<Packet2Xf>::size);
@@ -544,9 +567,8 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet2Xf preverse(const Packet2Xf& a) {
-  Packet2Xu idx =
-      __riscv_vrsub_vx_u32m2(__riscv_vid_v_u32m2(unpacket_traits<Packet2Xf>::size),
-                             unpacket_traits<Packet2Xf>::size - 1, unpacket_traits<Packet2Xf>::size);
+  Packet2Xu idx = __riscv_vrsub_vx_u32m2(__riscv_vid_v_u32m2(unpacket_traits<Packet2Xf>::size),
+                                         unpacket_traits<Packet2Xf>::size - 1, unpacket_traits<Packet2Xf>::size);
   return __riscv_vrgather_vv_f32m2(a, idx, unpacket_traits<Packet2Xf>::size);
 }
 
@@ -563,28 +585,26 @@
 
 template <>
 EIGEN_STRONG_INLINE float predux_mul<Packet2Xf>(const Packet2Xf& a) {
-  return predux_mul<Packet1Xf>(__riscv_vfmul_vv_f32m1(__riscv_vget_v_f32m2_f32m1(a, 0), __riscv_vget_v_f32m2_f32m1(a, 1),
-                                                     unpacket_traits<Packet1Xf>::size));
+  return predux_mul<Packet1Xf>(__riscv_vfmul_vv_f32m1(
+      __riscv_vget_v_f32m2_f32m1(a, 0), __riscv_vget_v_f32m2_f32m1(a, 1), unpacket_traits<Packet1Xf>::size));
 }
 
 template <>
 EIGEN_STRONG_INLINE float predux_min<Packet2Xf>(const Packet2Xf& a) {
-  return (std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f32m2_f32m1(
-                        a,
-                        __riscv_vfmv_v_f_f32m1((std::numeric_limits<float>::quiet_NaN)(),
-                                               unpacket_traits<Packet2Xf>::size / 2),
-                        unpacket_traits<Packet2Xf>::size)),
-                    (std::numeric_limits<float>::max)());
+  return (std::min)(
+      __riscv_vfmv_f(__riscv_vfredmin_vs_f32m2_f32m1(
+          a, __riscv_vfmv_v_f_f32m1((std::numeric_limits<float>::quiet_NaN)(), unpacket_traits<Packet2Xf>::size / 2),
+          unpacket_traits<Packet2Xf>::size)),
+      (std::numeric_limits<float>::max)());
 }
 
 template <>
 EIGEN_STRONG_INLINE float predux_max<Packet2Xf>(const Packet2Xf& a) {
-  return (std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f32m2_f32m1(
-                        a,
-                        __riscv_vfmv_v_f_f32m1((std::numeric_limits<float>::quiet_NaN)(),
-                                               unpacket_traits<Packet2Xf>::size / 2),
-                        unpacket_traits<Packet2Xf>::size)),
-                    -(std::numeric_limits<float>::max)());
+  return (std::max)(
+      __riscv_vfmv_f(__riscv_vfredmax_vs_f32m2_f32m1(
+          a, __riscv_vfmv_v_f_f32m1((std::numeric_limits<float>::quiet_NaN)(), unpacket_traits<Packet2Xf>::size / 2),
+          unpacket_traits<Packet2Xf>::size)),
+      -(std::numeric_limits<float>::max)());
 }
 
 template <int N>
@@ -609,18 +629,18 @@
 
 template <typename Packet = Packet4Xf>
 EIGEN_STRONG_INLINE
-typename std::enable_if<std::is_same<Packet, Packet4Xf>::value && (unpacket_traits<Packet4Xf>::size % 8) == 0,
-                        Packet2Xf>::type
-predux_half_dowto4(const Packet4Xf& a) {
+    typename std::enable_if<std::is_same<Packet, Packet4Xf>::value && (unpacket_traits<Packet4Xf>::size % 8) == 0,
+                            Packet2Xf>::type
+    predux_half(const Packet4Xf& a) {
   return __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(a, 0), __riscv_vget_v_f32m4_f32m2(a, 1),
                                 unpacket_traits<Packet2Xf>::size);
 }
 
 template <typename Packet = Packet2Xf>
 EIGEN_STRONG_INLINE
-typename std::enable_if<std::is_same<Packet, Packet2Xf>::value && (unpacket_traits<Packet2Xf>::size % 8) == 0,
-                        Packet1Xf>::type
-predux_half_dowto4(const Packet2Xf& a) {
+    typename std::enable_if<std::is_same<Packet, Packet2Xf>::value && (unpacket_traits<Packet2Xf>::size % 8) == 0,
+                            Packet1Xf>::type
+    predux_half(const Packet2Xf& a) {
   return __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(a, 0), __riscv_vget_v_f32m2_f32m1(a, 1),
                                 unpacket_traits<Packet1Xf>::size);
 }
@@ -775,19 +795,16 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet2Xl ploaddup<Packet2Xl>(const numext::int64_t* from) {
-  Packet2Xul idx = __riscv_vid_v_u64m2(unpacket_traits<Packet2Xl>::size);
-  idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffeu, unpacket_traits<Packet2Xl>::size), 2,
-                              unpacket_traits<Packet2Xl>::size);
-  // idx = 0 0 sizeof(int64_t) sizeof(int64_t) 2*sizeof(int64_t) 2*sizeof(int64_t) ...
-  return __riscv_vloxei64_v_i64m2(from, idx, unpacket_traits<Packet2Xl>::size);
+  Packet2Xul idx =
+      __riscv_vsrl_vx_u64m2(__riscv_vid_v_u64m2(unpacket_traits<Packet2Xl>::size), 1, unpacket_traits<Packet2Xl>::size);
+  return __riscv_vrgather_vv_i64m2(pload<Packet2Xl>(from), idx, unpacket_traits<Packet2Xl>::size);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet2Xl ploadquad<Packet2Xl>(const numext::int64_t* from) {
-  Packet2Xul idx = __riscv_vid_v_u64m2(unpacket_traits<Packet2Xl>::size);
-  idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffcu, unpacket_traits<Packet2Xl>::size), 1,
-                              unpacket_traits<Packet2Xl>::size);
-  return __riscv_vloxei64_v_i64m2(from, idx, unpacket_traits<Packet2Xl>::size);
+  Packet2Xul idx =
+      __riscv_vsrl_vx_u64m2(__riscv_vid_v_u64m2(unpacket_traits<Packet2Xl>::size), 2, unpacket_traits<Packet2Xl>::size);
+  return __riscv_vrgather_vv_i64m2(pload<Packet2Xl>(from), idx, unpacket_traits<Packet2Xl>::size);
 }
 
 template <>
@@ -801,14 +818,13 @@
 }
 
 template <>
-EIGEN_DEVICE_FUNC inline Packet2Xl pgather<numext::int64_t, Packet2Xl>(const numext::int64_t* from,
-                                                                             Index stride) {
+EIGEN_DEVICE_FUNC inline Packet2Xl pgather<numext::int64_t, Packet2Xl>(const numext::int64_t* from, Index stride) {
   return __riscv_vlse64_v_i64m2(from, stride * sizeof(numext::int64_t), unpacket_traits<Packet2Xl>::size);
 }
 
 template <>
 EIGEN_DEVICE_FUNC inline void pscatter<numext::int64_t, Packet2Xl>(numext::int64_t* to, const Packet2Xl& from,
-                                                                      Index stride) {
+                                                                   Index stride) {
   __riscv_vsse64(to, stride * sizeof(numext::int64_t), from, unpacket_traits<Packet2Xl>::size);
 }
 
@@ -819,9 +835,8 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet2Xl preverse(const Packet2Xl& a) {
-  Packet2Xul idx =
-      __riscv_vrsub_vx_u64m2(__riscv_vid_v_u64m2(unpacket_traits<Packet2Xl>::size),
-                             unpacket_traits<Packet2Xl>::size - 1, unpacket_traits<Packet2Xl>::size);
+  Packet2Xul idx = __riscv_vrsub_vx_u64m2(__riscv_vid_v_u64m2(unpacket_traits<Packet2Xl>::size),
+                                          unpacket_traits<Packet2Xl>::size - 1, unpacket_traits<Packet2Xl>::size);
   return __riscv_vrgather_vv_i64m2(a, idx, unpacket_traits<Packet2Xl>::size);
 }
 
@@ -834,14 +849,14 @@
 
 template <>
 EIGEN_STRONG_INLINE numext::int64_t predux<Packet2Xl>(const Packet2Xl& a) {
-  return __riscv_vmv_x(__riscv_vredsum_vs_i64m2_i64m1(
-      a, __riscv_vmv_v_x_i64m1(0, unpacket_traits<Packet2Xl>::size / 2), unpacket_traits<Packet2Xl>::size));
+  return __riscv_vmv_x(__riscv_vredsum_vs_i64m2_i64m1(a, __riscv_vmv_v_x_i64m1(0, unpacket_traits<Packet2Xl>::size / 2),
+                                                      unpacket_traits<Packet2Xl>::size));
 }
 
 template <>
 EIGEN_STRONG_INLINE numext::int64_t predux_mul<Packet2Xl>(const Packet2Xl& a) {
   return predux_mul<Packet1Xl>(__riscv_vmul_vv_i64m1(__riscv_vget_v_i64m2_i64m1(a, 0), __riscv_vget_v_i64m2_i64m1(a, 1),
-                                                    unpacket_traits<Packet1Xl>::size));
+                                                     unpacket_traits<Packet1Xl>::size));
 }
 
 template <>
@@ -874,18 +889,18 @@
 
 template <typename Packet = Packet4Xl>
 EIGEN_STRONG_INLINE
-typename std::enable_if<std::is_same<Packet, Packet4Xl>::value && (unpacket_traits<Packet4Xl>::size % 8) == 0,
-                        Packet2Xl>::type
-predux_half_dowto4(const Packet4Xl& a) {
+    typename std::enable_if<std::is_same<Packet, Packet4Xl>::value && (unpacket_traits<Packet4Xl>::size % 8) == 0,
+                            Packet2Xl>::type
+    predux_half(const Packet4Xl& a) {
   return __riscv_vadd_vv_i64m2(__riscv_vget_v_i64m4_i64m2(a, 0), __riscv_vget_v_i64m4_i64m2(a, 1),
                                unpacket_traits<Packet2Xl>::size);
 }
 
 template <typename Packet = Packet2Xl>
 EIGEN_STRONG_INLINE
-typename std::enable_if<std::is_same<Packet, Packet2Xl>::value && (unpacket_traits<Packet2Xl>::size % 8) == 0,
-                        Packet1Xl>::type
-predux_half_dowto4(const Packet2Xl& a) {
+    typename std::enable_if<std::is_same<Packet, Packet2Xl>::value && (unpacket_traits<Packet2Xl>::size % 8) == 0,
+                            Packet1Xl>::type
+    predux_half(const Packet2Xl& a) {
   return __riscv_vadd_vv_i64m1(__riscv_vget_v_i64m2_i64m1(a, 0), __riscv_vget_v_i64m2_i64m1(a, 1),
                                unpacket_traits<Packet1Xl>::size);
 }
@@ -908,6 +923,12 @@
 }
 
 template <>
+EIGEN_STRONG_INLINE Packet2Xd pabsdiff(const Packet2Xd& a, const Packet2Xd& b) {
+  return __riscv_vfabs_v_f64m2(__riscv_vfsub_vv_f64m2(a, b, unpacket_traits<Packet2Xd>::size),
+                               unpacket_traits<Packet2Xd>::size);
+}
+
+template <>
 EIGEN_STRONG_INLINE Packet2Xd pset1<Packet2Xd>(const double& from) {
   return __riscv_vfmv_v_f_f64m2(from, unpacket_traits<Packet2Xd>::size);
 }
@@ -926,6 +947,16 @@
 }
 
 template <>
+EIGEN_STRONG_INLINE void pbroadcast4<Packet2Xd>(const double* a, Packet2Xd& a0, Packet2Xd& a1, Packet2Xd& a2,
+                                                Packet2Xd& a3) {
+  vfloat64m2_t aa = __riscv_vle64_v_f64m2(a, 4);
+  a0 = __riscv_vrgather_vx_f64m2(aa, 0, unpacket_traits<Packet2Xd>::size);
+  a1 = __riscv_vrgather_vx_f64m2(aa, 1, unpacket_traits<Packet2Xd>::size);
+  a2 = __riscv_vrgather_vx_f64m2(aa, 2, unpacket_traits<Packet2Xd>::size);
+  a3 = __riscv_vrgather_vx_f64m2(aa, 3, unpacket_traits<Packet2Xd>::size);
+}
+
+template <>
 EIGEN_STRONG_INLINE Packet2Xd padd<Packet2Xd>(const Packet2Xd& a, const Packet2Xd& b) {
   return __riscv_vfadd_vv_f64m2(a, b, unpacket_traits<Packet2Xd>::size);
 }
@@ -941,6 +972,12 @@
 }
 
 template <>
+EIGEN_STRONG_INLINE Packet2Xd psignbit(const Packet2Xd& a) {
+  return __riscv_vreinterpret_v_i64m2_f64m2(
+      __riscv_vsra_vx_i64m2(__riscv_vreinterpret_v_f64m2_i64m2(a), 63, unpacket_traits<Packet2Xl>::size));
+}
+
+template <>
 EIGEN_STRONG_INLINE Packet2Xd pconj(const Packet2Xd& a) {
   return a;
 }
@@ -977,8 +1014,7 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet2Xd pmin<Packet2Xd>(const Packet2Xd& a, const Packet2Xd& b) {
-  Packet2Xd nans =
-      __riscv_vfmv_v_f_f64m2((std::numeric_limits<double>::quiet_NaN)(), unpacket_traits<Packet2Xd>::size);
+  Packet2Xd nans = __riscv_vfmv_v_f_f64m2((std::numeric_limits<double>::quiet_NaN)(), unpacket_traits<Packet2Xd>::size);
   PacketMask32 mask = __riscv_vmfeq_vv_f64m2_b32(a, a, unpacket_traits<Packet2Xd>::size);
   PacketMask32 mask2 = __riscv_vmfeq_vv_f64m2_b32(b, b, unpacket_traits<Packet2Xd>::size);
   mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits<Packet2Xd>::size);
@@ -998,8 +1034,7 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet2Xd pmax<Packet2Xd>(const Packet2Xd& a, const Packet2Xd& b) {
-  Packet2Xd nans =
-      __riscv_vfmv_v_f_f64m2((std::numeric_limits<double>::quiet_NaN)(), unpacket_traits<Packet2Xd>::size);
+  Packet2Xd nans = __riscv_vfmv_v_f_f64m2((std::numeric_limits<double>::quiet_NaN)(), unpacket_traits<Packet2Xd>::size);
   PacketMask32 mask = __riscv_vmfeq_vv_f64m2_b32(a, a, unpacket_traits<Packet2Xd>::size);
   PacketMask32 mask2 = __riscv_vmfeq_vv_f64m2_b32(b, b, unpacket_traits<Packet2Xd>::size);
   mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits<Packet2Xd>::size);
@@ -1020,22 +1055,19 @@
 template <>
 EIGEN_STRONG_INLINE Packet2Xd pcmp_le<Packet2Xd>(const Packet2Xd& a, const Packet2Xd& b) {
   PacketMask32 mask = __riscv_vmfle_vv_f64m2_b32(a, b, unpacket_traits<Packet2Xd>::size);
-  return __riscv_vmerge_vvm_f64m2(pzero<Packet2Xd>(a), ptrue<Packet2Xd>(a), mask,
-                                  unpacket_traits<Packet2Xd>::size);
+  return __riscv_vmerge_vvm_f64m2(pzero<Packet2Xd>(a), ptrue<Packet2Xd>(a), mask, unpacket_traits<Packet2Xd>::size);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet2Xd pcmp_lt<Packet2Xd>(const Packet2Xd& a, const Packet2Xd& b) {
   PacketMask32 mask = __riscv_vmflt_vv_f64m2_b32(a, b, unpacket_traits<Packet2Xd>::size);
-  return __riscv_vmerge_vvm_f64m2(pzero<Packet2Xd>(a), ptrue<Packet2Xd>(a), mask,
-                                  unpacket_traits<Packet2Xd>::size);
+  return __riscv_vmerge_vvm_f64m2(pzero<Packet2Xd>(a), ptrue<Packet2Xd>(a), mask, unpacket_traits<Packet2Xd>::size);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet2Xd pcmp_eq<Packet2Xd>(const Packet2Xd& a, const Packet2Xd& b) {
   PacketMask32 mask = __riscv_vmfeq_vv_f64m2_b32(a, b, unpacket_traits<Packet2Xd>::size);
-  return __riscv_vmerge_vvm_f64m2(pzero<Packet2Xd>(a), ptrue<Packet2Xd>(a), mask,
-                                  unpacket_traits<Packet2Xd>::size);
+  return __riscv_vmerge_vvm_f64m2(pzero<Packet2Xd>(a), ptrue<Packet2Xd>(a), mask, unpacket_traits<Packet2Xd>::size);
 }
 
 template <>
@@ -1044,26 +1076,33 @@
   return __riscv_vfmerge_vfm_f64m2(ptrue<Packet2Xd>(a), 0.0, mask, unpacket_traits<Packet2Xd>::size);
 }
 
+EIGEN_STRONG_INLINE Packet2Xd pselect(const PacketMask32& mask, const Packet2Xd& a, const Packet2Xd& b) {
+  return __riscv_vmerge_vvm_f64m2(b, a, mask, unpacket_traits<Packet2Xd>::size);
+}
+
+EIGEN_STRONG_INLINE Packet2Xd pselect(const Packet2Xd& mask, const Packet2Xd& a, const Packet2Xd& b) {
+  PacketMask32 mask2 =
+      __riscv_vmsne_vx_i64m2_b32(__riscv_vreinterpret_v_f64m2_i64m2(mask), 0, unpacket_traits<Packet2Xd>::size);
+  return __riscv_vmerge_vvm_f64m2(b, a, mask2, unpacket_traits<Packet2Xd>::size);
+}
+
 // Logical Operations are not supported for double, so reinterpret casts
 template <>
 EIGEN_STRONG_INLINE Packet2Xd pand<Packet2Xd>(const Packet2Xd& a, const Packet2Xd& b) {
-  return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vand_vv_u64m2(__riscv_vreinterpret_v_f64m2_u64m2(a),
-                                                                  __riscv_vreinterpret_v_f64m2_u64m2(b),
-                                                                  unpacket_traits<Packet2Xd>::size));
+  return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vand_vv_u64m2(
+      __riscv_vreinterpret_v_f64m2_u64m2(a), __riscv_vreinterpret_v_f64m2_u64m2(b), unpacket_traits<Packet2Xd>::size));
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet2Xd por<Packet2Xd>(const Packet2Xd& a, const Packet2Xd& b) {
-  return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vor_vv_u64m2(__riscv_vreinterpret_v_f64m2_u64m2(a),
-                                                                 __riscv_vreinterpret_v_f64m2_u64m2(b),
-                                                                 unpacket_traits<Packet2Xd>::size));
+  return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vor_vv_u64m2(
+      __riscv_vreinterpret_v_f64m2_u64m2(a), __riscv_vreinterpret_v_f64m2_u64m2(b), unpacket_traits<Packet2Xd>::size));
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet2Xd pxor<Packet2Xd>(const Packet2Xd& a, const Packet2Xd& b) {
-  return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vxor_vv_u64m2(__riscv_vreinterpret_v_f64m2_u64m2(a),
-                                                                  __riscv_vreinterpret_v_f64m2_u64m2(b),
-                                                                  unpacket_traits<Packet2Xd>::size));
+  return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vxor_vv_u64m2(
+      __riscv_vreinterpret_v_f64m2_u64m2(a), __riscv_vreinterpret_v_f64m2_u64m2(b), unpacket_traits<Packet2Xd>::size));
 }
 
 template <>
@@ -1086,18 +1125,16 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet2Xd ploaddup<Packet2Xd>(const double* from) {
-  Packet2Xul idx = __riscv_vid_v_u64m2(unpacket_traits<Packet2Xd>::size);
-  idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffeu, unpacket_traits<Packet2Xd>::size), 2,
-                              unpacket_traits<Packet2Xd>::size);
-  return __riscv_vloxei64_v_f64m2(from, idx, unpacket_traits<Packet2Xd>::size);
+  Packet2Xul idx =
+      __riscv_vsrl_vx_u64m2(__riscv_vid_v_u64m2(unpacket_traits<Packet2Xd>::size), 1, unpacket_traits<Packet2Xd>::size);
+  return __riscv_vrgather_vv_f64m2(pload<Packet2Xd>(from), idx, unpacket_traits<Packet2Xd>::size);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet2Xd ploadquad<Packet2Xd>(const double* from) {
-  Packet2Xul idx = __riscv_vid_v_u64m2(unpacket_traits<Packet2Xd>::size);
-  idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffcu, unpacket_traits<Packet2Xd>::size), 1,
-                              unpacket_traits<Packet2Xd>::size);
-  return __riscv_vloxei64_v_f64m2(from, idx, unpacket_traits<Packet2Xd>::size);
+  Packet2Xul idx =
+      __riscv_vsrl_vx_u64m2(__riscv_vid_v_u64m2(unpacket_traits<Packet2Xd>::size), 2, unpacket_traits<Packet2Xd>::size);
+  return __riscv_vrgather_vv_f64m2(pload<Packet2Xd>(from), idx, unpacket_traits<Packet2Xd>::size);
 }
 
 template <>
@@ -1137,8 +1174,8 @@
 
   PacketMask32 mask = __riscv_vmfne_vv_f64m2_b32(a, a, unpacket_traits<Packet2Xd>::size);
   const Packet2Xd x = __riscv_vfadd_vv_f64m2_tumu(mask, a, a, a, unpacket_traits<Packet2Xd>::size);
-  const Packet2Xd new_x = __riscv_vfcvt_f_x_v_f64m2(
-      __riscv_vfcvt_x_f_v_i64m2(a, unpacket_traits<Packet2Xd>::size), unpacket_traits<Packet2Xd>::size);
+  const Packet2Xd new_x = __riscv_vfcvt_f_x_v_f64m2(__riscv_vfcvt_x_f_v_i64m2(a, unpacket_traits<Packet2Xd>::size),
+                                                    unpacket_traits<Packet2Xd>::size);
 
   mask = __riscv_vmflt_vv_f64m2_b32(abs_a, limit, unpacket_traits<Packet2Xd>::size);
   Packet2Xd signed_x = __riscv_vfsgnj_vv_f64m2(new_x, x, unpacket_traits<Packet2Xd>::size);
@@ -1155,9 +1192,8 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet2Xd preverse(const Packet2Xd& a) {
-  Packet2Xul idx =
-      __riscv_vrsub_vx_u64m2(__riscv_vid_v_u64m2(unpacket_traits<Packet2Xd>::size),
-                             unpacket_traits<Packet2Xd>::size - 1, unpacket_traits<Packet2Xd>::size);
+  Packet2Xul idx = __riscv_vrsub_vx_u64m2(__riscv_vid_v_u64m2(unpacket_traits<Packet2Xd>::size),
+                                          unpacket_traits<Packet2Xd>::size - 1, unpacket_traits<Packet2Xd>::size);
   return __riscv_vrgather_vv_f64m2(a, idx, unpacket_traits<Packet2Xd>::size);
 }
 
@@ -1174,28 +1210,26 @@
 
 template <>
 EIGEN_STRONG_INLINE double predux_mul<Packet2Xd>(const Packet2Xd& a) {
-  return predux_mul<Packet1Xd>(__riscv_vfmul_vv_f64m1(__riscv_vget_v_f64m2_f64m1(a, 0), __riscv_vget_v_f64m2_f64m1(a, 1),
-                                                     unpacket_traits<Packet1Xd>::size));
+  return predux_mul<Packet1Xd>(__riscv_vfmul_vv_f64m1(
+      __riscv_vget_v_f64m2_f64m1(a, 0), __riscv_vget_v_f64m2_f64m1(a, 1), unpacket_traits<Packet1Xd>::size));
 }
 
 template <>
 EIGEN_STRONG_INLINE double predux_min<Packet2Xd>(const Packet2Xd& a) {
-  return (std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f64m2_f64m1(
-                        a,
-                        __riscv_vfmv_v_f_f64m1((std::numeric_limits<double>::quiet_NaN)(),
-                                               unpacket_traits<Packet2Xd>::size / 2),
-                        unpacket_traits<Packet2Xd>::size)),
-                    (std::numeric_limits<double>::max)());
+  return (std::min)(
+      __riscv_vfmv_f(__riscv_vfredmin_vs_f64m2_f64m1(
+          a, __riscv_vfmv_v_f_f64m1((std::numeric_limits<double>::quiet_NaN)(), unpacket_traits<Packet2Xd>::size / 2),
+          unpacket_traits<Packet2Xd>::size)),
+      (std::numeric_limits<double>::max)());
 }
 
 template <>
 EIGEN_STRONG_INLINE double predux_max<Packet2Xd>(const Packet2Xd& a) {
-  return (std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f64m2_f64m1(
-                        a,
-                        __riscv_vfmv_v_f_f64m1((std::numeric_limits<double>::quiet_NaN)(),
-                                               unpacket_traits<Packet2Xd>::size / 2),
-                        unpacket_traits<Packet2Xd>::size)),
-                    -(std::numeric_limits<double>::max)());
+  return (std::max)(
+      __riscv_vfmv_f(__riscv_vfredmax_vs_f64m2_f64m1(
+          a, __riscv_vfmv_v_f_f64m1((std::numeric_limits<double>::quiet_NaN)(), unpacket_traits<Packet2Xd>::size / 2),
+          unpacket_traits<Packet2Xd>::size)),
+      -(std::numeric_limits<double>::max)());
 }
 
 template <int N>
@@ -1220,18 +1254,18 @@
 
 template <typename Packet = Packet4Xd>
 EIGEN_STRONG_INLINE
-typename std::enable_if<std::is_same<Packet, Packet4Xd>::value && (unpacket_traits<Packet4Xd>::size % 8) == 0,
-                        Packet2Xd>::type
-predux_half_dowto4(const Packet4Xd& a) {
+    typename std::enable_if<std::is_same<Packet, Packet4Xd>::value && (unpacket_traits<Packet4Xd>::size % 8) == 0,
+                            Packet2Xd>::type
+    predux_half(const Packet4Xd& a) {
   return __riscv_vfadd_vv_f64m2(__riscv_vget_v_f64m4_f64m2(a, 0), __riscv_vget_v_f64m4_f64m2(a, 1),
                                 unpacket_traits<Packet2Xd>::size);
 }
 
 template <typename Packet = Packet2Xd>
 EIGEN_STRONG_INLINE
-typename std::enable_if<std::is_same<Packet, Packet2Xd>::value && (unpacket_traits<Packet2Xd>::size % 8) == 0,
-                        Packet1Xd>::type
-predux_half_dowto4(const Packet2Xd& a) {
+    typename std::enable_if<std::is_same<Packet, Packet2Xd>::value && (unpacket_traits<Packet2Xd>::size % 8) == 0,
+                            Packet1Xd>::type
+    predux_half(const Packet2Xd& a) {
   return __riscv_vfadd_vv_f64m1(__riscv_vget_v_f64m2_f64m1(a, 0), __riscv_vget_v_f64m2_f64m1(a, 1),
                                 unpacket_traits<Packet1Xd>::size);
 }
@@ -1386,18 +1420,17 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet2Xs ploaddup<Packet2Xs>(const numext::int16_t* from) {
-  Packet2Xsu idx = __riscv_vid_v_u16m2(unpacket_traits<Packet2Xs>::size);
-  idx = __riscv_vand_vx_u16m2(idx, 0xfffeu, unpacket_traits<Packet2Xs>::size);
-  // idx = 0 0 sizeof(int16_t) sizeof(int16_t) 2*sizeof(int16_t) 2*sizeof(int16_t) ...
-  return __riscv_vloxei16_v_i16m2(from, idx, unpacket_traits<Packet2Xs>::size);
+  Packet2Xsu data = __riscv_vreinterpret_v_i16m2_u16m2(pload<Packet2Xs>(from));
+  return __riscv_vreinterpret_v_i32m2_i16m2(__riscv_vreinterpret_v_u32m2_i32m2(__riscv_vlmul_trunc_v_u32m4_u32m2(
+      __riscv_vwmaccu_vx_u32m4(__riscv_vwaddu_vv_u32m4(data, data, unpacket_traits<Packet2Xs>::size), 0xffffu, data,
+                               unpacket_traits<Packet2Xs>::size))));
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet2Xs ploadquad<Packet2Xs>(const numext::int16_t* from) {
-  Packet2Xsu idx = __riscv_vid_v_u16m2(unpacket_traits<Packet2Xs>::size);
-  idx = __riscv_vsrl_vx_u16m2(__riscv_vand_vx_u16m2(idx, 0xfffcu, unpacket_traits<Packet2Xs>::size), 1,
-                              unpacket_traits<Packet2Xs>::size);
-  return __riscv_vloxei16_v_i16m2(from, idx, unpacket_traits<Packet2Xs>::size);
+  Packet2Xsu idx =
+      __riscv_vsrl_vx_u16m2(__riscv_vid_v_u16m2(unpacket_traits<Packet2Xs>::size), 2, unpacket_traits<Packet2Xs>::size);
+  return __riscv_vrgather_vv_i16m2(pload<Packet2Xs>(from), idx, unpacket_traits<Packet2Xs>::size);
 }
 
 template <>
@@ -1411,14 +1444,13 @@
 }
 
 template <>
-EIGEN_DEVICE_FUNC inline Packet2Xs pgather<numext::int16_t, Packet2Xs>(const numext::int16_t* from,
-                                                                             Index stride) {
+EIGEN_DEVICE_FUNC inline Packet2Xs pgather<numext::int16_t, Packet2Xs>(const numext::int16_t* from, Index stride) {
   return __riscv_vlse16_v_i16m2(from, stride * sizeof(numext::int16_t), unpacket_traits<Packet2Xs>::size);
 }
 
 template <>
 EIGEN_DEVICE_FUNC inline void pscatter<numext::int16_t, Packet2Xs>(numext::int16_t* to, const Packet2Xs& from,
-                                                                      Index stride) {
+                                                                   Index stride) {
   __riscv_vsse16(to, stride * sizeof(numext::int16_t), from, unpacket_traits<Packet2Xs>::size);
 }
 
@@ -1429,9 +1461,8 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet2Xs preverse(const Packet2Xs& a) {
-  Packet2Xsu idx =
-      __riscv_vrsub_vx_u16m2(__riscv_vid_v_u16m2(unpacket_traits<Packet2Xs>::size),
-                             unpacket_traits<Packet2Xs>::size - 1, unpacket_traits<Packet2Xs>::size);
+  Packet2Xsu idx = __riscv_vrsub_vx_u16m2(__riscv_vid_v_u16m2(unpacket_traits<Packet2Xs>::size),
+                                          unpacket_traits<Packet2Xs>::size - 1, unpacket_traits<Packet2Xs>::size);
   return __riscv_vrgather_vv_i16m2(a, idx, unpacket_traits<Packet2Xs>::size);
 }
 
@@ -1444,14 +1475,14 @@
 
 template <>
 EIGEN_STRONG_INLINE numext::int16_t predux<Packet2Xs>(const Packet2Xs& a) {
-  return __riscv_vmv_x(__riscv_vredsum_vs_i16m2_i16m1(
-      a, __riscv_vmv_v_x_i16m1(0, unpacket_traits<Packet2Xs>::size / 2), unpacket_traits<Packet2Xs>::size));
+  return __riscv_vmv_x(__riscv_vredsum_vs_i16m2_i16m1(a, __riscv_vmv_v_x_i16m1(0, unpacket_traits<Packet2Xs>::size / 2),
+                                                      unpacket_traits<Packet2Xs>::size));
 }
 
 template <>
 EIGEN_STRONG_INLINE numext::int16_t predux_mul<Packet2Xs>(const Packet2Xs& a) {
   return predux_mul<Packet1Xs>(__riscv_vmul_vv_i16m1(__riscv_vget_v_i16m2_i16m1(a, 0), __riscv_vget_v_i16m2_i16m1(a, 1),
-                                                    unpacket_traits<Packet1Xs>::size));
+                                                     unpacket_traits<Packet1Xs>::size));
 }
 
 template <>
@@ -1484,18 +1515,18 @@
 
 template <typename Packet = Packet4Xs>
 EIGEN_STRONG_INLINE
-typename std::enable_if<std::is_same<Packet, Packet4Xs>::value && (unpacket_traits<Packet4Xs>::size % 8) == 0,
-                        Packet2Xs>::type
-predux_half_dowto4(const Packet4Xs& a) {
+    typename std::enable_if<std::is_same<Packet, Packet4Xs>::value && (unpacket_traits<Packet4Xs>::size % 8) == 0,
+                            Packet2Xs>::type
+    predux_half(const Packet4Xs& a) {
   return __riscv_vadd_vv_i16m2(__riscv_vget_v_i16m4_i16m2(a, 0), __riscv_vget_v_i16m4_i16m2(a, 1),
                                unpacket_traits<Packet2Xs>::size);
 }
 
 template <typename Packet = Packet2Xs>
 EIGEN_STRONG_INLINE
-typename std::enable_if<std::is_same<Packet, Packet2Xs>::value && (unpacket_traits<Packet2Xs>::size % 8) == 0,
-                        Packet1Xs>::type
-predux_half_dowto4(const Packet2Xs& a) {
+    typename std::enable_if<std::is_same<Packet, Packet2Xs>::value && (unpacket_traits<Packet2Xs>::size % 8) == 0,
+                            Packet1Xs>::type
+    predux_half(const Packet2Xs& a) {
   return __riscv_vadd_vv_i16m1(__riscv_vget_v_i16m2_i16m1(a, 0), __riscv_vget_v_i16m2_i16m1(a, 1),
                                unpacket_traits<Packet1Xs>::size);
 }

diff --git a/Eigen/src/Core/arch/RVV10/PacketMath4.h b/Eigen/src/Core/arch/RVV10/PacketMath4.h
index 30f5ca3..249dadf 100644
--- a/Eigen/src/Core/arch/RVV10/PacketMath4.h
+++ b/Eigen/src/Core/arch/RVV10/PacketMath4.h

@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2024 Kseniya Zaytseva <kseniya.zaytseva@syntacore.com>
+// Copyright (C) 2025 Chip Kerchner <ckerchner@tenstorrent.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -166,18 +167,17 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet4Xi ploaddup<Packet4Xi>(const numext::int32_t* from) {
-  Packet4Xu idx = __riscv_vid_v_u32m4(unpacket_traits<Packet4Xi>::size);
-  idx = __riscv_vsll_vx_u32m4(__riscv_vand_vx_u32m4(idx, 0xfffffffeu, unpacket_traits<Packet4Xi>::size), 1,
-                              unpacket_traits<Packet4Xi>::size);
-  // idx = 0 0 sizeof(int32_t) sizeof(int32_t) 2*sizeof(int32_t) 2*sizeof(int32_t) ...
-  return __riscv_vloxei32_v_i32m4(from, idx, unpacket_traits<Packet4Xi>::size);
+  Packet4Xu data = __riscv_vreinterpret_v_i32m4_u32m4(pload<Packet4Xi>(from));
+  return __riscv_vreinterpret_v_i64m4_i32m4(__riscv_vreinterpret_v_u64m4_i64m4(__riscv_vlmul_trunc_v_u64m8_u64m4(
+      __riscv_vwmaccu_vx_u64m8(__riscv_vwaddu_vv_u64m8(data, data, unpacket_traits<Packet4Xi>::size), 0xffffffffu, data,
+                               unpacket_traits<Packet4Xi>::size))));
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet4Xi ploadquad<Packet4Xi>(const numext::int32_t* from) {
-  Packet4Xu idx = __riscv_vid_v_u32m4(unpacket_traits<Packet4Xi>::size);
-  idx = __riscv_vand_vx_u32m4(idx, 0xfffffffcu, unpacket_traits<Packet4Xi>::size);
-  return __riscv_vloxei32_v_i32m4(from, idx, unpacket_traits<Packet4Xi>::size);
+  Packet4Xu idx =
+      __riscv_vsrl_vx_u32m4(__riscv_vid_v_u32m4(unpacket_traits<Packet4Xi>::size), 2, unpacket_traits<Packet4Xi>::size);
+  return __riscv_vrgather_vv_i32m4(pload<Packet4Xi>(from), idx, unpacket_traits<Packet4Xi>::size);
 }
 
 template <>
@@ -191,14 +191,13 @@
 }
 
 template <>
-EIGEN_DEVICE_FUNC inline Packet4Xi pgather<numext::int32_t, Packet4Xi>(const numext::int32_t* from,
-                                                                             Index stride) {
+EIGEN_DEVICE_FUNC inline Packet4Xi pgather<numext::int32_t, Packet4Xi>(const numext::int32_t* from, Index stride) {
   return __riscv_vlse32_v_i32m4(from, stride * sizeof(numext::int32_t), unpacket_traits<Packet4Xi>::size);
 }
 
 template <>
 EIGEN_DEVICE_FUNC inline void pscatter<numext::int32_t, Packet4Xi>(numext::int32_t* to, const Packet4Xi& from,
-                                                                      Index stride) {
+                                                                   Index stride) {
   __riscv_vsse32(to, stride * sizeof(numext::int32_t), from, unpacket_traits<Packet4Xi>::size);
 }
 
@@ -209,9 +208,8 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet4Xi preverse(const Packet4Xi& a) {
-  Packet4Xu idx =
-      __riscv_vrsub_vx_u32m4(__riscv_vid_v_u32m4(unpacket_traits<Packet4Xi>::size),
-                             unpacket_traits<Packet4Xi>::size - 1, unpacket_traits<Packet4Xi>::size);
+  Packet4Xu idx = __riscv_vrsub_vx_u32m4(__riscv_vid_v_u32m4(unpacket_traits<Packet4Xi>::size),
+                                         unpacket_traits<Packet4Xi>::size - 1, unpacket_traits<Packet4Xi>::size);
   return __riscv_vrgather_vv_i32m4(a, idx, unpacket_traits<Packet4Xi>::size);
 }
 
@@ -224,16 +222,16 @@
 
 template <>
 EIGEN_STRONG_INLINE numext::int32_t predux<Packet4Xi>(const Packet4Xi& a) {
-  return __riscv_vmv_x(__riscv_vredsum_vs_i32m4_i32m1(
-      a, __riscv_vmv_v_x_i32m1(0, unpacket_traits<Packet4Xi>::size / 4), unpacket_traits<Packet4Xi>::size));
+  return __riscv_vmv_x(__riscv_vredsum_vs_i32m4_i32m1(a, __riscv_vmv_v_x_i32m1(0, unpacket_traits<Packet4Xi>::size / 4),
+                                                      unpacket_traits<Packet4Xi>::size));
 }
 
 template <>
 EIGEN_STRONG_INLINE numext::int32_t predux_mul<Packet4Xi>(const Packet4Xi& a) {
   Packet1Xi half1 = __riscv_vmul_vv_i32m1(__riscv_vget_v_i32m4_i32m1(a, 0), __riscv_vget_v_i32m4_i32m1(a, 1),
-                                         unpacket_traits<Packet1Xi>::size);
+                                          unpacket_traits<Packet1Xi>::size);
   Packet1Xi half2 = __riscv_vmul_vv_i32m1(__riscv_vget_v_i32m4_i32m1(a, 2), __riscv_vget_v_i32m4_i32m1(a, 3),
-                                         unpacket_traits<Packet1Xi>::size);
+                                          unpacket_traits<Packet1Xi>::size);
   return predux_mul<Packet1Xi>(__riscv_vmul_vv_i32m1(half1, half2, unpacket_traits<Packet1Xi>::size));
 }
 
@@ -283,6 +281,12 @@
 }
 
 template <>
+EIGEN_STRONG_INLINE Packet4Xf pabsdiff(const Packet4Xf& a, const Packet4Xf& b) {
+  return __riscv_vfabs_v_f32m4(__riscv_vfsub_vv_f32m4(a, b, unpacket_traits<Packet4Xf>::size),
+                               unpacket_traits<Packet4Xf>::size);
+}
+
+template <>
 EIGEN_STRONG_INLINE Packet4Xf pset1<Packet4Xf>(const float& from) {
   return __riscv_vfmv_v_f_f32m4(from, unpacket_traits<Packet4Xf>::size);
 }
@@ -301,6 +305,16 @@
 }
 
 template <>
+EIGEN_STRONG_INLINE void pbroadcast4<Packet4Xf>(const float* a, Packet4Xf& a0, Packet4Xf& a1, Packet4Xf& a2,
+                                                Packet4Xf& a3) {
+  vfloat32m4_t aa = __riscv_vle32_v_f32m4(a, 4);
+  a0 = __riscv_vrgather_vx_f32m4(aa, 0, unpacket_traits<Packet4Xf>::size);
+  a1 = __riscv_vrgather_vx_f32m4(aa, 1, unpacket_traits<Packet4Xf>::size);
+  a2 = __riscv_vrgather_vx_f32m4(aa, 2, unpacket_traits<Packet4Xf>::size);
+  a3 = __riscv_vrgather_vx_f32m4(aa, 3, unpacket_traits<Packet4Xf>::size);
+}
+
+template <>
 EIGEN_STRONG_INLINE Packet4Xf padd<Packet4Xf>(const Packet4Xf& a, const Packet4Xf& b) {
   return __riscv_vfadd_vv_f32m4(a, b, unpacket_traits<Packet4Xf>::size);
 }
@@ -316,6 +330,12 @@
 }
 
 template <>
+EIGEN_STRONG_INLINE Packet4Xf psignbit(const Packet4Xf& a) {
+  return __riscv_vreinterpret_v_i32m4_f32m4(
+      __riscv_vsra_vx_i32m4(__riscv_vreinterpret_v_f32m4_i32m4(a), 31, unpacket_traits<Packet4Xi>::size));
+}
+
+template <>
 EIGEN_STRONG_INLINE Packet4Xf pconj(const Packet4Xf& a) {
   return a;
 }
@@ -352,8 +372,7 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet4Xf pmin<Packet4Xf>(const Packet4Xf& a, const Packet4Xf& b) {
-  Packet4Xf nans =
-      __riscv_vfmv_v_f_f32m4((std::numeric_limits<float>::quiet_NaN)(), unpacket_traits<Packet4Xf>::size);
+  Packet4Xf nans = __riscv_vfmv_v_f_f32m4((std::numeric_limits<float>::quiet_NaN)(), unpacket_traits<Packet4Xf>::size);
   PacketMask8 mask = __riscv_vmfeq_vv_f32m4_b8(a, a, unpacket_traits<Packet4Xf>::size);
   PacketMask8 mask2 = __riscv_vmfeq_vv_f32m4_b8(b, b, unpacket_traits<Packet4Xf>::size);
   mask = __riscv_vmand_mm_b8(mask, mask2, unpacket_traits<Packet4Xf>::size);
@@ -373,8 +392,7 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet4Xf pmax<Packet4Xf>(const Packet4Xf& a, const Packet4Xf& b) {
-  Packet4Xf nans =
-      __riscv_vfmv_v_f_f32m4((std::numeric_limits<float>::quiet_NaN)(), unpacket_traits<Packet4Xf>::size);
+  Packet4Xf nans = __riscv_vfmv_v_f_f32m4((std::numeric_limits<float>::quiet_NaN)(), unpacket_traits<Packet4Xf>::size);
   PacketMask8 mask = __riscv_vmfeq_vv_f32m4_b8(a, a, unpacket_traits<Packet4Xf>::size);
   PacketMask8 mask2 = __riscv_vmfeq_vv_f32m4_b8(b, b, unpacket_traits<Packet4Xf>::size);
   mask = __riscv_vmand_mm_b8(mask, mask2, unpacket_traits<Packet4Xf>::size);
@@ -395,22 +413,19 @@
 template <>
 EIGEN_STRONG_INLINE Packet4Xf pcmp_le<Packet4Xf>(const Packet4Xf& a, const Packet4Xf& b) {
   PacketMask8 mask = __riscv_vmfle_vv_f32m4_b8(a, b, unpacket_traits<Packet4Xf>::size);
-  return __riscv_vmerge_vvm_f32m4(pzero<Packet4Xf>(a), ptrue<Packet4Xf>(a), mask,
-                                  unpacket_traits<Packet4Xf>::size);
+  return __riscv_vmerge_vvm_f32m4(pzero<Packet4Xf>(a), ptrue<Packet4Xf>(a), mask, unpacket_traits<Packet4Xf>::size);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet4Xf pcmp_lt<Packet4Xf>(const Packet4Xf& a, const Packet4Xf& b) {
   PacketMask8 mask = __riscv_vmflt_vv_f32m4_b8(a, b, unpacket_traits<Packet4Xf>::size);
-  return __riscv_vmerge_vvm_f32m4(pzero<Packet4Xf>(a), ptrue<Packet4Xf>(a), mask,
-                                  unpacket_traits<Packet4Xf>::size);
+  return __riscv_vmerge_vvm_f32m4(pzero<Packet4Xf>(a), ptrue<Packet4Xf>(a), mask, unpacket_traits<Packet4Xf>::size);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet4Xf pcmp_eq<Packet4Xf>(const Packet4Xf& a, const Packet4Xf& b) {
   PacketMask8 mask = __riscv_vmfeq_vv_f32m4_b8(a, b, unpacket_traits<Packet4Xf>::size);
-  return __riscv_vmerge_vvm_f32m4(pzero<Packet4Xf>(a), ptrue<Packet4Xf>(a), mask,
-                                  unpacket_traits<Packet4Xf>::size);
+  return __riscv_vmerge_vvm_f32m4(pzero<Packet4Xf>(a), ptrue<Packet4Xf>(a), mask, unpacket_traits<Packet4Xf>::size);
 }
 
 template <>
@@ -419,26 +434,33 @@
   return __riscv_vfmerge_vfm_f32m4(ptrue<Packet4Xf>(a), 0.0f, mask, unpacket_traits<Packet4Xf>::size);
 }
 
+EIGEN_STRONG_INLINE Packet4Xf pselect(const PacketMask8& mask, const Packet4Xf& a, const Packet4Xf& b) {
+  return __riscv_vmerge_vvm_f32m4(b, a, mask, unpacket_traits<Packet4Xf>::size);
+}
+
+EIGEN_STRONG_INLINE Packet4Xf pselect(const Packet4Xf& mask, const Packet4Xf& a, const Packet4Xf& b) {
+  PacketMask8 mask2 =
+      __riscv_vmsne_vx_i32m4_b8(__riscv_vreinterpret_v_f32m4_i32m4(mask), 0, unpacket_traits<Packet4Xf>::size);
+  return __riscv_vmerge_vvm_f32m4(b, a, mask2, unpacket_traits<Packet4Xf>::size);
+}
+
 // Logical Operations are not supported for float, so reinterpret casts
 template <>
 EIGEN_STRONG_INLINE Packet4Xf pand<Packet4Xf>(const Packet4Xf& a, const Packet4Xf& b) {
-  return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vand_vv_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(a),
-                                                                  __riscv_vreinterpret_v_f32m4_u32m4(b),
-                                                                  unpacket_traits<Packet4Xf>::size));
+  return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vand_vv_u32m4(
+      __riscv_vreinterpret_v_f32m4_u32m4(a), __riscv_vreinterpret_v_f32m4_u32m4(b), unpacket_traits<Packet4Xf>::size));
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet4Xf por<Packet4Xf>(const Packet4Xf& a, const Packet4Xf& b) {
-  return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vor_vv_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(a),
-                                                                 __riscv_vreinterpret_v_f32m4_u32m4(b),
-                                                                 unpacket_traits<Packet4Xf>::size));
+  return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vor_vv_u32m4(
+      __riscv_vreinterpret_v_f32m4_u32m4(a), __riscv_vreinterpret_v_f32m4_u32m4(b), unpacket_traits<Packet4Xf>::size));
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet4Xf pxor<Packet4Xf>(const Packet4Xf& a, const Packet4Xf& b) {
-  return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vxor_vv_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(a),
-                                                                  __riscv_vreinterpret_v_f32m4_u32m4(b),
-                                                                  unpacket_traits<Packet4Xf>::size));
+  return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vxor_vv_u32m4(
+      __riscv_vreinterpret_v_f32m4_u32m4(a), __riscv_vreinterpret_v_f32m4_u32m4(b), unpacket_traits<Packet4Xf>::size));
 }
 
 template <>
@@ -461,17 +483,18 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet4Xf ploaddup<Packet4Xf>(const float* from) {
-  Packet4Xu idx = __riscv_vid_v_u32m4(unpacket_traits<Packet4Xf>::size);
-  idx = __riscv_vsll_vx_u32m4(__riscv_vand_vx_u32m4(idx, 0xfffffffeu, unpacket_traits<Packet4Xf>::size), 1,
-                              unpacket_traits<Packet4Xf>::size);
-  return __riscv_vloxei32_v_f32m4(from, idx, unpacket_traits<Packet4Xf>::size);
+  Packet4Xu data = __riscv_vreinterpret_v_f32m4_u32m4(pload<Packet4Xf>(from));
+  return __riscv_vreinterpret_v_i32m4_f32m4(
+      __riscv_vreinterpret_v_i64m4_i32m4(__riscv_vreinterpret_v_u64m4_i64m4(__riscv_vlmul_trunc_v_u64m8_u64m4(
+          __riscv_vwmaccu_vx_u64m8(__riscv_vwaddu_vv_u64m8(data, data, unpacket_traits<Packet4Xi>::size), 0xffffffffu,
+                                   data, unpacket_traits<Packet4Xi>::size)))));
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet4Xf ploadquad<Packet4Xf>(const float* from) {
-  Packet4Xu idx = __riscv_vid_v_u32m4(unpacket_traits<Packet4Xf>::size);
-  idx = __riscv_vand_vx_u32m4(idx, 0xfffffffcu, unpacket_traits<Packet4Xf>::size);
-  return __riscv_vloxei32_v_f32m4(from, idx, unpacket_traits<Packet4Xf>::size);
+  Packet4Xu idx =
+      __riscv_vsrl_vx_u32m4(__riscv_vid_v_u32m4(unpacket_traits<Packet4Xf>::size), 2, unpacket_traits<Packet4Xf>::size);
+  return __riscv_vrgather_vv_f32m4(pload<Packet4Xf>(from), idx, unpacket_traits<Packet4Xf>::size);
 }
 
 template <>
@@ -511,8 +534,8 @@
 
   PacketMask8 mask = __riscv_vmfne_vv_f32m4_b8(a, a, unpacket_traits<Packet4Xf>::size);
   const Packet4Xf x = __riscv_vfadd_vv_f32m4_tumu(mask, a, a, a, unpacket_traits<Packet4Xf>::size);
-  const Packet4Xf new_x = __riscv_vfcvt_f_x_v_f32m4(
-      __riscv_vfcvt_x_f_v_i32m4(a, unpacket_traits<Packet4Xf>::size), unpacket_traits<Packet4Xf>::size);
+  const Packet4Xf new_x = __riscv_vfcvt_f_x_v_f32m4(__riscv_vfcvt_x_f_v_i32m4(a, unpacket_traits<Packet4Xf>::size),
+                                                    unpacket_traits<Packet4Xf>::size);
 
   mask = __riscv_vmflt_vv_f32m4_b8(abs_a, limit, unpacket_traits<Packet4Xf>::size);
   Packet4Xf signed_x = __riscv_vfsgnj_vv_f32m4(new_x, x, unpacket_traits<Packet4Xf>::size);
@@ -529,9 +552,8 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet4Xf preverse(const Packet4Xf& a) {
-  Packet4Xu idx =
-      __riscv_vrsub_vx_u32m4(__riscv_vid_v_u32m4(unpacket_traits<Packet4Xf>::size),
-                             unpacket_traits<Packet4Xf>::size - 1, unpacket_traits<Packet4Xf>::size);
+  Packet4Xu idx = __riscv_vrsub_vx_u32m4(__riscv_vid_v_u32m4(unpacket_traits<Packet4Xf>::size),
+                                         unpacket_traits<Packet4Xf>::size - 1, unpacket_traits<Packet4Xf>::size);
   return __riscv_vrgather_vv_f32m4(a, idx, unpacket_traits<Packet4Xf>::size);
 }
 
@@ -549,30 +571,28 @@
 template <>
 EIGEN_STRONG_INLINE float predux_mul<Packet4Xf>(const Packet4Xf& a) {
   Packet1Xf half1 = __riscv_vfmul_vv_f32m1(__riscv_vget_v_f32m4_f32m1(a, 0), __riscv_vget_v_f32m4_f32m1(a, 1),
-                                          unpacket_traits<Packet1Xf>::size);
+                                           unpacket_traits<Packet1Xf>::size);
   Packet1Xf half2 = __riscv_vfmul_vv_f32m1(__riscv_vget_v_f32m4_f32m1(a, 2), __riscv_vget_v_f32m4_f32m1(a, 3),
-                                          unpacket_traits<Packet1Xf>::size);
+                                           unpacket_traits<Packet1Xf>::size);
   return predux_mul<Packet1Xf>(__riscv_vfmul_vv_f32m1(half1, half2, unpacket_traits<Packet1Xf>::size));
 }
 
 template <>
 EIGEN_STRONG_INLINE float predux_min<Packet4Xf>(const Packet4Xf& a) {
-  return (std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f32m4_f32m1(
-                        a,
-                        __riscv_vfmv_v_f_f32m1((std::numeric_limits<float>::quiet_NaN)(),
-                                               unpacket_traits<Packet4Xf>::size / 4),
-                        unpacket_traits<Packet4Xf>::size)),
-                    (std::numeric_limits<float>::max)());
+  return (std::min)(
+      __riscv_vfmv_f(__riscv_vfredmin_vs_f32m4_f32m1(
+          a, __riscv_vfmv_v_f_f32m1((std::numeric_limits<float>::quiet_NaN)(), unpacket_traits<Packet4Xf>::size / 4),
+          unpacket_traits<Packet4Xf>::size)),
+      (std::numeric_limits<float>::max)());
 }
 
 template <>
 EIGEN_STRONG_INLINE float predux_max<Packet4Xf>(const Packet4Xf& a) {
-  return (std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f32m4_f32m1(
-                        a,
-                        __riscv_vfmv_v_f_f32m1((std::numeric_limits<float>::quiet_NaN)(),
-                                               unpacket_traits<Packet4Xf>::size / 4),
-                        unpacket_traits<Packet4Xf>::size)),
-                    -(std::numeric_limits<float>::max)());
+  return (std::max)(
+      __riscv_vfmv_f(__riscv_vfredmax_vs_f32m4_f32m1(
+          a, __riscv_vfmv_v_f_f32m1((std::numeric_limits<float>::quiet_NaN)(), unpacket_traits<Packet4Xf>::size / 4),
+          unpacket_traits<Packet4Xf>::size)),
+      -(std::numeric_limits<float>::max)());
 }
 
 template <int N>
@@ -745,19 +765,16 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet4Xl ploaddup<Packet4Xl>(const numext::int64_t* from) {
-  Packet4Xul idx = __riscv_vid_v_u64m4(unpacket_traits<Packet4Xl>::size);
-  idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffeu, unpacket_traits<Packet4Xl>::size), 2,
-                              unpacket_traits<Packet4Xl>::size);
-  // idx = 0 0 sizeof(int64_t) sizeof(int64_t) 2*sizeof(int64_t) 2*sizeof(int64_t) ...
-  return __riscv_vloxei64_v_i64m4(from, idx, unpacket_traits<Packet4Xl>::size);
+  Packet4Xul idx =
+      __riscv_vsrl_vx_u64m4(__riscv_vid_v_u64m4(unpacket_traits<Packet4Xl>::size), 1, unpacket_traits<Packet4Xl>::size);
+  return __riscv_vrgather_vv_i64m4(pload<Packet4Xl>(from), idx, unpacket_traits<Packet4Xl>::size);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet4Xl ploadquad<Packet4Xl>(const numext::int64_t* from) {
-  Packet4Xul idx = __riscv_vid_v_u64m4(unpacket_traits<Packet4Xl>::size);
-  idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffcu, unpacket_traits<Packet4Xl>::size), 1,
-                              unpacket_traits<Packet4Xl>::size);
-  return __riscv_vloxei64_v_i64m4(from, idx, unpacket_traits<Packet4Xl>::size);
+  Packet4Xul idx =
+      __riscv_vsrl_vx_u64m4(__riscv_vid_v_u64m4(unpacket_traits<Packet4Xl>::size), 2, unpacket_traits<Packet4Xl>::size);
+  return __riscv_vrgather_vv_i64m4(pload<Packet4Xl>(from), idx, unpacket_traits<Packet4Xl>::size);
 }
 
 template <>
@@ -771,14 +788,13 @@
 }
 
 template <>
-EIGEN_DEVICE_FUNC inline Packet4Xl pgather<numext::int64_t, Packet4Xl>(const numext::int64_t* from,
-                                                                             Index stride) {
+EIGEN_DEVICE_FUNC inline Packet4Xl pgather<numext::int64_t, Packet4Xl>(const numext::int64_t* from, Index stride) {
   return __riscv_vlse64_v_i64m4(from, stride * sizeof(numext::int64_t), unpacket_traits<Packet4Xl>::size);
 }
 
 template <>
 EIGEN_DEVICE_FUNC inline void pscatter<numext::int64_t, Packet4Xl>(numext::int64_t* to, const Packet4Xl& from,
-                                                                      Index stride) {
+                                                                   Index stride) {
   __riscv_vsse64(to, stride * sizeof(numext::int64_t), from, unpacket_traits<Packet4Xl>::size);
 }
 
@@ -789,9 +805,8 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet4Xl preverse(const Packet4Xl& a) {
-  Packet4Xul idx =
-      __riscv_vrsub_vx_u64m4(__riscv_vid_v_u64m4(unpacket_traits<Packet4Xl>::size),
-                             unpacket_traits<Packet4Xl>::size - 1, unpacket_traits<Packet4Xl>::size);
+  Packet4Xul idx = __riscv_vrsub_vx_u64m4(__riscv_vid_v_u64m4(unpacket_traits<Packet4Xl>::size),
+                                          unpacket_traits<Packet4Xl>::size - 1, unpacket_traits<Packet4Xl>::size);
   return __riscv_vrgather_vv_i64m4(a, idx, unpacket_traits<Packet4Xl>::size);
 }
 
@@ -804,16 +819,16 @@
 
 template <>
 EIGEN_STRONG_INLINE numext::int64_t predux<Packet4Xl>(const Packet4Xl& a) {
-  return __riscv_vmv_x(__riscv_vredsum_vs_i64m4_i64m1(
-      a, __riscv_vmv_v_x_i64m1(0, unpacket_traits<Packet4Xl>::size / 4), unpacket_traits<Packet4Xl>::size));
+  return __riscv_vmv_x(__riscv_vredsum_vs_i64m4_i64m1(a, __riscv_vmv_v_x_i64m1(0, unpacket_traits<Packet4Xl>::size / 4),
+                                                      unpacket_traits<Packet4Xl>::size));
 }
 
 template <>
 EIGEN_STRONG_INLINE numext::int64_t predux_mul<Packet4Xl>(const Packet4Xl& a) {
   Packet1Xl half1 = __riscv_vmul_vv_i64m1(__riscv_vget_v_i64m4_i64m1(a, 0), __riscv_vget_v_i64m4_i64m1(a, 1),
-                                         unpacket_traits<Packet1Xl>::size);
+                                          unpacket_traits<Packet1Xl>::size);
   Packet1Xl half2 = __riscv_vmul_vv_i64m1(__riscv_vget_v_i64m4_i64m1(a, 2), __riscv_vget_v_i64m4_i64m1(a, 3),
-                                         unpacket_traits<Packet1Xl>::size);
+                                          unpacket_traits<Packet1Xl>::size);
   return predux_mul<Packet1Xl>(__riscv_vmul_vv_i64m1(half1, half2, unpacket_traits<Packet1Xl>::size));
 }
 
@@ -863,6 +878,12 @@
 }
 
 template <>
+EIGEN_STRONG_INLINE Packet4Xd pabsdiff(const Packet4Xd& a, const Packet4Xd& b) {
+  return __riscv_vfabs_v_f64m4(__riscv_vfsub_vv_f64m4(a, b, unpacket_traits<Packet4Xd>::size),
+                               unpacket_traits<Packet4Xd>::size);
+}
+
+template <>
 EIGEN_STRONG_INLINE Packet4Xd pset1<Packet4Xd>(const double& from) {
   return __riscv_vfmv_v_f_f64m4(from, unpacket_traits<Packet4Xd>::size);
 }
@@ -881,6 +902,16 @@
 }
 
 template <>
+EIGEN_STRONG_INLINE void pbroadcast4<Packet4Xd>(const double* a, Packet4Xd& a0, Packet4Xd& a1, Packet4Xd& a2,
+                                                Packet4Xd& a3) {
+  vfloat64m4_t aa = __riscv_vle64_v_f64m4(a, 4);
+  a0 = __riscv_vrgather_vx_f64m4(aa, 0, unpacket_traits<Packet4Xd>::size);
+  a1 = __riscv_vrgather_vx_f64m4(aa, 1, unpacket_traits<Packet4Xd>::size);
+  a2 = __riscv_vrgather_vx_f64m4(aa, 2, unpacket_traits<Packet4Xd>::size);
+  a3 = __riscv_vrgather_vx_f64m4(aa, 3, unpacket_traits<Packet4Xd>::size);
+}
+
+template <>
 EIGEN_STRONG_INLINE Packet4Xd padd<Packet4Xd>(const Packet4Xd& a, const Packet4Xd& b) {
   return __riscv_vfadd_vv_f64m4(a, b, unpacket_traits<Packet4Xd>::size);
 }
@@ -896,6 +927,12 @@
 }
 
 template <>
+EIGEN_STRONG_INLINE Packet4Xd psignbit(const Packet4Xd& a) {
+  return __riscv_vreinterpret_v_i64m4_f64m4(
+      __riscv_vsra_vx_i64m4(__riscv_vreinterpret_v_f64m4_i64m4(a), 63, unpacket_traits<Packet4Xl>::size));
+}
+
+template <>
 EIGEN_STRONG_INLINE Packet4Xd pconj(const Packet4Xd& a) {
   return a;
 }
@@ -932,8 +969,7 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet4Xd pmin<Packet4Xd>(const Packet4Xd& a, const Packet4Xd& b) {
-  Packet4Xd nans =
-      __riscv_vfmv_v_f_f64m4((std::numeric_limits<double>::quiet_NaN)(), unpacket_traits<Packet4Xd>::size);
+  Packet4Xd nans = __riscv_vfmv_v_f_f64m4((std::numeric_limits<double>::quiet_NaN)(), unpacket_traits<Packet4Xd>::size);
   PacketMask16 mask = __riscv_vmfeq_vv_f64m4_b16(a, a, unpacket_traits<Packet4Xd>::size);
   PacketMask16 mask2 = __riscv_vmfeq_vv_f64m4_b16(b, b, unpacket_traits<Packet4Xd>::size);
   mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits<Packet4Xd>::size);
@@ -953,8 +989,7 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet4Xd pmax<Packet4Xd>(const Packet4Xd& a, const Packet4Xd& b) {
-  Packet4Xd nans =
-      __riscv_vfmv_v_f_f64m4((std::numeric_limits<double>::quiet_NaN)(), unpacket_traits<Packet4Xd>::size);
+  Packet4Xd nans = __riscv_vfmv_v_f_f64m4((std::numeric_limits<double>::quiet_NaN)(), unpacket_traits<Packet4Xd>::size);
   PacketMask16 mask = __riscv_vmfeq_vv_f64m4_b16(a, a, unpacket_traits<Packet4Xd>::size);
   PacketMask16 mask2 = __riscv_vmfeq_vv_f64m4_b16(b, b, unpacket_traits<Packet4Xd>::size);
   mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits<Packet4Xd>::size);
@@ -975,22 +1010,19 @@
 template <>
 EIGEN_STRONG_INLINE Packet4Xd pcmp_le<Packet4Xd>(const Packet4Xd& a, const Packet4Xd& b) {
   PacketMask16 mask = __riscv_vmfle_vv_f64m4_b16(a, b, unpacket_traits<Packet4Xd>::size);
-  return __riscv_vmerge_vvm_f64m4(pzero<Packet4Xd>(a), ptrue<Packet4Xd>(a), mask,
-                                  unpacket_traits<Packet4Xd>::size);
+  return __riscv_vmerge_vvm_f64m4(pzero<Packet4Xd>(a), ptrue<Packet4Xd>(a), mask, unpacket_traits<Packet4Xd>::size);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet4Xd pcmp_lt<Packet4Xd>(const Packet4Xd& a, const Packet4Xd& b) {
   PacketMask16 mask = __riscv_vmflt_vv_f64m4_b16(a, b, unpacket_traits<Packet4Xd>::size);
-  return __riscv_vmerge_vvm_f64m4(pzero<Packet4Xd>(a), ptrue<Packet4Xd>(a), mask,
-                                  unpacket_traits<Packet4Xd>::size);
+  return __riscv_vmerge_vvm_f64m4(pzero<Packet4Xd>(a), ptrue<Packet4Xd>(a), mask, unpacket_traits<Packet4Xd>::size);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet4Xd pcmp_eq<Packet4Xd>(const Packet4Xd& a, const Packet4Xd& b) {
   PacketMask16 mask = __riscv_vmfeq_vv_f64m4_b16(a, b, unpacket_traits<Packet4Xd>::size);
-  return __riscv_vmerge_vvm_f64m4(pzero<Packet4Xd>(a), ptrue<Packet4Xd>(a), mask,
-                                  unpacket_traits<Packet4Xd>::size);
+  return __riscv_vmerge_vvm_f64m4(pzero<Packet4Xd>(a), ptrue<Packet4Xd>(a), mask, unpacket_traits<Packet4Xd>::size);
 }
 
 template <>
@@ -999,26 +1031,33 @@
   return __riscv_vfmerge_vfm_f64m4(ptrue<Packet4Xd>(a), 0.0, mask, unpacket_traits<Packet4Xd>::size);
 }
 
+EIGEN_STRONG_INLINE Packet4Xd pselect(const PacketMask16& mask, const Packet4Xd& a, const Packet4Xd& b) {
+  return __riscv_vmerge_vvm_f64m4(b, a, mask, unpacket_traits<Packet4Xd>::size);
+}
+
+EIGEN_STRONG_INLINE Packet4Xd pselect(const Packet4Xd& mask, const Packet4Xd& a, const Packet4Xd& b) {
+  PacketMask16 mask2 =
+      __riscv_vmsne_vx_i64m4_b16(__riscv_vreinterpret_v_f64m4_i64m4(mask), 0, unpacket_traits<Packet4Xd>::size);
+  return __riscv_vmerge_vvm_f64m4(b, a, mask2, unpacket_traits<Packet4Xd>::size);
+}
+
 // Logical Operations are not supported for double, so reinterpret casts
 template <>
 EIGEN_STRONG_INLINE Packet4Xd pand<Packet4Xd>(const Packet4Xd& a, const Packet4Xd& b) {
-  return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vand_vv_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(a),
-                                                                  __riscv_vreinterpret_v_f64m4_u64m4(b),
-                                                                  unpacket_traits<Packet4Xd>::size));
+  return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vand_vv_u64m4(
+      __riscv_vreinterpret_v_f64m4_u64m4(a), __riscv_vreinterpret_v_f64m4_u64m4(b), unpacket_traits<Packet4Xd>::size));
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet4Xd por<Packet4Xd>(const Packet4Xd& a, const Packet4Xd& b) {
-  return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vor_vv_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(a),
-                                                                 __riscv_vreinterpret_v_f64m4_u64m4(b),
-                                                                 unpacket_traits<Packet4Xd>::size));
+  return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vor_vv_u64m4(
+      __riscv_vreinterpret_v_f64m4_u64m4(a), __riscv_vreinterpret_v_f64m4_u64m4(b), unpacket_traits<Packet4Xd>::size));
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet4Xd pxor<Packet4Xd>(const Packet4Xd& a, const Packet4Xd& b) {
-  return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vxor_vv_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(a),
-                                                                  __riscv_vreinterpret_v_f64m4_u64m4(b),
-                                                                  unpacket_traits<Packet4Xd>::size));
+  return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vxor_vv_u64m4(
+      __riscv_vreinterpret_v_f64m4_u64m4(a), __riscv_vreinterpret_v_f64m4_u64m4(b), unpacket_traits<Packet4Xd>::size));
 }
 
 template <>
@@ -1041,18 +1080,16 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet4Xd ploaddup<Packet4Xd>(const double* from) {
-  Packet4Xul idx = __riscv_vid_v_u64m4(unpacket_traits<Packet4Xd>::size);
-  idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffeu, unpacket_traits<Packet4Xd>::size), 2,
-                              unpacket_traits<Packet4Xd>::size);
-  return __riscv_vloxei64_v_f64m4(from, idx, unpacket_traits<Packet4Xd>::size);
+  Packet4Xul idx =
+      __riscv_vsrl_vx_u64m4(__riscv_vid_v_u64m4(unpacket_traits<Packet4Xd>::size), 1, unpacket_traits<Packet4Xd>::size);
+  return __riscv_vrgather_vv_f64m4(pload<Packet4Xd>(from), idx, unpacket_traits<Packet4Xd>::size);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet4Xd ploadquad<Packet4Xd>(const double* from) {
-  Packet4Xul idx = __riscv_vid_v_u64m4(unpacket_traits<Packet4Xd>::size);
-  idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffcu, unpacket_traits<Packet4Xd>::size), 1,
-                              unpacket_traits<Packet4Xd>::size);
-  return __riscv_vloxei64_v_f64m4(from, idx, unpacket_traits<Packet4Xd>::size);
+  Packet4Xul idx =
+      __riscv_vsrl_vx_u64m4(__riscv_vid_v_u64m4(unpacket_traits<Packet4Xd>::size), 2, unpacket_traits<Packet4Xd>::size);
+  return __riscv_vrgather_vv_f64m4(pload<Packet4Xd>(from), idx, unpacket_traits<Packet4Xd>::size);
 }
 
 template <>
@@ -1092,8 +1129,8 @@
 
   PacketMask16 mask = __riscv_vmfne_vv_f64m4_b16(a, a, unpacket_traits<Packet4Xd>::size);
   const Packet4Xd x = __riscv_vfadd_vv_f64m4_tumu(mask, a, a, a, unpacket_traits<Packet4Xd>::size);
-  const Packet4Xd new_x = __riscv_vfcvt_f_x_v_f64m4(
-      __riscv_vfcvt_x_f_v_i64m4(a, unpacket_traits<Packet4Xd>::size), unpacket_traits<Packet4Xd>::size);
+  const Packet4Xd new_x = __riscv_vfcvt_f_x_v_f64m4(__riscv_vfcvt_x_f_v_i64m4(a, unpacket_traits<Packet4Xd>::size),
+                                                    unpacket_traits<Packet4Xd>::size);
 
   mask = __riscv_vmflt_vv_f64m4_b16(abs_a, limit, unpacket_traits<Packet4Xd>::size);
   Packet4Xd signed_x = __riscv_vfsgnj_vv_f64m4(new_x, x, unpacket_traits<Packet4Xd>::size);
@@ -1110,9 +1147,8 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet4Xd preverse(const Packet4Xd& a) {
-  Packet4Xul idx =
-      __riscv_vrsub_vx_u64m4(__riscv_vid_v_u64m4(unpacket_traits<Packet4Xd>::size),
-                             unpacket_traits<Packet4Xd>::size - 1, unpacket_traits<Packet4Xd>::size);
+  Packet4Xul idx = __riscv_vrsub_vx_u64m4(__riscv_vid_v_u64m4(unpacket_traits<Packet4Xd>::size),
+                                          unpacket_traits<Packet4Xd>::size - 1, unpacket_traits<Packet4Xd>::size);
   return __riscv_vrgather_vv_f64m4(a, idx, unpacket_traits<Packet4Xd>::size);
 }
 
@@ -1130,30 +1166,28 @@
 template <>
 EIGEN_STRONG_INLINE double predux_mul<Packet4Xd>(const Packet4Xd& a) {
   Packet1Xd half1 = __riscv_vfmul_vv_f64m1(__riscv_vget_v_f64m4_f64m1(a, 0), __riscv_vget_v_f64m4_f64m1(a, 1),
-                                          unpacket_traits<Packet1Xd>::size);
+                                           unpacket_traits<Packet1Xd>::size);
   Packet1Xd half2 = __riscv_vfmul_vv_f64m1(__riscv_vget_v_f64m4_f64m1(a, 2), __riscv_vget_v_f64m4_f64m1(a, 3),
-                                          unpacket_traits<Packet1Xd>::size);
+                                           unpacket_traits<Packet1Xd>::size);
   return predux_mul<Packet1Xd>(__riscv_vfmul_vv_f64m1(half1, half2, unpacket_traits<Packet1Xd>::size));
 }
 
 template <>
 EIGEN_STRONG_INLINE double predux_min<Packet4Xd>(const Packet4Xd& a) {
-  return (std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f64m4_f64m1(
-                        a,
-                        __riscv_vfmv_v_f_f64m1((std::numeric_limits<double>::quiet_NaN)(),
-                                               unpacket_traits<Packet4Xd>::size / 4),
-                        unpacket_traits<Packet4Xd>::size)),
-                    (std::numeric_limits<double>::max)());
+  return (std::min)(
+      __riscv_vfmv_f(__riscv_vfredmin_vs_f64m4_f64m1(
+          a, __riscv_vfmv_v_f_f64m1((std::numeric_limits<double>::quiet_NaN)(), unpacket_traits<Packet4Xd>::size / 4),
+          unpacket_traits<Packet4Xd>::size)),
+      (std::numeric_limits<double>::max)());
 }
 
 template <>
 EIGEN_STRONG_INLINE double predux_max<Packet4Xd>(const Packet4Xd& a) {
-  return (std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f64m4_f64m1(
-                        a,
-                        __riscv_vfmv_v_f_f64m1((std::numeric_limits<double>::quiet_NaN)(),
-                                               unpacket_traits<Packet4Xd>::size / 4),
-                        unpacket_traits<Packet4Xd>::size)),
-                    -(std::numeric_limits<double>::max)());
+  return (std::max)(
+      __riscv_vfmv_f(__riscv_vfredmax_vs_f64m4_f64m1(
+          a, __riscv_vfmv_v_f_f64m1((std::numeric_limits<double>::quiet_NaN)(), unpacket_traits<Packet4Xd>::size / 4),
+          unpacket_traits<Packet4Xd>::size)),
+      -(std::numeric_limits<double>::max)());
 }
 
 template <int N>
@@ -1326,18 +1360,17 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet4Xs ploaddup<Packet4Xs>(const numext::int16_t* from) {
-  Packet4Xsu idx = __riscv_vid_v_u16m4(unpacket_traits<Packet4Xs>::size);
-  idx = __riscv_vand_vx_u16m4(idx, 0xfffeu, unpacket_traits<Packet4Xs>::size);
-  // idx = 0 0 sizeof(int16_t) sizeof(int16_t) 2*sizeof(int16_t) 2*sizeof(int16_t) ...
-  return __riscv_vloxei16_v_i16m4(from, idx, unpacket_traits<Packet4Xs>::size);
+  Packet4Xsu data = __riscv_vreinterpret_v_i16m4_u16m4(pload<Packet4Xs>(from));
+  return __riscv_vreinterpret_v_i32m4_i16m4(__riscv_vreinterpret_v_u32m4_i32m4(__riscv_vlmul_trunc_v_u32m8_u32m4(
+      __riscv_vwmaccu_vx_u32m8(__riscv_vwaddu_vv_u32m8(data, data, unpacket_traits<Packet4Xs>::size), 0xffffu, data,
+                               unpacket_traits<Packet4Xs>::size))));
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet4Xs ploadquad<Packet4Xs>(const numext::int16_t* from) {
-  Packet4Xsu idx = __riscv_vid_v_u16m4(unpacket_traits<Packet4Xs>::size);
-  idx = __riscv_vsrl_vx_u16m4(__riscv_vand_vx_u16m4(idx, 0xfffcu, unpacket_traits<Packet4Xs>::size), 1,
-                              unpacket_traits<Packet4Xs>::size);
-  return __riscv_vloxei16_v_i16m4(from, idx, unpacket_traits<Packet4Xs>::size);
+  Packet4Xsu idx =
+      __riscv_vsrl_vx_u16m4(__riscv_vid_v_u16m4(unpacket_traits<Packet4Xs>::size), 2, unpacket_traits<Packet4Xs>::size);
+  return __riscv_vrgather_vv_i16m4(pload<Packet4Xs>(from), idx, unpacket_traits<Packet4Xs>::size);
 }
 
 template <>
@@ -1351,14 +1384,13 @@
 }
 
 template <>
-EIGEN_DEVICE_FUNC inline Packet4Xs pgather<numext::int16_t, Packet4Xs>(const numext::int16_t* from,
-                                                                             Index stride) {
+EIGEN_DEVICE_FUNC inline Packet4Xs pgather<numext::int16_t, Packet4Xs>(const numext::int16_t* from, Index stride) {
   return __riscv_vlse16_v_i16m4(from, stride * sizeof(numext::int16_t), unpacket_traits<Packet4Xs>::size);
 }
 
 template <>
 EIGEN_DEVICE_FUNC inline void pscatter<numext::int16_t, Packet4Xs>(numext::int16_t* to, const Packet4Xs& from,
-                                                                      Index stride) {
+                                                                   Index stride) {
   __riscv_vsse16(to, stride * sizeof(numext::int16_t), from, unpacket_traits<Packet4Xs>::size);
 }
 
@@ -1369,9 +1401,8 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet4Xs preverse(const Packet4Xs& a) {
-  Packet4Xsu idx =
-      __riscv_vrsub_vx_u16m4(__riscv_vid_v_u16m4(unpacket_traits<Packet4Xs>::size),
-                             unpacket_traits<Packet4Xs>::size - 1, unpacket_traits<Packet4Xs>::size);
+  Packet4Xsu idx = __riscv_vrsub_vx_u16m4(__riscv_vid_v_u16m4(unpacket_traits<Packet4Xs>::size),
+                                          unpacket_traits<Packet4Xs>::size - 1, unpacket_traits<Packet4Xs>::size);
   return __riscv_vrgather_vv_i16m4(a, idx, unpacket_traits<Packet4Xs>::size);
 }
 
@@ -1384,16 +1415,16 @@
 
 template <>
 EIGEN_STRONG_INLINE numext::int16_t predux<Packet4Xs>(const Packet4Xs& a) {
-  return __riscv_vmv_x(__riscv_vredsum_vs_i16m4_i16m1(
-      a, __riscv_vmv_v_x_i16m1(0, unpacket_traits<Packet4Xs>::size / 4), unpacket_traits<Packet4Xs>::size));
+  return __riscv_vmv_x(__riscv_vredsum_vs_i16m4_i16m1(a, __riscv_vmv_v_x_i16m1(0, unpacket_traits<Packet4Xs>::size / 4),
+                                                      unpacket_traits<Packet4Xs>::size));
 }
 
 template <>
 EIGEN_STRONG_INLINE numext::int16_t predux_mul<Packet4Xs>(const Packet4Xs& a) {
   Packet1Xs half1 = __riscv_vmul_vv_i16m1(__riscv_vget_v_i16m4_i16m1(a, 0), __riscv_vget_v_i16m4_i16m1(a, 1),
-                                         unpacket_traits<Packet1Xs>::size);
+                                          unpacket_traits<Packet1Xs>::size);
   Packet1Xs half2 = __riscv_vmul_vv_i16m1(__riscv_vget_v_i16m4_i16m1(a, 2), __riscv_vget_v_i16m4_i16m1(a, 3),
-                                         unpacket_traits<Packet1Xs>::size);
+                                          unpacket_traits<Packet1Xs>::size);
   return predux_mul<Packet1Xs>(__riscv_vmul_vv_i16m1(half1, half2, unpacket_traits<Packet1Xs>::size));
 }
 

diff --git a/Eigen/src/Core/arch/RVV10/PacketMathBF16.h b/Eigen/src/Core/arch/RVV10/PacketMathBF16.h
new file mode 100644
index 0000000..2522efd
--- /dev/null
+++ b/Eigen/src/Core/arch/RVV10/PacketMathBF16.h

@@ -0,0 +1,839 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2025 Chip Kerchner <ckerchner@tenstorrent.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_BF16_RVV10_H
+#define EIGEN_PACKET_MATH_BF16_RVV10_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+typedef eigen_packet_wrapper<vbfloat16m1_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))), 26> Packet1Xbf;
+typedef eigen_packet_wrapper<vbfloat16m2_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 2))), 27>
+    Packet2Xbf;
+
+#if EIGEN_RISCV64_DEFAULT_LMUL == 1
+typedef Packet1Xbf PacketXbf;
+
+template <>
+struct packet_traits<bfloat16> : default_packet_traits {
+  typedef Packet1Xbf type;
+  typedef Packet1Xbf half;
+
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = rvv_packet_size_selector<bfloat16, EIGEN_RISCV64_RVV_VL, 1>::size,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 0,
+    HasBlend = 0,
+    HasReduxp = 0,
+    HasSign = 0,
+
+    HasCmp = 1,
+    HasDiv = 1,
+    HasRound = 0,
+
+    HasSin = 0,
+    HasCos = 0,
+    HasLog = 0,
+    HasExp = 0,
+    HasSqrt = 1,
+    HasTanh = 0,
+    HasErf = 0
+  };
+};
+
+#else
+typedef Packet2Xbf PacketXbf;
+
+template <>
+struct packet_traits<bfloat16> : default_packet_traits {
+  typedef Packet2Xbf type;
+  typedef Packet1Xbf half;
+
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = rvv_packet_size_selector<bfloat16, EIGEN_RISCV64_RVV_VL, 2>::size,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 0,
+    HasBlend = 0,
+    HasReduxp = 0,
+    HasSign = 0,
+
+    HasCmp = 1,
+    HasDiv = 1,
+    HasRound = 0,
+
+    HasSin = 0,
+    HasCos = 0,
+    HasLog = 0,
+    HasExp = 0,
+    HasSqrt = 1,
+    HasTanh = 0,
+    HasErf = 0
+  };
+};
+#endif
+
+template <>
+struct unpacket_traits<Packet1Xbf> : default_unpacket_traits {
+  typedef bfloat16 type;
+  typedef Packet1Xbf half;  // Half not yet implemented
+  typedef Packet1Xs integer_packet;
+  typedef numext::uint8_t mask_t;
+
+  enum {
+    size = rvv_packet_size_selector<bfloat16, EIGEN_RISCV64_RVV_VL, 1>::size,
+    alignment = rvv_packet_alignment_selector<EIGEN_RISCV64_RVV_VL, 1>::alignment,
+    vectorizable = true
+  };
+};
+
+template <>
+struct unpacket_traits<Packet2Xbf> : default_unpacket_traits {
+  typedef bfloat16 type;
+  typedef Packet1Xbf half;
+  typedef Packet2Xs integer_packet;
+  typedef numext::uint8_t mask_t;
+
+  enum {
+    size = rvv_packet_size_selector<bfloat16, EIGEN_RISCV64_RVV_VL, 2>::size,
+    alignment = rvv_packet_alignment_selector<EIGEN_RISCV64_RVV_VL, 2>::alignment,
+    vectorizable = true
+  };
+};
+
+/********************************* Packet1Xbf ************************************/
+
+EIGEN_STRONG_INLINE Packet2Xf Bf16ToF32(const Packet1Xbf& a) {
+  return __riscv_vfwcvtbf16_f_f_v_f32m2(a, unpacket_traits<Packet1Xbf>::size);
+}
+
+EIGEN_STRONG_INLINE Packet1Xbf F32ToBf16(const Packet2Xf& a) {
+  return __riscv_vfncvtbf16_f_f_w_bf16m1(a, unpacket_traits<Packet2Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf ptrue<Packet1Xbf>(const Packet1Xbf& /*a*/) {
+  return __riscv_vreinterpret_bf16m1(
+      __riscv_vmv_v_x_u16m1(static_cast<numext::uint16_t>(0xffffu), unpacket_traits<Packet1Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pzero<Packet1Xbf>(const Packet1Xbf& /*a*/) {
+  return __riscv_vreinterpret_bf16m1(
+      __riscv_vmv_v_x_i16m1(numext::bit_cast<int16_t>(static_cast<__bf16>(0.0)), unpacket_traits<Packet1Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pabs(const Packet1Xbf& a) {
+  return __riscv_vreinterpret_v_u16m1_bf16m1(__riscv_vand_vx_u16m1(__riscv_vreinterpret_v_bf16m1_u16m1(a),
+                                                                   static_cast<numext::uint16_t>(0x7fffu),
+                                                                   unpacket_traits<Packet1Xs>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pset1<Packet1Xbf>(const bfloat16& from) {
+  return __riscv_vreinterpret_bf16m1(
+      __riscv_vmv_v_x_i16m1(numext::bit_cast<int16_t>(from), unpacket_traits<Packet1Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pset1frombits<Packet1Xbf>(numext::uint16_t from) {
+  return __riscv_vreinterpret_bf16m1(__riscv_vmv_v_x_u16m1(from, unpacket_traits<Packet1Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf plset<Packet1Xbf>(const bfloat16& a) {
+  return F32ToBf16(plset<Packet2Xf>(static_cast<float>(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE void pbroadcast4<Packet1Xbf>(const bfloat16* a, Packet1Xbf& a0, Packet1Xbf& a1, Packet1Xbf& a2,
+                                                 Packet1Xbf& a3) {
+  vint16m1_t aa = __riscv_vle16_v_i16m1(reinterpret_cast<const int16_t*>(a), 4);
+  a0 = __riscv_vreinterpret_bf16m1(__riscv_vrgather_vx_i16m1(aa, 0, unpacket_traits<Packet1Xs>::size));
+  a1 = __riscv_vreinterpret_bf16m1(__riscv_vrgather_vx_i16m1(aa, 1, unpacket_traits<Packet1Xs>::size));
+  a2 = __riscv_vreinterpret_bf16m1(__riscv_vrgather_vx_i16m1(aa, 2, unpacket_traits<Packet1Xs>::size));
+  a3 = __riscv_vreinterpret_bf16m1(__riscv_vrgather_vx_i16m1(aa, 3, unpacket_traits<Packet1Xs>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf padd<Packet1Xbf>(const Packet1Xbf& a, const Packet1Xbf& b) {
+  // b + (1 * a)
+  return F32ToBf16(__riscv_vfwmaccbf16_vf_f32m2(Bf16ToF32(b),
+                                                numext::bit_cast<__bf16>(static_cast<numext::int16_t>(0x3f80u)), a,
+                                                unpacket_traits<Packet1Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf psub<Packet1Xbf>(const Packet1Xbf& a, const Packet1Xbf& b) {
+  // a + (-1 * b)
+  return F32ToBf16(__riscv_vfwmaccbf16_vf_f32m2(Bf16ToF32(a),
+                                                numext::bit_cast<__bf16>(static_cast<numext::int16_t>(0xbf80u)), b,
+                                                unpacket_traits<Packet1Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pabsdiff(const Packet1Xbf& a, const Packet1Xbf& b) {
+  return pabs<Packet1Xbf>(psub<Packet1Xbf>(a, b));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pnegate(const Packet1Xbf& a) {
+  return __riscv_vreinterpret_v_u16m1_bf16m1(__riscv_vxor_vx_u16m1(__riscv_vreinterpret_v_bf16m1_u16m1(a),
+                                                                   static_cast<numext::uint16_t>(0x8000u),
+                                                                   unpacket_traits<Packet1Xs>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf psignbit(const Packet1Xbf& a) {
+  return __riscv_vreinterpret_v_i16m1_bf16m1(
+      __riscv_vsra_vx_i16m1(__riscv_vreinterpret_v_bf16m1_i16m1(a), 15, unpacket_traits<Packet1Xs>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pconj(const Packet1Xbf& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pmul<Packet1Xbf>(const Packet1Xbf& a, const Packet1Xbf& b) {
+  Packet2Xf c;
+  return F32ToBf16(__riscv_vfwmaccbf16_vv_f32m2(pzero<Packet2Xf>(c), a, b, unpacket_traits<Packet1Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pdiv<Packet1Xbf>(const Packet1Xbf& a, const Packet1Xbf& b) {
+  return F32ToBf16(pdiv<Packet2Xf>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pmadd(const Packet1Xbf& a, const Packet1Xbf& b, const Packet1Xbf& c) {
+  return F32ToBf16(__riscv_vfwmaccbf16_vv_f32m2(Bf16ToF32(c), a, b, unpacket_traits<Packet1Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pmsub(const Packet1Xbf& a, const Packet1Xbf& b, const Packet1Xbf& c) {
+  return F32ToBf16(
+      __riscv_vfwmaccbf16_vv_f32m2(Bf16ToF32(pnegate<Packet1Xbf>(c)), a, b, unpacket_traits<Packet1Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pnmadd(const Packet1Xbf& a, const Packet1Xbf& b, const Packet1Xbf& c) {
+  return F32ToBf16(
+      __riscv_vfwmaccbf16_vv_f32m2(Bf16ToF32(c), pnegate<Packet1Xbf>(a), b, unpacket_traits<Packet1Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pnmsub(const Packet1Xbf& a, const Packet1Xbf& b, const Packet1Xbf& c) {
+  return pnegate<Packet1Xbf>(
+      F32ToBf16(__riscv_vfwmaccbf16_vv_f32m2(Bf16ToF32(c), a, b, unpacket_traits<Packet1Xbf>::size)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pmin<Packet1Xbf>(const Packet1Xbf& a, const Packet1Xbf& b) {
+  return F32ToBf16(pmin<Packet2Xf>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pmin<PropagateNaN, Packet1Xbf>(const Packet1Xbf& a, const Packet1Xbf& b) {
+  return F32ToBf16(pmin<PropagateNaN, Packet2Xf>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pmin<PropagateNumbers, Packet1Xbf>(const Packet1Xbf& a, const Packet1Xbf& b) {
+  return F32ToBf16(pmin<PropagateNumbers, Packet2Xf>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pmax<Packet1Xbf>(const Packet1Xbf& a, const Packet1Xbf& b) {
+  return F32ToBf16(pmax<Packet2Xf>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pmax<PropagateNaN, Packet1Xbf>(const Packet1Xbf& a, const Packet1Xbf& b) {
+  return F32ToBf16(pmax<PropagateNaN, Packet2Xf>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pmax<PropagateNumbers, Packet1Xbf>(const Packet1Xbf& a, const Packet1Xbf& b) {
+  return F32ToBf16(pmax<PropagateNumbers, Packet2Xf>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pcmp_le<Packet1Xbf>(const Packet1Xbf& a, const Packet1Xbf& b) {
+  return F32ToBf16(pcmp_le<Packet2Xf>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pcmp_lt<Packet1Xbf>(const Packet1Xbf& a, const Packet1Xbf& b) {
+  return F32ToBf16(pcmp_lt<Packet2Xf>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pcmp_eq<Packet1Xbf>(const Packet1Xbf& a, const Packet1Xbf& b) {
+  return F32ToBf16(pcmp_eq<Packet2Xf>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pcmp_lt_or_nan<Packet1Xbf>(const Packet1Xbf& a, const Packet1Xbf& b) {
+  return F32ToBf16(pcmp_lt_or_nan<Packet2Xf>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+EIGEN_STRONG_INLINE Packet1Xbf pselect(const PacketMask16& mask, const Packet1Xbf& a, const Packet1Xbf& b) {
+  return __riscv_vreinterpret_v_i16m1_bf16m1(__riscv_vmerge_vvm_i16m1(__riscv_vreinterpret_v_bf16m1_i16m1(b),
+                                                                      __riscv_vreinterpret_v_bf16m1_i16m1(a), mask,
+                                                                      unpacket_traits<Packet1Xbf>::size));
+}
+
+EIGEN_STRONG_INLINE Packet1Xbf pselect(const Packet1Xbf& mask, const Packet1Xbf& a, const Packet1Xbf& b) {
+  PacketMask16 mask2 =
+      __riscv_vmsne_vx_i16m1_b16(__riscv_vreinterpret_v_bf16m1_i16m1(mask), 0, unpacket_traits<Packet1Xbf>::size);
+  return __riscv_vreinterpret_v_i16m1_bf16m1(__riscv_vmerge_vvm_i16m1(__riscv_vreinterpret_v_bf16m1_i16m1(b),
+                                                                      __riscv_vreinterpret_v_bf16m1_i16m1(a), mask2,
+                                                                      unpacket_traits<Packet1Xbf>::size));
+}
+
+// Logical Operations are not supported for bfloat16, so reinterpret casts
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pand<Packet1Xbf>(const Packet1Xbf& a, const Packet1Xbf& b) {
+  return __riscv_vreinterpret_v_u16m1_bf16m1(__riscv_vand_vv_u16m1(__riscv_vreinterpret_v_bf16m1_u16m1(a),
+                                                                   __riscv_vreinterpret_v_bf16m1_u16m1(b),
+                                                                   unpacket_traits<Packet1Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf por<Packet1Xbf>(const Packet1Xbf& a, const Packet1Xbf& b) {
+  return __riscv_vreinterpret_v_u16m1_bf16m1(__riscv_vor_vv_u16m1(__riscv_vreinterpret_v_bf16m1_u16m1(a),
+                                                                  __riscv_vreinterpret_v_bf16m1_u16m1(b),
+                                                                  unpacket_traits<Packet1Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pxor<Packet1Xbf>(const Packet1Xbf& a, const Packet1Xbf& b) {
+  return __riscv_vreinterpret_v_u16m1_bf16m1(__riscv_vxor_vv_u16m1(__riscv_vreinterpret_v_bf16m1_u16m1(a),
+                                                                   __riscv_vreinterpret_v_bf16m1_u16m1(b),
+                                                                   unpacket_traits<Packet1Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pandnot<Packet1Xbf>(const Packet1Xbf& a, const Packet1Xbf& b) {
+  return __riscv_vreinterpret_v_u16m1_bf16m1(__riscv_vand_vv_u16m1(
+      __riscv_vreinterpret_v_bf16m1_u16m1(a),
+      __riscv_vnot_v_u16m1(__riscv_vreinterpret_v_bf16m1_u16m1(b), unpacket_traits<Packet1Xbf>::size),
+      unpacket_traits<Packet1Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pload<Packet1Xbf>(const bfloat16* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_bf16m1(reinterpret_cast<const __bf16*>(from),
+                                                         unpacket_traits<Packet1Xbf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf ploadu<Packet1Xbf>(const bfloat16* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_bf16m1(reinterpret_cast<const __bf16*>(from),
+                                                           unpacket_traits<Packet1Xbf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf ploaddup<Packet1Xbf>(const bfloat16* from) {
+  Packet1Xsu data = __riscv_vreinterpret_v_bf16m1_u16m1(pload<Packet1Xbf>(from));
+  return __riscv_vreinterpret_v_i16m1_bf16m1(
+      __riscv_vreinterpret_v_i32m1_i16m1(__riscv_vreinterpret_v_u32m1_i32m1(__riscv_vlmul_trunc_v_u32m2_u32m1(
+          __riscv_vwmaccu_vx_u32m2(__riscv_vwaddu_vv_u32m2(data, data, unpacket_traits<Packet1Xs>::size), 0xffffu, data,
+                                   unpacket_traits<Packet1Xs>::size)))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf ploadquad<Packet1Xbf>(const bfloat16* from) {
+  Packet1Xsu idx = __riscv_vsrl_vx_u16m1(__riscv_vid_v_u16m1(unpacket_traits<Packet1Xbf>::size), 2,
+                                         unpacket_traits<Packet1Xbf>::size);
+  return __riscv_vreinterpret_v_i16m1_bf16m1(__riscv_vrgather_vv_i16m1(
+      pload<Packet1Xs>(reinterpret_cast<const short*>(from)), idx, unpacket_traits<Packet1Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet1Xbf& from) {
+  EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_bf16m1(reinterpret_cast<__bf16*>(to), from,
+                                                   unpacket_traits<Packet1Xbf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet1Xbf& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_bf16m1(reinterpret_cast<__bf16*>(to), from,
+                                                     unpacket_traits<Packet1Xbf>::size);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet1Xbf pgather<bfloat16, Packet1Xbf>(const bfloat16* from, Index stride) {
+  return __riscv_vlse16_v_bf16m1(reinterpret_cast<const __bf16*>(from), stride * sizeof(bfloat16),
+                                 unpacket_traits<Packet1Xbf>::size);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<bfloat16, Packet1Xbf>(bfloat16* to, const Packet1Xbf& from, Index stride) {
+  __riscv_vsse16(reinterpret_cast<__bf16*>(to), stride * sizeof(bfloat16), from, unpacket_traits<Packet1Xbf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 pfirst<Packet1Xbf>(const Packet1Xbf& a) {
+  return numext::bit_cast<bfloat16>(__riscv_vmv_x_s_i16m1_i16(__riscv_vreinterpret_v_bf16m1_i16m1(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf psqrt(const Packet1Xbf& a) {
+  return F32ToBf16(psqrt<Packet2Xf>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf print<Packet1Xbf>(const Packet1Xbf& a) {
+  return F32ToBf16(print<Packet2Xf>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pfloor<Packet1Xbf>(const Packet1Xbf& a) {
+  return F32ToBf16(pfloor<Packet2Xf>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf preverse(const Packet1Xbf& a) {
+  return __riscv_vreinterpret_v_i16m1_bf16m1(preverse<Packet1Xs>(__riscv_vreinterpret_v_bf16m1_i16m1(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux<Packet1Xbf>(const Packet1Xbf& a) {
+  return static_cast<bfloat16>(predux<Packet2Xf>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet1Xbf>(const Packet1Xbf& a) {
+  return static_cast<bfloat16>(predux_mul<Packet2Xf>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_min<Packet1Xbf>(const Packet1Xbf& a) {
+  return static_cast<bfloat16>(predux_min<Packet2Xf>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_max<Packet1Xbf>(const Packet1Xbf& a) {
+  return static_cast<bfloat16>(predux_max<Packet2Xf>(Bf16ToF32(a)));
+}
+
+template <int N>
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet1Xbf, N>& kernel) {
+  bfloat16 buffer[unpacket_traits<Packet1Xbf>::size * N];
+  int i = 0;
+
+  for (i = 0; i < N; i++) {
+    __riscv_vsse16(reinterpret_cast<__bf16*>(&buffer[i]), N * sizeof(bfloat16), kernel.packet[i],
+                   unpacket_traits<Packet1Xbf>::size);
+  }
+
+  for (i = 0; i < N; i++) {
+    kernel.packet[i] = __riscv_vle16_v_bf16m1(reinterpret_cast<__bf16*>(&buffer[i * unpacket_traits<Packet1Xbf>::size]),
+                                              unpacket_traits<Packet1Xbf>::size);
+  }
+}
+
+/********************************* Packet2Xbf ************************************/
+
+EIGEN_STRONG_INLINE Packet4Xf Bf16ToF32(const Packet2Xbf& a) {
+  return __riscv_vfwcvtbf16_f_f_v_f32m4(a, unpacket_traits<Packet2Xbf>::size);
+}
+
+EIGEN_STRONG_INLINE Packet2Xbf F32ToBf16(const Packet4Xf& a) {
+  return __riscv_vfncvtbf16_f_f_w_bf16m2(a, unpacket_traits<Packet4Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf ptrue<Packet2Xbf>(const Packet2Xbf& /*a*/) {
+  return __riscv_vreinterpret_bf16m2(
+      __riscv_vmv_v_x_u16m2(static_cast<numext::uint16_t>(0xffffu), unpacket_traits<Packet2Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pzero<Packet2Xbf>(const Packet2Xbf& /*a*/) {
+  return __riscv_vreinterpret_bf16m2(
+      __riscv_vmv_v_x_i16m2(numext::bit_cast<int16_t>(static_cast<__bf16>(0.0)), unpacket_traits<Packet2Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pabs(const Packet2Xbf& a) {
+  return __riscv_vreinterpret_v_u16m2_bf16m2(__riscv_vand_vx_u16m2(__riscv_vreinterpret_v_bf16m2_u16m2(a),
+                                                                   static_cast<numext::uint16_t>(0x7fffu),
+                                                                   unpacket_traits<Packet2Xs>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pset1<Packet2Xbf>(const bfloat16& from) {
+  return __riscv_vreinterpret_bf16m2(
+      __riscv_vmv_v_x_i16m2(numext::bit_cast<int16_t>(from), unpacket_traits<Packet2Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pset1frombits<Packet2Xbf>(numext::uint16_t from) {
+  return __riscv_vreinterpret_bf16m2(__riscv_vmv_v_x_u16m2(from, unpacket_traits<Packet2Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf plset<Packet2Xbf>(const bfloat16& a) {
+  return F32ToBf16(plset<Packet4Xf>(static_cast<float>(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE void pbroadcast4<Packet2Xbf>(const bfloat16* a, Packet2Xbf& a0, Packet2Xbf& a1, Packet2Xbf& a2,
+                                                 Packet2Xbf& a3) {
+  vint16m2_t aa = __riscv_vle16_v_i16m2(reinterpret_cast<const int16_t*>(a), 4);
+  a0 = __riscv_vreinterpret_bf16m2(__riscv_vrgather_vx_i16m2(aa, 0, unpacket_traits<Packet2Xs>::size));
+  a1 = __riscv_vreinterpret_bf16m2(__riscv_vrgather_vx_i16m2(aa, 1, unpacket_traits<Packet2Xs>::size));
+  a2 = __riscv_vreinterpret_bf16m2(__riscv_vrgather_vx_i16m2(aa, 2, unpacket_traits<Packet2Xs>::size));
+  a3 = __riscv_vreinterpret_bf16m2(__riscv_vrgather_vx_i16m2(aa, 3, unpacket_traits<Packet2Xs>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf padd<Packet2Xbf>(const Packet2Xbf& a, const Packet2Xbf& b) {
+  // b + (1 * a)
+  return F32ToBf16(__riscv_vfwmaccbf16_vf_f32m4(Bf16ToF32(b),
+                                                numext::bit_cast<__bf16>(static_cast<numext::int16_t>(0x3f80u)), a,
+                                                unpacket_traits<Packet2Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf psub<Packet2Xbf>(const Packet2Xbf& a, const Packet2Xbf& b) {
+  // a + (-1 * b)
+  return F32ToBf16(__riscv_vfwmaccbf16_vf_f32m4(Bf16ToF32(a),
+                                                numext::bit_cast<__bf16>(static_cast<numext::int16_t>(0xbf80u)), b,
+                                                unpacket_traits<Packet2Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pabsdiff(const Packet2Xbf& a, const Packet2Xbf& b) {
+  return pabs<Packet2Xbf>(psub<Packet2Xbf>(a, b));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pnegate(const Packet2Xbf& a) {
+  return __riscv_vreinterpret_v_u16m2_bf16m2(__riscv_vxor_vx_u16m2(__riscv_vreinterpret_v_bf16m2_u16m2(a),
+                                                                   static_cast<numext::uint16_t>(0x8000u),
+                                                                   unpacket_traits<Packet2Xs>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf psignbit(const Packet2Xbf& a) {
+  return __riscv_vreinterpret_v_i16m2_bf16m2(
+      __riscv_vsra_vx_i16m2(__riscv_vreinterpret_v_bf16m2_i16m2(a), 15, unpacket_traits<Packet2Xs>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pconj(const Packet2Xbf& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pmul<Packet2Xbf>(const Packet2Xbf& a, const Packet2Xbf& b) {
+  Packet4Xf c;
+  return F32ToBf16(__riscv_vfwmaccbf16_vv_f32m4(pzero<Packet4Xf>(c), a, b, unpacket_traits<Packet2Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pdiv<Packet2Xbf>(const Packet2Xbf& a, const Packet2Xbf& b) {
+  return F32ToBf16(pdiv<Packet4Xf>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pmadd(const Packet2Xbf& a, const Packet2Xbf& b, const Packet2Xbf& c) {
+  return F32ToBf16(__riscv_vfwmaccbf16_vv_f32m4(Bf16ToF32(c), a, b, unpacket_traits<Packet2Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pmsub(const Packet2Xbf& a, const Packet2Xbf& b, const Packet2Xbf& c) {
+  return F32ToBf16(
+      __riscv_vfwmaccbf16_vv_f32m4(Bf16ToF32(pnegate<Packet2Xbf>(c)), a, b, unpacket_traits<Packet2Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pnmadd(const Packet2Xbf& a, const Packet2Xbf& b, const Packet2Xbf& c) {
+  return F32ToBf16(
+      __riscv_vfwmaccbf16_vv_f32m4(Bf16ToF32(c), pnegate<Packet2Xbf>(a), b, unpacket_traits<Packet2Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pnmsub(const Packet2Xbf& a, const Packet2Xbf& b, const Packet2Xbf& c) {
+  return pnegate<Packet2Xbf>(
+      F32ToBf16(__riscv_vfwmaccbf16_vv_f32m4(Bf16ToF32(c), a, b, unpacket_traits<Packet2Xbf>::size)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pmin<Packet2Xbf>(const Packet2Xbf& a, const Packet2Xbf& b) {
+  return F32ToBf16(pmin<Packet4Xf>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pmin<PropagateNaN, Packet2Xbf>(const Packet2Xbf& a, const Packet2Xbf& b) {
+  return F32ToBf16(pmin<PropagateNaN, Packet4Xf>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pmin<PropagateNumbers, Packet2Xbf>(const Packet2Xbf& a, const Packet2Xbf& b) {
+  return F32ToBf16(pmin<PropagateNumbers, Packet4Xf>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pmax<Packet2Xbf>(const Packet2Xbf& a, const Packet2Xbf& b) {
+  return F32ToBf16(pmax<Packet4Xf>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pmax<PropagateNaN, Packet2Xbf>(const Packet2Xbf& a, const Packet2Xbf& b) {
+  return F32ToBf16(pmax<PropagateNaN, Packet4Xf>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pmax<PropagateNumbers, Packet2Xbf>(const Packet2Xbf& a, const Packet2Xbf& b) {
+  return F32ToBf16(pmax<PropagateNumbers, Packet4Xf>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pcmp_le<Packet2Xbf>(const Packet2Xbf& a, const Packet2Xbf& b) {
+  return F32ToBf16(pcmp_le<Packet4Xf>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pcmp_lt<Packet2Xbf>(const Packet2Xbf& a, const Packet2Xbf& b) {
+  return F32ToBf16(pcmp_lt<Packet4Xf>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pcmp_eq<Packet2Xbf>(const Packet2Xbf& a, const Packet2Xbf& b) {
+  return F32ToBf16(pcmp_eq<Packet4Xf>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pcmp_lt_or_nan<Packet2Xbf>(const Packet2Xbf& a, const Packet2Xbf& b) {
+  return F32ToBf16(pcmp_lt_or_nan<Packet4Xf>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+EIGEN_STRONG_INLINE Packet2Xbf pselect(const PacketMask8& mask, const Packet2Xbf& a, const Packet2Xbf& b) {
+  return __riscv_vreinterpret_v_i16m2_bf16m2(__riscv_vmerge_vvm_i16m2(__riscv_vreinterpret_v_bf16m2_i16m2(b),
+                                                                      __riscv_vreinterpret_v_bf16m2_i16m2(a), mask,
+                                                                      unpacket_traits<Packet2Xbf>::size));
+}
+
+EIGEN_STRONG_INLINE Packet2Xbf pselect(const Packet2Xbf& mask, const Packet2Xbf& a, const Packet2Xbf& b) {
+  PacketMask8 mask2 =
+      __riscv_vmsne_vx_i16m2_b8(__riscv_vreinterpret_v_bf16m2_i16m2(mask), 0, unpacket_traits<Packet2Xbf>::size);
+  return __riscv_vreinterpret_v_i16m2_bf16m2(__riscv_vmerge_vvm_i16m2(__riscv_vreinterpret_v_bf16m2_i16m2(b),
+                                                                      __riscv_vreinterpret_v_bf16m2_i16m2(a), mask2,
+                                                                      unpacket_traits<Packet2Xbf>::size));
+}
+
+// Logical Operations are not supported for bflaot16, so reinterpret casts
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pand<Packet2Xbf>(const Packet2Xbf& a, const Packet2Xbf& b) {
+  return __riscv_vreinterpret_v_u16m2_bf16m2(__riscv_vand_vv_u16m2(__riscv_vreinterpret_v_bf16m2_u16m2(a),
+                                                                   __riscv_vreinterpret_v_bf16m2_u16m2(b),
+                                                                   unpacket_traits<Packet2Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf por<Packet2Xbf>(const Packet2Xbf& a, const Packet2Xbf& b) {
+  return __riscv_vreinterpret_v_u16m2_bf16m2(__riscv_vor_vv_u16m2(__riscv_vreinterpret_v_bf16m2_u16m2(a),
+                                                                  __riscv_vreinterpret_v_bf16m2_u16m2(b),
+                                                                  unpacket_traits<Packet2Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pxor<Packet2Xbf>(const Packet2Xbf& a, const Packet2Xbf& b) {
+  return __riscv_vreinterpret_v_u16m2_bf16m2(__riscv_vxor_vv_u16m2(__riscv_vreinterpret_v_bf16m2_u16m2(a),
+                                                                   __riscv_vreinterpret_v_bf16m2_u16m2(b),
+                                                                   unpacket_traits<Packet2Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pandnot<Packet2Xbf>(const Packet2Xbf& a, const Packet2Xbf& b) {
+  return __riscv_vreinterpret_v_u16m2_bf16m2(__riscv_vand_vv_u16m2(
+      __riscv_vreinterpret_v_bf16m2_u16m2(a),
+      __riscv_vnot_v_u16m2(__riscv_vreinterpret_v_bf16m2_u16m2(b), unpacket_traits<Packet2Xbf>::size),
+      unpacket_traits<Packet2Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pload<Packet2Xbf>(const bfloat16* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_bf16m2(reinterpret_cast<const __bf16*>(from),
+                                                         unpacket_traits<Packet2Xbf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf ploadu<Packet2Xbf>(const bfloat16* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_bf16m2(reinterpret_cast<const __bf16*>(from),
+                                                           unpacket_traits<Packet2Xbf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf ploaddup<Packet2Xbf>(const bfloat16* from) {
+  Packet2Xsu data = __riscv_vreinterpret_v_bf16m2_u16m2(pload<Packet2Xbf>(from));
+  return __riscv_vreinterpret_v_i16m2_bf16m2(
+      __riscv_vreinterpret_v_i32m2_i16m2(__riscv_vreinterpret_v_u32m2_i32m2(__riscv_vlmul_trunc_v_u32m4_u32m2(
+          __riscv_vwmaccu_vx_u32m4(__riscv_vwaddu_vv_u32m4(data, data, unpacket_traits<Packet2Xs>::size), 0xffffu, data,
+                                   unpacket_traits<Packet2Xs>::size)))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf ploadquad<Packet2Xbf>(const bfloat16* from) {
+  Packet2Xsu idx = __riscv_vsrl_vx_u16m2(__riscv_vid_v_u16m2(unpacket_traits<Packet2Xbf>::size), 2,
+                                         unpacket_traits<Packet2Xbf>::size);
+  return __riscv_vreinterpret_v_i16m2_bf16m2(__riscv_vrgather_vv_i16m2(
+      pload<Packet2Xs>(reinterpret_cast<const short*>(from)), idx, unpacket_traits<Packet2Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet2Xbf& from) {
+  EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_bf16m2(reinterpret_cast<__bf16*>(to), from,
+                                                   unpacket_traits<Packet2Xbf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet2Xbf& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_bf16m2(reinterpret_cast<__bf16*>(to), from,
+                                                     unpacket_traits<Packet2Xbf>::size);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet2Xbf pgather<bfloat16, Packet2Xbf>(const bfloat16* from, Index stride) {
+  return __riscv_vlse16_v_bf16m2(reinterpret_cast<const __bf16*>(from), stride * sizeof(bfloat16),
+                                 unpacket_traits<Packet2Xbf>::size);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<bfloat16, Packet2Xbf>(bfloat16* to, const Packet2Xbf& from, Index stride) {
+  __riscv_vsse16(reinterpret_cast<__bf16*>(to), stride * sizeof(bfloat16), from, unpacket_traits<Packet2Xbf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 pfirst<Packet2Xbf>(const Packet2Xbf& a) {
+  return numext::bit_cast<bfloat16>(__riscv_vmv_x_s_i16m2_i16(__riscv_vreinterpret_v_bf16m2_i16m2(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf psqrt(const Packet2Xbf& a) {
+  return F32ToBf16(psqrt<Packet4Xf>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf print<Packet2Xbf>(const Packet2Xbf& a) {
+  return F32ToBf16(print<Packet4Xf>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pfloor<Packet2Xbf>(const Packet2Xbf& a) {
+  return F32ToBf16(pfloor<Packet4Xf>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf preverse(const Packet2Xbf& a) {
+  return __riscv_vreinterpret_v_i16m2_bf16m2(preverse<Packet2Xs>(__riscv_vreinterpret_v_bf16m2_i16m2(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux<Packet2Xbf>(const Packet2Xbf& a) {
+  return static_cast<bfloat16>(predux<Packet4Xf>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet2Xbf>(const Packet2Xbf& a) {
+  return static_cast<bfloat16>(predux_mul<Packet4Xf>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_min<Packet2Xbf>(const Packet2Xbf& a) {
+  return static_cast<bfloat16>(predux_min<Packet4Xf>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_max<Packet2Xbf>(const Packet2Xbf& a) {
+  return static_cast<bfloat16>(predux_max<Packet4Xf>(Bf16ToF32(a)));
+}
+
+template <int N>
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2Xbf, N>& kernel) {
+  bfloat16 buffer[unpacket_traits<Packet2Xbf>::size * N];
+  int i = 0;
+
+  for (i = 0; i < N; i++) {
+    __riscv_vsse16(reinterpret_cast<__bf16*>(&buffer[i]), N * sizeof(bfloat16), kernel.packet[i],
+                   unpacket_traits<Packet2Xbf>::size);
+  }
+
+  for (i = 0; i < N; i++) {
+    kernel.packet[i] = __riscv_vle16_v_bf16m2(reinterpret_cast<__bf16*>(&buffer[i * unpacket_traits<Packet2Xbf>::size]),
+                                              unpacket_traits<Packet2Xbf>::size);
+  }
+}
+
+template <typename Packet = Packet2Xbf>
+EIGEN_STRONG_INLINE
+    typename std::enable_if<std::is_same<Packet, Packet2Xbf>::value && (unpacket_traits<Packet2Xbf>::size % 8) == 0,
+                            Packet1Xbf>::type
+    predux_half(const Packet2Xbf& a) {
+  return padd<Packet1Xbf>(__riscv_vget_v_bf16m2_bf16m1(a, 0), __riscv_vget_v_bf16m2_bf16m1(a, 1));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pcast<Packet1Xs, Packet1Xbf>(const Packet1Xs& a) {
+  return __riscv_vreinterpret_v_i16m1_bf16m1(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pcast<Packet2Xs, Packet2Xbf>(const Packet2Xs& a) {
+  return __riscv_vreinterpret_v_i16m2_bf16m2(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xs pcast<Packet1Xbf, Packet1Xs>(const Packet1Xbf& a) {
+  return __riscv_vreinterpret_v_bf16m1_i16m1(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xs pcast<Packet2Xbf, Packet2Xs>(const Packet2Xbf& a) {
+  return __riscv_vreinterpret_v_bf16m2_i16m2(a);
+}
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_PACKET_MATH_BF16_RVV10_H

diff --git a/Eigen/src/Core/arch/RVV10/PacketMathFP16.h b/Eigen/src/Core/arch/RVV10/PacketMathFP16.h
index fbda191..73118a3 100644
--- a/Eigen/src/Core/arch/RVV10/PacketMathFP16.h
+++ b/Eigen/src/Core/arch/RVV10/PacketMathFP16.h

@@ -16,8 +16,9 @@
 namespace Eigen {
 namespace internal {
 
-typedef vfloat16m1_t Packet1Xh __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL)));
-typedef vfloat16m2_t Packet2Xh __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 2)));
+typedef eigen_packet_wrapper<vfloat16m1_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))), 24> Packet1Xh;
+typedef eigen_packet_wrapper<vfloat16m2_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 2))), 25>
+    Packet2Xh;
 
 #if EIGEN_RISCV64_DEFAULT_LMUL == 1
 typedef Packet1Xh PacketXh;
@@ -108,7 +109,7 @@
 struct unpacket_traits<Packet1Xh> {
   typedef Eigen::half type;
   typedef Packet1Xh half;  // Half not yet implemented
-  typedef PacketXs integer_packet;
+  typedef Packet1Xs integer_packet;
   typedef numext::uint8_t mask_t;
 
   enum {
@@ -136,346 +137,388 @@
   };
 };
 
-/********************************* PacketXh ************************************/
+/********************************* Packet1Xh ************************************/
 
 template <>
-EIGEN_STRONG_INLINE PacketXh ptrue<PacketXh>(const PacketXh& /*a*/) {
-  return __riscv_vreinterpret_f16m1(__riscv_vmv_v_x_u16m1(0xffffu, unpacket_traits<PacketXh>::size));
+EIGEN_STRONG_INLINE Packet1Xh ptrue<Packet1Xh>(const Packet1Xh& /*a*/) {
+  return __riscv_vreinterpret_f16m1(__riscv_vmv_v_x_u16m1(0xffffu, unpacket_traits<Packet1Xh>::size));
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXh pzero<PacketXh>(const PacketXh& /*a*/) {
-  return __riscv_vfmv_v_f_f16m1(static_cast<Eigen::half>(0.0), unpacket_traits<PacketXh>::size);
+EIGEN_STRONG_INLINE Packet1Xh pzero<Packet1Xh>(const Packet1Xh& /*a*/) {
+  return __riscv_vfmv_v_f_f16m1(static_cast<_Float16>(0.0), unpacket_traits<Packet1Xh>::size);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXh pabs(const PacketXh& a) {
-  return __riscv_vfabs_v_f16m1(a, unpacket_traits<PacketXh>::size);
+EIGEN_STRONG_INLINE Packet1Xh pabs(const Packet1Xh& a) {
+  return __riscv_vfabs_v_f16m1(a, unpacket_traits<Packet1Xh>::size);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXh pset1<PacketXh>(const Eigen::half& from) {
-  return __riscv_vfmv_v_f_f16m1(static_cast<_Float16>(from), unpacket_traits<PacketXh>::size);
+EIGEN_STRONG_INLINE Packet1Xh pabsdiff(const Packet1Xh& a, const Packet1Xh& b) {
+  return __riscv_vfabs_v_f16m1(__riscv_vfsub_vv_f16m1(a, b, unpacket_traits<Packet1Xh>::size),
+                               unpacket_traits<Packet1Xh>::size);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXh pset1frombits<PacketXh>(numext::uint16_t from) {
-  return __riscv_vreinterpret_f16m1(__riscv_vmv_v_x_u16m1(from, unpacket_traits<PacketXh>::size));
+EIGEN_STRONG_INLINE Packet1Xh pset1<Packet1Xh>(const Eigen::half& from) {
+  return __riscv_vfmv_v_f_f16m1(numext::bit_cast<_Float16>(from), unpacket_traits<Packet1Xh>::size);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXh plset<PacketXh>(const Eigen::half& a) {
-  PacketXh idx =
-      __riscv_vfcvt_f_x_v_f16m1(__riscv_vid_v_i16m1(unpacket_traits<PacketXs>::size), unpacket_traits<PacketXh>::size);
-  return __riscv_vfadd_vf_f16m1(idx, a, unpacket_traits<PacketXh>::size);
+EIGEN_STRONG_INLINE Packet1Xh pset1frombits<Packet1Xh>(numext::uint16_t from) {
+  return __riscv_vreinterpret_f16m1(__riscv_vmv_v_x_u16m1(from, unpacket_traits<Packet1Xh>::size));
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXh padd<PacketXh>(const PacketXh& a, const PacketXh& b) {
-  return __riscv_vfadd_vv_f16m1(a, b, unpacket_traits<PacketXh>::size);
+EIGEN_STRONG_INLINE Packet1Xh plset<Packet1Xh>(const Eigen::half& a) {
+  Packet1Xh idx = __riscv_vfcvt_f_x_v_f16m1(
+      __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vid_v_u16m1(unpacket_traits<Packet1Xs>::size)),
+      unpacket_traits<Packet1Xh>::size);
+  return __riscv_vfadd_vf_f16m1(idx, numext::bit_cast<_Float16>(a), unpacket_traits<Packet1Xh>::size);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXh psub<PacketXh>(const PacketXh& a, const PacketXh& b) {
-  return __riscv_vfsub_vv_f16m1(a, b, unpacket_traits<PacketXh>::size);
+EIGEN_STRONG_INLINE void pbroadcast4<Packet1Xh>(const Eigen::half* a, Packet1Xh& a0, Packet1Xh& a1, Packet1Xh& a2,
+                                                Packet1Xh& a3) {
+  vfloat16m1_t aa = __riscv_vle16_v_f16m1(reinterpret_cast<const _Float16*>(a), 4);
+  a0 = __riscv_vrgather_vx_f16m1(aa, 0, unpacket_traits<Packet1Xh>::size);
+  a1 = __riscv_vrgather_vx_f16m1(aa, 1, unpacket_traits<Packet1Xh>::size);
+  a2 = __riscv_vrgather_vx_f16m1(aa, 2, unpacket_traits<Packet1Xh>::size);
+  a3 = __riscv_vrgather_vx_f16m1(aa, 3, unpacket_traits<Packet1Xh>::size);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXh pnegate(const PacketXh& a) {
-  return __riscv_vfneg_v_f16m1(a, unpacket_traits<PacketXh>::size);
+EIGEN_STRONG_INLINE Packet1Xh padd<Packet1Xh>(const Packet1Xh& a, const Packet1Xh& b) {
+  return __riscv_vfadd_vv_f16m1(a, b, unpacket_traits<Packet1Xh>::size);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXh pconj(const PacketXh& a) {
+EIGEN_STRONG_INLINE Packet1Xh psub<Packet1Xh>(const Packet1Xh& a, const Packet1Xh& b) {
+  return __riscv_vfsub_vv_f16m1(a, b, unpacket_traits<Packet1Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xh pnegate(const Packet1Xh& a) {
+  return __riscv_vfneg_v_f16m1(a, unpacket_traits<Packet1Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xh psignbit(const Packet1Xh& a) {
+  return __riscv_vreinterpret_v_i16m1_f16m1(
+      __riscv_vsra_vx_i16m1(__riscv_vreinterpret_v_f16m1_i16m1(a), 15, unpacket_traits<Packet1Xs>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xh pconj(const Packet1Xh& a) {
   return a;
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXh pmul<PacketXh>(const PacketXh& a, const PacketXh& b) {
-  return __riscv_vfmul_vv_f16m1(a, b, unpacket_traits<PacketXh>::size);
+EIGEN_STRONG_INLINE Packet1Xh pmul<Packet1Xh>(const Packet1Xh& a, const Packet1Xh& b) {
+  return __riscv_vfmul_vv_f16m1(a, b, unpacket_traits<Packet1Xh>::size);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXh pdiv<PacketXh>(const PacketXh& a, const PacketXh& b) {
-  return __riscv_vfdiv_vv_f16m1(a, b, unpacket_traits<PacketXh>::size);
+EIGEN_STRONG_INLINE Packet1Xh pdiv<Packet1Xh>(const Packet1Xh& a, const Packet1Xh& b) {
+  return __riscv_vfdiv_vv_f16m1(a, b, unpacket_traits<Packet1Xh>::size);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXh pmadd(const PacketXh& a, const PacketXh& b, const PacketXh& c) {
-  return __riscv_vfmadd_vv_f16m1(a, b, c, unpacket_traits<PacketXh>::size);
+EIGEN_STRONG_INLINE Packet1Xh pmadd(const Packet1Xh& a, const Packet1Xh& b, const Packet1Xh& c) {
+  return __riscv_vfmadd_vv_f16m1(a, b, c, unpacket_traits<Packet1Xh>::size);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXh pmsub(const PacketXh& a, const PacketXh& b, const PacketXh& c) {
-  return __riscv_vfmsub_vv_f16m1(a, b, c, unpacket_traits<PacketXh>::size);
+EIGEN_STRONG_INLINE Packet1Xh pmsub(const Packet1Xh& a, const Packet1Xh& b, const Packet1Xh& c) {
+  return __riscv_vfmsub_vv_f16m1(a, b, c, unpacket_traits<Packet1Xh>::size);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXh pnmadd(const PacketXh& a, const PacketXh& b, const PacketXh& c) {
-  return __riscv_vfnmsub_vv_f16m1(a, b, c, unpacket_traits<PacketXh>::size);
+EIGEN_STRONG_INLINE Packet1Xh pnmadd(const Packet1Xh& a, const Packet1Xh& b, const Packet1Xh& c) {
+  return __riscv_vfnmsub_vv_f16m1(a, b, c, unpacket_traits<Packet1Xh>::size);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXh pnmsub(const PacketXh& a, const PacketXh& b, const PacketXh& c) {
-  return __riscv_vfnmadd_vv_f16m1(a, b, c, unpacket_traits<PacketXh>::size);
+EIGEN_STRONG_INLINE Packet1Xh pnmsub(const Packet1Xh& a, const Packet1Xh& b, const Packet1Xh& c) {
+  return __riscv_vfnmadd_vv_f16m1(a, b, c, unpacket_traits<Packet1Xh>::size);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXh pmin<PacketXh>(const PacketXh& a, const PacketXh& b) {
-  PacketXh nans =
-      __riscv_vfmv_v_f_f16m1((std::numeric_limits<Eigen::half>::quiet_NaN)(), unpacket_traits<PacketXh>::size);
-  PacketMask16 mask = __riscv_vmfeq_vv_f16m1_b16(a, a, unpacket_traits<PacketXh>::size);
-  PacketMask16 mask2 = __riscv_vmfeq_vv_f16m1_b16(b, b, unpacket_traits<PacketXh>::size);
-  mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits<PacketXh>::size);
+EIGEN_STRONG_INLINE Packet1Xh pmin<Packet1Xh>(const Packet1Xh& a, const Packet1Xh& b) {
+  const Eigen::half nan = (std::numeric_limits<Eigen::half>::quiet_NaN)();
+  Packet1Xh nans = __riscv_vfmv_v_f_f16m1(numext::bit_cast<_Float16>(nan), unpacket_traits<Packet1Xh>::size);
+  PacketMask16 mask = __riscv_vmfeq_vv_f16m1_b16(a, a, unpacket_traits<Packet1Xh>::size);
+  PacketMask16 mask2 = __riscv_vmfeq_vv_f16m1_b16(b, b, unpacket_traits<Packet1Xh>::size);
+  mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits<Packet1Xh>::size);
 
-  return __riscv_vfmin_vv_f16m1_tum(mask, nans, a, b, unpacket_traits<PacketXh>::size);
+  return __riscv_vfmin_vv_f16m1_tumu(mask, nans, a, b, unpacket_traits<Packet1Xh>::size);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXh pmin<PropagateNaN, PacketXh>(const PacketXh& a, const PacketXh& b) {
-  return pmin<PacketXh>(a, b);
+EIGEN_STRONG_INLINE Packet1Xh pmin<PropagateNaN, Packet1Xh>(const Packet1Xh& a, const Packet1Xh& b) {
+  return pmin<Packet1Xh>(a, b);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXh pmin<PropagateNumbers, PacketXh>(const PacketXh& a, const PacketXh& b) {
-  return __riscv_vfmin_vv_f16m1(a, b, unpacket_traits<PacketXh>::size);
+EIGEN_STRONG_INLINE Packet1Xh pmin<PropagateNumbers, Packet1Xh>(const Packet1Xh& a, const Packet1Xh& b) {
+  return __riscv_vfmin_vv_f16m1(a, b, unpacket_traits<Packet1Xh>::size);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXh pmax<PacketXh>(const PacketXh& a, const PacketXh& b) {
-  PacketXh nans =
-      __riscv_vfmv_v_f_f16m1((std::numeric_limits<Eigen::half>::quiet_NaN)(), unpacket_traits<PacketXh>::size);
-  PacketMask16 mask = __riscv_vmfeq_vv_f16m1_b16(a, a, unpacket_traits<PacketXh>::size);
-  PacketMask16 mask2 = __riscv_vmfeq_vv_f16m1_b16(b, b, unpacket_traits<PacketXh>::size);
-  mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits<PacketXh>::size);
+EIGEN_STRONG_INLINE Packet1Xh pmax<Packet1Xh>(const Packet1Xh& a, const Packet1Xh& b) {
+  const Eigen::half nan = (std::numeric_limits<Eigen::half>::quiet_NaN)();
+  Packet1Xh nans = __riscv_vfmv_v_f_f16m1(numext::bit_cast<_Float16>(nan), unpacket_traits<Packet1Xh>::size);
+  PacketMask16 mask = __riscv_vmfeq_vv_f16m1_b16(a, a, unpacket_traits<Packet1Xh>::size);
+  PacketMask16 mask2 = __riscv_vmfeq_vv_f16m1_b16(b, b, unpacket_traits<Packet1Xh>::size);
+  mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits<Packet1Xh>::size);
 
-  return __riscv_vfmax_vv_f16m1_tum(mask, nans, a, b, unpacket_traits<PacketXh>::size);
+  return __riscv_vfmax_vv_f16m1_tumu(mask, nans, a, b, unpacket_traits<Packet1Xh>::size);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXh pmax<PropagateNaN, PacketXh>(const PacketXh& a, const PacketXh& b) {
-  return pmax<PacketXh>(a, b);
+EIGEN_STRONG_INLINE Packet1Xh pmax<PropagateNaN, Packet1Xh>(const Packet1Xh& a, const Packet1Xh& b) {
+  return pmax<Packet1Xh>(a, b);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXh pmax<PropagateNumbers, PacketXh>(const PacketXh& a, const PacketXh& b) {
-  return __riscv_vfmax_vv_f16m1(a, b, unpacket_traits<PacketXh>::size);
+EIGEN_STRONG_INLINE Packet1Xh pmax<PropagateNumbers, Packet1Xh>(const Packet1Xh& a, const Packet1Xh& b) {
+  return __riscv_vfmax_vv_f16m1(a, b, unpacket_traits<Packet1Xh>::size);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXh pcmp_le<PacketXh>(const PacketXh& a, const PacketXh& b) {
-  PacketMask16 mask = __riscv_vmfle_vv_f16m1_b16(a, b, unpacket_traits<PacketXh>::size);
-  return __riscv_vmerge_vvm_f16m1(pzero<PacketXh>(a), ptrue<PacketXh>(a), mask, unpacket_traits<PacketXh>::size);
+EIGEN_STRONG_INLINE Packet1Xh pcmp_le<Packet1Xh>(const Packet1Xh& a, const Packet1Xh& b) {
+  PacketMask16 mask = __riscv_vmfle_vv_f16m1_b16(a, b, unpacket_traits<Packet1Xh>::size);
+  return __riscv_vmerge_vvm_f16m1(pzero<Packet1Xh>(a), ptrue<Packet1Xh>(a), mask, unpacket_traits<Packet1Xh>::size);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXh pcmp_lt<PacketXh>(const PacketXh& a, const PacketXh& b) {
-  PacketMask16 mask = __riscv_vmflt_vv_f16m1_b16(a, b, unpacket_traits<PacketXh>::size);
-  return __riscv_vmerge_vvm_f16m1(pzero<PacketXh>(a), ptrue<PacketXh>(a), mask, unpacket_traits<PacketXh>::size);
+EIGEN_STRONG_INLINE Packet1Xh pcmp_lt<Packet1Xh>(const Packet1Xh& a, const Packet1Xh& b) {
+  PacketMask16 mask = __riscv_vmflt_vv_f16m1_b16(a, b, unpacket_traits<Packet1Xh>::size);
+  return __riscv_vmerge_vvm_f16m1(pzero<Packet1Xh>(a), ptrue<Packet1Xh>(a), mask, unpacket_traits<Packet1Xh>::size);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXh pcmp_eq<PacketXh>(const PacketXh& a, const PacketXh& b) {
-  PacketMask16 mask = __riscv_vmfeq_vv_f16m1_b16(a, b, unpacket_traits<PacketXh>::size);
-  return __riscv_vmerge_vvm_f16m1(pzero<PacketXh>(a), ptrue<PacketXh>(a), mask, unpacket_traits<PacketXh>::size);
+EIGEN_STRONG_INLINE Packet1Xh pcmp_eq<Packet1Xh>(const Packet1Xh& a, const Packet1Xh& b) {
+  PacketMask16 mask = __riscv_vmfeq_vv_f16m1_b16(a, b, unpacket_traits<Packet1Xh>::size);
+  return __riscv_vmerge_vvm_f16m1(pzero<Packet1Xh>(a), ptrue<Packet1Xh>(a), mask, unpacket_traits<Packet1Xh>::size);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXh pcmp_lt_or_nan<PacketXh>(const PacketXh& a, const PacketXh& b) {
-  PacketMask16 mask = __riscv_vmfge_vv_f16m1_b16(a, b, unpacket_traits<PacketXh>::size);
-  return __riscv_vfmerge_vfm_f16m1(ptrue<PacketXh>(a), static_cast<Eigen::half>(0.0), mask,
-                                   unpacket_traits<PacketXh>::size);
+EIGEN_STRONG_INLINE Packet1Xh pcmp_lt_or_nan<Packet1Xh>(const Packet1Xh& a, const Packet1Xh& b) {
+  PacketMask16 mask = __riscv_vmfge_vv_f16m1_b16(a, b, unpacket_traits<Packet1Xh>::size);
+  return __riscv_vfmerge_vfm_f16m1(ptrue<Packet1Xh>(a), static_cast<_Float16>(0.0), mask,
+                                   unpacket_traits<Packet1Xh>::size);
+}
+
+EIGEN_STRONG_INLINE Packet1Xh pselect(const PacketMask16& mask, const Packet1Xh& a, const Packet1Xh& b) {
+  return __riscv_vmerge_vvm_f16m1(b, a, mask, unpacket_traits<Packet1Xh>::size);
+}
+
+EIGEN_STRONG_INLINE Packet1Xh pselect(const Packet1Xh& mask, const Packet1Xh& a, const Packet1Xh& b) {
+  PacketMask16 mask2 =
+      __riscv_vmsne_vx_i16m1_b16(__riscv_vreinterpret_v_f16m1_i16m1(mask), 0, unpacket_traits<Packet1Xh>::size);
+  return __riscv_vreinterpret_v_i16m1_f16m1(__riscv_vmerge_vvm_i16m1(__riscv_vreinterpret_v_f16m1_i16m1(b),
+                                                                     __riscv_vreinterpret_v_f16m1_i16m1(a), mask2,
+                                                                     unpacket_traits<Packet1Xh>::size));
 }
 
 // Logical Operations are not supported for half, so reinterpret casts
 template <>
-EIGEN_STRONG_INLINE PacketXh pand<PacketXh>(const PacketXh& a, const PacketXh& b) {
+EIGEN_STRONG_INLINE Packet1Xh pand<Packet1Xh>(const Packet1Xh& a, const Packet1Xh& b) {
   return __riscv_vreinterpret_v_u16m1_f16m1(__riscv_vand_vv_u16m1(
-      __riscv_vreinterpret_v_f16m1_u16m1(a), __riscv_vreinterpret_v_f16m1_u16m1(b), unpacket_traits<PacketXh>::size));
+      __riscv_vreinterpret_v_f16m1_u16m1(a), __riscv_vreinterpret_v_f16m1_u16m1(b), unpacket_traits<Packet1Xh>::size));
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXh por<PacketXh>(const PacketXh& a, const PacketXh& b) {
+EIGEN_STRONG_INLINE Packet1Xh por<Packet1Xh>(const Packet1Xh& a, const Packet1Xh& b) {
   return __riscv_vreinterpret_v_u16m1_f16m1(__riscv_vor_vv_u16m1(
-      __riscv_vreinterpret_v_f16m1_u16m1(a), __riscv_vreinterpret_v_f16m1_u16m1(b), unpacket_traits<PacketXh>::size));
+      __riscv_vreinterpret_v_f16m1_u16m1(a), __riscv_vreinterpret_v_f16m1_u16m1(b), unpacket_traits<Packet1Xh>::size));
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXh pxor<PacketXh>(const PacketXh& a, const PacketXh& b) {
+EIGEN_STRONG_INLINE Packet1Xh pxor<Packet1Xh>(const Packet1Xh& a, const Packet1Xh& b) {
   return __riscv_vreinterpret_v_u16m1_f16m1(__riscv_vxor_vv_u16m1(
-      __riscv_vreinterpret_v_f16m1_u16m1(a), __riscv_vreinterpret_v_f16m1_u16m1(b), unpacket_traits<PacketXh>::size));
+      __riscv_vreinterpret_v_f16m1_u16m1(a), __riscv_vreinterpret_v_f16m1_u16m1(b), unpacket_traits<Packet1Xh>::size));
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXh pandnot<PacketXh>(const PacketXh& a, const PacketXh& b) {
+EIGEN_STRONG_INLINE Packet1Xh pandnot<Packet1Xh>(const Packet1Xh& a, const Packet1Xh& b) {
   return __riscv_vreinterpret_v_u16m1_f16m1(__riscv_vand_vv_u16m1(
       __riscv_vreinterpret_v_f16m1_u16m1(a),
-      __riscv_vnot_v_u16m1(__riscv_vreinterpret_v_f16m1_u16m1(b), unpacket_traits<PacketXh>::size),
-      unpacket_traits<PacketXh>::size));
+      __riscv_vnot_v_u16m1(__riscv_vreinterpret_v_f16m1_u16m1(b), unpacket_traits<Packet1Xh>::size),
+      unpacket_traits<Packet1Xh>::size));
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXh pload<PacketXh>(const Eigen::half* from) {
+EIGEN_STRONG_INLINE Packet1Xh pload<Packet1Xh>(const Eigen::half* from) {
   EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_f16m1(reinterpret_cast<const _Float16*>(from),
-                                                        unpacket_traits<PacketXh>::size);
+                                                        unpacket_traits<Packet1Xh>::size);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXh ploadu<PacketXh>(const Eigen::half* from) {
+EIGEN_STRONG_INLINE Packet1Xh ploadu<Packet1Xh>(const Eigen::half* from) {
   EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_f16m1(reinterpret_cast<const _Float16*>(from),
-                                                          unpacket_traits<PacketXh>::size);
+                                                          unpacket_traits<Packet1Xh>::size);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXh ploaddup<PacketXh>(const Eigen::half* from) {
-  PacketXsu idx = __riscv_vid_v_u16m1(unpacket_traits<PacketXh>::size);
-  idx = __riscv_vand_vx_u16m1(idx, 0xfffeu, unpacket_traits<PacketXh>::size);
-  return __riscv_vloxei16_v_f16m1(reinterpret_cast<const _Float16*>(from), idx, unpacket_traits<PacketXh>::size);
+EIGEN_STRONG_INLINE Packet1Xh ploaddup<Packet1Xh>(const Eigen::half* from) {
+  Packet1Xsu data = __riscv_vreinterpret_v_f16m1_u16m1(pload<Packet1Xh>(from));
+  return __riscv_vreinterpret_v_i16m1_f16m1(
+      __riscv_vreinterpret_v_i32m1_i16m1(__riscv_vreinterpret_v_u32m1_i32m1(__riscv_vlmul_trunc_v_u32m2_u32m1(
+          __riscv_vwmaccu_vx_u32m2(__riscv_vwaddu_vv_u32m2(data, data, unpacket_traits<Packet1Xs>::size), 0xffffu, data,
+                                   unpacket_traits<Packet1Xs>::size)))));
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXh ploadquad<PacketXh>(const Eigen::half* from) {
-  PacketXsu idx = __riscv_vid_v_u16m1(unpacket_traits<PacketXh>::size);
-  idx = __riscv_vsrl_vx_u16m1(__riscv_vand_vx_u16m1(idx, 0xfffcu, unpacket_traits<PacketXh>::size), 1,
-                              unpacket_traits<PacketXh>::size);
-  return __riscv_vloxei16_v_f16m1(reinterpret_cast<const _Float16*>(from), idx, unpacket_traits<PacketXh>::size);
+EIGEN_STRONG_INLINE Packet1Xh ploadquad<Packet1Xh>(const Eigen::half* from) {
+  Packet1Xsu idx =
+      __riscv_vsrl_vx_u16m1(__riscv_vid_v_u16m1(unpacket_traits<Packet1Xh>::size), 2, unpacket_traits<Packet1Xh>::size);
+  return __riscv_vrgather_vv_f16m1(pload<Packet1Xh>(from), idx, unpacket_traits<Packet1Xh>::size);
 }
 
 template <>
-EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const PacketXh& from) {
+EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet1Xh& from) {
   EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_f16m1(reinterpret_cast<_Float16*>(to), from,
-                                                  unpacket_traits<PacketXh>::size);
+                                                  unpacket_traits<Packet1Xh>::size);
 }
 
 template <>
-EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const PacketXh& from) {
+EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet1Xh& from) {
   EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_f16m1(reinterpret_cast<_Float16*>(to), from,
-                                                    unpacket_traits<PacketXh>::size);
+                                                    unpacket_traits<Packet1Xh>::size);
 }
 
 template <>
-EIGEN_DEVICE_FUNC inline PacketXh pgather<Eigen::half, PacketXh>(const Eigen::half* from, Index stride) {
+EIGEN_DEVICE_FUNC inline Packet1Xh pgather<Eigen::half, Packet1Xh>(const Eigen::half* from, Index stride) {
   return __riscv_vlse16_v_f16m1(reinterpret_cast<const _Float16*>(from), stride * sizeof(Eigen::half),
-                                unpacket_traits<PacketXh>::size);
+                                unpacket_traits<Packet1Xh>::size);
 }
 
 template <>
-EIGEN_DEVICE_FUNC inline void pscatter<Eigen::half, PacketXh>(Eigen::half* to, const PacketXh& from, Index stride) {
-  __riscv_vsse16(reinterpret_cast<_Float16*>(to), stride * sizeof(Eigen::half), from, unpacket_traits<PacketXh>::size);
+EIGEN_DEVICE_FUNC inline void pscatter<Eigen::half, Packet1Xh>(Eigen::half* to, const Packet1Xh& from, Index stride) {
+  __riscv_vsse16(reinterpret_cast<_Float16*>(to), stride * sizeof(Eigen::half), from, unpacket_traits<Packet1Xh>::size);
 }
 
 template <>
-EIGEN_STRONG_INLINE Eigen::half pfirst<PacketXh>(const PacketXh& a) {
+EIGEN_STRONG_INLINE Eigen::half pfirst<Packet1Xh>(const Packet1Xh& a) {
   return static_cast<Eigen::half>(__riscv_vfmv_f_s_f16m1_f16(a));
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXh psqrt(const PacketXh& a) {
-  return __riscv_vfsqrt_v_f16m1(a, unpacket_traits<PacketXh>::size);
+EIGEN_STRONG_INLINE Packet1Xh psqrt(const Packet1Xh& a) {
+  return __riscv_vfsqrt_v_f16m1(a, unpacket_traits<Packet1Xh>::size);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXh print<PacketXh>(const PacketXh& a) {
-  const PacketXh limit = pset1<PacketXh>(static_cast<Eigen::half>(1 << 10));
-  const PacketXh abs_a = pabs(a);
+EIGEN_STRONG_INLINE Packet1Xh print<Packet1Xh>(const Packet1Xh& a) {
+  const Packet1Xh limit = pset1<Packet1Xh>(static_cast<Eigen::half>(1 << 10));
+  const Packet1Xh abs_a = pabs(a);
 
-  PacketMask16 mask = __riscv_vmfne_vv_f16m1_b16(a, a, unpacket_traits<PacketXh>::size);
-  const PacketXh x = __riscv_vfadd_vv_f16m1_tum(mask, a, a, a, unpacket_traits<PacketXh>::size);
-  const PacketXh new_x = __riscv_vfcvt_f_x_v_f16m1(__riscv_vfcvt_x_f_v_i16m1(a, unpacket_traits<PacketXh>::size),
-                                                   unpacket_traits<PacketXh>::size);
+  PacketMask16 mask = __riscv_vmfne_vv_f16m1_b16(a, a, unpacket_traits<Packet1Xh>::size);
+  const Packet1Xh x = __riscv_vfadd_vv_f16m1_tumu(mask, a, a, a, unpacket_traits<Packet1Xh>::size);
+  const Packet1Xh new_x = __riscv_vfcvt_f_x_v_f16m1(__riscv_vfcvt_x_f_v_i16m1(a, unpacket_traits<Packet1Xh>::size),
+                                                    unpacket_traits<Packet1Xh>::size);
 
-  mask = __riscv_vmflt_vv_f16m1_b16(abs_a, limit, unpacket_traits<PacketXh>::size);
-  PacketXh signed_x = __riscv_vfsgnj_vv_f16m1(new_x, x, unpacket_traits<PacketXh>::size);
-  return __riscv_vmerge_vvm_f16m1(x, signed_x, mask, unpacket_traits<PacketXh>::size);
+  mask = __riscv_vmflt_vv_f16m1_b16(abs_a, limit, unpacket_traits<Packet1Xh>::size);
+  Packet1Xh signed_x = __riscv_vfsgnj_vv_f16m1(new_x, x, unpacket_traits<Packet1Xh>::size);
+  return __riscv_vmerge_vvm_f16m1(x, signed_x, mask, unpacket_traits<Packet1Xh>::size);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXh pfloor<PacketXh>(const PacketXh& a) {
-  PacketXh tmp = print<PacketXh>(a);
+EIGEN_STRONG_INLINE Packet1Xh pfloor<Packet1Xh>(const Packet1Xh& a) {
+  Packet1Xh tmp = print<Packet1Xh>(a);
   // If greater, subtract one.
-  PacketMask16 mask = __riscv_vmflt_vv_f16m1_b16(a, tmp, unpacket_traits<PacketXh>::size);
-  return __riscv_vfsub_vf_f16m1_tum(mask, tmp, tmp, static_cast<Eigen::half>(1.0), unpacket_traits<PacketXh>::size);
+  PacketMask16 mask = __riscv_vmflt_vv_f16m1_b16(a, tmp, unpacket_traits<Packet1Xh>::size);
+  return __riscv_vfsub_vf_f16m1_tumu(mask, tmp, tmp, static_cast<_Float16>(1.0), unpacket_traits<Packet1Xh>::size);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXh preverse(const PacketXh& a) {
-  PacketXsu idx = __riscv_vrsub_vx_u16m1(__riscv_vid_v_u16m1(unpacket_traits<PacketXh>::size),
-                                         unpacket_traits<PacketXh>::size - 1, unpacket_traits<PacketXh>::size);
-  return __riscv_vrgather_vv_f16m1(a, idx, unpacket_traits<PacketXh>::size);
+EIGEN_STRONG_INLINE Packet1Xh preverse(const Packet1Xh& a) {
+  Packet1Xsu idx = __riscv_vrsub_vx_u16m1(__riscv_vid_v_u16m1(unpacket_traits<Packet1Xh>::size),
+                                          unpacket_traits<Packet1Xh>::size - 1, unpacket_traits<Packet1Xh>::size);
+  return __riscv_vrgather_vv_f16m1(a, idx, unpacket_traits<Packet1Xh>::size);
 }
 
 template <>
-EIGEN_STRONG_INLINE Eigen::half predux<PacketXh>(const PacketXh& a) {
+EIGEN_STRONG_INLINE Eigen::half predux<Packet1Xh>(const Packet1Xh& a) {
   return static_cast<Eigen::half>(__riscv_vfmv_f(__riscv_vfredusum_vs_f16m1_f16m1(
-      a, __riscv_vfmv_v_f_f16m1(static_cast<Eigen::half>(0.0), unpacket_traits<PacketXh>::size),
-      unpacket_traits<PacketXh>::size)));
+      a, __riscv_vfmv_v_f_f16m1(static_cast<_Float16>(0.0), unpacket_traits<Packet1Xh>::size),
+      unpacket_traits<Packet1Xh>::size)));
 }
 
 template <>
-EIGEN_STRONG_INLINE Eigen::half predux_mul<PacketXh>(const PacketXh& a) {
+EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet1Xh>(const Packet1Xh& a) {
   // Multiply the vector by its reverse
-  PacketXh prod = __riscv_vfmul_vv_f16m1(preverse(a), a, unpacket_traits<PacketXh>::size);
-  PacketXh half_prod;
+  Packet1Xh prod = __riscv_vfmul_vv_f16m1(preverse(a), a, unpacket_traits<Packet1Xh>::size);
+  Packet1Xh half_prod;
 
   if (EIGEN_RISCV64_RVV_VL >= 1024) {
-    half_prod = __riscv_vslidedown_vx_f16m1(prod, 16, unpacket_traits<PacketXh>::size);
-    prod = __riscv_vfmul_vv_f16m1(prod, half_prod, unpacket_traits<PacketXh>::size);
+    half_prod = __riscv_vslidedown_vx_f16m1(prod, 16, unpacket_traits<Packet1Xh>::size);
+    prod = __riscv_vfmul_vv_f16m1(prod, half_prod, unpacket_traits<Packet1Xh>::size);
   }
   if (EIGEN_RISCV64_RVV_VL >= 512) {
-    half_prod = __riscv_vslidedown_vx_f16m1(prod, 8, unpacket_traits<PacketXh>::size);
-    prod = __riscv_vfmul_vv_f16m1(prod, half_prod, unpacket_traits<PacketXh>::size);
+    half_prod = __riscv_vslidedown_vx_f16m1(prod, 8, unpacket_traits<Packet1Xh>::size);
+    prod = __riscv_vfmul_vv_f16m1(prod, half_prod, unpacket_traits<Packet1Xh>::size);
   }
   if (EIGEN_RISCV64_RVV_VL >= 256) {
-    half_prod = __riscv_vslidedown_vx_f16m1(prod, 4, unpacket_traits<PacketXh>::size);
-    prod = __riscv_vfmul_vv_f16m1(prod, half_prod, unpacket_traits<PacketXh>::size);
+    half_prod = __riscv_vslidedown_vx_f16m1(prod, 4, unpacket_traits<Packet1Xh>::size);
+    prod = __riscv_vfmul_vv_f16m1(prod, half_prod, unpacket_traits<Packet1Xh>::size);
   }
   // Last reduction
-  half_prod = __riscv_vslidedown_vx_f16m1(prod, 2, unpacket_traits<PacketXh>::size);
-  prod = __riscv_vfmul_vv_f16m1(prod, half_prod, unpacket_traits<PacketXh>::size);
+  half_prod = __riscv_vslidedown_vx_f16m1(prod, 2, unpacket_traits<Packet1Xh>::size);
+  prod = __riscv_vfmul_vv_f16m1(prod, half_prod, unpacket_traits<Packet1Xh>::size);
 
-  half_prod = __riscv_vslidedown_vx_f16m1(prod, 1, unpacket_traits<PacketXh>::size);
-  prod = __riscv_vfmul_vv_f16m1(prod, half_prod, unpacket_traits<PacketXh>::size);
+  half_prod = __riscv_vslidedown_vx_f16m1(prod, 1, unpacket_traits<Packet1Xh>::size);
+  prod = __riscv_vfmul_vv_f16m1(prod, half_prod, unpacket_traits<Packet1Xh>::size);
 
   // The reduction is done to the first element.
   return pfirst(prod);
 }
 
 template <>
-EIGEN_STRONG_INLINE Eigen::half predux_min<PacketXh>(const PacketXh& a) {
-  return static_cast<Eigen::half>(__riscv_vfmv_f(__riscv_vfredmin_vs_f16m1_f16m1(
-      a, __riscv_vfmv_v_f_f16m1((std::numeric_limits<Eigen::half>::max)(), unpacket_traits<PacketXh>::size),
-      unpacket_traits<PacketXh>::size)));
+EIGEN_STRONG_INLINE Eigen::half predux_min<Packet1Xh>(const Packet1Xh& a) {
+  const Eigen::half max = (std::numeric_limits<Eigen::half>::max)();
+  const Eigen::half nan = (std::numeric_limits<Eigen::half>::quiet_NaN)();
+  return (std::min)(static_cast<Eigen::half>(__riscv_vfmv_f(__riscv_vfredmin_vs_f16m1_f16m1(
+                        a, __riscv_vfmv_v_f_f16m1(numext::bit_cast<_Float16>(nan), unpacket_traits<Packet1Xh>::size),
+                        unpacket_traits<Packet1Xh>::size))),
+                    max);
 }
 
 template <>
-EIGEN_STRONG_INLINE Eigen::half predux_max<PacketXh>(const PacketXh& a) {
-  return static_cast<Eigen::half>(__riscv_vfmv_f(__riscv_vfredmax_vs_f16m1_f16m1(
-      a, __riscv_vfmv_v_f_f16m1(-(std::numeric_limits<Eigen::half>::max)(), unpacket_traits<PacketXh>::size),
-      unpacket_traits<PacketXh>::size)));
+EIGEN_STRONG_INLINE Eigen::half predux_max<Packet1Xh>(const Packet1Xh& a) {
+  const Eigen::half min = -(std::numeric_limits<Eigen::half>::max)();
+  const Eigen::half nan = (std::numeric_limits<Eigen::half>::quiet_NaN)();
+  return (std::max)(static_cast<Eigen::half>(__riscv_vfmv_f(__riscv_vfredmax_vs_f16m1_f16m1(
+                        a, __riscv_vfmv_v_f_f16m1(numext::bit_cast<_Float16>(nan), unpacket_traits<Packet1Xh>::size),
+                        unpacket_traits<Packet1Xh>::size))),
+                    min);
 }
 
 template <int N>
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<PacketXh, N>& kernel) {
-  Eigen::half buffer[unpacket_traits<PacketXh>::size * N];
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet1Xh, N>& kernel) {
+  Eigen::half buffer[unpacket_traits<Packet1Xh>::size * N];
   int i = 0;
 
   for (i = 0; i < N; i++) {
     __riscv_vsse16(reinterpret_cast<_Float16*>(&buffer[i]), N * sizeof(Eigen::half), kernel.packet[i],
-                   unpacket_traits<PacketXh>::size);
+                   unpacket_traits<Packet1Xh>::size);
   }
 
   for (i = 0; i < N; i++) {
-    kernel.packet[i] = __riscv_vle16_v_f16m1(reinterpret_cast<_Float16*>(&buffer[i * unpacket_traits<PacketXh>::size]),
-                                             unpacket_traits<PacketXh>::size);
+    kernel.packet[i] = __riscv_vle16_v_f16m1(reinterpret_cast<_Float16*>(&buffer[i * unpacket_traits<Packet1Xh>::size]),
+                                             unpacket_traits<Packet1Xh>::size);
   }
 }
 
-EIGEN_STRONG_INLINE Packet2Xf half2float(const PacketXh& a) {
+EIGEN_STRONG_INLINE Packet2Xf half2float(const Packet1Xh& a) {
   return __riscv_vfwcvt_f_f_v_f32m2(a, unpacket_traits<Packet2Xf>::size);
 }
 
-EIGEN_STRONG_INLINE PacketXh float2half(const Packet2Xf& a) {
-  return __riscv_vfncvt_f_f_w_f16m1(a, unpacket_traits<PacketXh>::size);
+EIGEN_STRONG_INLINE Packet1Xh float2half(const Packet2Xf& a) {
+  return __riscv_vfncvt_f_f_w_f16m1(a, unpacket_traits<Packet1Xh>::size);
 }
 
 /********************************* Packet2Xh ************************************/
@@ -487,7 +530,7 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet2Xh pzero<Packet2Xh>(const Packet2Xh& /*a*/) {
-  return __riscv_vfmv_v_f_f16m2(static_cast<Eigen::half>(0.0), unpacket_traits<Packet2Xh>::size);
+  return __riscv_vfmv_v_f_f16m2(static_cast<_Float16>(0.0), unpacket_traits<Packet2Xh>::size);
 }
 
 template <>
@@ -496,8 +539,14 @@
 }
 
 template <>
+EIGEN_STRONG_INLINE Packet2Xh pabsdiff(const Packet2Xh& a, const Packet2Xh& b) {
+  return __riscv_vfabs_v_f16m2(__riscv_vfsub_vv_f16m2(a, b, unpacket_traits<Packet2Xh>::size),
+                               unpacket_traits<Packet2Xh>::size);
+}
+
+template <>
 EIGEN_STRONG_INLINE Packet2Xh pset1<Packet2Xh>(const Eigen::half& from) {
-  return __riscv_vfmv_v_f_f16m2(static_cast<_Float16>(from), unpacket_traits<Packet2Xh>::size);
+  return __riscv_vfmv_v_f_f16m2(numext::bit_cast<_Float16>(from), unpacket_traits<Packet2Xh>::size);
 }
 
 template <>
@@ -507,9 +556,20 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet2Xh plset<Packet2Xh>(const Eigen::half& a) {
-  Packet2Xh idx = __riscv_vfcvt_f_x_v_f16m2(__riscv_vid_v_i16m2(unpacket_traits<Packet4Xs>::size),
-                                               unpacket_traits<Packet2Xh>::size);
-  return __riscv_vfadd_vf_f16m2(idx, a, unpacket_traits<Packet2Xh>::size);
+  Packet2Xh idx = __riscv_vfcvt_f_x_v_f16m2(
+      __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vid_v_u16m2(unpacket_traits<Packet4Xs>::size)),
+      unpacket_traits<Packet2Xh>::size);
+  return __riscv_vfadd_vf_f16m2(idx, numext::bit_cast<_Float16>(a), unpacket_traits<Packet2Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pbroadcast4<Packet2Xh>(const Eigen::half* a, Packet2Xh& a0, Packet2Xh& a1, Packet2Xh& a2,
+                                                Packet2Xh& a3) {
+  vfloat16m2_t aa = __riscv_vle16_v_f16m2(reinterpret_cast<const _Float16*>(a), 4);
+  a0 = __riscv_vrgather_vx_f16m2(aa, 0, unpacket_traits<Packet2Xh>::size);
+  a1 = __riscv_vrgather_vx_f16m2(aa, 1, unpacket_traits<Packet2Xh>::size);
+  a2 = __riscv_vrgather_vx_f16m2(aa, 2, unpacket_traits<Packet2Xh>::size);
+  a3 = __riscv_vrgather_vx_f16m2(aa, 3, unpacket_traits<Packet2Xh>::size);
 }
 
 template <>
@@ -528,6 +588,12 @@
 }
 
 template <>
+EIGEN_STRONG_INLINE Packet2Xh psignbit(const Packet2Xh& a) {
+  return __riscv_vreinterpret_v_i16m2_f16m2(
+      __riscv_vsra_vx_i16m2(__riscv_vreinterpret_v_f16m2_i16m2(a), 15, unpacket_traits<Packet2Xs>::size));
+}
+
+template <>
 EIGEN_STRONG_INLINE Packet2Xh pconj(const Packet2Xh& a) {
   return a;
 }
@@ -564,13 +630,13 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet2Xh pmin<Packet2Xh>(const Packet2Xh& a, const Packet2Xh& b) {
-  Packet2Xh nans =
-      __riscv_vfmv_v_f_f16m2((std::numeric_limits<Eigen::half>::quiet_NaN)(), unpacket_traits<Packet2Xh>::size);
+  const Eigen::half nan = (std::numeric_limits<Eigen::half>::quiet_NaN)();
+  Packet2Xh nans = __riscv_vfmv_v_f_f16m2(numext::bit_cast<_Float16>(nan), unpacket_traits<Packet2Xh>::size);
   PacketMask8 mask = __riscv_vmfeq_vv_f16m2_b8(a, a, unpacket_traits<Packet2Xh>::size);
   PacketMask8 mask2 = __riscv_vmfeq_vv_f16m2_b8(b, b, unpacket_traits<Packet2Xh>::size);
   mask = __riscv_vmand_mm_b8(mask, mask2, unpacket_traits<Packet2Xh>::size);
 
-  return __riscv_vfmin_vv_f16m2_tum(mask, nans, a, b, unpacket_traits<Packet2Xh>::size);
+  return __riscv_vfmin_vv_f16m2_tumu(mask, nans, a, b, unpacket_traits<Packet2Xh>::size);
 }
 
 template <>
@@ -585,13 +651,13 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet2Xh pmax<Packet2Xh>(const Packet2Xh& a, const Packet2Xh& b) {
-  Packet2Xh nans =
-      __riscv_vfmv_v_f_f16m2((std::numeric_limits<Eigen::half>::quiet_NaN)(), unpacket_traits<Packet2Xh>::size);
+  const Eigen::half nan = (std::numeric_limits<Eigen::half>::quiet_NaN)();
+  Packet2Xh nans = __riscv_vfmv_v_f_f16m2(numext::bit_cast<_Float16>(nan), unpacket_traits<Packet2Xh>::size);
   PacketMask8 mask = __riscv_vmfeq_vv_f16m2_b8(a, a, unpacket_traits<Packet2Xh>::size);
   PacketMask8 mask2 = __riscv_vmfeq_vv_f16m2_b8(b, b, unpacket_traits<Packet2Xh>::size);
   mask = __riscv_vmand_mm_b8(mask, mask2, unpacket_traits<Packet2Xh>::size);
 
-  return __riscv_vfmax_vv_f16m2_tum(mask, nans, a, b, unpacket_traits<Packet2Xh>::size);
+  return __riscv_vfmax_vv_f16m2_tumu(mask, nans, a, b, unpacket_traits<Packet2Xh>::size);
 }
 
 template <>
@@ -607,51 +673,57 @@
 template <>
 EIGEN_STRONG_INLINE Packet2Xh pcmp_le<Packet2Xh>(const Packet2Xh& a, const Packet2Xh& b) {
   PacketMask8 mask = __riscv_vmfle_vv_f16m2_b8(a, b, unpacket_traits<Packet2Xh>::size);
-  return __riscv_vmerge_vvm_f16m2(pzero<Packet2Xh>(a), ptrue<Packet2Xh>(a), mask,
-                                  unpacket_traits<Packet2Xh>::size);
+  return __riscv_vmerge_vvm_f16m2(pzero<Packet2Xh>(a), ptrue<Packet2Xh>(a), mask, unpacket_traits<Packet2Xh>::size);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet2Xh pcmp_lt<Packet2Xh>(const Packet2Xh& a, const Packet2Xh& b) {
   PacketMask8 mask = __riscv_vmflt_vv_f16m2_b8(a, b, unpacket_traits<Packet2Xh>::size);
-  return __riscv_vmerge_vvm_f16m2(pzero<Packet2Xh>(a), ptrue<Packet2Xh>(a), mask,
-                                  unpacket_traits<Packet2Xh>::size);
+  return __riscv_vmerge_vvm_f16m2(pzero<Packet2Xh>(a), ptrue<Packet2Xh>(a), mask, unpacket_traits<Packet2Xh>::size);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet2Xh pcmp_eq<Packet2Xh>(const Packet2Xh& a, const Packet2Xh& b) {
   PacketMask8 mask = __riscv_vmfeq_vv_f16m2_b8(a, b, unpacket_traits<Packet2Xh>::size);
-  return __riscv_vmerge_vvm_f16m2(pzero<Packet2Xh>(a), ptrue<Packet2Xh>(a), mask,
-                                  unpacket_traits<Packet2Xh>::size);
+  return __riscv_vmerge_vvm_f16m2(pzero<Packet2Xh>(a), ptrue<Packet2Xh>(a), mask, unpacket_traits<Packet2Xh>::size);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet2Xh pcmp_lt_or_nan<Packet2Xh>(const Packet2Xh& a, const Packet2Xh& b) {
   PacketMask8 mask = __riscv_vmfge_vv_f16m2_b8(a, b, unpacket_traits<Packet2Xh>::size);
-  return __riscv_vfmerge_vfm_f16m2(ptrue<Packet2Xh>(a), static_cast<Eigen::half>(0.0), mask,
+  return __riscv_vfmerge_vfm_f16m2(ptrue<Packet2Xh>(a), static_cast<_Float16>(0.0), mask,
                                    unpacket_traits<Packet2Xh>::size);
 }
 
+EIGEN_STRONG_INLINE Packet2Xh pselect(const PacketMask8& mask, const Packet2Xh& a, const Packet2Xh& b) {
+  return __riscv_vmerge_vvm_f16m2(b, a, mask, unpacket_traits<Packet2Xh>::size);
+}
+
+EIGEN_STRONG_INLINE Packet2Xh pselect(const Packet2Xh& mask, const Packet2Xh& a, const Packet2Xh& b) {
+  PacketMask8 mask2 =
+      __riscv_vmsne_vx_i16m2_b8(__riscv_vreinterpret_v_f16m2_i16m2(mask), 0, unpacket_traits<Packet2Xh>::size);
+  return __riscv_vreinterpret_v_i16m2_f16m2(__riscv_vmerge_vvm_i16m2(__riscv_vreinterpret_v_f16m2_i16m2(b),
+                                                                     __riscv_vreinterpret_v_f16m2_i16m2(a), mask2,
+                                                                     unpacket_traits<Packet2Xh>::size));
+}
+
 // Logical Operations are not supported for half, so reinterpret casts
 template <>
 EIGEN_STRONG_INLINE Packet2Xh pand<Packet2Xh>(const Packet2Xh& a, const Packet2Xh& b) {
-  return __riscv_vreinterpret_v_u16m2_f16m2(__riscv_vand_vv_u16m2(__riscv_vreinterpret_v_f16m2_u16m2(a),
-                                                                  __riscv_vreinterpret_v_f16m2_u16m2(b),
-                                                                  unpacket_traits<Packet2Xh>::size));
+  return __riscv_vreinterpret_v_u16m2_f16m2(__riscv_vand_vv_u16m2(
+      __riscv_vreinterpret_v_f16m2_u16m2(a), __riscv_vreinterpret_v_f16m2_u16m2(b), unpacket_traits<Packet2Xh>::size));
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet2Xh por<Packet2Xh>(const Packet2Xh& a, const Packet2Xh& b) {
-  return __riscv_vreinterpret_v_u16m2_f16m2(__riscv_vor_vv_u16m2(__riscv_vreinterpret_v_f16m2_u16m2(a),
-                                                                 __riscv_vreinterpret_v_f16m2_u16m2(b),
-                                                                 unpacket_traits<Packet2Xh>::size));
+  return __riscv_vreinterpret_v_u16m2_f16m2(__riscv_vor_vv_u16m2(
+      __riscv_vreinterpret_v_f16m2_u16m2(a), __riscv_vreinterpret_v_f16m2_u16m2(b), unpacket_traits<Packet2Xh>::size));
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet2Xh pxor<Packet2Xh>(const Packet2Xh& a, const Packet2Xh& b) {
-  return __riscv_vreinterpret_v_u16m2_f16m2(__riscv_vxor_vv_u16m2(__riscv_vreinterpret_v_f16m2_u16m2(a),
-                                                                  __riscv_vreinterpret_v_f16m2_u16m2(b),
-                                                                  unpacket_traits<Packet2Xh>::size));
+  return __riscv_vreinterpret_v_u16m2_f16m2(__riscv_vxor_vv_u16m2(
+      __riscv_vreinterpret_v_f16m2_u16m2(a), __riscv_vreinterpret_v_f16m2_u16m2(b), unpacket_traits<Packet2Xh>::size));
 }
 
 template <>
@@ -676,17 +748,18 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet2Xh ploaddup<Packet2Xh>(const Eigen::half* from) {
-  Packet2Xsu idx = __riscv_vid_v_u16m2(unpacket_traits<Packet2Xh>::size);
-  idx = __riscv_vand_vx_u16m2(idx, 0xfffeu, unpacket_traits<Packet2Xh>::size);
-  return __riscv_vloxei16_v_f16m2(reinterpret_cast<const _Float16*>(from), idx, unpacket_traits<Packet2Xh>::size);
+  Packet2Xsu data = __riscv_vreinterpret_v_f16m2_u16m2(pload<Packet2Xh>(from));
+  return __riscv_vreinterpret_v_i16m2_f16m2(
+      __riscv_vreinterpret_v_i32m2_i16m2(__riscv_vreinterpret_v_u32m2_i32m2(__riscv_vlmul_trunc_v_u32m4_u32m2(
+          __riscv_vwmaccu_vx_u32m4(__riscv_vwaddu_vv_u32m4(data, data, unpacket_traits<Packet2Xs>::size), 0xffffu, data,
+                                   unpacket_traits<Packet2Xs>::size)))));
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet2Xh ploadquad<Packet2Xh>(const Eigen::half* from) {
-  Packet2Xsu idx = __riscv_vid_v_u16m2(unpacket_traits<Packet2Xh>::size);
-  idx = __riscv_vsrl_vx_u16m2(__riscv_vand_vx_u16m2(idx, 0xfffcu, unpacket_traits<Packet2Xh>::size), 1,
-                              unpacket_traits<Packet2Xs>::size);
-  return __riscv_vloxei16_v_f16m2(reinterpret_cast<const _Float16*>(from), idx, unpacket_traits<Packet2Xh>::size);
+  Packet2Xsu idx =
+      __riscv_vsrl_vx_u16m2(__riscv_vid_v_u16m2(unpacket_traits<Packet2Xh>::size), 2, unpacket_traits<Packet2Xh>::size);
+  return __riscv_vrgather_vv_f16m2(pload<Packet2Xh>(from), idx, unpacket_traits<Packet2Xh>::size);
 }
 
 template <>
@@ -708,15 +781,13 @@
 }
 
 template <>
-EIGEN_DEVICE_FUNC inline void pscatter<Eigen::half, Packet2Xh>(Eigen::half* to, const Packet2Xh& from,
-                                                                  Index stride) {
-  __riscv_vsse16(reinterpret_cast<_Float16*>(to), stride * sizeof(Eigen::half), from,
-                 unpacket_traits<Packet2Xh>::size);
+EIGEN_DEVICE_FUNC inline void pscatter<Eigen::half, Packet2Xh>(Eigen::half* to, const Packet2Xh& from, Index stride) {
+  __riscv_vsse16(reinterpret_cast<_Float16*>(to), stride * sizeof(Eigen::half), from, unpacket_traits<Packet2Xh>::size);
 }
 
 template <>
 EIGEN_STRONG_INLINE Eigen::half pfirst<Packet2Xh>(const Packet2Xh& a) {
-  return static_cast<Eigen::half>(__riscv_vfmv_f_s_f16m2_f16(a));
+  return numext::bit_cast<Eigen::half>(__riscv_vfmv_f_s_f16m2_f16(a));
 }
 
 template <>
@@ -730,9 +801,9 @@
   const Packet2Xh abs_a = pabs(a);
 
   PacketMask8 mask = __riscv_vmfne_vv_f16m2_b8(a, a, unpacket_traits<Packet2Xh>::size);
-  const Packet2Xh x = __riscv_vfadd_vv_f16m2_tum(mask, a, a, a, unpacket_traits<Packet2Xh>::size);
-  const Packet2Xh new_x = __riscv_vfcvt_f_x_v_f16m2(
-      __riscv_vfcvt_x_f_v_i16m2(a, unpacket_traits<Packet2Xh>::size), unpacket_traits<Packet2Xh>::size);
+  const Packet2Xh x = __riscv_vfadd_vv_f16m2_tumu(mask, a, a, a, unpacket_traits<Packet2Xh>::size);
+  const Packet2Xh new_x = __riscv_vfcvt_f_x_v_f16m2(__riscv_vfcvt_x_f_v_i16m2(a, unpacket_traits<Packet2Xh>::size),
+                                                    unpacket_traits<Packet2Xh>::size);
 
   mask = __riscv_vmflt_vv_f16m2_b8(abs_a, limit, unpacket_traits<Packet2Xh>::size);
   Packet2Xh signed_x = __riscv_vfsgnj_vv_f16m2(new_x, x, unpacket_traits<Packet2Xh>::size);
@@ -744,42 +815,49 @@
   Packet2Xh tmp = print<Packet2Xh>(a);
   // If greater, subtract one.
   PacketMask8 mask = __riscv_vmflt_vv_f16m2_b8(a, tmp, unpacket_traits<Packet2Xh>::size);
-  return __riscv_vfsub_vf_f16m2_tum(mask, tmp, tmp, static_cast<Eigen::half>(1.0), unpacket_traits<Packet2Xh>::size);
+  return __riscv_vfsub_vf_f16m2_tumu(mask, tmp, tmp, static_cast<_Float16>(1.0), unpacket_traits<Packet2Xh>::size);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet2Xh preverse(const Packet2Xh& a) {
-  Packet2Xsu idx =
-      __riscv_vrsub_vx_u16m2(__riscv_vid_v_u16m2(unpacket_traits<Packet2Xh>::size),
-                             unpacket_traits<Packet2Xh>::size - 1, unpacket_traits<Packet2Xh>::size);
+  Packet2Xsu idx = __riscv_vrsub_vx_u16m2(__riscv_vid_v_u16m2(unpacket_traits<Packet2Xh>::size),
+                                          unpacket_traits<Packet2Xh>::size - 1, unpacket_traits<Packet2Xh>::size);
   return __riscv_vrgather_vv_f16m2(a, idx, unpacket_traits<Packet2Xh>::size);
 }
 
 template <>
 EIGEN_STRONG_INLINE Eigen::half predux<Packet2Xh>(const Packet2Xh& a) {
   return static_cast<Eigen::half>(__riscv_vfmv_f(__riscv_vfredusum_vs_f16m2_f16m1(
-      a, __riscv_vfmv_v_f_f16m1(static_cast<Eigen::half>(0.0), unpacket_traits<Packet2Xh>::size / 4),
+      a, __riscv_vfmv_v_f_f16m1(static_cast<_Float16>(0.0), unpacket_traits<Packet2Xh>::size / 2),
       unpacket_traits<Packet2Xh>::size)));
 }
 
 template <>
 EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet2Xh>(const Packet2Xh& a) {
-  return predux_mul<PacketXh>(__riscv_vfmul_vv_f16m1(__riscv_vget_v_f16m2_f16m1(a, 0), __riscv_vget_v_f16m2_f16m1(a, 1),
-                                                     unpacket_traits<PacketXh>::size));
+  return predux_mul<Packet1Xh>(__riscv_vfmul_vv_f16m1(
+      __riscv_vget_v_f16m2_f16m1(a, 0), __riscv_vget_v_f16m2_f16m1(a, 1), unpacket_traits<Packet1Xh>::size));
 }
 
 template <>
 EIGEN_STRONG_INLINE Eigen::half predux_min<Packet2Xh>(const Packet2Xh& a) {
-  return static_cast<Eigen::half>(__riscv_vfmv_f(__riscv_vfredmin_vs_f16m2_f16m1(
-      a, __riscv_vfmv_v_f_f16m1((std::numeric_limits<Eigen::half>::max)(), unpacket_traits<Packet2Xh>::size / 4),
-      unpacket_traits<Packet2Xh>::size)));
+  const Eigen::half max = (std::numeric_limits<Eigen::half>::max)();
+  const Eigen::half nan = (std::numeric_limits<Eigen::half>::quiet_NaN)();
+  return (std::min)(
+      static_cast<Eigen::half>(__riscv_vfmv_f(__riscv_vfredmin_vs_f16m2_f16m1(
+          a, __riscv_vfmv_v_f_f16m1(numext::bit_cast<_Float16>(nan), unpacket_traits<Packet2Xh>::size / 2),
+          unpacket_traits<Packet2Xh>::size))),
+      max);
 }
 
 template <>
 EIGEN_STRONG_INLINE Eigen::half predux_max<Packet2Xh>(const Packet2Xh& a) {
-  return static_cast<Eigen::half>(__riscv_vfmv_f(__riscv_vfredmax_vs_f16m2_f16m1(
-      a, __riscv_vfmv_v_f_f16m1(-(std::numeric_limits<Eigen::half>::max)(), unpacket_traits<Packet2Xh>::size / 4),
-      unpacket_traits<Packet2Xh>::size)));
+  const Eigen::half min = -(std::numeric_limits<Eigen::half>::max)();
+  const Eigen::half nan = (std::numeric_limits<Eigen::half>::quiet_NaN)();
+  return (std::max)(
+      static_cast<Eigen::half>(__riscv_vfmv_f(__riscv_vfredmax_vs_f16m2_f16m1(
+          a, __riscv_vfmv_v_f_f16m1(numext::bit_cast<_Float16>(nan), unpacket_traits<Packet2Xh>::size / 2),
+          unpacket_traits<Packet2Xh>::size))),
+      min);
 }
 
 template <int N>
@@ -793,9 +871,8 @@
   }
 
   for (i = 0; i < N; i++) {
-    kernel.packet[i] =
-        __riscv_vle16_v_f16m2(reinterpret_cast<_Float16*>(&buffer[i * unpacket_traits<Packet2Xh>::size]),
-                              unpacket_traits<Packet2Xh>::size);
+    kernel.packet[i] = __riscv_vle16_v_f16m2(reinterpret_cast<_Float16*>(&buffer[i * unpacket_traits<Packet2Xh>::size]),
+                                             unpacket_traits<Packet2Xh>::size);
   }
 }
 
@@ -809,23 +886,23 @@
 
 template <typename Packet = Packet2Xh>
 EIGEN_STRONG_INLINE
-typename std::enable_if<std::is_same<Packet, Packet2Xh>::value && (unpacket_traits<Packet2Xh>::size % 8) == 0,
-                        PacketXh>::type
-predux_half_dowto4(const Packet2Xh& a) {
+    typename std::enable_if<std::is_same<Packet, Packet2Xh>::value && (unpacket_traits<Packet2Xh>::size % 8) == 0,
+                            Packet1Xh>::type
+    predux_half(const Packet2Xh& a) {
   return __riscv_vfadd_vv_f16m1(__riscv_vget_v_f16m2_f16m1(a, 0), __riscv_vget_v_f16m2_f16m1(a, 1),
-                                unpacket_traits<PacketXh>::size);
+                                unpacket_traits<Packet1Xh>::size);
 }
 
-F16_PACKET_FUNCTION(Packet2Xf, PacketXh, pcos)
-F16_PACKET_FUNCTION(Packet2Xf, PacketXh, pexp)
-F16_PACKET_FUNCTION(Packet2Xf, PacketXh, pexpm1)
-F16_PACKET_FUNCTION(Packet2Xf, PacketXh, plog)
-F16_PACKET_FUNCTION(Packet2Xf, PacketXh, plog1p)
-F16_PACKET_FUNCTION(Packet2Xf, PacketXh, plog2)
-F16_PACKET_FUNCTION(Packet2Xf, PacketXh, preciprocal)
-F16_PACKET_FUNCTION(Packet2Xf, PacketXh, prsqrt)
-F16_PACKET_FUNCTION(Packet2Xf, PacketXh, psin)
-F16_PACKET_FUNCTION(Packet2Xf, PacketXh, ptanh)
+F16_PACKET_FUNCTION(Packet2Xf, Packet1Xh, pcos)
+F16_PACKET_FUNCTION(Packet2Xf, Packet1Xh, pexp)
+F16_PACKET_FUNCTION(Packet2Xf, Packet1Xh, pexpm1)
+F16_PACKET_FUNCTION(Packet2Xf, Packet1Xh, plog)
+F16_PACKET_FUNCTION(Packet2Xf, Packet1Xh, plog1p)
+F16_PACKET_FUNCTION(Packet2Xf, Packet1Xh, plog2)
+F16_PACKET_FUNCTION(Packet2Xf, Packet1Xh, preciprocal)
+F16_PACKET_FUNCTION(Packet2Xf, Packet1Xh, prsqrt)
+F16_PACKET_FUNCTION(Packet2Xf, Packet1Xh, psin)
+F16_PACKET_FUNCTION(Packet2Xf, Packet1Xh, ptanh)
 
 F16_PACKET_FUNCTION(Packet4Xf, Packet2Xh, pcos)
 F16_PACKET_FUNCTION(Packet4Xf, Packet2Xh, pexp)
@@ -851,22 +928,22 @@
 };
 
 template <>
-EIGEN_STRONG_INLINE PacketXh pcast<PacketXs, PacketXh>(const PacketXs& a) {
-  return __riscv_vfcvt_f_x_v_f16m1(a, unpacket_traits<PacketXs>::size);
+EIGEN_STRONG_INLINE Packet1Xh pcast<Packet1Xs, Packet1Xh>(const Packet1Xs& a) {
+  return __riscv_vfcvt_f_x_v_f16m1(a, unpacket_traits<Packet1Xs>::size);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXs pcast<PacketXh, PacketXs>(const PacketXh& a) {
-  return __riscv_vfcvt_rtz_x_f_v_i16m1(a, unpacket_traits<PacketXh>::size);
+EIGEN_STRONG_INLINE Packet1Xs pcast<Packet1Xh, Packet1Xs>(const Packet1Xh& a) {
+  return __riscv_vfcvt_rtz_x_f_v_i16m1(a, unpacket_traits<Packet1Xh>::size);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXh preinterpret<PacketXh, PacketXs>(const PacketXs& a) {
+EIGEN_STRONG_INLINE Packet1Xh preinterpret<Packet1Xh, Packet1Xs>(const Packet1Xs& a) {
   return __riscv_vreinterpret_v_i16m1_f16m1(a);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXs preinterpret<PacketXs, PacketXh>(const PacketXh& a) {
+EIGEN_STRONG_INLINE Packet1Xs preinterpret<Packet1Xs, Packet1Xh>(const Packet1Xh& a) {
   return __riscv_vreinterpret_v_f16m1_i16m1(a);
 }
 
@@ -891,29 +968,29 @@
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet4Xs pcast<PacketXh, Packet4Xs>(const PacketXh& a, const PacketXh& b, const PacketXh& c,
-                                                               const PacketXh& d) {
-  return __riscv_vcreate_v_i16m1_i16m4(__riscv_vfcvt_rtz_x_f_v_i16m1(a, unpacket_traits<PacketXh>::size),
-                                       __riscv_vfcvt_rtz_x_f_v_i16m1(b, unpacket_traits<PacketXh>::size),
-                                       __riscv_vfcvt_rtz_x_f_v_i16m1(c, unpacket_traits<PacketXh>::size),
-                                       __riscv_vfcvt_rtz_x_f_v_i16m1(d, unpacket_traits<PacketXh>::size));
+EIGEN_STRONG_INLINE Packet4Xs pcast<Packet1Xh, Packet4Xs>(const Packet1Xh& a, const Packet1Xh& b, const Packet1Xh& c,
+                                                          const Packet1Xh& d) {
+  return __riscv_vcreate_v_i16m1_i16m4(__riscv_vfcvt_rtz_x_f_v_i16m1(a, unpacket_traits<Packet1Xh>::size),
+                                       __riscv_vfcvt_rtz_x_f_v_i16m1(b, unpacket_traits<Packet1Xh>::size),
+                                       __riscv_vfcvt_rtz_x_f_v_i16m1(c, unpacket_traits<Packet1Xh>::size),
+                                       __riscv_vfcvt_rtz_x_f_v_i16m1(d, unpacket_traits<Packet1Xh>::size));
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet2Xh pcast<PacketXs, Packet2Xh>(const PacketXs& a, const PacketXs& b) {
-  return __riscv_vcreate_v_f16m1_f16m2(__riscv_vfcvt_f_x_v_f16m1(a, unpacket_traits<PacketXs>::size),
-                                       __riscv_vfcvt_f_x_v_f16m1(b, unpacket_traits<PacketXs>::size));
+EIGEN_STRONG_INLINE Packet2Xh pcast<Packet1Xs, Packet2Xh>(const Packet1Xs& a, const Packet1Xs& b) {
+  return __riscv_vcreate_v_f16m1_f16m2(__riscv_vfcvt_f_x_v_f16m1(a, unpacket_traits<Packet1Xs>::size),
+                                       __riscv_vfcvt_f_x_v_f16m1(b, unpacket_traits<Packet1Xs>::size));
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet2Xh pcast<PacketXh, Packet2Xh>(const PacketXh& a, const PacketXh& b) {
+EIGEN_STRONG_INLINE Packet2Xh pcast<Packet1Xh, Packet2Xh>(const Packet1Xh& a, const Packet1Xh& b) {
   return __riscv_vcreate_v_f16m1_f16m2(a, b);
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet2Xs pcast<PacketXh, Packet2Xs>(const PacketXh& a, const PacketXh& b) {
-  return __riscv_vcreate_v_i16m1_i16m2(__riscv_vfcvt_rtz_x_f_v_i16m1(a, unpacket_traits<PacketXh>::size),
-                                       __riscv_vfcvt_rtz_x_f_v_i16m1(b, unpacket_traits<PacketXh>::size));
+EIGEN_STRONG_INLINE Packet2Xs pcast<Packet1Xh, Packet2Xs>(const Packet1Xh& a, const Packet1Xh& b) {
+  return __riscv_vcreate_v_i16m1_i16m2(__riscv_vfcvt_rtz_x_f_v_i16m1(a, unpacket_traits<Packet1Xh>::size),
+                                       __riscv_vfcvt_rtz_x_f_v_i16m1(b, unpacket_traits<Packet1Xh>::size));
 }
 
 }  // namespace internal

diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h
index 9dfe334..d04b898 100644
--- a/Eigen/src/Core/arch/SSE/Complex.h
+++ b/Eigen/src/Core/arch/SSE/Complex.h

@@ -244,7 +244,8 @@
     HasAbs2 = 0,
     HasMin = 0,
     HasMax = 0,
-    HasSetLinear = 0
+    HasSetLinear = 0,
+    HasExp = 1
   };
 };
 #endif
@@ -277,7 +278,7 @@
 }
 template <>
 EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) {
-  const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000, 0x0, 0x0, 0x0));
+  const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(static_cast<int32_t>(0x80000000), 0x0, 0x0, 0x0));
   return Packet1cd(_mm_xor_pd(a.v, mask));
 }
 
@@ -412,30 +413,8 @@
   return Packet1cd(pand<Packet2d>(eq, vec2d_swizzle1(eq, 1, 0)));
 }
 
-template <>
-EIGEN_STRONG_INLINE Packet1cd psqrt<Packet1cd>(const Packet1cd& a) {
-  return psqrt_complex<Packet1cd>(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2cf psqrt<Packet2cf>(const Packet2cf& a) {
-  return psqrt_complex<Packet2cf>(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet1cd plog<Packet1cd>(const Packet1cd& a) {
-  return plog_complex<Packet1cd>(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2cf plog<Packet2cf>(const Packet2cf& a) {
-  return plog_complex<Packet2cf>(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2cf pexp<Packet2cf>(const Packet2cf& a) {
-  return pexp_complex<Packet2cf>(a);
-}
+EIGEN_INSTANTIATE_COMPLEX_MATH_FUNCS(Packet1cd)
+EIGEN_INSTANTIATE_COMPLEX_MATH_FUNCS(Packet2cf)
 
 #ifdef EIGEN_VECTORIZE_FMA
 // std::complex<float>

diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 7d53fa2..e7039d4 100644
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h

@@ -91,6 +91,8 @@
   enum { mask = (s) << 6 | (r) << 4 | (q) << 2 | (p) };
 };
 
+#define SIGN_MASK_I32 static_cast<int32_t>(0x80000000)
+
 // TODO: change the implementation of all swizzle* ops from macro to template,
 #define vec4f_swizzle1(v, p, q, r, s) \
   Packet4f(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v), (shuffle_mask<p, q, r, s>::mask))))
@@ -560,7 +562,7 @@
 #ifdef EIGEN_VECTORIZE_SSE3
   return _mm_addsub_ps(a, b);
 #else
-  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000, 0x0, 0x80000000, 0x0));
+  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(SIGN_MASK_I32, 0x0, SIGN_MASK_I32, 0x0));
   return padd(a, pxor(mask, b));
 #endif
 }
@@ -572,19 +574,19 @@
 #ifdef EIGEN_VECTORIZE_SSE3
   return _mm_addsub_pd(a, b);
 #else
-  const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0, 0x80000000, 0x0, 0x0));
+  const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0, SIGN_MASK_I32, 0x0, 0x0));
   return padd(a, pxor(mask, b));
 #endif
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) {
-  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000));
+  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(SIGN_MASK_I32, SIGN_MASK_I32, SIGN_MASK_I32, SIGN_MASK_I32));
   return _mm_xor_ps(a, mask);
 }
 template <>
 EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) {
-  const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0, 0x80000000, 0x0, 0x80000000));
+  const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0, SIGN_MASK_I32, 0x0, SIGN_MASK_I32));
   return _mm_xor_pd(a, mask);
 }
 template <>
@@ -1249,7 +1251,7 @@
 }
 template <>
 EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) {
-  const __m128i mask = _mm_setr_epi32(0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF);
+  const __m128i mask = _mm_setr_epi32(-1, 0x7FFFFFFF, -1, 0x7FFFFFFF);
   return _mm_castsi128_pd(_mm_and_si128(mask, _mm_castpd_si128(a)));
 }
 template <>
@@ -2125,205 +2127,6 @@
 }
 #endif
 
-// Packet math for Eigen::half
-// Disable the following code since it's broken on too many platforms / compilers.
-// #elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
-#if 0
-
-typedef struct {
-  __m64 x;
-} Packet4h;
-
-
-template<> struct is_arithmetic<Packet4h> { enum { value = true }; };
-
-template <>
-struct packet_traits<Eigen::half> : default_packet_traits {
-  typedef Packet4h type;
-  // There is no half-size packet for Packet4h.
-  typedef Packet4h half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 4,
-    HasAdd    = 1,
-    HasSub    = 1,
-    HasMul    = 1,
-    HasDiv    = 1,
-    HasNegate = 0,
-    HasAbs    = 0,
-    HasMin    = 0,
-    HasMax    = 0,
-    HasConj   = 0,
-    HasSetLinear = 0,
-  };
-};
-
-
-template<> struct unpacket_traits<Packet4h> { typedef Eigen::half type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4h half; };
-
-template<> EIGEN_STRONG_INLINE Packet4h pset1<Packet4h>(const Eigen::half& from) {
-  Packet4h result;
-  result.x = _mm_set1_pi16(from.x);
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h>(const Packet4h& from) {
-  return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_cvtsi64_si32(from.x)));
-}
-
-template<> EIGEN_STRONG_INLINE Packet4h pconj(const Packet4h& a) { return a; }
-
-template<> EIGEN_STRONG_INLINE Packet4h padd<Packet4h>(const Packet4h& a, const Packet4h& b) {
-  __int64_t a64 = _mm_cvtm64_si64(a.x);
-  __int64_t b64 = _mm_cvtm64_si64(b.x);
-
-  Eigen::half h[4];
-
-  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
-  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
-  h[0] = ha + hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
-  h[1] = ha + hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
-  h[2] = ha + hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
-  h[3] = ha + hb;
-  Packet4h result;
-  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE Packet4h psub<Packet4h>(const Packet4h& a, const Packet4h& b) {
-  __int64_t a64 = _mm_cvtm64_si64(a.x);
-  __int64_t b64 = _mm_cvtm64_si64(b.x);
-
-  Eigen::half h[4];
-
-  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
-  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
-  h[0] = ha - hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
-  h[1] = ha - hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
-  h[2] = ha - hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
-  h[3] = ha - hb;
-  Packet4h result;
-  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE Packet4h pmul<Packet4h>(const Packet4h& a, const Packet4h& b) {
-  __int64_t a64 = _mm_cvtm64_si64(a.x);
-  __int64_t b64 = _mm_cvtm64_si64(b.x);
-
-  Eigen::half h[4];
-
-  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
-  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
-  h[0] = ha * hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
-  h[1] = ha * hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
-  h[2] = ha * hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
-  h[3] = ha * hb;
-  Packet4h result;
-  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE Packet4h pdiv<Packet4h>(const Packet4h& a, const Packet4h& b) {
-  __int64_t a64 = _mm_cvtm64_si64(a.x);
-  __int64_t b64 = _mm_cvtm64_si64(b.x);
-
-  Eigen::half h[4];
-
-  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
-  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
-  h[0] = ha / hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
-  h[1] = ha / hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
-  h[2] = ha / hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
-  h[3] = ha / hb;
-  Packet4h result;
-  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE Packet4h pload<Packet4h>(const Eigen::half* from) {
-  Packet4h result;
-  result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE Packet4h ploadu<Packet4h>(const Eigen::half* from) {
-  Packet4h result;
-  result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4h& from) {
-  __int64_t r = _mm_cvtm64_si64(from.x);
-  *(reinterpret_cast<__int64_t*>(to)) = r;
-}
-
-template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet4h& from) {
-  __int64_t r = _mm_cvtm64_si64(from.x);
-  *(reinterpret_cast<__int64_t*>(to)) = r;
-}
-
-template<> EIGEN_STRONG_INLINE Packet4h
-ploadquad<Packet4h>(const Eigen::half* from) {
-  return pset1<Packet4h>(*from);
-}
-
-template<> EIGEN_STRONG_INLINE Packet4h pgather<Eigen::half, Packet4h>(const Eigen::half* from, Index stride)
-{
-  Packet4h result;
-  result.x = _mm_set_pi16(from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h>(Eigen::half* to, const Packet4h& from, Index stride)
-{
-  __int64_t a = _mm_cvtm64_si64(from.x);
-  to[stride*0].x = static_cast<unsigned short>(a);
-  to[stride*1].x = static_cast<unsigned short>(a >> 16);
-  to[stride*2].x = static_cast<unsigned short>(a >> 32);
-  to[stride*3].x = static_cast<unsigned short>(a >> 48);
-}
-
-EIGEN_STRONG_INLINE void
-ptranspose(PacketBlock<Packet4h,4>& kernel) {
-  __m64 T0 = _mm_unpacklo_pi16(kernel.packet[0].x, kernel.packet[1].x);
-  __m64 T1 = _mm_unpacklo_pi16(kernel.packet[2].x, kernel.packet[3].x);
-  __m64 T2 = _mm_unpackhi_pi16(kernel.packet[0].x, kernel.packet[1].x);
-  __m64 T3 = _mm_unpackhi_pi16(kernel.packet[2].x, kernel.packet[3].x);
-
-  kernel.packet[0].x = _mm_unpacklo_pi32(T0, T1);
-  kernel.packet[1].x = _mm_unpackhi_pi32(T0, T1);
-  kernel.packet[2].x = _mm_unpacklo_pi32(T2, T3);
-  kernel.packet[3].x = _mm_unpackhi_pi32(T2, T3);
-}
-
-#endif
-
 }  // end namespace internal
 
 }  // end namespace Eigen

diff --git a/Eigen/src/Core/arch/SSE/TypeCasting.h b/Eigen/src/Core/arch/SSE/TypeCasting.h
index 9a7732a..8eebc02 100644
--- a/Eigen/src/Core/arch/SSE/TypeCasting.h
+++ b/Eigen/src/Core/arch/SSE/TypeCasting.h

@@ -173,56 +173,6 @@
   return Packet4i(a);
 }
 
-// Disable the following code since it's broken on too many platforms / compilers.
-// #elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
-#if 0
-
-template <>
-struct type_casting_traits<Eigen::half, float> {
-  enum {
-    VectorizedCast = 1,
-    SrcCoeffRatio = 1,
-    TgtCoeffRatio = 1
-  };
-};
-
-template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4h, Packet4f>(const Packet4h& a) {
-  __int64_t a64 = _mm_cvtm64_si64(a.x);
-  Eigen::half h = raw_uint16_to_half(static_cast<unsigned short>(a64));
-  float f1 = static_cast<float>(h);
-  h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
-  float f2 = static_cast<float>(h);
-  h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
-  float f3 = static_cast<float>(h);
-  h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
-  float f4 = static_cast<float>(h);
-  return _mm_set_ps(f4, f3, f2, f1);
-}
-
-template <>
-struct type_casting_traits<float, Eigen::half> {
-  enum {
-    VectorizedCast = 1,
-    SrcCoeffRatio = 1,
-    TgtCoeffRatio = 1
-  };
-};
-
-template<> EIGEN_STRONG_INLINE Packet4h pcast<Packet4f, Packet4h>(const Packet4f& a) {
-  EIGEN_ALIGN16 float aux[4];
-  pstore(aux, a);
-  Eigen::half h0(aux[0]);
-  Eigen::half h1(aux[1]);
-  Eigen::half h2(aux[2]);
-  Eigen::half h3(aux[3]);
-
-  Packet4h result;
-  result.x = _mm_set_pi16(h3.x, h2.x, h1.x, h0.x);
-  return result;
-}
-
-#endif
-
 }  // end namespace internal
 
 }  // end namespace Eigen

diff --git a/Eigen/src/Core/arch/SVE/MathFunctions.h b/Eigen/src/Core/arch/SVE/MathFunctions.h
index 5967433..a547d39 100644
--- a/Eigen/src/Core/arch/SVE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SVE/MathFunctions.h

@@ -16,36 +16,7 @@
 namespace Eigen {
 namespace internal {
 
-template <>
-EIGEN_STRONG_INLINE PacketXf pexp<PacketXf>(const PacketXf& x) {
-  return pexp_float(x);
-}
-
-template <>
-EIGEN_STRONG_INLINE PacketXf plog<PacketXf>(const PacketXf& x) {
-  return plog_float(x);
-}
-
-template <>
-EIGEN_STRONG_INLINE PacketXf psin<PacketXf>(const PacketXf& x) {
-  return psin_float(x);
-}
-
-template <>
-EIGEN_STRONG_INLINE PacketXf pcos<PacketXf>(const PacketXf& x) {
-  return pcos_float(x);
-}
-
-template <>
-EIGEN_STRONG_INLINE PacketXf ptan<PacketXf>(const PacketXf& x) {
-  return ptan_float(x);
-}
-
-// Hyperbolic Tangent function.
-template <>
-EIGEN_STRONG_INLINE PacketXf ptanh<PacketXf>(const PacketXf& x) {
-  return ptanh_float(x);
-}
+EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(PacketXf)
 
 }  // end namespace internal
 }  // end namespace Eigen

diff --git a/Eigen/src/Core/arch/SVE/PacketMath.h b/Eigen/src/Core/arch/SVE/PacketMath.h
index 39b29fa..310a40c 100644
--- a/Eigen/src/Core/arch/SVE/PacketMath.h
+++ b/Eigen/src/Core/arch/SVE/PacketMath.h

@@ -354,10 +354,17 @@
     HasSin = EIGEN_FAST_MATH,
     HasCos = EIGEN_FAST_MATH,
     HasTan = EIGEN_FAST_MATH,
+    HasACos = 1,
+    HasASin = 1,
+    HasATan = 1,
+    HasATanh = 1,
     HasLog = 1,
+    HasLog1p = 1,
+    HasExpm1 = 1,
     HasExp = 1,
     HasPow = 1,
     HasSqrt = 1,
+    HasCbrt = 1,
     HasTanh = EIGEN_FAST_MATH,
     HasErf = EIGEN_FAST_MATH,
     HasErfc = EIGEN_FAST_MATH

diff --git a/Eigen/src/Core/arch/SYCL/MathFunctions.h b/Eigen/src/Core/arch/SYCL/MathFunctions.h
index b20c32b..90d6066 100644
--- a/Eigen/src/Core/arch/SYCL/MathFunctions.h
+++ b/Eigen/src/Core/arch/SYCL/MathFunctions.h

@@ -31,259 +31,69 @@
 // introduce conflicts between these packet_traits definitions and the ones
 // we'll use on the host side (SSE, AVX, ...)
 #if defined(SYCL_DEVICE_ONLY)
-#define SYCL_PLOG(packet_type)                                                                \
-  template <>                                                                                 \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type plog<packet_type>(const packet_type& a) { \
-    return cl::sycl::log(a);                                                                  \
+
+// Generic macro for unary SYCL math functions.
+#define SYCL_PACKET_FUNCTION(EIGEN_FUNC, SYCL_FUNC, PACKET)                          \
+  template <>                                                                        \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PACKET EIGEN_FUNC<PACKET>(const PACKET& a) { \
+    return cl::sycl::SYCL_FUNC(a);                                                   \
   }
 
-SYCL_PLOG(cl::sycl::cl_half8)
-SYCL_PLOG(cl::sycl::cl_float4)
-SYCL_PLOG(cl::sycl::cl_double2)
-#undef SYCL_PLOG
+// Instantiate a unary function for the standard set of SYCL vector types.
+#define SYCL_UNARY_FUNCTION(EIGEN_FUNC, SYCL_FUNC)                 \
+  SYCL_PACKET_FUNCTION(EIGEN_FUNC, SYCL_FUNC, cl::sycl::cl_half8)  \
+  SYCL_PACKET_FUNCTION(EIGEN_FUNC, SYCL_FUNC, cl::sycl::cl_float4) \
+  SYCL_PACKET_FUNCTION(EIGEN_FUNC, SYCL_FUNC, cl::sycl::cl_double2)
 
-#define SYCL_PLOG1P(packet_type)                                                                \
-  template <>                                                                                   \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type plog1p<packet_type>(const packet_type& a) { \
-    return cl::sycl::log1p(a);                                                                  \
+SYCL_UNARY_FUNCTION(plog, log)
+SYCL_UNARY_FUNCTION(plog1p, log1p)
+SYCL_UNARY_FUNCTION(plog10, log10)
+SYCL_UNARY_FUNCTION(pexpm1, expm1)
+SYCL_UNARY_FUNCTION(psqrt, sqrt)
+SYCL_UNARY_FUNCTION(prsqrt, rsqrt)
+SYCL_UNARY_FUNCTION(psin, sin)
+SYCL_UNARY_FUNCTION(pcos, cos)
+SYCL_UNARY_FUNCTION(ptan, tan)
+SYCL_UNARY_FUNCTION(pasin, asin)
+SYCL_UNARY_FUNCTION(pacos, acos)
+SYCL_UNARY_FUNCTION(patan, atan)
+SYCL_UNARY_FUNCTION(psinh, sinh)
+SYCL_UNARY_FUNCTION(pcosh, cosh)
+SYCL_UNARY_FUNCTION(ptanh, tanh)
+SYCL_UNARY_FUNCTION(pround, round)
+SYCL_UNARY_FUNCTION(print, rint)
+SYCL_UNARY_FUNCTION(pfloor, floor)
+
+// pexp has additional scalar type instantiations.
+SYCL_UNARY_FUNCTION(pexp, exp)
+SYCL_PACKET_FUNCTION(pexp, exp, cl::sycl::cl_half)
+SYCL_PACKET_FUNCTION(pexp, exp, cl::sycl::cl_float)
+
+// pceil uses cl_half (scalar) instead of cl_half8 (vector) — preserving original behavior.
+SYCL_PACKET_FUNCTION(pceil, ceil, cl::sycl::cl_half)
+SYCL_PACKET_FUNCTION(pceil, ceil, cl::sycl::cl_float4)
+SYCL_PACKET_FUNCTION(pceil, ceil, cl::sycl::cl_double2)
+
+#undef SYCL_UNARY_FUNCTION
+#undef SYCL_PACKET_FUNCTION
+
+// Binary min/max functions.
+#define SYCL_BINARY_FUNCTION(EIGEN_FUNC, SYCL_FUNC, PACKET)                                           \
+  template <>                                                                                         \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PACKET EIGEN_FUNC<PACKET>(const PACKET& a, const PACKET& b) { \
+    return cl::sycl::SYCL_FUNC(a, b);                                                                 \
   }
 
-SYCL_PLOG1P(cl::sycl::cl_half8)
-SYCL_PLOG1P(cl::sycl::cl_float4)
-SYCL_PLOG1P(cl::sycl::cl_double2)
-#undef SYCL_PLOG1P
+SYCL_BINARY_FUNCTION(pmin, fmin, cl::sycl::cl_half8)
+SYCL_BINARY_FUNCTION(pmin, fmin, cl::sycl::cl_float4)
+SYCL_BINARY_FUNCTION(pmin, fmin, cl::sycl::cl_double2)
+SYCL_BINARY_FUNCTION(pmax, fmax, cl::sycl::cl_half8)
+SYCL_BINARY_FUNCTION(pmax, fmax, cl::sycl::cl_float4)
+SYCL_BINARY_FUNCTION(pmax, fmax, cl::sycl::cl_double2)
 
-#define SYCL_PLOG10(packet_type)                                                                \
-  template <>                                                                                   \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type plog10<packet_type>(const packet_type& a) { \
-    return cl::sycl::log10(a);                                                                  \
-  }
+#undef SYCL_BINARY_FUNCTION
 
-SYCL_PLOG10(cl::sycl::cl_half8)
-SYCL_PLOG10(cl::sycl::cl_float4)
-SYCL_PLOG10(cl::sycl::cl_double2)
-#undef SYCL_PLOG10
-
-#define SYCL_PEXP(packet_type)                                                                \
-  template <>                                                                                 \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pexp<packet_type>(const packet_type& a) { \
-    return cl::sycl::exp(a);                                                                  \
-  }
-
-SYCL_PEXP(cl::sycl::cl_half8)
-SYCL_PEXP(cl::sycl::cl_half)
-SYCL_PEXP(cl::sycl::cl_float4)
-SYCL_PEXP(cl::sycl::cl_float)
-SYCL_PEXP(cl::sycl::cl_double2)
-#undef SYCL_PEXP
-
-#define SYCL_PEXPM1(packet_type)                                                                \
-  template <>                                                                                   \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pexpm1<packet_type>(const packet_type& a) { \
-    return cl::sycl::expm1(a);                                                                  \
-  }
-
-SYCL_PEXPM1(cl::sycl::cl_half8)
-SYCL_PEXPM1(cl::sycl::cl_float4)
-SYCL_PEXPM1(cl::sycl::cl_double2)
-#undef SYCL_PEXPM1
-
-#define SYCL_PSQRT(packet_type)                                                                \
-  template <>                                                                                  \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type psqrt<packet_type>(const packet_type& a) { \
-    return cl::sycl::sqrt(a);                                                                  \
-  }
-
-SYCL_PSQRT(cl::sycl::cl_half8)
-SYCL_PSQRT(cl::sycl::cl_float4)
-SYCL_PSQRT(cl::sycl::cl_double2)
-#undef SYCL_PSQRT
-
-#define SYCL_PRSQRT(packet_type)                                                                \
-  template <>                                                                                   \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type prsqrt<packet_type>(const packet_type& a) { \
-    return cl::sycl::rsqrt(a);                                                                  \
-  }
-
-SYCL_PRSQRT(cl::sycl::cl_half8)
-SYCL_PRSQRT(cl::sycl::cl_float4)
-SYCL_PRSQRT(cl::sycl::cl_double2)
-#undef SYCL_PRSQRT
-
-/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
-#define SYCL_PSIN(packet_type)                                                                \
-  template <>                                                                                 \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type psin<packet_type>(const packet_type& a) { \
-    return cl::sycl::sin(a);                                                                  \
-  }
-
-SYCL_PSIN(cl::sycl::cl_half8)
-SYCL_PSIN(cl::sycl::cl_float4)
-SYCL_PSIN(cl::sycl::cl_double2)
-#undef SYCL_PSIN
-
-/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
-#define SYCL_PCOS(packet_type)                                                                \
-  template <>                                                                                 \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pcos<packet_type>(const packet_type& a) { \
-    return cl::sycl::cos(a);                                                                  \
-  }
-
-SYCL_PCOS(cl::sycl::cl_half8)
-SYCL_PCOS(cl::sycl::cl_float4)
-SYCL_PCOS(cl::sycl::cl_double2)
-#undef SYCL_PCOS
-
-/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
-#define SYCL_PTAN(packet_type)                                                                \
-  template <>                                                                                 \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ptan<packet_type>(const packet_type& a) { \
-    return cl::sycl::tan(a);                                                                  \
-  }
-
-SYCL_PTAN(cl::sycl::cl_half8)
-SYCL_PTAN(cl::sycl::cl_float4)
-SYCL_PTAN(cl::sycl::cl_double2)
-#undef SYCL_PTAN
-
-/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
-#define SYCL_PASIN(packet_type)                                                                \
-  template <>                                                                                  \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pasin<packet_type>(const packet_type& a) { \
-    return cl::sycl::asin(a);                                                                  \
-  }
-
-SYCL_PASIN(cl::sycl::cl_half8)
-SYCL_PASIN(cl::sycl::cl_float4)
-SYCL_PASIN(cl::sycl::cl_double2)
-#undef SYCL_PASIN
-
-/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
-#define SYCL_PACOS(packet_type)                                                                \
-  template <>                                                                                  \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pacos<packet_type>(const packet_type& a) { \
-    return cl::sycl::acos(a);                                                                  \
-  }
-
-SYCL_PACOS(cl::sycl::cl_half8)
-SYCL_PACOS(cl::sycl::cl_float4)
-SYCL_PACOS(cl::sycl::cl_double2)
-#undef SYCL_PACOS
-
-/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
-#define SYCL_PATAN(packet_type)                                                                \
-  template <>                                                                                  \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type patan<packet_type>(const packet_type& a) { \
-    return cl::sycl::atan(a);                                                                  \
-  }
-
-SYCL_PATAN(cl::sycl::cl_half8)
-SYCL_PATAN(cl::sycl::cl_float4)
-SYCL_PATAN(cl::sycl::cl_double2)
-#undef SYCL_PATAN
-
-/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
-#define SYCL_PSINH(packet_type)                                                                \
-  template <>                                                                                  \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type psinh<packet_type>(const packet_type& a) { \
-    return cl::sycl::sinh(a);                                                                  \
-  }
-
-SYCL_PSINH(cl::sycl::cl_half8)
-SYCL_PSINH(cl::sycl::cl_float4)
-SYCL_PSINH(cl::sycl::cl_double2)
-#undef SYCL_PSINH
-
-/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
-#define SYCL_PCOSH(packet_type)                                                                \
-  template <>                                                                                  \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pcosh<packet_type>(const packet_type& a) { \
-    return cl::sycl::cosh(a);                                                                  \
-  }
-
-SYCL_PCOSH(cl::sycl::cl_half8)
-SYCL_PCOSH(cl::sycl::cl_float4)
-SYCL_PCOSH(cl::sycl::cl_double2)
-#undef SYCL_PCOSH
-
-/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
-#define SYCL_PTANH(packet_type)                                                                \
-  template <>                                                                                  \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ptanh<packet_type>(const packet_type& a) { \
-    return cl::sycl::tanh(a);                                                                  \
-  }
-
-SYCL_PTANH(cl::sycl::cl_half8)
-SYCL_PTANH(cl::sycl::cl_float4)
-SYCL_PTANH(cl::sycl::cl_double2)
-#undef SYCL_PTANH
-
-#define SYCL_PCEIL(packet_type)                                                                \
-  template <>                                                                                  \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pceil<packet_type>(const packet_type& a) { \
-    return cl::sycl::ceil(a);                                                                  \
-  }
-
-SYCL_PCEIL(cl::sycl::cl_half)
-SYCL_PCEIL(cl::sycl::cl_float4)
-SYCL_PCEIL(cl::sycl::cl_double2)
-#undef SYCL_PCEIL
-
-#define SYCL_PROUND(packet_type)                                                                \
-  template <>                                                                                   \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pround<packet_type>(const packet_type& a) { \
-    return cl::sycl::round(a);                                                                  \
-  }
-
-SYCL_PROUND(cl::sycl::cl_half8)
-SYCL_PROUND(cl::sycl::cl_float4)
-SYCL_PROUND(cl::sycl::cl_double2)
-#undef SYCL_PROUND
-
-#define SYCL_PRINT(packet_type)                                                                \
-  template <>                                                                                  \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type print<packet_type>(const packet_type& a) { \
-    return cl::sycl::rint(a);                                                                  \
-  }
-
-SYCL_PRINT(cl::sycl::cl_half8)
-SYCL_PRINT(cl::sycl::cl_float4)
-SYCL_PRINT(cl::sycl::cl_double2)
-#undef SYCL_PRINT
-
-#define SYCL_FLOOR(packet_type)                                                                 \
-  template <>                                                                                   \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pfloor<packet_type>(const packet_type& a) { \
-    return cl::sycl::floor(a);                                                                  \
-  }
-
-SYCL_FLOOR(cl::sycl::cl_half8)
-SYCL_FLOOR(cl::sycl::cl_float4)
-SYCL_FLOOR(cl::sycl::cl_double2)
-#undef SYCL_FLOOR
-
-#define SYCL_PMIN(packet_type, expr)                                                                                \
-  template <>                                                                                                       \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pmin<packet_type>(const packet_type& a, const packet_type& b) { \
-    return expr;                                                                                                    \
-  }
-
-SYCL_PMIN(cl::sycl::cl_half8, cl::sycl::fmin(a, b))
-SYCL_PMIN(cl::sycl::cl_float4, cl::sycl::fmin(a, b))
-SYCL_PMIN(cl::sycl::cl_double2, cl::sycl::fmin(a, b))
-#undef SYCL_PMIN
-
-#define SYCL_PMAX(packet_type, expr)                                                                                \
-  template <>                                                                                                       \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pmax<packet_type>(const packet_type& a, const packet_type& b) { \
-    return expr;                                                                                                    \
-  }
-
-SYCL_PMAX(cl::sycl::cl_half8, cl::sycl::fmax(a, b))
-SYCL_PMAX(cl::sycl::cl_float4, cl::sycl::fmax(a, b))
-SYCL_PMAX(cl::sycl::cl_double2, cl::sycl::fmax(a, b))
-#undef SYCL_PMAX
-
+// pldexp requires integer conversion of the exponent.
 #define SYCL_PLDEXP(packet_type)                                                                                  \
   template <>                                                                                                     \
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pldexp(const packet_type& a, const packet_type& exponent) {   \

diff --git a/Eigen/src/Core/arch/ZVector/Complex.h b/Eigen/src/Core/arch/ZVector/Complex.h
index dfcdcab..048b598 100644
--- a/Eigen/src/Core/arch/ZVector/Complex.h
+++ b/Eigen/src/Core/arch/ZVector/Complex.h

@@ -20,7 +20,8 @@
 
 #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
 inline Packet4ui p4ui_CONJ_XOR() {
-  return Packet4ui {0x00000000, 0x80000000, 0x00000000, 0x80000000};  // vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);
+  return Packet4ui{0x00000000, 0x80000000, 0x00000000,
+                   0x80000000};  // vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);
 }
 #endif
 
@@ -255,29 +256,8 @@
   return pdiv_complex(a, b);
 }
 
-template <>
-EIGEN_STRONG_INLINE Packet1cd psqrt<Packet1cd>(const Packet1cd& a) {
-  return psqrt_complex<Packet1cd>(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2cf psqrt<Packet2cf>(const Packet2cf& a) {
-  return psqrt_complex<Packet2cf>(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet1cd plog<Packet1cd>(const Packet1cd& a) {
-  return plog_complex<Packet1cd>(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet2cf plog<Packet2cf>(const Packet2cf& a) {
-  return plog_complex<Packet2cf>(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2cf pexp<Packet2cf>(const Packet2cf& a) {
-  return pexp_complex(a);
-}
+EIGEN_INSTANTIATE_COMPLEX_MATH_FUNCS_NO_EXP(Packet1cd)
+EIGEN_INSTANTIATE_COMPLEX_MATH_FUNCS(Packet2cf)
 
 EIGEN_STRONG_INLINE Packet1cd pcplxflip /*<Packet1cd>*/ (const Packet1cd& x) {
   return Packet1cd(preverse(Packet2d(x.v)));

diff --git a/Eigen/src/Core/arch/clang/Complex.h b/Eigen/src/Core/arch/clang/Complex.h
index d6cc435..cfcc229 100644
--- a/Eigen/src/Core/arch/clang/Complex.h
+++ b/Eigen/src/Core/arch/clang/Complex.h

@@ -19,7 +19,7 @@
 template <typename RealScalar, int N>
 struct complex_packet_wrapper {
   using RealPacketT = detail::VectorType<RealScalar, 2 * N>;
-  EIGEN_STRONG_INLINE complex_packet_wrapper() = default;
+  complex_packet_wrapper() = default;
   EIGEN_STRONG_INLINE explicit complex_packet_wrapper(const RealPacketT& a) : v(a) {}
   EIGEN_STRONG_INLINE constexpr std::complex<RealScalar> operator[](Index i) const {
     return std::complex<RealScalar>(v[2 * i], v[2 * i + 1]);
@@ -27,11 +27,21 @@
   RealPacketT v;
 };
 
-using Packet8cf = complex_packet_wrapper<float, 8>;
-using Packet4cf = complex_packet_wrapper<float, 4>;
+// --- Primary complex packet aliases ---
+constexpr int kComplexFloatSize = kFloatPacketSize / 2;    // 2, 4, or 8
+constexpr int kComplexDoubleSize = kDoublePacketSize / 2;  // 1, 2, or 4
+using PacketXcf = complex_packet_wrapper<float, kComplexFloatSize>;
+using PacketXcd = complex_packet_wrapper<double, kComplexDoubleSize>;
+
+// Sub-packet types needed for reductions at larger sizes.
+// When PacketXcf IS already a given size, we skip the alias to avoid duplicates.
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 32
 using Packet2cf = complex_packet_wrapper<float, 2>;
-using Packet4cd = complex_packet_wrapper<double, 4>;
+#endif
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 64
+using Packet4cf = complex_packet_wrapper<float, 4>;
 using Packet2cd = complex_packet_wrapper<double, 2>;
+#endif
 
 struct generic_complex_packet_traits : default_packet_traits {
   enum {
@@ -58,40 +68,39 @@
 
 template <>
 struct packet_traits<std::complex<float>> : generic_complex_packet_traits {
-  using type = Packet8cf;
-  using half = Packet8cf;
+  using type = PacketXcf;
+  using half = PacketXcf;
   enum {
-    size = 8,
+    size = kComplexFloatSize,
   };
 };
 
 template <>
-struct unpacket_traits<Packet8cf> : generic_unpacket_traits {
+struct unpacket_traits<PacketXcf> : generic_unpacket_traits {
   using type = std::complex<float>;
-  using half = Packet8cf;
-  using as_real = Packet16f;
+  using half = PacketXcf;
+  using as_real = PacketXf;
   enum {
-    size = 8,
+    size = kComplexFloatSize,
   };
 };
 
 template <>
 struct packet_traits<std::complex<double>> : generic_complex_packet_traits {
-  using type = Packet4cd;
-  using half = Packet4cd;
+  using type = PacketXcd;
+  using half = PacketXcd;
   enum {
-    size = 4,
-    HasExp = 0,  // FIXME(rmlarsen): pexp_complex is broken for double.
+    size = kComplexDoubleSize,
   };
 };
 
 template <>
-struct unpacket_traits<Packet4cd> : generic_unpacket_traits {
+struct unpacket_traits<PacketXcd> : generic_unpacket_traits {
   using type = std::complex<double>;
-  using half = Packet4cd;
-  using as_real = Packet8d;
+  using half = PacketXcd;
+  using as_real = PacketXd;
   enum {
-    size = 4,
+    size = kComplexDoubleSize,
   };
 };
 
@@ -116,24 +125,58 @@
     pstore(&numext::real_ref(*to), from.v);                                                               \
   }
 
-EIGEN_CLANG_COMPLEX_LOAD_STORE(Packet8cf);
-EIGEN_CLANG_COMPLEX_LOAD_STORE(Packet4cd);
+EIGEN_CLANG_COMPLEX_LOAD_STORE(PacketXcf);
+EIGEN_CLANG_COMPLEX_LOAD_STORE(PacketXcd);
 #undef EIGEN_CLANG_COMPLEX_LOAD_STORE
 
-template <>
-EIGEN_STRONG_INLINE Packet8cf pset1<Packet8cf>(const std::complex<float>& from) {
-  const float re = numext::real(from);
-  const float im = numext::imag(from);
-  return Packet8cf(Packet16f{re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im});
-}
+// --- pset1 for complex ---
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
 
 template <>
-EIGEN_STRONG_INLINE Packet4cd pset1<Packet4cd>(const std::complex<double>& from) {
+EIGEN_STRONG_INLINE PacketXcf pset1<PacketXcf>(const std::complex<float>& from) {
+  const float re = numext::real(from);
+  const float im = numext::imag(from);
+  return PacketXcf(PacketXf{re, im, re, im});
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd pset1<PacketXcd>(const std::complex<double>& from) {
   const double re = numext::real(from);
   const double im = numext::imag(from);
-  return Packet4cd(Packet8d{re, im, re, im, re, im, re, im});
+  return PacketXcd(PacketXd{re, im});
 }
 
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf pset1<PacketXcf>(const std::complex<float>& from) {
+  const float re = numext::real(from);
+  const float im = numext::imag(from);
+  return PacketXcf(PacketXf{re, im, re, im, re, im, re, im});
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd pset1<PacketXcd>(const std::complex<double>& from) {
+  const double re = numext::real(from);
+  const double im = numext::imag(from);
+  return PacketXcd(PacketXd{re, im, re, im});
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf pset1<PacketXcf>(const std::complex<float>& from) {
+  const float re = numext::real(from);
+  const float im = numext::imag(from);
+  return PacketXcf(PacketXf{re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im});
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd pset1<PacketXcd>(const std::complex<double>& from) {
+  const double re = numext::real(from);
+  const double im = numext::imag(from);
+  return PacketXcd(PacketXd{re, im, re, im, re, im, re, im});
+}
+
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
 // ----------- Unary ops ------------------
 #define DELEGATE_UNARY_TO_REAL_OP(PACKET_TYPE, OP)                        \
   template <>                                                             \
@@ -148,147 +191,350 @@
   EIGEN_STRONG_INLINE unpacket_traits<PACKET_TYPE>::type pfirst<PACKET_TYPE>(const PACKET_TYPE& a) { \
     return a[0];                                                                                     \
   }                                                                                                  \
-  template <>                                                                                        \
-  EIGEN_STRONG_INLINE PACKET_TYPE pexp<PACKET_TYPE>(const PACKET_TYPE& a) {                          \
-    return pexp_complex(a);                                                                          \
-  }                                                                                                  \
-  template <>                                                                                        \
-  EIGEN_STRONG_INLINE PACKET_TYPE plog<PACKET_TYPE>(const PACKET_TYPE& a) {                          \
-    return plog_complex(a);                                                                          \
-  }                                                                                                  \
-  template <>                                                                                        \
-  EIGEN_STRONG_INLINE PACKET_TYPE psqrt<PACKET_TYPE>(const PACKET_TYPE& a) {                         \
-    return psqrt_complex(a);                                                                         \
-  }
+  EIGEN_INSTANTIATE_COMPLEX_MATH_FUNCS(PACKET_TYPE)
 
-EIGEN_CLANG_COMPLEX_UNARY_CWISE_OPS(Packet8cf);
-EIGEN_CLANG_COMPLEX_UNARY_CWISE_OPS(Packet4cd);
+EIGEN_CLANG_COMPLEX_UNARY_CWISE_OPS(PacketXcf);
+EIGEN_CLANG_COMPLEX_UNARY_CWISE_OPS(PacketXcd);
+
+// --- pconj ---
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
 
 template <>
-EIGEN_STRONG_INLINE Packet8cf pconj<Packet8cf>(const Packet8cf& a) {
-  return Packet8cf(__builtin_shufflevector(a.v, -a.v, 0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31));
+EIGEN_STRONG_INLINE PacketXcf pconj<PacketXcf>(const PacketXcf& a) {
+  return PacketXcf(__builtin_shufflevector(a.v, -a.v, 0, 5, 2, 7));
 }
 template <>
+EIGEN_STRONG_INLINE PacketXcd pconj<PacketXcd>(const PacketXcd& a) {
+  return PacketXcd(__builtin_shufflevector(a.v, -a.v, 0, 3));
+}
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf pconj<PacketXcf>(const PacketXcf& a) {
+  return PacketXcf(__builtin_shufflevector(a.v, -a.v, 0, 9, 2, 11, 4, 13, 6, 15));
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd pconj<PacketXcd>(const PacketXcd& a) {
+  return PacketXcd(__builtin_shufflevector(a.v, -a.v, 0, 5, 2, 7));
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf pconj<PacketXcf>(const PacketXcf& a) {
+  return PacketXcf(__builtin_shufflevector(a.v, -a.v, 0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31));
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd pconj<PacketXcd>(const PacketXcd& a) {
+  return PacketXcd(__builtin_shufflevector(a.v, -a.v, 0, 9, 2, 11, 4, 13, 6, 15));
+}
+
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
+// Sub-packet pconj specializations needed for reductions.
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 32
+template <>
+EIGEN_STRONG_INLINE Packet2cf pconj<Packet2cf>(const Packet2cf& a) {
+  return Packet2cf(__builtin_shufflevector(a.v, -a.v, 0, 5, 2, 7));
+}
+#endif
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 64
+template <>
 EIGEN_STRONG_INLINE Packet4cf pconj<Packet4cf>(const Packet4cf& a) {
   return Packet4cf(__builtin_shufflevector(a.v, -a.v, 0, 9, 2, 11, 4, 13, 6, 15));
 }
 template <>
-EIGEN_STRONG_INLINE Packet2cf pconj<Packet2cf>(const Packet2cf& a) {
-  return Packet2cf(__builtin_shufflevector(a.v, -a.v, 0, 5, 2, 7));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4cd pconj<Packet4cd>(const Packet4cd& a) {
-  return Packet4cd(__builtin_shufflevector(a.v, -a.v, 0, 9, 2, 11, 4, 13, 6, 15));
-}
-template <>
 EIGEN_STRONG_INLINE Packet2cd pconj<Packet2cd>(const Packet2cd& a) {
   return Packet2cd(__builtin_shufflevector(a.v, -a.v, 0, 5, 2, 7));
 }
+#endif
 
 #undef DELEGATE_UNARY_TO_REAL_OP
 #undef EIGEN_CLANG_COMPLEX_UNARY_CWISE_OPS
 
 // Flip real and imaginary parts, i.e.  {re(a), im(a)} -> {im(a), re(a)}.
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
+
 template <>
-EIGEN_STRONG_INLINE Packet8cf pcplxflip<Packet8cf>(const Packet8cf& a) {
-  return Packet8cf(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14));
+EIGEN_STRONG_INLINE PacketXcf pcplxflip<PacketXcf>(const PacketXcf& a) {
+  return PacketXcf(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2));
 }
 template <>
+EIGEN_STRONG_INLINE PacketXcd pcplxflip<PacketXcd>(const PacketXcd& a) {
+  return PacketXcd(__builtin_shufflevector(a.v, a.v, 1, 0));
+}
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf pcplxflip<PacketXcf>(const PacketXcf& a) {
+  return PacketXcf(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2, 5, 4, 7, 6));
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd pcplxflip<PacketXcd>(const PacketXcd& a) {
+  return PacketXcd(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2));
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf pcplxflip<PacketXcf>(const PacketXcf& a) {
+  return PacketXcf(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14));
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd pcplxflip<PacketXcd>(const PacketXcd& a) {
+  return PacketXcd(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2, 5, 4, 7, 6));
+}
+
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
+// Sub-packet pcplxflip specializations needed for reductions.
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 32
+template <>
+EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& a) {
+  return Packet2cf(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2));
+}
+#endif
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 64
+template <>
 EIGEN_STRONG_INLINE Packet4cf pcplxflip<Packet4cf>(const Packet4cf& a) {
   return Packet4cf(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2, 5, 4, 7, 6));
 }
 template <>
-EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& a) {
-  return Packet2cf(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2));
-}
-template <>
-EIGEN_STRONG_INLINE Packet4cd pcplxflip<Packet4cd>(const Packet4cd& a) {
-  return Packet4cd(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2, 5, 4, 7, 6));
-}
-template <>
 EIGEN_STRONG_INLINE Packet2cd pcplxflip<Packet2cd>(const Packet2cd& a) {
   return Packet2cd(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2));
 }
+#endif
 
 // Copy real to imaginary part, i.e. {re(a), im(a)} -> {re(a), re(a)}.
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
+
 template <>
-EIGEN_STRONG_INLINE Packet8cf pdupreal<Packet8cf>(const Packet8cf& a) {
-  return Packet8cf(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14));
+EIGEN_STRONG_INLINE PacketXcf pdupreal<PacketXcf>(const PacketXcf& a) {
+  return PacketXcf(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2));
 }
 template <>
+EIGEN_STRONG_INLINE PacketXcd pdupreal<PacketXcd>(const PacketXcd& a) {
+  return PacketXcd(__builtin_shufflevector(a.v, a.v, 0, 0));
+}
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf pdupreal<PacketXcf>(const PacketXcf& a) {
+  return PacketXcf(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2, 4, 4, 6, 6));
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd pdupreal<PacketXcd>(const PacketXcd& a) {
+  return PacketXcd(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2));
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf pdupreal<PacketXcf>(const PacketXcf& a) {
+  return PacketXcf(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14));
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd pdupreal<PacketXcd>(const PacketXcd& a) {
+  return PacketXcd(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2, 4, 4, 6, 6));
+}
+
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
+// Sub-packet pdupreal specializations needed for reductions.
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 32
+template <>
+EIGEN_STRONG_INLINE Packet2cf pdupreal<Packet2cf>(const Packet2cf& a) {
+  return Packet2cf(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2));
+}
+#endif
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 64
+template <>
 EIGEN_STRONG_INLINE Packet4cf pdupreal<Packet4cf>(const Packet4cf& a) {
   return Packet4cf(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2, 4, 4, 6, 6));
 }
 template <>
-EIGEN_STRONG_INLINE Packet2cf pdupreal<Packet2cf>(const Packet2cf& a) {
-  return Packet2cf(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2));
-}
-template <>
-EIGEN_STRONG_INLINE Packet4cd pdupreal<Packet4cd>(const Packet4cd& a) {
-  return Packet4cd(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2, 4, 4, 6, 6));
-}
-template <>
 EIGEN_STRONG_INLINE Packet2cd pdupreal<Packet2cd>(const Packet2cd& a) {
   return Packet2cd(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2));
 }
+#endif
 
 // Copy imaginary to real part, i.e. {re(a), im(a)} -> {im(a), im(a)}.
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
+
 template <>
-EIGEN_STRONG_INLINE Packet8cf pdupimag<Packet8cf>(const Packet8cf& a) {
-  return Packet8cf(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15));
+EIGEN_STRONG_INLINE PacketXcf pdupimag<PacketXcf>(const PacketXcf& a) {
+  return PacketXcf(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3));
 }
 template <>
+EIGEN_STRONG_INLINE PacketXcd pdupimag<PacketXcd>(const PacketXcd& a) {
+  return PacketXcd(__builtin_shufflevector(a.v, a.v, 1, 1));
+}
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf pdupimag<PacketXcf>(const PacketXcf& a) {
+  return PacketXcf(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3, 5, 5, 7, 7));
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd pdupimag<PacketXcd>(const PacketXcd& a) {
+  return PacketXcd(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3));
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf pdupimag<PacketXcf>(const PacketXcf& a) {
+  return PacketXcf(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15));
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd pdupimag<PacketXcd>(const PacketXcd& a) {
+  return PacketXcd(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3, 5, 5, 7, 7));
+}
+
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
+// Sub-packet pdupimag specializations needed for reductions.
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 32
+template <>
+EIGEN_STRONG_INLINE Packet2cf pdupimag<Packet2cf>(const Packet2cf& a) {
+  return Packet2cf(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3));
+}
+#endif
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 64
+template <>
 EIGEN_STRONG_INLINE Packet4cf pdupimag<Packet4cf>(const Packet4cf& a) {
   return Packet4cf(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3, 5, 5, 7, 7));
 }
 template <>
-EIGEN_STRONG_INLINE Packet2cf pdupimag<Packet2cf>(const Packet2cf& a) {
-  return Packet2cf(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3));
-}
-template <>
-EIGEN_STRONG_INLINE Packet4cd pdupimag<Packet4cd>(const Packet4cd& a) {
-  return Packet4cd(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3, 5, 5, 7, 7));
-}
-template <>
 EIGEN_STRONG_INLINE Packet2cd pdupimag<Packet2cd>(const Packet2cd& a) {
   return Packet2cd(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3));
 }
+#endif
+
+// --- ploaddup ---
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
 
 template <>
-EIGEN_STRONG_INLINE Packet8cf ploaddup<Packet8cf>(const std::complex<float>* from) {
-  return Packet8cf(Packet16f{std::real(from[0]), std::imag(from[0]), std::real(from[0]), std::imag(from[0]),
-                             std::real(from[1]), std::imag(from[1]), std::real(from[1]), std::imag(from[1]),
-                             std::real(from[2]), std::imag(from[2]), std::real(from[2]), std::imag(from[2]),
-                             std::real(from[3]), std::imag(from[3]), std::real(from[3]), std::imag(from[3])});
+EIGEN_STRONG_INLINE PacketXcf ploaddup<PacketXcf>(const std::complex<float>* from) {
+  return pset1<PacketXcf>(*from);
 }
 template <>
-EIGEN_STRONG_INLINE Packet4cd ploaddup<Packet4cd>(const std::complex<double>* from) {
-  return Packet4cd(Packet8d{std::real(from[0]), std::imag(from[0]), std::real(from[0]), std::imag(from[0]),
+EIGEN_STRONG_INLINE PacketXcd ploaddup<PacketXcd>(const std::complex<double>* from) {
+  return pset1<PacketXcd>(*from);
+}
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf ploaddup<PacketXcf>(const std::complex<float>* from) {
+  return PacketXcf(PacketXf{std::real(from[0]), std::imag(from[0]), std::real(from[0]), std::imag(from[0]),
+                            std::real(from[1]), std::imag(from[1]), std::real(from[1]), std::imag(from[1])});
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd ploaddup<PacketXcd>(const std::complex<double>* from) {
+  return pset1<PacketXcd>(*from);
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf ploaddup<PacketXcf>(const std::complex<float>* from) {
+  return PacketXcf(PacketXf{std::real(from[0]), std::imag(from[0]), std::real(from[0]), std::imag(from[0]),
+                            std::real(from[1]), std::imag(from[1]), std::real(from[1]), std::imag(from[1]),
+                            std::real(from[2]), std::imag(from[2]), std::real(from[2]), std::imag(from[2]),
+                            std::real(from[3]), std::imag(from[3]), std::real(from[3]), std::imag(from[3])});
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd ploaddup<PacketXcd>(const std::complex<double>* from) {
+  return PacketXcd(PacketXd{std::real(from[0]), std::imag(from[0]), std::real(from[0]), std::imag(from[0]),
                             std::real(from[1]), std::imag(from[1]), std::real(from[1]), std::imag(from[1])});
 }
 
-template <>
-EIGEN_STRONG_INLINE Packet8cf ploadquad<Packet8cf>(const std::complex<float>* from) {
-  return Packet8cf(Packet16f{std::real(from[0]), std::imag(from[0]), std::real(from[0]), std::imag(from[0]),
-                             std::real(from[0]), std::imag(from[0]), std::real(from[0]), std::imag(from[0]),
-                             std::real(from[1]), std::imag(from[1]), std::real(from[1]), std::imag(from[1]),
-                             std::real(from[1]), std::imag(from[1]), std::real(from[1]), std::imag(from[1])});
-}
-template <>
-EIGEN_STRONG_INLINE Packet4cd ploadquad<Packet4cd>(const std::complex<double>* from) {
-  return pset1<Packet4cd>(*from);
-}
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
+// --- ploadquad ---
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
 
 template <>
-EIGEN_STRONG_INLINE Packet8cf preverse<Packet8cf>(const Packet8cf& a) {
-  return Packet8cf(reinterpret_cast<Packet16f>(preverse(reinterpret_cast<Packet8d>(a.v))));
+EIGEN_STRONG_INLINE PacketXcf ploadquad<PacketXcf>(const std::complex<float>* from) {
+  return pset1<PacketXcf>(*from);
 }
 template <>
-EIGEN_STRONG_INLINE Packet4cd preverse<Packet4cd>(const Packet4cd& a) {
-  return Packet4cd(__builtin_shufflevector(a.v, a.v, 6, 7, 4, 5, 2, 3, 0, 1));
+EIGEN_STRONG_INLINE PacketXcd ploadquad<PacketXcd>(const std::complex<double>* from) {
+  return pset1<PacketXcd>(*from);
 }
 
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf ploadquad<PacketXcf>(const std::complex<float>* from) {
+  return pset1<PacketXcf>(*from);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd ploadquad<PacketXcd>(const std::complex<double>* from) {
+  return pset1<PacketXcd>(*from);
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf ploadquad<PacketXcf>(const std::complex<float>* from) {
+  return PacketXcf(PacketXf{std::real(from[0]), std::imag(from[0]), std::real(from[0]), std::imag(from[0]),
+                            std::real(from[0]), std::imag(from[0]), std::real(from[0]), std::imag(from[0]),
+                            std::real(from[1]), std::imag(from[1]), std::real(from[1]), std::imag(from[1]),
+                            std::real(from[1]), std::imag(from[1]), std::real(from[1]), std::imag(from[1])});
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd ploadquad<PacketXcd>(const std::complex<double>* from) {
+  return pset1<PacketXcd>(*from);
+}
+
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
+// --- preverse ---
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf preverse<PacketXcf>(const PacketXcf& a) {
+  // 2 complex floats: swap pairs (0,1) and (2,3)
+  return PacketXcf(__builtin_shufflevector(a.v, a.v, 2, 3, 0, 1));
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd preverse<PacketXcd>(const PacketXcd& a) {
+  // 1 complex double: identity
+  return a;
+}
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf preverse<PacketXcf>(const PacketXcf& a) {
+  // 4 complex floats: reverse pairs
+  return PacketXcf(reinterpret_cast<PacketXf>(preverse(reinterpret_cast<PacketXd>(a.v))));
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd preverse<PacketXcd>(const PacketXcd& a) {
+  // 2 complex doubles: swap pairs
+  return PacketXcd(__builtin_shufflevector(a.v, a.v, 2, 3, 0, 1));
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf preverse<PacketXcf>(const PacketXcf& a) {
+  return PacketXcf(reinterpret_cast<PacketXf>(preverse(reinterpret_cast<PacketXd>(a.v))));
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd preverse<PacketXcd>(const PacketXcd& a) {
+  return PacketXcd(__builtin_shufflevector(a.v, a.v, 6, 7, 4, 5, 2, 3, 0, 1));
+}
+
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
 // ----------- Binary ops ------------------
 #define DELEGATE_BINARY_TO_REAL_OP(PACKET_TYPE, OP)                                             \
   template <>                                                                                   \
@@ -312,8 +558,8 @@
     return PACKET_TYPE(pand(pdupreal(t).v, pdupimag(t).v));                                          \
   }
 
-EIGEN_CLANG_COMPLEX_BINARY_CWISE_OPS(Packet8cf);
-EIGEN_CLANG_COMPLEX_BINARY_CWISE_OPS(Packet4cd);
+EIGEN_CLANG_COMPLEX_BINARY_CWISE_OPS(PacketXcf);
+EIGEN_CLANG_COMPLEX_BINARY_CWISE_OPS(PacketXcd);
 
 // Binary ops that are needed on sub-packets for predux and predux_mul.
 #define EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(PACKET_TYPE)                                 \
@@ -323,11 +569,17 @@
     return pmul_complex(a, b);                                                                    \
   }
 
-EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(Packet8cf);
-EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(Packet4cf);
+EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(PacketXcf);
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 32
 EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(Packet2cf);
-EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(Packet4cd);
+#endif
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 64
+EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(Packet4cf);
+#endif
+EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(PacketXcd);
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 64
 EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(Packet2cd);
+#endif
 
 #define EIGEN_CLANG_PACKET_SCATTER_GATHER(PACKET_TYPE)                                                               \
   template <>                                                                                                        \
@@ -350,8 +602,8 @@
     return result;                                                                                                   \
   }
 
-EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet8cf);
-EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet4cd);
+EIGEN_CLANG_PACKET_SCATTER_GATHER(PacketXcf);
+EIGEN_CLANG_PACKET_SCATTER_GATHER(PacketXcd);
 #undef EIGEN_CLANG_PACKET_SCATTER_GATHER
 
 #undef DELEGATE_BINARY_TO_REAL_OP
@@ -360,43 +612,89 @@
 
 // ------------ ternary ops -------------
 template <>
-EIGEN_STRONG_INLINE Packet8cf pselect<Packet8cf>(const Packet8cf& mask, const Packet8cf& a, const Packet8cf& b) {
-  return Packet8cf(reinterpret_cast<Packet16f>(
-      pselect(reinterpret_cast<Packet8d>(mask.v), reinterpret_cast<Packet8d>(a.v), reinterpret_cast<Packet8d>(b.v))));
+EIGEN_STRONG_INLINE PacketXcf pselect<PacketXcf>(const PacketXcf& mask, const PacketXcf& a, const PacketXcf& b) {
+  return PacketXcf(reinterpret_cast<PacketXf>(
+      pselect(reinterpret_cast<PacketXd>(mask.v), reinterpret_cast<PacketXd>(a.v), reinterpret_cast<PacketXd>(b.v))));
 }
 
+// --- zip_in_place for complex ---
 namespace detail {
+
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
+
 template <>
-EIGEN_ALWAYS_INLINE void zip_in_place<Packet8cf>(Packet8cf& p1, Packet8cf& p2) {
-  Packet16f tmp = __builtin_shufflevector(p1.v, p2.v, 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23);
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXcf>(PacketXcf& p1, PacketXcf& p2) {
+  PacketXf tmp = __builtin_shufflevector(p1.v, p2.v, 0, 1, 4, 5);
+  p2.v = __builtin_shufflevector(p1.v, p2.v, 2, 3, 6, 7);
+  p1.v = tmp;
+}
+// PacketXcd at 16 bytes has 1 element, no zip_in_place needed.
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXcf>(PacketXcf& p1, PacketXcf& p2) {
+  PacketXf tmp = __builtin_shufflevector(p1.v, p2.v, 0, 1, 8, 9, 2, 3, 10, 11);
+  p2.v = __builtin_shufflevector(p1.v, p2.v, 4, 5, 12, 13, 6, 7, 14, 15);
+  p1.v = tmp;
+}
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXcd>(PacketXcd& p1, PacketXcd& p2) {
+  PacketXd tmp = __builtin_shufflevector(p1.v, p2.v, 0, 1, 4, 5);
+  p2.v = __builtin_shufflevector(p1.v, p2.v, 2, 3, 6, 7);
+  p1.v = tmp;
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXcf>(PacketXcf& p1, PacketXcf& p2) {
+  PacketXf tmp = __builtin_shufflevector(p1.v, p2.v, 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23);
   p2.v = __builtin_shufflevector(p1.v, p2.v, 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31);
   p1.v = tmp;
 }
 
 template <>
-EIGEN_ALWAYS_INLINE void zip_in_place<Packet4cd>(Packet4cd& p1, Packet4cd& p2) {
-  Packet8d tmp = __builtin_shufflevector(p1.v, p2.v, 0, 1, 8, 9, 2, 3, 10, 11);
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXcd>(PacketXcd& p1, PacketXcd& p2) {
+  PacketXd tmp = __builtin_shufflevector(p1.v, p2.v, 0, 1, 8, 9, 2, 3, 10, 11);
   p2.v = __builtin_shufflevector(p1.v, p2.v, 4, 5, 12, 13, 6, 7, 14, 15);
   p1.v = tmp;
 }
+
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
 }  // namespace detail
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8cf, 8>& kernel) {
+// --- ptranspose for complex ---
+// PacketXcf: valid block sizes depend on kComplexFloatSize.
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXcf, 2>& kernel) {
   detail::ptranspose_impl(kernel);
 }
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8cf, 4>& kernel) {
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 32
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXcf, 4>& kernel) {
   detail::ptranspose_impl(kernel);
 }
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8cf, 2>& kernel) {
+#endif
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 64
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXcf, 8>& kernel) {
   detail::ptranspose_impl(kernel);
 }
+#endif
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4cd, 4>& kernel) {
+// PacketXcd: valid block sizes depend on kComplexDoubleSize.
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 32
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXcd, 2>& kernel) {
   detail::ptranspose_impl(kernel);
 }
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4cd, 2>& kernel) {
+#endif
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 64
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXcd, 4>& kernel) {
   detail::ptranspose_impl(kernel);
 }
+#endif
+
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(PacketXcf, PacketXf)
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(PacketXcd, PacketXd)
 
 }  // end namespace internal
 }  // end namespace Eigen

diff --git a/Eigen/src/Core/arch/clang/MathFunctions.h b/Eigen/src/Core/arch/clang/MathFunctions.h
index 706a870..c2afeda 100644
--- a/Eigen/src/Core/arch/clang/MathFunctions.h
+++ b/Eigen/src/Core/arch/clang/MathFunctions.h

@@ -18,27 +18,27 @@
 namespace internal {
 
 template <>
-EIGEN_STRONG_INLINE Packet16f pfrexp<Packet16f>(const Packet16f& a, Packet16f& exponent) {
+EIGEN_STRONG_INLINE PacketXf pfrexp<PacketXf>(const PacketXf& a, PacketXf& exponent) {
   return pfrexp_generic(a, exponent);
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet8d pfrexp<Packet8d>(const Packet8d& a, Packet8d& exponent) {
+EIGEN_STRONG_INLINE PacketXd pfrexp<PacketXd>(const PacketXd& a, PacketXd& exponent) {
   return pfrexp_generic(a, exponent);
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet16f pldexp<Packet16f>(const Packet16f& a, const Packet16f& exponent) {
+EIGEN_STRONG_INLINE PacketXf pldexp<PacketXf>(const PacketXf& a, const PacketXf& exponent) {
   return pldexp_generic(a, exponent);
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet8d pldexp<Packet8d>(const Packet8d& a, const Packet8d& exponent) {
+EIGEN_STRONG_INLINE PacketXd pldexp<PacketXd>(const PacketXd& a, const PacketXd& exponent) {
   return pldexp_generic(a, exponent);
 }
 
-EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(Packet16f)
-EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(Packet8d)
+EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(PacketXf)
+EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(PacketXd)
 
 }  // end namespace internal
 

diff --git a/Eigen/src/Core/arch/clang/PacketMath.h b/Eigen/src/Core/arch/clang/PacketMath.h
index 19e5e8f..8412694 100644
--- a/Eigen/src/Core/arch/clang/PacketMath.h
+++ b/Eigen/src/Core/arch/clang/PacketMath.h

@@ -10,6 +10,9 @@
 #ifndef EIGEN_PACKET_MATH_CLANG_H
 #define EIGEN_PACKET_MATH_CLANG_H
 
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
 namespace Eigen {
 namespace internal {
 
@@ -21,14 +24,32 @@
 using VectorType = ScalarT __attribute__((ext_vector_type(n), aligned(n * sizeof(ScalarT))));
 }  // namespace detail
 
-// --- Primary packet type definitions (fixed at 64 bytes) ---
+// --- Naming Convention ---
+// This backend uses size-independent type aliases so the same code works
+// for EIGEN_GENERIC_VECTOR_SIZE_BYTES in {16, 32, 64}:
+//
+//   PacketXf  - float vector   (4, 8, or 16 elements)
+//   PacketXd  - double vector  (2, 4, or 8 elements)
+//   PacketXi  - int32_t vector (4, 8, or 16 elements)
+//   PacketXl  - int64_t vector (2, 4, or 8 elements)
+//   PacketXcf - complex<float> vector  (2, 4, or 8 elements)  [in Complex.h]
+//   PacketXcd - complex<double> vector (1, 2, or 4 elements)  [in Complex.h]
+//
+// The "X" suffix indicates the element count is determined by the macro
+// EIGEN_GENERIC_VECTOR_SIZE_BYTES at compile time. Operations that require
+// compile-time constant indices (e.g. __builtin_shufflevector) use
+// #if EIGEN_GENERIC_VECTOR_SIZE_BYTES == ... blocks.
 
-// TODO(rmlarsen): Generalize to other vector sizes.
-static_assert(EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64, "We currently assume the full vector size is 64 bytes");
-using Packet16f = detail::VectorType<float, 16>;
-using Packet8d = detail::VectorType<double, 8>;
-using Packet16i = detail::VectorType<int32_t, 16>;
-using Packet8l = detail::VectorType<int64_t, 8>;
+static_assert(EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16 || EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32 ||
+                  EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64,
+              "EIGEN_GENERIC_VECTOR_SIZE_BYTES must be 16, 32, or 64");
+
+constexpr int kFloatPacketSize = EIGEN_GENERIC_VECTOR_SIZE_BYTES / sizeof(float);
+constexpr int kDoublePacketSize = EIGEN_GENERIC_VECTOR_SIZE_BYTES / sizeof(double);
+using PacketXf = detail::VectorType<float, kFloatPacketSize>;
+using PacketXd = detail::VectorType<double, kDoublePacketSize>;
+using PacketXi = detail::VectorType<int32_t, kFloatPacketSize>;
+using PacketXl = detail::VectorType<int64_t, kDoublePacketSize>;
 
 // --- packet_traits specializations ---
 struct generic_float_packet_traits : default_packet_traits {
@@ -79,18 +100,20 @@
 
 template <>
 struct packet_traits<float> : generic_float_packet_traits {
-  using type = Packet16f;
-  using half = Packet16f;
+  using type = PacketXf;
+  using half = PacketXf;
   enum {
-    size = 16,
+    size = kFloatPacketSize,
   };
 };
 
 template <>
 struct packet_traits<double> : generic_float_packet_traits {
-  using type = Packet8d;
-  using half = Packet8d;
-  enum { size = 8, HasACos = 0, HasASin = 0 };
+  using type = PacketXd;
+  using half = PacketXd;
+  // Generic double-precision acos/asin are not yet implemented in
+  // GenericPacketMathFunctions.h (only float versions exist).
+  enum { size = kDoublePacketSize, HasACos = 0, HasASin = 0 };
 };
 
 struct generic_integer_packet_traits : default_packet_traits {
@@ -126,19 +149,19 @@
 
 template <>
 struct packet_traits<int32_t> : generic_integer_packet_traits {
-  using type = Packet16i;
-  using half = Packet16i;
+  using type = PacketXi;
+  using half = PacketXi;
   enum {
-    size = 16,
+    size = kFloatPacketSize,
   };
 };
 
 template <>
 struct packet_traits<int64_t> : generic_integer_packet_traits {
-  using type = Packet8l;
-  using half = Packet8l;
+  using type = PacketXl;
+  using half = PacketXl;
   enum {
-    size = 8,
+    size = kDoublePacketSize,
   };
 };
 
@@ -151,37 +174,37 @@
 };
 
 template <>
-struct unpacket_traits<Packet16f> : generic_unpacket_traits {
+struct unpacket_traits<PacketXf> : generic_unpacket_traits {
   using type = float;
-  using half = Packet16f;
-  using integer_packet = Packet16i;
+  using half = PacketXf;
+  using integer_packet = PacketXi;
   enum {
-    size = 16,
+    size = kFloatPacketSize,
   };
 };
 template <>
-struct unpacket_traits<Packet8d> : generic_unpacket_traits {
+struct unpacket_traits<PacketXd> : generic_unpacket_traits {
   using type = double;
-  using half = Packet8d;
-  using integer_packet = Packet8l;
+  using half = PacketXd;
+  using integer_packet = PacketXl;
   enum {
-    size = 8,
+    size = kDoublePacketSize,
   };
 };
 template <>
-struct unpacket_traits<Packet16i> : generic_unpacket_traits {
+struct unpacket_traits<PacketXi> : generic_unpacket_traits {
   using type = int32_t;
-  using half = Packet16i;
+  using half = PacketXi;
   enum {
-    size = 16,
+    size = kFloatPacketSize,
   };
 };
 template <>
-struct unpacket_traits<Packet8l> : generic_unpacket_traits {
+struct unpacket_traits<PacketXl> : generic_unpacket_traits {
   using type = int64_t;
-  using half = Packet8l;
+  using half = PacketXl;
   enum {
-    size = 8,
+    size = kDoublePacketSize,
   };
 };
 
@@ -196,7 +219,7 @@
 using scalar_type_of_vector_t = typename ScalarTypeOfVector<VectorT>::type;
 
 template <typename VectorType>
-struct UnsignedVectorHelpter {
+struct UnsignedVectorHelper {
   static VectorType v;
   static constexpr int n = __builtin_vectorelements(v);
   using UnsignedScalar = std::make_unsigned_t<scalar_type_of_vector_t<VectorType>>;
@@ -204,7 +227,7 @@
 };
 
 template <typename VectorT>
-using unsigned_vector_t = typename UnsignedVectorHelpter<VectorT>::type;
+using unsigned_vector_t = typename UnsignedVectorHelper<VectorT>::type;
 
 template <typename VectorT>
 using HalfPacket = VectorType<typename unpacket_traits<VectorT>::type, unpacket_traits<VectorT>::size / 2>;
@@ -216,10 +239,7 @@
 template <typename VectorT>
 EIGEN_STRONG_INLINE VectorT load_vector_unaligned(const scalar_type_of_vector_t<VectorT>* from) {
   VectorT to;
-  constexpr int n = __builtin_vectorelements(to);
-  for (int i = 0; i < n; ++i) {
-    to[i] = from[i];
-  }
+  __builtin_memcpy(&to, from, sizeof(VectorT));
   return to;
 }
 
@@ -230,10 +250,7 @@
 
 template <typename VectorT>
 EIGEN_STRONG_INLINE void store_vector_unaligned(scalar_type_of_vector_t<VectorT>* to, const VectorT& from) {
-  constexpr int n = __builtin_vectorelements(from);
-  for (int i = 0; i < n; ++i) {
-    *to++ = from[i];
-  }
+  __builtin_memcpy(to, &from, sizeof(VectorT));
 }
 
 template <typename VectorT>
@@ -266,21 +283,21 @@
     detail::store_vector_aligned<PACKET_TYPE>(to, from);                                                          \
   }
 
-EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(Packet16f)
-EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(Packet8d)
-EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(Packet16i)
-EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(Packet8l)
+EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(PacketXf)
+EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(PacketXd)
+EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(PacketXi)
+EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(PacketXl)
 #undef EIGEN_CLANG_PACKET_LOAD_STORE_PACKET
 
 // --- Broadcast operation ---
 template <>
-EIGEN_STRONG_INLINE Packet16f pset1frombits<Packet16f>(uint32_t from) {
-  return Packet16f(numext::bit_cast<float>(from));
+EIGEN_STRONG_INLINE PacketXf pset1frombits<PacketXf>(uint32_t from) {
+  return PacketXf(numext::bit_cast<float>(from));
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet8d pset1frombits<Packet8d>(uint64_t from) {
-  return Packet8d(numext::bit_cast<double>(from));
+EIGEN_STRONG_INLINE PacketXd pset1frombits<PacketXd>(uint64_t from) {
+  return PacketXd(numext::bit_cast<double>(from));
 }
 
 #define EIGEN_CLANG_PACKET_SET1(PACKET_TYPE)                                                            \
@@ -293,10 +310,10 @@
     return from[0];                                                                                     \
   }
 
-EIGEN_CLANG_PACKET_SET1(Packet16f)
-EIGEN_CLANG_PACKET_SET1(Packet8d)
-EIGEN_CLANG_PACKET_SET1(Packet16i)
-EIGEN_CLANG_PACKET_SET1(Packet8l)
+EIGEN_CLANG_PACKET_SET1(PacketXf)
+EIGEN_CLANG_PACKET_SET1(PacketXd)
+EIGEN_CLANG_PACKET_SET1(PacketXi)
+EIGEN_CLANG_PACKET_SET1(PacketXl)
 #undef EIGEN_CLANG_PACKET_SET1
 
 // --- Arithmetic operations ---
@@ -310,23 +327,22 @@
     return -a;                                                                 \
   }
 
-EIGEN_CLANG_PACKET_ARITHMETIC(Packet16f)
-EIGEN_CLANG_PACKET_ARITHMETIC(Packet8d)
-EIGEN_CLANG_PACKET_ARITHMETIC(Packet16i)
-EIGEN_CLANG_PACKET_ARITHMETIC(Packet8l)
+EIGEN_CLANG_PACKET_ARITHMETIC(PacketXf)
+EIGEN_CLANG_PACKET_ARITHMETIC(PacketXd)
+EIGEN_CLANG_PACKET_ARITHMETIC(PacketXi)
+EIGEN_CLANG_PACKET_ARITHMETIC(PacketXl)
 #undef EIGEN_CLANG_PACKET_ARITHMETIC
 
 // --- Bitwise operations (via casting) ---
 
 namespace detail {
 
-// Note: pcast functions are not template specializations, just helpers
-// identical to preinterpret. We duplicate them here to avoid a circular
-// dependence with TypeCasting.h.
-EIGEN_STRONG_INLINE Packet16i pcast_float_to_int(const Packet16f& a) { return reinterpret_cast<Packet16i>(a); }
-EIGEN_STRONG_INLINE Packet16f pcast_int_to_float(const Packet16i& a) { return reinterpret_cast<Packet16f>(a); }
-EIGEN_STRONG_INLINE Packet8l pcast_double_to_long(const Packet8d& a) { return reinterpret_cast<Packet8l>(a); }
-EIGEN_STRONG_INLINE Packet8d pcast_long_to_double(const Packet8l& a) { return reinterpret_cast<Packet8d>(a); }
+// Reinterpret-cast helpers, equivalent to preinterpret<> but defined here
+// because PacketMath.h is included before TypeCasting.h.
+EIGEN_STRONG_INLINE PacketXi preinterpret_float_to_int(const PacketXf& a) { return reinterpret_cast<PacketXi>(a); }
+EIGEN_STRONG_INLINE PacketXf preinterpret_int_to_float(const PacketXi& a) { return reinterpret_cast<PacketXf>(a); }
+EIGEN_STRONG_INLINE PacketXl preinterpret_double_to_long(const PacketXd& a) { return reinterpret_cast<PacketXl>(a); }
+EIGEN_STRONG_INLINE PacketXd preinterpret_long_to_double(const PacketXl& a) { return reinterpret_cast<PacketXd>(a); }
 
 }  // namespace detail
 
@@ -338,7 +354,7 @@
   }                                                                                                  \
   template <>                                                                                        \
   constexpr EIGEN_STRONG_INLINE PACKET_TYPE ptrue<PACKET_TYPE>(const PACKET_TYPE& /*unused*/) {      \
-    return PACKET_TYPE(0) == PACKET_TYPE(0);                                                         \
+    return numext::bit_cast<PACKET_TYPE>(PACKET_TYPE(0) == PACKET_TYPE(0));                          \
   }                                                                                                  \
   template <>                                                                                        \
   EIGEN_STRONG_INLINE PACKET_TYPE pand<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) {    \
@@ -370,16 +386,21 @@
     return a << N;                                                                                   \
   }
 
-EIGEN_CLANG_PACKET_BITWISE_INT(Packet16i)
-EIGEN_CLANG_PACKET_BITWISE_INT(Packet8l)
+EIGEN_CLANG_PACKET_BITWISE_INT(PacketXi)
+EIGEN_CLANG_PACKET_BITWISE_INT(PacketXl)
 #undef EIGEN_CLANG_PACKET_BITWISE_INT
 
 // Bitwise ops for floating point packets
 #define EIGEN_CLANG_PACKET_BITWISE_FLOAT(PACKET_TYPE, CAST_TO_INT, CAST_FROM_INT)                    \
   template <>                                                                                        \
-  EIGEN_STRONG_INLINE PACKET_TYPE ptrue<PACKET_TYPE>(const PACKET_TYPE& /* unused */) {              \
+  constexpr EIGEN_STRONG_INLINE PACKET_TYPE pzero<PACKET_TYPE>(const PACKET_TYPE& /*unused*/) {      \
     using Scalar = detail::scalar_type_of_vector_t<PACKET_TYPE>;                                     \
-    return CAST_FROM_INT(PACKET_TYPE(Scalar(0)) == PACKET_TYPE(Scalar(0)));                          \
+    return PACKET_TYPE(Scalar(0));                                                                   \
+  }                                                                                                  \
+  template <>                                                                                        \
+  constexpr EIGEN_STRONG_INLINE PACKET_TYPE ptrue<PACKET_TYPE>(const PACKET_TYPE& /* unused */) {    \
+    using Scalar = detail::scalar_type_of_vector_t<PACKET_TYPE>;                                     \
+    return numext::bit_cast<PACKET_TYPE>(PACKET_TYPE(Scalar(0)) == PACKET_TYPE(Scalar(0)));          \
   }                                                                                                  \
   template <>                                                                                        \
   EIGEN_STRONG_INLINE PACKET_TYPE pand<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) {    \
@@ -398,10 +419,37 @@
     return CAST_FROM_INT(CAST_TO_INT(a) & ~CAST_TO_INT(b));                                          \
   }
 
-EIGEN_CLANG_PACKET_BITWISE_FLOAT(Packet16f, detail::pcast_float_to_int, detail::pcast_int_to_float)
-EIGEN_CLANG_PACKET_BITWISE_FLOAT(Packet8d, detail::pcast_double_to_long, detail::pcast_long_to_double)
+EIGEN_CLANG_PACKET_BITWISE_FLOAT(PacketXf, detail::preinterpret_float_to_int, detail::preinterpret_int_to_float)
+EIGEN_CLANG_PACKET_BITWISE_FLOAT(PacketXd, detail::preinterpret_double_to_long, detail::preinterpret_long_to_double)
 #undef EIGEN_CLANG_PACKET_BITWISE_FLOAT
 
+// --- Comparison operations ---
+// Clang vector extensions perform comparisons in the original type (float/double),
+// returning an int vector with all-ones (-1) for true and all-zeros for false.
+// The bit_cast reinterprets those int bitmasks as float packets, which is the
+// format expected by pselect and other Eigen packet operations.
+#define EIGEN_CLANG_PACKET_CMP(PACKET_TYPE, INT_PACKET_TYPE)                                                \
+  template <>                                                                                               \
+  EIGEN_STRONG_INLINE PACKET_TYPE pcmp_eq<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) {        \
+    return numext::bit_cast<PACKET_TYPE>(INT_PACKET_TYPE(a == b));                                          \
+  }                                                                                                         \
+  template <>                                                                                               \
+  EIGEN_STRONG_INLINE PACKET_TYPE pcmp_lt<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) {        \
+    return numext::bit_cast<PACKET_TYPE>(INT_PACKET_TYPE(a < b));                                           \
+  }                                                                                                         \
+  template <>                                                                                               \
+  EIGEN_STRONG_INLINE PACKET_TYPE pcmp_le<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) {        \
+    return numext::bit_cast<PACKET_TYPE>(INT_PACKET_TYPE(a <= b));                                          \
+  }                                                                                                         \
+  template <>                                                                                               \
+  EIGEN_STRONG_INLINE PACKET_TYPE pcmp_lt_or_nan<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) { \
+    return numext::bit_cast<PACKET_TYPE>(INT_PACKET_TYPE(!(a >= b)));                                       \
+  }
+
+EIGEN_CLANG_PACKET_CMP(PacketXf, PacketXi)
+EIGEN_CLANG_PACKET_CMP(PacketXd, PacketXl)
+#undef EIGEN_CLANG_PACKET_CMP
+
 // --- Min/Max operations ---
 #if EIGEN_HAS_BUILTIN(__builtin_elementwise_min) && EIGEN_HAS_BUILTIN(__builtin_elementwise_max) && \
     EIGEN_HAS_BUILTIN(__builtin_elementwise_abs)
@@ -442,10 +490,10 @@
     return mask != 0 ? a : b;                                                                                       \
   }
 
-EIGEN_CLANG_PACKET_ELEMENTWISE(Packet16f)
-EIGEN_CLANG_PACKET_ELEMENTWISE(Packet8d)
-EIGEN_CLANG_PACKET_ELEMENTWISE(Packet16i)
-EIGEN_CLANG_PACKET_ELEMENTWISE(Packet8l)
+EIGEN_CLANG_PACKET_ELEMENTWISE(PacketXf)
+EIGEN_CLANG_PACKET_ELEMENTWISE(PacketXd)
+EIGEN_CLANG_PACKET_ELEMENTWISE(PacketXi)
+EIGEN_CLANG_PACKET_ELEMENTWISE(PacketXl)
 #undef EIGEN_CLANG_PACKET_ELEMENTWISE
 #endif
 
@@ -480,8 +528,8 @@
     return __builtin_elementwise_sqrt(a);                                     \
   }
 
-EIGEN_CLANG_PACKET_MATH_FLOAT(Packet16f)
-EIGEN_CLANG_PACKET_MATH_FLOAT(Packet8d)
+EIGEN_CLANG_PACKET_MATH_FLOAT(PacketXf)
+EIGEN_CLANG_PACKET_MATH_FLOAT(PacketXd)
 #undef EIGEN_CLANG_PACKET_MATH_FLOAT
 #endif
 
@@ -510,16 +558,31 @@
   }
 #else
 // Fallback if FMA builtin is not available
-#define EIGEN_CLANG_PACKET_MADD(PACKET_TYPE)                                                     \
-  template <>                                                                                    \
-  EIGEN_STRONG_INLINE PACKET_TYPE pmadd<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b, \
-                                                     const PACKET_TYPE& c) {                     \
-    return (a * b) + c;                                                                          \
+#define EIGEN_CLANG_PACKET_MADD(PACKET_TYPE)                                                      \
+  template <>                                                                                     \
+  EIGEN_STRONG_INLINE PACKET_TYPE pmadd<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b,  \
+                                                     const PACKET_TYPE& c) {                      \
+    return (a * b) + c;                                                                           \
+  }                                                                                               \
+  template <>                                                                                     \
+  EIGEN_STRONG_INLINE PACKET_TYPE pmsub<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b,  \
+                                                     const PACKET_TYPE& c) {                      \
+    return (a * b) - c;                                                                           \
+  }                                                                                               \
+  template <>                                                                                     \
+  EIGEN_STRONG_INLINE PACKET_TYPE pnmadd<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b, \
+                                                      const PACKET_TYPE& c) {                     \
+    return c - (a * b);                                                                           \
+  }                                                                                               \
+  template <>                                                                                     \
+  EIGEN_STRONG_INLINE PACKET_TYPE pnmsub<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b, \
+                                                      const PACKET_TYPE& c) {                     \
+    return -((a * b) + c);                                                                        \
   }
 #endif
 
-EIGEN_CLANG_PACKET_MADD(Packet16f)
-EIGEN_CLANG_PACKET_MADD(Packet8d)
+EIGEN_CLANG_PACKET_MADD(PacketXf)
+EIGEN_CLANG_PACKET_MADD(PacketXd)
 #undef EIGEN_CLANG_PACKET_MADD
 
 #define EIGEN_CLANG_PACKET_SCATTER_GATHER(PACKET_TYPE)                                                               \
@@ -541,10 +604,10 @@
     return result;                                                                                                   \
   }
 
-EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet16f)
-EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet8d)
-EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet16i)
-EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet8l)
+EIGEN_CLANG_PACKET_SCATTER_GATHER(PacketXf)
+EIGEN_CLANG_PACKET_SCATTER_GATHER(PacketXd)
+EIGEN_CLANG_PACKET_SCATTER_GATHER(PacketXi)
+EIGEN_CLANG_PACKET_SCATTER_GATHER(PacketXl)
 
 #undef EIGEN_CLANG_PACKET_SCATTER_GATHER
 
@@ -552,6 +615,14 @@
 #if EIGEN_HAS_BUILTIN(__builtin_shufflevector)
 namespace detail {
 template <typename Packet>
+EIGEN_STRONG_INLINE Packet preverse_impl_2(const Packet& a) {
+  return __builtin_shufflevector(a, a, 1, 0);
+}
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet preverse_impl_4(const Packet& a) {
+  return __builtin_shufflevector(a, a, 3, 2, 1, 0);
+}
+template <typename Packet>
 EIGEN_STRONG_INLINE Packet preverse_impl_8(const Packet& a) {
   return __builtin_shufflevector(a, a, 7, 6, 5, 4, 3, 2, 1, 0);
 }
@@ -561,33 +632,81 @@
 }
 }  // namespace detail
 
-#define EIGEN_CLANG_PACKET_REVERSE(PACKET_TYPE, SIZE)                           \
-  template <>                                                                   \
-  EIGEN_STRONG_INLINE PACKET_TYPE preverse<PACKET_TYPE>(const PACKET_TYPE& a) { \
-    return detail::preverse_impl_##SIZE(a);                                     \
-  }
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
 
-EIGEN_CLANG_PACKET_REVERSE(Packet16f, 16)
-EIGEN_CLANG_PACKET_REVERSE(Packet8d, 8)
-EIGEN_CLANG_PACKET_REVERSE(Packet16i, 16)
-EIGEN_CLANG_PACKET_REVERSE(Packet8l, 8)
-#undef EIGEN_CLANG_PACKET_REVERSE
+template <>
+EIGEN_STRONG_INLINE PacketXf preverse<PacketXf>(const PacketXf& a) {
+  return detail::preverse_impl_4(a);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXd preverse<PacketXd>(const PacketXd& a) {
+  return detail::preverse_impl_2(a);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXi preverse<PacketXi>(const PacketXi& a) {
+  return detail::preverse_impl_4(a);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXl preverse<PacketXl>(const PacketXl& a) {
+  return detail::preverse_impl_2(a);
+}
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE PacketXf preverse<PacketXf>(const PacketXf& a) {
+  return detail::preverse_impl_8(a);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXd preverse<PacketXd>(const PacketXd& a) {
+  return detail::preverse_impl_4(a);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXi preverse<PacketXi>(const PacketXi& a) {
+  return detail::preverse_impl_8(a);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXl preverse<PacketXl>(const PacketXl& a) {
+  return detail::preverse_impl_4(a);
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE PacketXf preverse<PacketXf>(const PacketXf& a) {
+  return detail::preverse_impl_16(a);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXd preverse<PacketXd>(const PacketXd& a) {
+  return detail::preverse_impl_8(a);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXi preverse<PacketXi>(const PacketXi& a) {
+  return detail::preverse_impl_16(a);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXl preverse<PacketXl>(const PacketXl& a) {
+  return detail::preverse_impl_8(a);
+}
+
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
 
 namespace detail {
+
 template <typename Packet>
-EIGEN_STRONG_INLINE Packet ploaddup16(const typename unpacket_traits<Packet>::type* from) {
+EIGEN_STRONG_INLINE Packet ploaddup2(const typename unpacket_traits<Packet>::type* from) {
   static_assert((unpacket_traits<Packet>::size) % 2 == 0, "Packet size must be a multiple of 2");
   using HalfPacket = HalfPacket<Packet>;
   HalfPacket a = load_vector_unaligned<HalfPacket>(from);
-  return __builtin_shufflevector(a, a, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7);
+  return __builtin_shufflevector(a, a, 0, 0);
 }
 
 template <typename Packet>
-EIGEN_STRONG_INLINE Packet ploadquad16(const typename unpacket_traits<Packet>::type* from) {
-  static_assert((unpacket_traits<Packet>::size) % 4 == 0, "Packet size must be a multiple of 4");
-  using QuarterPacket = QuarterPacket<Packet>;
-  QuarterPacket a = load_vector_unaligned<QuarterPacket>(from);
-  return __builtin_shufflevector(a, a, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3);
+EIGEN_STRONG_INLINE Packet ploaddup4(const typename unpacket_traits<Packet>::type* from) {
+  static_assert((unpacket_traits<Packet>::size) % 2 == 0, "Packet size must be a multiple of 2");
+  using HalfPacket = HalfPacket<Packet>;
+  HalfPacket a = load_vector_unaligned<HalfPacket>(from);
+  return __builtin_shufflevector(a, a, 0, 0, 1, 1);
 }
 
 template <typename Packet>
@@ -599,6 +718,22 @@
 }
 
 template <typename Packet>
+EIGEN_STRONG_INLINE Packet ploaddup16(const typename unpacket_traits<Packet>::type* from) {
+  static_assert((unpacket_traits<Packet>::size) % 2 == 0, "Packet size must be a multiple of 2");
+  using HalfPacket = HalfPacket<Packet>;
+  HalfPacket a = load_vector_unaligned<HalfPacket>(from);
+  return __builtin_shufflevector(a, a, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7);
+}
+
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet ploadquad4(const typename unpacket_traits<Packet>::type* from) {
+  static_assert((unpacket_traits<Packet>::size) % 4 == 0, "Packet size must be a multiple of 4");
+  using QuarterPacket = QuarterPacket<Packet>;
+  QuarterPacket a = load_vector_unaligned<QuarterPacket>(from);
+  return __builtin_shufflevector(a, a, 0, 0, 0, 0);
+}
+
+template <typename Packet>
 EIGEN_STRONG_INLINE Packet ploadquad8(const typename unpacket_traits<Packet>::type* from) {
   static_assert((unpacket_traits<Packet>::size) % 4 == 0, "Packet size must be a multiple of 4");
   using QuarterPacket = QuarterPacket<Packet>;
@@ -606,84 +741,241 @@
   return __builtin_shufflevector(a, a, 0, 0, 0, 0, 1, 1, 1, 1);
 }
 
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet ploadquad16(const typename unpacket_traits<Packet>::type* from) {
+  static_assert((unpacket_traits<Packet>::size) % 4 == 0, "Packet size must be a multiple of 4");
+  using QuarterPacket = QuarterPacket<Packet>;
+  QuarterPacket a = load_vector_unaligned<QuarterPacket>(from);
+  return __builtin_shufflevector(a, a, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3);
+}
+
 }  // namespace detail
 
-template <>
-EIGEN_STRONG_INLINE Packet16f ploaddup<Packet16f>(const float* from) {
-  return detail::ploaddup16<Packet16f>(from);
-}
-template <>
-EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) {
-  return detail::ploaddup8<Packet8d>(from);
-}
-template <>
-EIGEN_STRONG_INLINE Packet16i ploaddup<Packet16i>(const int32_t* from) {
-  return detail::ploaddup16<Packet16i>(from);
-}
-template <>
-EIGEN_STRONG_INLINE Packet8l ploaddup<Packet8l>(const int64_t* from) {
-  return detail::ploaddup8<Packet8l>(from);
-}
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
 
 template <>
-EIGEN_STRONG_INLINE Packet16f ploadquad<Packet16f>(const float* from) {
-  return detail::ploadquad16<Packet16f>(from);
+EIGEN_STRONG_INLINE PacketXf ploaddup<PacketXf>(const float* from) {
+  return detail::ploaddup4<PacketXf>(from);
 }
 template <>
-EIGEN_STRONG_INLINE Packet8d ploadquad<Packet8d>(const double* from) {
-  return detail::ploadquad8<Packet8d>(from);
+EIGEN_STRONG_INLINE PacketXd ploaddup<PacketXd>(const double* from) {
+  return detail::ploaddup2<PacketXd>(from);
 }
 template <>
-EIGEN_STRONG_INLINE Packet16i ploadquad<Packet16i>(const int32_t* from) {
-  return detail::ploadquad16<Packet16i>(from);
+EIGEN_STRONG_INLINE PacketXi ploaddup<PacketXi>(const int32_t* from) {
+  return detail::ploaddup4<PacketXi>(from);
 }
 template <>
-EIGEN_STRONG_INLINE Packet8l ploadquad<Packet8l>(const int64_t* from) {
-  return detail::ploadquad8<Packet8l>(from);
+EIGEN_STRONG_INLINE PacketXl ploaddup<PacketXl>(const int64_t* from) {
+  return detail::ploaddup2<PacketXl>(from);
 }
+template <>
+EIGEN_STRONG_INLINE PacketXf ploadquad<PacketXf>(const float* from) {
+  return detail::ploadquad4<PacketXf>(from);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXi ploadquad<PacketXi>(const int32_t* from) {
+  return detail::ploadquad4<PacketXi>(from);
+}
+// No ploadquad for 2-element packets (PacketXd, PacketXl) at 16 bytes.
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
 
 template <>
-EIGEN_STRONG_INLINE Packet16f plset<Packet16f>(const float& a) {
-  Packet16f x{a + 0.0f, a + 1.0f, a + 2.0f,  a + 3.0f,  a + 4.0f,  a + 5.0f,  a + 6.0f,  a + 7.0f,
-              a + 8.0f, a + 9.0f, a + 10.0f, a + 11.0f, a + 12.0f, a + 13.0f, a + 14.0f, a + 15.0f};
-  return x;
+EIGEN_STRONG_INLINE PacketXf ploaddup<PacketXf>(const float* from) {
+  return detail::ploaddup8<PacketXf>(from);
 }
 template <>
-EIGEN_STRONG_INLINE Packet8d plset<Packet8d>(const double& a) {
-  return Packet8d{a + 0.0, a + 1.0, a + 2.0, a + 3.0, a + 4.0, a + 5.0, a + 6.0, a + 7.0};
+EIGEN_STRONG_INLINE PacketXd ploaddup<PacketXd>(const double* from) {
+  return detail::ploaddup4<PacketXd>(from);
 }
 template <>
-EIGEN_STRONG_INLINE Packet16i plset<Packet16i>(const int32_t& a) {
-  return Packet16i{a + 0, a + 1, a + 2,  a + 3,  a + 4,  a + 5,  a + 6,  a + 7,
-                   a + 8, a + 9, a + 10, a + 11, a + 12, a + 13, a + 14, a + 15};
+EIGEN_STRONG_INLINE PacketXi ploaddup<PacketXi>(const int32_t* from) {
+  return detail::ploaddup8<PacketXi>(from);
 }
 template <>
-EIGEN_STRONG_INLINE Packet8l plset<Packet8l>(const int64_t& a) {
-  return Packet8l{a + 0, a + 1, a + 2, a + 3, a + 4, a + 5, a + 6, a + 7};
+EIGEN_STRONG_INLINE PacketXl ploaddup<PacketXl>(const int64_t* from) {
+  return detail::ploaddup4<PacketXl>(from);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXf ploadquad<PacketXf>(const float* from) {
+  return detail::ploadquad8<PacketXf>(from);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXd ploadquad<PacketXd>(const double* from) {
+  return detail::ploadquad4<PacketXd>(from);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXi ploadquad<PacketXi>(const int32_t* from) {
+  return detail::ploadquad8<PacketXi>(from);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXl ploadquad<PacketXl>(const int64_t* from) {
+  return detail::ploadquad4<PacketXl>(from);
 }
 
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
 template <>
-EIGEN_STRONG_INLINE Packet16f peven_mask(const Packet16f& /* unused */) {
+EIGEN_STRONG_INLINE PacketXf ploaddup<PacketXf>(const float* from) {
+  return detail::ploaddup16<PacketXf>(from);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXd ploaddup<PacketXd>(const double* from) {
+  return detail::ploaddup8<PacketXd>(from);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXi ploaddup<PacketXi>(const int32_t* from) {
+  return detail::ploaddup16<PacketXi>(from);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXl ploaddup<PacketXl>(const int64_t* from) {
+  return detail::ploaddup8<PacketXl>(from);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXf ploadquad<PacketXf>(const float* from) {
+  return detail::ploadquad16<PacketXf>(from);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXd ploadquad<PacketXd>(const double* from) {
+  return detail::ploadquad8<PacketXd>(from);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXi ploadquad<PacketXi>(const int32_t* from) {
+  return detail::ploadquad16<PacketXi>(from);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXl ploadquad<PacketXl>(const int64_t* from) {
+  return detail::ploadquad8<PacketXl>(from);
+}
+
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
+// --- plset ---
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
+
+template <>
+EIGEN_STRONG_INLINE PacketXf plset<PacketXf>(const float& a) {
+  return PacketXf{a + 0.0f, a + 1.0f, a + 2.0f, a + 3.0f};
+}
+template <>
+EIGEN_STRONG_INLINE PacketXd plset<PacketXd>(const double& a) {
+  return PacketXd{a + 0.0, a + 1.0};
+}
+template <>
+EIGEN_STRONG_INLINE PacketXi plset<PacketXi>(const int32_t& a) {
+  return PacketXi{a + 0, a + 1, a + 2, a + 3};
+}
+template <>
+EIGEN_STRONG_INLINE PacketXl plset<PacketXl>(const int64_t& a) {
+  return PacketXl{a + 0, a + 1};
+}
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE PacketXf plset<PacketXf>(const float& a) {
+  return PacketXf{a + 0.0f, a + 1.0f, a + 2.0f, a + 3.0f, a + 4.0f, a + 5.0f, a + 6.0f, a + 7.0f};
+}
+template <>
+EIGEN_STRONG_INLINE PacketXd plset<PacketXd>(const double& a) {
+  return PacketXd{a + 0.0, a + 1.0, a + 2.0, a + 3.0};
+}
+template <>
+EIGEN_STRONG_INLINE PacketXi plset<PacketXi>(const int32_t& a) {
+  return PacketXi{a + 0, a + 1, a + 2, a + 3, a + 4, a + 5, a + 6, a + 7};
+}
+template <>
+EIGEN_STRONG_INLINE PacketXl plset<PacketXl>(const int64_t& a) {
+  return PacketXl{a + 0, a + 1, a + 2, a + 3};
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE PacketXf plset<PacketXf>(const float& a) {
+  return PacketXf{a + 0.0f, a + 1.0f, a + 2.0f,  a + 3.0f,  a + 4.0f,  a + 5.0f,  a + 6.0f,  a + 7.0f,
+                  a + 8.0f, a + 9.0f, a + 10.0f, a + 11.0f, a + 12.0f, a + 13.0f, a + 14.0f, a + 15.0f};
+}
+template <>
+EIGEN_STRONG_INLINE PacketXd plset<PacketXd>(const double& a) {
+  return PacketXd{a + 0.0, a + 1.0, a + 2.0, a + 3.0, a + 4.0, a + 5.0, a + 6.0, a + 7.0};
+}
+template <>
+EIGEN_STRONG_INLINE PacketXi plset<PacketXi>(const int32_t& a) {
+  return PacketXi{a + 0, a + 1, a + 2,  a + 3,  a + 4,  a + 5,  a + 6,  a + 7,
+                  a + 8, a + 9, a + 10, a + 11, a + 12, a + 13, a + 14, a + 15};
+}
+template <>
+EIGEN_STRONG_INLINE PacketXl plset<PacketXl>(const int64_t& a) {
+  return PacketXl{a + 0, a + 1, a + 2, a + 3, a + 4, a + 5, a + 6, a + 7};
+}
+
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
+// --- peven_mask ---
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
+
+template <>
+EIGEN_STRONG_INLINE PacketXf peven_mask(const PacketXf& /* unused */) {
   float kTrue = numext::bit_cast<float>(int32_t(-1));
   float kFalse = 0.0f;
-  return Packet16f{kTrue, kFalse, kTrue, kFalse, kTrue, kFalse, kTrue, kFalse,
-                   kTrue, kFalse, kTrue, kFalse, kTrue, kFalse, kTrue, kFalse};
+  return PacketXf{kTrue, kFalse, kTrue, kFalse};
 }
-
 template <>
-EIGEN_STRONG_INLINE Packet8d peven_mask(const Packet8d& /* unused */) {
+EIGEN_STRONG_INLINE PacketXd peven_mask(const PacketXd& /* unused */) {
   double kTrue = numext::bit_cast<double>(int64_t(-1l));
   double kFalse = 0.0;
-  return Packet8d{kTrue, kFalse, kTrue, kFalse, kTrue, kFalse, kTrue, kFalse};
+  return PacketXd{kTrue, kFalse};
 }
 
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE PacketXf peven_mask(const PacketXf& /* unused */) {
+  float kTrue = numext::bit_cast<float>(int32_t(-1));
+  float kFalse = 0.0f;
+  return PacketXf{kTrue, kFalse, kTrue, kFalse, kTrue, kFalse, kTrue, kFalse};
+}
+template <>
+EIGEN_STRONG_INLINE PacketXd peven_mask(const PacketXd& /* unused */) {
+  double kTrue = numext::bit_cast<double>(int64_t(-1l));
+  double kFalse = 0.0;
+  return PacketXd{kTrue, kFalse, kTrue, kFalse};
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE PacketXf peven_mask(const PacketXf& /* unused */) {
+  float kTrue = numext::bit_cast<float>(int32_t(-1));
+  float kFalse = 0.0f;
+  return PacketXf{kTrue, kFalse, kTrue, kFalse, kTrue, kFalse, kTrue, kFalse,
+                  kTrue, kFalse, kTrue, kFalse, kTrue, kFalse, kTrue, kFalse};
+}
+template <>
+EIGEN_STRONG_INLINE PacketXd peven_mask(const PacketXd& /* unused */) {
+  double kTrue = numext::bit_cast<double>(int64_t(-1l));
+  double kFalse = 0.0;
+  return PacketXd{kTrue, kFalse, kTrue, kFalse, kTrue, kFalse, kTrue, kFalse};
+}
+
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
 // Helpers for ptranspose.
 namespace detail {
 
 template <typename Packet>
-EIGEN_ALWAYS_INLINE void zip_in_place16(Packet& p1, Packet& p2) {
-  Packet tmp = __builtin_shufflevector(p1, p2, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
-  p2 = __builtin_shufflevector(p1, p2, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
+EIGEN_ALWAYS_INLINE void zip_in_place2(Packet& p1, Packet& p2) {
+  Packet tmp = __builtin_shufflevector(p1, p2, 0, 2);
+  p2 = __builtin_shufflevector(p1, p2, 1, 3);
+  p1 = tmp;
+}
+
+template <typename Packet>
+EIGEN_ALWAYS_INLINE void zip_in_place4(Packet& p1, Packet& p2) {
+  Packet tmp = __builtin_shufflevector(p1, p2, 0, 4, 1, 5);
+  p2 = __builtin_shufflevector(p1, p2, 2, 6, 3, 7);
   p1 = tmp;
 }
 
@@ -695,27 +987,67 @@
 }
 
 template <typename Packet>
+EIGEN_ALWAYS_INLINE void zip_in_place16(Packet& p1, Packet& p2) {
+  Packet tmp = __builtin_shufflevector(p1, p2, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
+  p2 = __builtin_shufflevector(p1, p2, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
+  p1 = tmp;
+}
+
+template <typename Packet>
 void zip_in_place(Packet& p1, Packet& p2);
 
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
 template <>
-EIGEN_ALWAYS_INLINE void zip_in_place<Packet16f>(Packet16f& p1, Packet16f& p2) {
-  zip_in_place16(p1, p2);
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXf>(PacketXf& p1, PacketXf& p2) {
+  zip_in_place4(p1, p2);
 }
-
 template <>
-EIGEN_ALWAYS_INLINE void zip_in_place<Packet8d>(Packet8d& p1, Packet8d& p2) {
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXd>(PacketXd& p1, PacketXd& p2) {
+  zip_in_place2(p1, p2);
+}
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXi>(PacketXi& p1, PacketXi& p2) {
+  zip_in_place4(p1, p2);
+}
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXl>(PacketXl& p1, PacketXl& p2) {
+  zip_in_place2(p1, p2);
+}
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXf>(PacketXf& p1, PacketXf& p2) {
   zip_in_place8(p1, p2);
 }
-
 template <>
-EIGEN_ALWAYS_INLINE void zip_in_place<Packet16i>(Packet16i& p1, Packet16i& p2) {
-  zip_in_place16(p1, p2);
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXd>(PacketXd& p1, PacketXd& p2) {
+  zip_in_place4(p1, p2);
 }
-
 template <>
-EIGEN_ALWAYS_INLINE void zip_in_place<Packet8l>(Packet8l& p1, Packet8l& p2) {
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXi>(PacketXi& p1, PacketXi& p2) {
   zip_in_place8(p1, p2);
 }
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXl>(PacketXl& p1, PacketXl& p2) {
+  zip_in_place4(p1, p2);
+}
+#else   // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXf>(PacketXf& p1, PacketXf& p2) {
+  zip_in_place16(p1, p2);
+}
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXd>(PacketXd& p1, PacketXd& p2) {
+  zip_in_place8(p1, p2);
+}
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXi>(PacketXi& p1, PacketXi& p2) {
+  zip_in_place16(p1, p2);
+}
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXl>(PacketXl& p1, PacketXl& p2) {
+  zip_in_place8(p1, p2);
+}
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
 
 template <typename Packet>
 EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 2>& kernel) {
@@ -767,61 +1099,68 @@
 
 }  // namespace detail
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16f, 16>& kernel) {
+// ptranspose overloads: only emit valid block sizes per vector size.
+// At 16 bytes: float has 4 elems, double has 2 elems.
+// At 32 bytes: float has 8 elems, double has 4 elems.
+// At 64 bytes: float has 16 elems, double has 8 elems.
+
+// All sizes support PacketBlock<PacketXf, 2> and PacketBlock<PacketXf, 4>.
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXf, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXf, 2>& kernel) {
   detail::ptranspose_impl(kernel);
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16f, 8>& kernel) {
+// All sizes support PacketBlock<PacketXd, 2>.
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXd, 2>& kernel) {
   detail::ptranspose_impl(kernel);
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16f, 4>& kernel) {
+// All sizes support PacketBlock<PacketXi, 2> and PacketBlock<PacketXi, 4>.
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXi, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXi, 2>& kernel) {
   detail::ptranspose_impl(kernel);
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16f, 2>& kernel) {
+// All sizes support PacketBlock<PacketXl, 2>.
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXl, 2>& kernel) {
   detail::ptranspose_impl(kernel);
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8d, 8>& kernel) {
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 32
+// 32+ bytes: float has 8+ elems, double has 4+ elems.
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXf, 8>& kernel) {
   detail::ptranspose_impl(kernel);
 }
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXd, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXi, 8>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXl, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+#endif
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8d, 4>& kernel) {
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 64
+// 64 bytes: float has 16 elems, double has 8 elems.
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXf, 16>& kernel) {
   detail::ptranspose_impl(kernel);
 }
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8d, 2>& kernel) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXd, 8>& kernel) {
   detail::ptranspose_impl(kernel);
 }
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16i, 16>& kernel) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXi, 16>& kernel) {
   detail::ptranspose_impl(kernel);
 }
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16i, 8>& kernel) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXl, 8>& kernel) {
   detail::ptranspose_impl(kernel);
 }
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16i, 4>& kernel) {
-  detail::ptranspose_impl(kernel);
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16i, 2>& kernel) {
-  detail::ptranspose_impl(kernel);
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8l, 8>& kernel) {
-  detail::ptranspose_impl(kernel);
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8l, 4>& kernel) {
-  detail::ptranspose_impl(kernel);
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8l, 2>& kernel) {
-  detail::ptranspose_impl(kernel);
-}
+#endif
 #endif
 
 }  // end namespace internal

diff --git a/Eigen/src/Core/arch/clang/Reductions.h b/Eigen/src/Core/arch/clang/Reductions.h
index 1a6387a..37fc161 100644
--- a/Eigen/src/Core/arch/clang/Reductions.h
+++ b/Eigen/src/Core/arch/clang/Reductions.h

@@ -10,6 +10,9 @@
 #ifndef EIGEN_REDUCTIONS_CLANG_H
 #define EIGEN_REDUCTIONS_CLANG_H
 
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
 namespace Eigen {
 namespace internal {
 
@@ -30,10 +33,10 @@
     return __builtin_reduce_or(a != 0) != 0;                                                \
   }
 
-EIGEN_CLANG_PACKET_REDUX_MINMAX(Packet16f)
-EIGEN_CLANG_PACKET_REDUX_MINMAX(Packet8d)
-EIGEN_CLANG_PACKET_REDUX_MINMAX(Packet16i)
-EIGEN_CLANG_PACKET_REDUX_MINMAX(Packet8l)
+EIGEN_CLANG_PACKET_REDUX_MINMAX(PacketXf)
+EIGEN_CLANG_PACKET_REDUX_MINMAX(PacketXd)
+EIGEN_CLANG_PACKET_REDUX_MINMAX(PacketXi)
+EIGEN_CLANG_PACKET_REDUX_MINMAX(PacketXl)
 #undef EIGEN_CLANG_PACKET_REDUX_MINMAX
 #endif
 
@@ -49,13 +52,38 @@
   }
 
 // __builtin_reduce_{mul,add} are only defined for integer types.
-EIGEN_CLANG_PACKET_REDUX_INT(Packet16i)
-EIGEN_CLANG_PACKET_REDUX_INT(Packet8l)
+EIGEN_CLANG_PACKET_REDUX_INT(PacketXi)
+EIGEN_CLANG_PACKET_REDUX_INT(PacketXl)
 #undef EIGEN_CLANG_PACKET_REDUX_INT
 #endif
 
 #if EIGEN_HAS_BUILTIN(__builtin_shufflevector)
 namespace detail {
+
+// Reduction helpers for different vector sizes.
+// Each returns a pair of (even-sum, odd-sum) or (even-product, odd-product).
+
+template <typename VectorT>
+EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceAdd2(
+    const VectorT& a) {
+  return {a[0], a[1]};
+}
+
+template <typename VectorT>
+EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceAdd4(
+    const VectorT& a) {
+  const auto t1 = __builtin_shufflevector(a, a, 0, 1) + __builtin_shufflevector(a, a, 2, 3);
+  return {t1[0], t1[1]};
+}
+
+template <typename VectorT>
+EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceAdd8(
+    const VectorT& a) {
+  const auto t1 = __builtin_shufflevector(a, a, 0, 1, 2, 3) + __builtin_shufflevector(a, a, 4, 5, 6, 7);
+  const auto t2 = __builtin_shufflevector(t1, t1, 0, 1) + __builtin_shufflevector(t1, t1, 2, 3);
+  return {t2[0], t2[1]};
+}
+
 template <typename VectorT>
 EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceAdd16(
     const VectorT& a) {
@@ -67,10 +95,23 @@
 }
 
 template <typename VectorT>
-EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceAdd8(
+EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceMul2(
     const VectorT& a) {
-  const auto t1 = __builtin_shufflevector(a, a, 0, 1, 2, 3) + __builtin_shufflevector(a, a, 4, 5, 6, 7);
-  const auto t2 = __builtin_shufflevector(t1, t1, 0, 1) + __builtin_shufflevector(t1, t1, 2, 3);
+  return {a[0], a[1]};
+}
+
+template <typename VectorT>
+EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceMul4(
+    const VectorT& a) {
+  const auto t1 = __builtin_shufflevector(a, a, 0, 1) * __builtin_shufflevector(a, a, 2, 3);
+  return {t1[0], t1[1]};
+}
+
+template <typename VectorT>
+EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceMul8(
+    const VectorT& a) {
+  const auto t1 = __builtin_shufflevector(a, a, 0, 1, 2, 3) * __builtin_shufflevector(a, a, 4, 5, 6, 7);
+  const auto t2 = __builtin_shufflevector(t1, t1, 0, 1) * __builtin_shufflevector(t1, t1, 2, 3);
   return {t2[0], t2[1]};
 }
 
@@ -83,57 +124,188 @@
   const auto t3 = __builtin_shufflevector(t2, t2, 0, 1) * __builtin_shufflevector(t2, t2, 2, 3);
   return {t3[0], t3[1]};
 }
-
-template <typename VectorT>
-EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceMul8(
-    const VectorT& a) {
-  const auto t1 = __builtin_shufflevector(a, a, 0, 1, 2, 3) * __builtin_shufflevector(a, a, 4, 5, 6, 7);
-  const auto t2 = __builtin_shufflevector(t1, t1, 0, 1) * __builtin_shufflevector(t1, t1, 2, 3);
-  return {t2[0], t2[1]};
-}
 }  // namespace detail
 
+// --- predux and predux_mul for float ---
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
+
 template <>
-EIGEN_STRONG_INLINE float predux<Packet16f>(const Packet16f& a) {
+EIGEN_STRONG_INLINE float predux<PacketXf>(const PacketXf& a) {
+  float even, odd;
+  std::tie(even, odd) = detail::ReduceAdd4(a);
+  return even + odd;
+}
+template <>
+EIGEN_STRONG_INLINE float predux_mul<PacketXf>(const PacketXf& a) {
+  float even, odd;
+  std::tie(even, odd) = detail::ReduceMul4(a);
+  return even * odd;
+}
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE float predux<PacketXf>(const PacketXf& a) {
+  float even, odd;
+  std::tie(even, odd) = detail::ReduceAdd8(a);
+  return even + odd;
+}
+template <>
+EIGEN_STRONG_INLINE float predux_mul<PacketXf>(const PacketXf& a) {
+  float even, odd;
+  std::tie(even, odd) = detail::ReduceMul8(a);
+  return even * odd;
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE float predux<PacketXf>(const PacketXf& a) {
   float even, odd;
   std::tie(even, odd) = detail::ReduceAdd16(a);
   return even + odd;
 }
 template <>
-EIGEN_STRONG_INLINE double predux<Packet8d>(const Packet8d& a) {
+EIGEN_STRONG_INLINE float predux_mul<PacketXf>(const PacketXf& a) {
+  float even, odd;
+  std::tie(even, odd) = detail::ReduceMul16(a);
+  return even * odd;
+}
+
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
+// --- predux and predux_mul for double ---
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
+
+template <>
+EIGEN_STRONG_INLINE double predux<PacketXd>(const PacketXd& a) {
+  double even, odd;
+  std::tie(even, odd) = detail::ReduceAdd2(a);
+  return even + odd;
+}
+template <>
+EIGEN_STRONG_INLINE double predux_mul<PacketXd>(const PacketXd& a) {
+  double even, odd;
+  std::tie(even, odd) = detail::ReduceMul2(a);
+  return even * odd;
+}
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE double predux<PacketXd>(const PacketXd& a) {
+  double even, odd;
+  std::tie(even, odd) = detail::ReduceAdd4(a);
+  return even + odd;
+}
+template <>
+EIGEN_STRONG_INLINE double predux_mul<PacketXd>(const PacketXd& a) {
+  double even, odd;
+  std::tie(even, odd) = detail::ReduceMul4(a);
+  return even * odd;
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE double predux<PacketXd>(const PacketXd& a) {
   double even, odd;
   std::tie(even, odd) = detail::ReduceAdd8(a);
   return even + odd;
 }
 template <>
-EIGEN_STRONG_INLINE float predux_mul<Packet16f>(const Packet16f& a) {
-  float even, odd;
-  std::tie(even, odd) = detail::ReduceMul16(a);
-  return even * odd;
-}
-template <>
-EIGEN_STRONG_INLINE double predux_mul<Packet8d>(const Packet8d& a) {
+EIGEN_STRONG_INLINE double predux_mul<PacketXd>(const PacketXd& a) {
   double even, odd;
   std::tie(even, odd) = detail::ReduceMul8(a);
   return even * odd;
 }
 
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
+// --- predux for complex<float> ---
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
+
 template <>
-EIGEN_STRONG_INLINE std::complex<float> predux<Packet8cf>(const Packet8cf& a) {
+EIGEN_STRONG_INLINE std::complex<float> predux<PacketXcf>(const PacketXcf& a) {
+  float re, im;
+  std::tie(re, im) = detail::ReduceAdd4(a.v);
+  return std::complex<float>(re, im);
+}
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux<PacketXcf>(const PacketXcf& a) {
+  float re, im;
+  std::tie(re, im) = detail::ReduceAdd8(a.v);
+  return std::complex<float>(re, im);
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux<PacketXcf>(const PacketXcf& a) {
   float re, im;
   std::tie(re, im) = detail::ReduceAdd16(a.v);
   return std::complex<float>(re, im);
 }
 
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
+// --- predux for complex<double> ---
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
+
 template <>
-EIGEN_STRONG_INLINE std::complex<double> predux<Packet4cd>(const Packet4cd& a) {
+EIGEN_STRONG_INLINE std::complex<double> predux<PacketXcd>(const PacketXcd& a) {
+  // 1 complex double: just return it
+  return a[0];
+}
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux<PacketXcd>(const PacketXcd& a) {
+  double re, im;
+  std::tie(re, im) = detail::ReduceAdd4(a.v);
+  return std::complex<double>(re, im);
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux<PacketXcd>(const PacketXcd& a) {
   double re, im;
   std::tie(re, im) = detail::ReduceAdd8(a.v);
   return std::complex<double>(re, im);
 }
 
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
+// --- predux_mul for complex<float> ---
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
+
 template <>
-EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet8cf>(const Packet8cf& a) {
+EIGEN_STRONG_INLINE std::complex<float> predux_mul<PacketXcf>(const PacketXcf& a) {
+  // 2 complex floats: just multiply them
+  return a[0] * a[1];
+}
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux_mul<PacketXcf>(const PacketXcf& a) {
+  // 4 complex floats: split into 2+2, multiply, then scalar multiply
+  const Packet2cf lower2 = Packet2cf(__builtin_shufflevector(a.v, a.v, 0, 1, 2, 3));
+  const Packet2cf upper2 = Packet2cf(__builtin_shufflevector(a.v, a.v, 4, 5, 6, 7));
+  const Packet2cf prod2 = pmul<Packet2cf>(lower2, upper2);
+  return prod2[0] * prod2[1];
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux_mul<PacketXcf>(const PacketXcf& a) {
+  // 8 complex floats: 8->4->2->scalar
   const Packet4cf lower4 = Packet4cf(__builtin_shufflevector(a.v, a.v, 0, 1, 2, 3, 4, 5, 6, 7));
   const Packet4cf upper4 = Packet4cf(__builtin_shufflevector(a.v, a.v, 8, 9, 10, 11, 12, 13, 14, 15));
   const Packet4cf prod4 = pmul<Packet4cf>(lower4, upper4);
@@ -143,14 +315,38 @@
   return prod2[0] * prod2[1];
 }
 
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
+// --- predux_mul for complex<double> ---
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
+
 template <>
-EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet4cd>(const Packet4cd& a) {
+EIGEN_STRONG_INLINE std::complex<double> predux_mul<PacketXcd>(const PacketXcd& a) {
+  // 1 complex double: just return it
+  return a[0];
+}
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux_mul<PacketXcd>(const PacketXcd& a) {
+  // 2 complex doubles: just multiply them
+  return a[0] * a[1];
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux_mul<PacketXcd>(const PacketXcd& a) {
+  // 4 complex doubles: split into 2+2, multiply, then scalar multiply
   const Packet2cd lower2 = Packet2cd(__builtin_shufflevector(a.v, a.v, 0, 1, 2, 3));
   const Packet2cd upper2 = Packet2cd(__builtin_shufflevector(a.v, a.v, 4, 5, 6, 7));
   const Packet2cd prod2 = pmul<Packet2cd>(lower2, upper2);
   return prod2[0] * prod2[1];
 }
 
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
 #endif
 
 }  // end namespace internal

diff --git a/Eigen/src/Core/arch/clang/TypeCasting.h b/Eigen/src/Core/arch/clang/TypeCasting.h
index 164056b..75281b2 100644
--- a/Eigen/src/Core/arch/clang/TypeCasting.h
+++ b/Eigen/src/Core/arch/clang/TypeCasting.h

@@ -10,6 +10,9 @@
 #ifndef EIGEN_TYPE_CASTING_CLANG_H
 #define EIGEN_TYPE_CASTING_CLANG_H
 
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
 namespace Eigen {
 namespace internal {
 
@@ -17,44 +20,164 @@
 // preinterpret
 //==============================================================================
 template <>
-EIGEN_STRONG_INLINE Packet16f preinterpret<Packet16f, Packet16i>(const Packet16i& a) {
-  return reinterpret_cast<Packet16f>(a);
+EIGEN_STRONG_INLINE PacketXf preinterpret<PacketXf, PacketXi>(const PacketXi& a) {
+  return reinterpret_cast<PacketXf>(a);
 }
 template <>
-EIGEN_STRONG_INLINE Packet16i preinterpret<Packet16i, Packet16f>(const Packet16f& a) {
-  return reinterpret_cast<Packet16i>(a);
+EIGEN_STRONG_INLINE PacketXi preinterpret<PacketXi, PacketXf>(const PacketXf& a) {
+  return reinterpret_cast<PacketXi>(a);
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet8d preinterpret<Packet8d, Packet8l>(const Packet8l& a) {
-  return reinterpret_cast<Packet8d>(a);
+EIGEN_STRONG_INLINE PacketXd preinterpret<PacketXd, PacketXl>(const PacketXl& a) {
+  return reinterpret_cast<PacketXd>(a);
 }
 template <>
-EIGEN_STRONG_INLINE Packet8l preinterpret<Packet8l, Packet8d>(const Packet8d& a) {
-  return reinterpret_cast<Packet8l>(a);
+EIGEN_STRONG_INLINE PacketXl preinterpret<PacketXl, PacketXd>(const PacketXd& a) {
+  return reinterpret_cast<PacketXl>(a);
 }
 
 //==============================================================================
 // pcast
 //==============================================================================
 #if EIGEN_HAS_BUILTIN(__builtin_convertvector)
+// Float-to-int conversions: __builtin_convertvector has UB for NaN/inf/
+// out-of-range inputs. Replace NaN with 0 before converting so that
+// pldexp_fast (which may pass NaN exponents) doesn't trigger UB.
 template <>
-EIGEN_STRONG_INLINE Packet16i pcast<Packet16f, Packet16i>(const Packet16f& a) {
-  return __builtin_convertvector(a, Packet16i);
+EIGEN_STRONG_INLINE PacketXi pcast<PacketXf, PacketXi>(const PacketXf& a) {
+  const PacketXf safe = a == a ? a : PacketXf(0);
+  return __builtin_convertvector(safe, PacketXi);
 }
 template <>
-EIGEN_STRONG_INLINE Packet16f pcast<Packet16i, Packet16f>(const Packet16i& a) {
-  return __builtin_convertvector(a, Packet16f);
+EIGEN_STRONG_INLINE PacketXf pcast<PacketXi, PacketXf>(const PacketXi& a) {
+  return __builtin_convertvector(a, PacketXf);
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet8l pcast<Packet8d, Packet8l>(const Packet8d& a) {
-  return __builtin_convertvector(a, Packet8l);
+EIGEN_STRONG_INLINE PacketXl pcast<PacketXd, PacketXl>(const PacketXd& a) {
+  const PacketXd safe = a == a ? a : PacketXd(0);
+  return __builtin_convertvector(safe, PacketXl);
 }
 template <>
-EIGEN_STRONG_INLINE Packet8d pcast<Packet8l, Packet8d>(const Packet8l& a) {
-  return __builtin_convertvector(a, Packet8d);
+EIGEN_STRONG_INLINE PacketXd pcast<PacketXl, PacketXd>(const PacketXl& a) {
+  return __builtin_convertvector(a, PacketXd);
 }
+
+// float -> double: converts lower half of floats to doubles
+// double -> float: converts two PacketXd to one PacketXf
+// int32 -> int64: converts lower half of int32s to int64s
+// int64 -> int32: converts two PacketXl to one PacketXi
+
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
+
+// float -> double: converts lower 2 floats to 2 doubles
+template <>
+EIGEN_STRONG_INLINE PacketXd pcast<PacketXf, PacketXd>(const PacketXf& a) {
+  using HalfFloat = detail::VectorType<float, 2>;
+  HalfFloat lo = __builtin_shufflevector(a, a, 0, 1);
+  return __builtin_convertvector(lo, PacketXd);
+}
+
+// double -> float: converts two PacketXd (2 doubles each) to one PacketXf (4 floats)
+template <>
+EIGEN_STRONG_INLINE PacketXf pcast<PacketXd, PacketXf>(const PacketXd& a, const PacketXd& b) {
+  using HalfFloat = detail::VectorType<float, 2>;
+  HalfFloat lo = __builtin_convertvector(a, HalfFloat);
+  HalfFloat hi = __builtin_convertvector(b, HalfFloat);
+  return __builtin_shufflevector(lo, hi, 0, 1, 2, 3);
+}
+
+// int32 -> int64: converts lower 2 int32s to 2 int64s
+template <>
+EIGEN_STRONG_INLINE PacketXl pcast<PacketXi, PacketXl>(const PacketXi& a) {
+  using HalfInt = detail::VectorType<int32_t, 2>;
+  HalfInt lo = __builtin_shufflevector(a, a, 0, 1);
+  return __builtin_convertvector(lo, PacketXl);
+}
+
+// int64 -> int32: converts two PacketXl (2 int64s each) to one PacketXi (4 int32s)
+template <>
+EIGEN_STRONG_INLINE PacketXi pcast<PacketXl, PacketXi>(const PacketXl& a, const PacketXl& b) {
+  using HalfInt = detail::VectorType<int32_t, 2>;
+  HalfInt lo = __builtin_convertvector(a, HalfInt);
+  HalfInt hi = __builtin_convertvector(b, HalfInt);
+  return __builtin_shufflevector(lo, hi, 0, 1, 2, 3);
+}
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+// float -> double: converts lower 4 floats to 4 doubles
+template <>
+EIGEN_STRONG_INLINE PacketXd pcast<PacketXf, PacketXd>(const PacketXf& a) {
+  using HalfFloat = detail::VectorType<float, 4>;
+  HalfFloat lo = __builtin_shufflevector(a, a, 0, 1, 2, 3);
+  return __builtin_convertvector(lo, PacketXd);
+}
+
+// double -> float: converts two PacketXd (4 doubles each) to one PacketXf (8 floats)
+template <>
+EIGEN_STRONG_INLINE PacketXf pcast<PacketXd, PacketXf>(const PacketXd& a, const PacketXd& b) {
+  using HalfFloat = detail::VectorType<float, 4>;
+  HalfFloat lo = __builtin_convertvector(a, HalfFloat);
+  HalfFloat hi = __builtin_convertvector(b, HalfFloat);
+  return __builtin_shufflevector(lo, hi, 0, 1, 2, 3, 4, 5, 6, 7);
+}
+
+// int32 -> int64: converts lower 4 int32s to 4 int64s
+template <>
+EIGEN_STRONG_INLINE PacketXl pcast<PacketXi, PacketXl>(const PacketXi& a) {
+  using HalfInt = detail::VectorType<int32_t, 4>;
+  HalfInt lo = __builtin_shufflevector(a, a, 0, 1, 2, 3);
+  return __builtin_convertvector(lo, PacketXl);
+}
+
+// int64 -> int32: converts two PacketXl (4 int64s each) to one PacketXi (8 int32s)
+template <>
+EIGEN_STRONG_INLINE PacketXi pcast<PacketXl, PacketXi>(const PacketXl& a, const PacketXl& b) {
+  using HalfInt = detail::VectorType<int32_t, 4>;
+  HalfInt lo = __builtin_convertvector(a, HalfInt);
+  HalfInt hi = __builtin_convertvector(b, HalfInt);
+  return __builtin_shufflevector(lo, hi, 0, 1, 2, 3, 4, 5, 6, 7);
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+// float -> double: converts lower 8 floats to 8 doubles
+template <>
+EIGEN_STRONG_INLINE PacketXd pcast<PacketXf, PacketXd>(const PacketXf& a) {
+  using HalfFloat = detail::VectorType<float, 8>;
+  HalfFloat lo = __builtin_shufflevector(a, a, 0, 1, 2, 3, 4, 5, 6, 7);
+  return __builtin_convertvector(lo, PacketXd);
+}
+
+// double -> float: converts two PacketXd to one PacketXf
+template <>
+EIGEN_STRONG_INLINE PacketXf pcast<PacketXd, PacketXf>(const PacketXd& a, const PacketXd& b) {
+  using HalfFloat = detail::VectorType<float, 8>;
+  HalfFloat lo = __builtin_convertvector(a, HalfFloat);
+  HalfFloat hi = __builtin_convertvector(b, HalfFloat);
+  return __builtin_shufflevector(lo, hi, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+}
+
+// int32 -> int64: converts lower 8 int32s to 8 int64s
+template <>
+EIGEN_STRONG_INLINE PacketXl pcast<PacketXi, PacketXl>(const PacketXi& a) {
+  using HalfInt = detail::VectorType<int32_t, 8>;
+  HalfInt lo = __builtin_shufflevector(a, a, 0, 1, 2, 3, 4, 5, 6, 7);
+  return __builtin_convertvector(lo, PacketXl);
+}
+
+// int64 -> int32: converts two PacketXl to one PacketXi
+template <>
+EIGEN_STRONG_INLINE PacketXi pcast<PacketXl, PacketXi>(const PacketXl& a, const PacketXl& b) {
+  using HalfInt = detail::VectorType<int32_t, 8>;
+  HalfInt lo = __builtin_convertvector(a, HalfInt);
+  HalfInt hi = __builtin_convertvector(b, HalfInt);
+  return __builtin_shufflevector(lo, hi, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+}
+
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
 #endif
 
 }  // end namespace internal

diff --git a/Eigen/src/Core/functors/AssignmentFunctors.h b/Eigen/src/Core/functors/AssignmentFunctors.h
index 0239262..8f7ad13 100644
--- a/Eigen/src/Core/functors/AssignmentFunctors.h
+++ b/Eigen/src/Core/functors/AssignmentFunctors.h

@@ -23,7 +23,7 @@
  */
 template <typename DstScalar, typename SrcScalar>
 struct assign_op {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void assignCoeff(DstScalar& a, const SrcScalar& b) const { a = b; }
+  EIGEN_DEVICE_FUNC constexpr void assignCoeff(DstScalar& a, const SrcScalar& b) const { a = b; }
 
   template <int Alignment, typename Packet>
   EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const {
@@ -56,7 +56,7 @@
 template <typename DstScalar, typename SrcScalar, typename Func>
 struct compound_assign_op {
   using traits = functor_traits<compound_assign_op<DstScalar, SrcScalar, Func>>;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void assignCoeff(DstScalar& a, const SrcScalar& b) const {
+  EIGEN_DEVICE_FUNC constexpr void assignCoeff(DstScalar& a, const SrcScalar& b) const {
     assign_op<DstScalar, DstScalar>().assignCoeff(a, Func().operator()(a, b));
   }
 
@@ -138,9 +138,9 @@
  */
 template <typename Scalar>
 struct swap_assign_op {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const {
 #ifdef EIGEN_GPUCC
-    // FIXME is there some kind of cuda::swap?
+    // FIXME: check whether cuda::swap exists.
     Scalar t = b;
     const_cast<Scalar&>(b) = a;
     a = t;

diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h
index 85e1584..bbaab11 100644
--- a/Eigen/src/Core/functors/BinaryFunctors.h
+++ b/Eigen/src/Core/functors/BinaryFunctors.h

@@ -36,7 +36,7 @@
 #ifdef EIGEN_SCALAR_BINARY_OP_PLUGIN
   scalar_sum_op(){EIGEN_SCALAR_BINARY_OP_PLUGIN}
 #endif
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type
   operator()(const LhsScalar& a, const RhsScalar& b) const {
     return a + b;
   }
@@ -55,12 +55,13 @@
     Cost = (int(NumTraits<LhsScalar>::AddCost) + int(NumTraits<RhsScalar>::AddCost)) / 2,  // rough estimate!
     PacketAccess =
         is_same<LhsScalar, RhsScalar>::value && packet_traits<LhsScalar>::HasAdd && packet_traits<RhsScalar>::HasAdd
-    // TODO vectorize mixed sum
+    // TODO: vectorize mixed sum
   };
 };
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool scalar_sum_op<bool, bool>::operator()(const bool& a, const bool& b) const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE bool scalar_sum_op<bool, bool>::operator()(const bool& a,
+                                                                                           const bool& b) const {
   return a || b;
 }
 
@@ -75,7 +76,7 @@
 #ifdef EIGEN_SCALAR_BINARY_OP_PLUGIN
   scalar_product_op(){EIGEN_SCALAR_BINARY_OP_PLUGIN}
 #endif
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type
   operator()(const LhsScalar& a, const RhsScalar& b) const {
     return a * b;
   }
@@ -94,13 +95,13 @@
     Cost = (int(NumTraits<LhsScalar>::MulCost) + int(NumTraits<RhsScalar>::MulCost)) / 2,  // rough estimate!
     PacketAccess =
         is_same<LhsScalar, RhsScalar>::value && packet_traits<LhsScalar>::HasMul && packet_traits<RhsScalar>::HasMul
-    // TODO vectorize mixed product
+    // TODO: vectorize mixed product
   };
 };
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool scalar_product_op<bool, bool>::operator()(const bool& a,
-                                                                                     const bool& b) const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE bool scalar_product_op<bool, bool>::operator()(const bool& a,
+                                                                                               const bool& b) const {
   return a && b;
 }
 
@@ -116,7 +117,7 @@
 
   typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar, scalar_conj_product_op>::ReturnType result_type;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
     return conj_helper<LhsScalar, RhsScalar, Conj, false>().pmul(a, b);
   }
 
@@ -141,7 +142,7 @@
 template <typename LhsScalar, typename RhsScalar, int NaNPropagation>
 struct scalar_min_op : binary_op_base<LhsScalar, RhsScalar> {
   typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar, scalar_min_op>::ReturnType result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
     return internal::pmin<NaNPropagation>(a, b);
   }
   template <typename Packet>
@@ -170,7 +171,7 @@
 template <typename LhsScalar, typename RhsScalar, int NaNPropagation>
 struct scalar_max_op : binary_op_base<LhsScalar, RhsScalar> {
   typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar, scalar_max_op>::ReturnType result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
     return internal::pmax<NaNPropagation>(a, b);
   }
   template <typename Packet>
@@ -210,7 +211,7 @@
 template <typename LhsScalar, typename RhsScalar, bool UseTypedComparators>
 struct scalar_cmp_op<LhsScalar, RhsScalar, cmp_EQ, UseTypedComparators> : binary_op_base<LhsScalar, RhsScalar> {
   using result_type = std::conditional_t<UseTypedComparators, LhsScalar, bool>;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
     return a == b ? result_type(1) : result_type(0);
   }
   template <typename Packet>
@@ -223,7 +224,7 @@
 template <typename LhsScalar, typename RhsScalar, bool UseTypedComparators>
 struct scalar_cmp_op<LhsScalar, RhsScalar, cmp_LT, UseTypedComparators> : binary_op_base<LhsScalar, RhsScalar> {
   using result_type = std::conditional_t<UseTypedComparators, LhsScalar, bool>;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
     return a < b ? result_type(1) : result_type(0);
   }
   template <typename Packet>
@@ -236,7 +237,7 @@
 template <typename LhsScalar, typename RhsScalar, bool UseTypedComparators>
 struct scalar_cmp_op<LhsScalar, RhsScalar, cmp_LE, UseTypedComparators> : binary_op_base<LhsScalar, RhsScalar> {
   using result_type = std::conditional_t<UseTypedComparators, LhsScalar, bool>;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
     return a <= b ? result_type(1) : result_type(0);
   }
   template <typename Packet>
@@ -249,7 +250,7 @@
 template <typename LhsScalar, typename RhsScalar, bool UseTypedComparators>
 struct scalar_cmp_op<LhsScalar, RhsScalar, cmp_GT, UseTypedComparators> : binary_op_base<LhsScalar, RhsScalar> {
   using result_type = std::conditional_t<UseTypedComparators, LhsScalar, bool>;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
     return a > b ? result_type(1) : result_type(0);
   }
   template <typename Packet>
@@ -262,7 +263,7 @@
 template <typename LhsScalar, typename RhsScalar, bool UseTypedComparators>
 struct scalar_cmp_op<LhsScalar, RhsScalar, cmp_GE, UseTypedComparators> : binary_op_base<LhsScalar, RhsScalar> {
   using result_type = std::conditional_t<UseTypedComparators, LhsScalar, bool>;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
     return a >= b ? result_type(1) : result_type(0);
   }
   template <typename Packet>
@@ -275,7 +276,7 @@
 template <typename LhsScalar, typename RhsScalar, bool UseTypedComparators>
 struct scalar_cmp_op<LhsScalar, RhsScalar, cmp_UNORD, UseTypedComparators> : binary_op_base<LhsScalar, RhsScalar> {
   using result_type = std::conditional_t<UseTypedComparators, LhsScalar, bool>;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
     return !(a <= b || b <= a) ? result_type(1) : result_type(0);
   }
   template <typename Packet>
@@ -288,7 +289,7 @@
 template <typename LhsScalar, typename RhsScalar, bool UseTypedComparators>
 struct scalar_cmp_op<LhsScalar, RhsScalar, cmp_NEQ, UseTypedComparators> : binary_op_base<LhsScalar, RhsScalar> {
   using result_type = std::conditional_t<UseTypedComparators, LhsScalar, bool>;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
     return a != b ? result_type(1) : result_type(0);
   }
   template <typename Packet>
@@ -305,7 +306,7 @@
  */
 template <typename Scalar>
 struct scalar_hypot_op<Scalar, Scalar> : binary_op_base<Scalar, Scalar> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x, const Scalar& y) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const Scalar& x, const Scalar& y) const {
     // This functor is used by hypotNorm only for which it is faster to first apply abs
     // on all coefficients prior to reduction through hypot.
     // This way we avoid calling abs on positive and real entries, and this also permits
@@ -337,12 +338,12 @@
   }
 #endif
 
-  EIGEN_DEVICE_FUNC inline result_type operator()(const Scalar& a, const Exponent& b) const {
+  EIGEN_DEVICE_FUNC constexpr inline result_type operator()(const Scalar& a, const Exponent& b) const {
     return numext::pow(a, b);
   }
 
   template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const {
     return generic_pow(a, b);
   }
 };
@@ -368,12 +369,12 @@
 #ifdef EIGEN_SCALAR_BINARY_OP_PLUGIN
   scalar_difference_op(){EIGEN_SCALAR_BINARY_OP_PLUGIN}
 #endif
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type
   operator()(const LhsScalar& a, const RhsScalar& b) const {
     return a - b;
   }
   template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const {
     return internal::psub(a, b);
   }
 };
@@ -388,13 +389,13 @@
 
 template <typename Packet, bool IsInteger = NumTraits<typename unpacket_traits<Packet>::type>::IsInteger>
 struct maybe_raise_div_by_zero {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Packet x) { EIGEN_UNUSED_VARIABLE(x); }
+  static EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE void run(Packet x) { EIGEN_UNUSED_VARIABLE(x); }
 };
 
 #ifndef EIGEN_GPU_COMPILE_PHASE
 template <typename Packet>
 struct maybe_raise_div_by_zero<Packet, true> {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Packet x) {
+  static EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE void run(Packet x) {
     if (EIGEN_PREDICT_FALSE(predux_any(pcmp_eq(x, pzero(x))))) {
       // Use volatile variables to force a division by zero, which will
       // result in the default platform behaviour (usually SIGFPE).
@@ -417,12 +418,12 @@
 #ifdef EIGEN_SCALAR_BINARY_OP_PLUGIN
   scalar_quotient_op(){EIGEN_SCALAR_BINARY_OP_PLUGIN}
 #endif
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type
   operator()(const LhsScalar& a, const RhsScalar& b) const {
     return a / b;
   }
   template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const {
     return internal::pdiv(a, b);
   }
 };
@@ -446,7 +447,7 @@
   using result_type = Scalar;
   // `false` any value `a` that satisfies `a == Scalar(0)`
   // `true` is the complement of `false`
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a, const Scalar& b) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a, const Scalar& b) const {
     return (a != Scalar(0)) && (b != Scalar(0)) ? Scalar(1) : Scalar(0);
   }
   template <typename Packet>
@@ -474,7 +475,7 @@
   using result_type = Scalar;
   // `false` any value `a` that satisfies `a == Scalar(0)`
   // `true` is the complement of `false`
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a, const Scalar& b) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a, const Scalar& b) const {
     return (a != Scalar(0)) || (b != Scalar(0)) ? Scalar(1) : Scalar(0);
   }
   template <typename Packet>
@@ -501,7 +502,7 @@
   using result_type = Scalar;
   // `false` any value `a` that satisfies `a == Scalar(0)`
   // `true` is the complement of `false`
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a, const Scalar& b) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a, const Scalar& b) const {
     return (a != Scalar(0)) != (b != Scalar(0)) ? Scalar(1) : Scalar(0);
   }
   template <typename Packet>
@@ -523,19 +524,19 @@
 struct bitwise_binary_impl {
   static constexpr size_t Size = sizeof(Scalar);
   using uint_t = typename numext::get_integer_by_size<Size>::unsigned_type;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run_and(const Scalar& a, const Scalar& b) {
+  static EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar run_and(const Scalar& a, const Scalar& b) {
     uint_t a_as_uint = numext::bit_cast<uint_t, Scalar>(a);
     uint_t b_as_uint = numext::bit_cast<uint_t, Scalar>(b);
     uint_t result = a_as_uint & b_as_uint;
     return numext::bit_cast<Scalar, uint_t>(result);
   }
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run_or(const Scalar& a, const Scalar& b) {
+  static EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar run_or(const Scalar& a, const Scalar& b) {
     uint_t a_as_uint = numext::bit_cast<uint_t, Scalar>(a);
     uint_t b_as_uint = numext::bit_cast<uint_t, Scalar>(b);
     uint_t result = a_as_uint | b_as_uint;
     return numext::bit_cast<Scalar, uint_t>(result);
   }
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run_xor(const Scalar& a, const Scalar& b) {
+  static EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar run_xor(const Scalar& a, const Scalar& b) {
     uint_t a_as_uint = numext::bit_cast<uint_t, Scalar>(a);
     uint_t b_as_uint = numext::bit_cast<uint_t, Scalar>(b);
     uint_t result = a_as_uint ^ b_as_uint;
@@ -546,17 +547,17 @@
 template <typename Scalar>
 struct bitwise_binary_impl<Scalar, true> {
   using Real = typename NumTraits<Scalar>::Real;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run_and(const Scalar& a, const Scalar& b) {
+  static EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar run_and(const Scalar& a, const Scalar& b) {
     Real real_result = bitwise_binary_impl<Real>::run_and(numext::real(a), numext::real(b));
     Real imag_result = bitwise_binary_impl<Real>::run_and(numext::imag(a), numext::imag(b));
     return Scalar(real_result, imag_result);
   }
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run_or(const Scalar& a, const Scalar& b) {
+  static EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar run_or(const Scalar& a, const Scalar& b) {
     Real real_result = bitwise_binary_impl<Real>::run_or(numext::real(a), numext::real(b));
     Real imag_result = bitwise_binary_impl<Real>::run_or(numext::imag(a), numext::imag(b));
     return Scalar(real_result, imag_result);
   }
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run_xor(const Scalar& a, const Scalar& b) {
+  static EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar run_xor(const Scalar& a, const Scalar& b) {
     Real real_result = bitwise_binary_impl<Real>::run_xor(numext::real(a), numext::real(b));
     Real imag_result = bitwise_binary_impl<Real>::run_xor(numext::imag(a), numext::imag(b));
     return Scalar(real_result, imag_result);
@@ -574,7 +575,7 @@
                       BITWISE OPERATIONS MAY ONLY BE PERFORMED ON PLAIN DATA TYPES)
   EIGEN_STATIC_ASSERT((!internal::is_same<Scalar, bool>::value), DONT USE BITWISE OPS ON BOOLEAN TYPES)
   using result_type = Scalar;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a, const Scalar& b) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a, const Scalar& b) const {
     return bitwise_binary_impl<Scalar>::run_and(a, b);
   }
   template <typename Packet>
@@ -598,7 +599,7 @@
                       BITWISE OPERATIONS MAY ONLY BE PERFORMED ON PLAIN DATA TYPES)
   EIGEN_STATIC_ASSERT((!internal::is_same<Scalar, bool>::value), DONT USE BITWISE OPS ON BOOLEAN TYPES)
   using result_type = Scalar;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a, const Scalar& b) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a, const Scalar& b) const {
     return bitwise_binary_impl<Scalar>::run_or(a, b);
   }
   template <typename Packet>
@@ -622,7 +623,7 @@
                       BITWISE OPERATIONS MAY ONLY BE PERFORMED ON PLAIN DATA TYPES)
   EIGEN_STATIC_ASSERT((!internal::is_same<Scalar, bool>::value), DONT USE BITWISE OPS ON BOOLEAN TYPES)
   using result_type = Scalar;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a, const Scalar& b) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a, const Scalar& b) const {
     return bitwise_binary_impl<Scalar>::run_xor(a, b);
   }
   template <typename Packet>
@@ -646,12 +647,12 @@
 #ifdef EIGEN_SCALAR_BINARY_OP_PLUGIN
   scalar_absolute_difference_op(){EIGEN_SCALAR_BINARY_OP_PLUGIN}
 #endif
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type
   operator()(const LhsScalar& a, const RhsScalar& b) const {
     return numext::absdiff(a, b);
   }
   template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const {
     return internal::pabsdiff(a, b);
   }
 };
@@ -671,7 +672,7 @@
       is_same<LhsScalar, RhsScalar>::value && !NumTraits<Scalar>::IsInteger && !NumTraits<Scalar>::IsComplex;
   EIGEN_STATIC_ASSERT(Enable, "LhsScalar and RhsScalar must be the same non-integer, non-complex type")
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& y, const Scalar& x) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const Scalar& y, const Scalar& x) const {
     return numext::atan2(y, x);
   }
   template <typename Packet>
@@ -702,14 +703,14 @@
   typedef typename BinaryOp::second_argument_type second_argument_type;
   typedef typename BinaryOp::result_type result_type;
 
-  EIGEN_DEVICE_FUNC explicit bind1st_op(const first_argument_type& val) : m_value(val) {}
+  EIGEN_DEVICE_FUNC constexpr explicit bind1st_op(const first_argument_type& val) : m_value(val) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator()(const second_argument_type& b) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type operator()(const second_argument_type& b) const {
     return BinaryOp::operator()(m_value, b);
   }
 
   template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& b) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& b) const {
     return BinaryOp::packetOp(internal::pset1<Packet>(m_value), b);
   }
 
@@ -724,14 +725,14 @@
   typedef typename BinaryOp::second_argument_type second_argument_type;
   typedef typename BinaryOp::result_type result_type;
 
-  EIGEN_DEVICE_FUNC explicit bind2nd_op(const second_argument_type& val) : m_value(val) {}
+  EIGEN_DEVICE_FUNC constexpr explicit bind2nd_op(const second_argument_type& val) : m_value(val) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator()(const first_argument_type& a) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type operator()(const first_argument_type& a) const {
     return BinaryOp::operator()(a, m_value);
   }
 
   template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const {
     return BinaryOp::packetOp(a, internal::pset1<Packet>(m_value));
   }
 

diff --git a/Eigen/src/Core/functors/NullaryFunctors.h b/Eigen/src/Core/functors/NullaryFunctors.h
index 35dc738..d8fc78a 100644
--- a/Eigen/src/Core/functors/NullaryFunctors.h
+++ b/Eigen/src/Core/functors/NullaryFunctors.h

@@ -19,10 +19,10 @@
 
 template <typename Scalar>
 struct scalar_constant_op {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_constant_op(const Scalar& other) : m_other(other) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()() const { return m_other; }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE scalar_constant_op(const Scalar& other) : m_other(other) {}
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()() const { return m_other; }
   template <typename PacketType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packetOp() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetOp() const {
     return internal::pset1<PacketType>(m_other);
   }
   const Scalar m_other;
@@ -38,10 +38,10 @@
 
 template <typename Scalar>
 struct scalar_zero_op {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_zero_op() = default;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()() const { return Scalar(0); }
+  EIGEN_DEVICE_FUNC scalar_zero_op() = default;
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()() const { return Scalar(0); }
   template <typename PacketType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packetOp() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetOp() const {
     return internal::pzero<PacketType>(PacketType());
   }
 };
@@ -51,7 +51,7 @@
 template <typename Scalar>
 struct scalar_identity_op {
   template <typename IndexType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(IndexType row, IndexType col) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(IndexType row, IndexType col) const {
     return row == col ? Scalar(1) : Scalar(0);
   }
 };
@@ -67,7 +67,7 @@
 struct linspaced_op_impl<Scalar, /*IsInteger*/ false> {
   typedef typename NumTraits<Scalar>::Real RealScalar;
 
-  EIGEN_DEVICE_FUNC linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps)
+  EIGEN_DEVICE_FUNC constexpr linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps)
       : m_low(low),
         m_high(high),
         m_size1(num_steps == 1 ? 1 : num_steps - 1),
@@ -75,7 +75,7 @@
         m_flip(numext::abs(high) < numext::abs(low)) {}
 
   template <typename IndexType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(IndexType i) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(IndexType i) const {
     if (m_flip)
       return (i == 0) ? m_low : Scalar(m_high - RealScalar(m_size1 - i) * m_step);
     else
@@ -83,7 +83,7 @@
   }
 
   template <typename Packet, typename IndexType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(IndexType i) const {
     // Principle:
     // [low, ..., low] + ( [step, ..., step] * ( [i, ..., i] + [0, ..., size] ) )
     Packet low = pset1<Packet>(m_low);
@@ -111,7 +111,7 @@
 
 template <typename Scalar>
 struct linspaced_op_impl<Scalar, /*IsInteger*/ true> {
-  EIGEN_DEVICE_FUNC linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps)
+  EIGEN_DEVICE_FUNC constexpr linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps)
       : m_low(low),
         m_multiplier((high - low) / convert_index<Scalar>(num_steps <= 1 ? 1 : num_steps - 1)),
         m_divisor(convert_index<Scalar>((high >= low ? num_steps : -num_steps) + (high - low)) /
@@ -119,7 +119,7 @@
         m_use_divisor(num_steps > 1 && (numext::abs(high - low) + 1) < num_steps) {}
 
   template <typename IndexType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(IndexType i) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(IndexType i) const {
     if (m_use_divisor)
       return m_low + convert_index<Scalar>(i) / m_divisor;
     else
@@ -151,16 +151,16 @@
 };
 template <typename Scalar>
 struct linspaced_op {
-  EIGEN_DEVICE_FUNC linspaced_op(const Scalar& low, const Scalar& high, Index num_steps)
+  EIGEN_DEVICE_FUNC constexpr linspaced_op(const Scalar& low, const Scalar& high, Index num_steps)
       : impl((num_steps == 1 ? high : low), high, num_steps) {}
 
   template <typename IndexType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(IndexType i) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(IndexType i) const {
     return impl(i);
   }
 
   template <typename Packet, typename IndexType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(IndexType i) const {
     return impl.template packetOp<Packet>(i);
   }
 
@@ -173,9 +173,9 @@
 struct equalspaced_op {
   typedef typename NumTraits<Scalar>::Real RealScalar;
 
-  EIGEN_DEVICE_FUNC equalspaced_op(const Scalar& start, const Scalar& step) : m_start(start), m_step(step) {}
+  EIGEN_DEVICE_FUNC constexpr equalspaced_op(const Scalar& start, const Scalar& step) : m_start(start), m_step(step) {}
   template <typename IndexType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(IndexType i) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(IndexType i) const {
     return m_start + m_step * static_cast<Scalar>(i);
   }
   template <typename Packet, typename IndexType>

diff --git a/Eigen/src/Core/functors/TernaryFunctors.h b/Eigen/src/Core/functors/TernaryFunctors.h
index 82095f1..9a4ea48 100644
--- a/Eigen/src/Core/functors/TernaryFunctors.h
+++ b/Eigen/src/Core/functors/TernaryFunctors.h

@@ -26,8 +26,8 @@
   EIGEN_STATIC_ASSERT(ThenElseAreSame, THEN AND ELSE MUST BE SAME TYPE)
   using Scalar = ThenScalar;
   using result_type = Scalar;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const ThenScalar& a, const ElseScalar& b,
-                                                          const ConditionScalar& cond) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const ThenScalar& a, const ElseScalar& b,
+                                                                    const ConditionScalar& cond) const {
     return cond == ConditionScalar(0) ? b : a;
   }
   template <typename Packet>

diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h
index d7fc7bb..99cda3d 100644
--- a/Eigen/src/Core/functors/UnaryFunctors.h
+++ b/Eigen/src/Core/functors/UnaryFunctors.h

@@ -24,9 +24,9 @@
  */
 template <typename Scalar>
 struct scalar_opposite_op {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const { return numext::negate(a); }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const { return numext::negate(a); }
   template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const {
     return internal::pnegate(a);
   }
 };
@@ -43,9 +43,11 @@
 template <typename Scalar>
 struct scalar_abs_op {
   typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator()(const Scalar& a) const { return numext::abs(a); }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type operator()(const Scalar& a) const {
+    return numext::abs(a);
+  }
   template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const {
     return internal::pabs(a);
   }
 };
@@ -71,7 +73,7 @@
 struct abs_knowing_score {
   typedef typename NumTraits<Scalar>::Real result_type;
   template <typename Score>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator()(const Scalar& a, const Score&) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type operator()(const Scalar& a, const Score&) const {
     return numext::abs(a);
   }
 };
@@ -79,7 +81,7 @@
 struct abs_knowing_score<Scalar, typename scalar_score_coeff_op<Scalar>::Score_is_abs> {
   typedef typename NumTraits<Scalar>::Real result_type;
   template <typename Scal>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator()(const Scal&, const result_type& a) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type operator()(const Scal&, const result_type& a) const {
     return a;
   }
 };
@@ -92,9 +94,11 @@
 template <typename Scalar>
 struct scalar_abs2_op {
   typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator()(const Scalar& a) const { return numext::abs2(a); }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type operator()(const Scalar& a) const {
+    return numext::abs2(a);
+  }
   template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const {
     return internal::pmul(a, a);
   }
 };
@@ -113,9 +117,9 @@
  */
 template <typename Scalar>
 struct scalar_conjugate_op {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const { return numext::conj(a); }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const { return numext::conj(a); }
   template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const {
     return internal::pconj(a);
   }
 };
@@ -143,9 +147,11 @@
 template <typename Scalar>
 struct scalar_arg_op {
   typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator()(const Scalar& a) const { return numext::arg(a); }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type operator()(const Scalar& a) const {
+    return numext::arg(a);
+  }
   template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const {
     return internal::parg(a);
   }
 };
@@ -165,11 +171,11 @@
 template <typename Scalar>
 struct scalar_carg_op {
   using result_type = Scalar;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const {
     return Scalar(numext::arg(a));
   }
   template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const {
     return pcarg(a);
   }
 };
@@ -187,7 +193,7 @@
 template <typename Scalar, typename NewType>
 struct scalar_cast_op {
   typedef NewType result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const NewType operator()(const Scalar& a) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE NewType operator()(const Scalar& a) const {
     return cast<Scalar, NewType>(a);
   }
 };
@@ -222,11 +228,11 @@
  */
 template <typename Scalar, int N>
 struct scalar_shift_right_op {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const {
     return numext::arithmetic_shift_right(a);
   }
   template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const {
     return internal::parithmetic_shift_right<N>(a);
   }
 };
@@ -242,11 +248,11 @@
  */
 template <typename Scalar, int N>
 struct scalar_shift_left_op {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const {
     return numext::logical_shift_left(a);
   }
   template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const {
     return internal::plogical_shift_left<N>(a);
   }
 };
@@ -263,7 +269,9 @@
 template <typename Scalar>
 struct scalar_real_op {
   typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(const Scalar& a) const { return numext::real(a); }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type operator()(const Scalar& a) const {
+    return numext::real(a);
+  }
 };
 template <typename Scalar>
 struct functor_traits<scalar_real_op<Scalar>> {
@@ -278,7 +286,9 @@
 template <typename Scalar>
 struct scalar_imag_op {
   typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(const Scalar& a) const { return numext::imag(a); }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type operator()(const Scalar& a) const {
+    return numext::imag(a);
+  }
 };
 template <typename Scalar>
 struct functor_traits<scalar_imag_op<Scalar>> {
@@ -293,10 +303,12 @@
 template <typename Scalar>
 struct scalar_real_ref_op {
   typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type& operator()(const Scalar& a) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const result_type& operator()(const Scalar& a) const {
     return numext::real_ref(a);
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type& operator()(Scalar& a) const { return numext::real_ref(a); }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type& operator()(Scalar& a) const {
+    return numext::real_ref(a);
+  }
 };
 template <typename Scalar>
 struct functor_traits<scalar_real_ref_op<Scalar>> {
@@ -311,8 +323,10 @@
 template <typename Scalar>
 struct scalar_imag_ref_op {
   typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type& operator()(Scalar& a) const { return numext::imag_ref(a); }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type& operator()(const Scalar& a) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type& operator()(Scalar& a) const {
+    return numext::imag_ref(a);
+  }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const result_type& operator()(const Scalar& a) const {
     return numext::imag_ref(a);
   }
 };
@@ -329,7 +343,7 @@
  */
 template <typename Scalar>
 struct scalar_exp_op {
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return internal::pexp(a); }
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const { return internal::pexp(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return internal::pexp(a);
@@ -361,7 +375,7 @@
 
 template <typename Scalar>
 struct scalar_exp2_op {
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return internal::pexp2(a); }
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const { return internal::pexp2(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return internal::pexp2(a);
@@ -371,7 +385,7 @@
 struct functor_traits<scalar_exp2_op<Scalar>> {
   enum {
     PacketAccess = packet_traits<Scalar>::HasExp,
-    Cost = functor_traits<scalar_exp_op<Scalar>>::Cost  // TODO measure cost of exp2
+    Cost = functor_traits<scalar_exp_op<Scalar>>::Cost  // TODO: measure cost of exp2
   };
 };
 
@@ -383,7 +397,7 @@
  */
 template <typename Scalar>
 struct scalar_expm1_op {
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::expm1(a); }
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const { return numext::expm1(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return internal::pexpm1(a);
@@ -393,7 +407,7 @@
 struct functor_traits<scalar_expm1_op<Scalar>> {
   enum {
     PacketAccess = packet_traits<Scalar>::HasExpm1,
-    Cost = functor_traits<scalar_exp_op<Scalar>>::Cost  // TODO measure cost of expm1
+    Cost = functor_traits<scalar_exp_op<Scalar>>::Cost  // TODO: measure cost of expm1
   };
 };
 
@@ -405,7 +419,7 @@
  */
 template <typename Scalar>
 struct scalar_log_op {
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::log(a); }
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const { return numext::log(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return internal::plog(a);
@@ -437,7 +451,7 @@
  */
 template <typename Scalar>
 struct scalar_log1p_op {
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::log1p(a); }
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const { return numext::log1p(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return internal::plog1p(a);
@@ -447,7 +461,7 @@
 struct functor_traits<scalar_log1p_op<Scalar>> {
   enum {
     PacketAccess = packet_traits<Scalar>::HasLog1p,
-    Cost = functor_traits<scalar_log_op<Scalar>>::Cost  // TODO measure cost of log1p
+    Cost = functor_traits<scalar_log_op<Scalar>>::Cost  // TODO: measure cost of log1p
   };
 };
 
@@ -459,7 +473,9 @@
  */
 template <typename Scalar>
 struct scalar_log10_op {
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { EIGEN_USING_STD(log10) return log10(a); }
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const {
+    EIGEN_USING_STD(log10) return log10(a);
+  }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return internal::plog10(a);
@@ -479,7 +495,7 @@
 template <typename Scalar>
 struct scalar_log2_op {
   using RealScalar = typename NumTraits<Scalar>::Real;
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const {
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const {
     return Scalar(RealScalar(EIGEN_LOG2E)) * numext::log(a);
   }
   template <typename Packet>
@@ -498,7 +514,7 @@
  */
 template <typename Scalar>
 struct scalar_sqrt_op {
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::sqrt(a); }
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const { return numext::sqrt(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return internal::psqrt(a);
@@ -540,7 +556,7 @@
  */
 template <typename Scalar>
 struct scalar_cbrt_op {
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::cbrt(a); }
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const { return numext::cbrt(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return internal::pcbrt(a);
@@ -558,7 +574,7 @@
  */
 template <typename Scalar>
 struct scalar_rsqrt_op {
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::rsqrt(a); }
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const { return numext::rsqrt(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return internal::prsqrt(a);
@@ -576,7 +592,7 @@
  */
 template <typename Scalar>
 struct scalar_cos_op {
-  EIGEN_DEVICE_FUNC inline Scalar operator()(const Scalar& a) const { return numext::cos(a); }
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const { return numext::cos(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return internal::pcos(a);
@@ -593,7 +609,7 @@
  */
 template <typename Scalar>
 struct scalar_sin_op {
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::sin(a); }
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const { return numext::sin(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return internal::psin(a);
@@ -610,7 +626,7 @@
  */
 template <typename Scalar>
 struct scalar_tan_op {
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::tan(a); }
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const { return numext::tan(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return internal::ptan(a);
@@ -627,7 +643,7 @@
  */
 template <typename Scalar>
 struct scalar_acos_op {
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::acos(a); }
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const { return numext::acos(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return internal::pacos(a);
@@ -644,7 +660,7 @@
  */
 template <typename Scalar>
 struct scalar_asin_op {
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::asin(a); }
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const { return numext::asin(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return internal::pasin(a);
@@ -661,7 +677,7 @@
  */
 template <typename Scalar>
 struct scalar_atan_op {
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::atan(a); }
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const { return numext::atan(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return internal::patan(a);
@@ -678,7 +694,7 @@
  */
 template <typename Scalar>
 struct scalar_tanh_op {
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::tanh(a); }
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const { return numext::tanh(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& x) const {
     return ptanh(x);
@@ -713,7 +729,7 @@
  */
 template <typename Scalar>
 struct scalar_atanh_op {
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::atanh(a); }
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const { return numext::atanh(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& x) const {
     return patanh(x);
@@ -731,7 +747,7 @@
  */
 template <typename Scalar>
 struct scalar_sinh_op {
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::sinh(a); }
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const { return numext::sinh(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return internal::psinh(a);
@@ -748,7 +764,7 @@
  */
 template <typename Scalar>
 struct scalar_asinh_op {
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::asinh(a); }
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const { return numext::asinh(a); }
 };
 
 template <typename Scalar>
@@ -762,7 +778,7 @@
  */
 template <typename Scalar>
 struct scalar_cosh_op {
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::cosh(a); }
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const { return numext::cosh(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return internal::pcosh(a);
@@ -779,7 +795,7 @@
  */
 template <typename Scalar>
 struct scalar_acosh_op {
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::acosh(a); }
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const { return numext::acosh(a); }
 };
 
 template <typename Scalar>
@@ -793,9 +809,9 @@
  */
 template <typename Scalar>
 struct scalar_inverse_op {
-  EIGEN_DEVICE_FUNC inline Scalar operator()(const Scalar& a) const { return Scalar(1) / a; }
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const { return Scalar(1) / a; }
   template <typename Packet>
-  EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const {
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return internal::preciprocal(a);
   }
 };
@@ -817,9 +833,9 @@
  */
 template <typename Scalar>
 struct scalar_square_op {
-  EIGEN_DEVICE_FUNC inline Scalar operator()(const Scalar& a) const { return a * a; }
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const { return a * a; }
   template <typename Packet>
-  EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const {
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return internal::pmul(a, a);
   }
 };
@@ -833,7 +849,7 @@
 struct scalar_square_op<bool> {
   EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline bool operator()(const bool& a) const { return a; }
   template <typename Packet>
-  EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const {
+  EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return a;
   }
 };
@@ -848,9 +864,9 @@
  */
 template <typename Scalar>
 struct scalar_cube_op {
-  EIGEN_DEVICE_FUNC inline Scalar operator()(const Scalar& a) const { return a * a * a; }
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const { return a * a * a; }
   template <typename Packet>
-  EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const {
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return internal::pmul(a, pmul(a, a));
   }
 };
@@ -864,7 +880,7 @@
 struct scalar_cube_op<bool> {
   EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline bool operator()(const bool& a) const { return a; }
   template <typename Packet>
-  EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const {
+  EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return a;
   }
 };
@@ -879,7 +895,7 @@
  */
 template <typename Scalar>
 struct scalar_round_op {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const { return numext::round(a); }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const { return numext::round(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return internal::pround(a);
@@ -899,7 +915,7 @@
  */
 template <typename Scalar>
 struct scalar_floor_op {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const { return numext::floor(a); }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const { return numext::floor(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return internal::pfloor(a);
@@ -919,7 +935,7 @@
  */
 template <typename Scalar>
 struct scalar_rint_op {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const { return numext::rint(a); }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const { return numext::rint(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return internal::print(a);
@@ -939,7 +955,7 @@
  */
 template <typename Scalar>
 struct scalar_ceil_op {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const { return numext::ceil(a); }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const { return numext::ceil(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return internal::pceil(a);
@@ -959,7 +975,7 @@
  */
 template <typename Scalar>
 struct scalar_trunc_op {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const { return numext::trunc(a); }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const { return numext::trunc(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return internal::ptrunc(a);
@@ -979,7 +995,7 @@
  */
 template <typename Scalar, bool UseTypedPredicate = false>
 struct scalar_isnan_op {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE bool operator()(const Scalar& a) const {
 #if defined(SYCL_DEVICE_ONLY)
     return numext::isnan(a);
 #else
@@ -990,7 +1006,7 @@
 
 template <typename Scalar>
 struct scalar_isnan_op<Scalar, true> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const {
 #if defined(SYCL_DEVICE_ONLY)
     return (numext::isnan(a) ? ptrue(a) : pzero(a));
 #else
@@ -1014,7 +1030,7 @@
  */
 template <typename Scalar, bool UseTypedPredicate = false>
 struct scalar_isinf_op {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE bool operator()(const Scalar& a) const {
 #if defined(SYCL_DEVICE_ONLY)
     return numext::isinf(a);
 #else
@@ -1025,7 +1041,7 @@
 
 template <typename Scalar>
 struct scalar_isinf_op<Scalar, true> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const {
 #if defined(SYCL_DEVICE_ONLY)
     return (numext::isinf(a) ? ptrue(a) : pzero(a));
 #else
@@ -1048,7 +1064,7 @@
  */
 template <typename Scalar, bool UseTypedPredicate = false>
 struct scalar_isfinite_op {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE bool operator()(const Scalar& a) const {
 #if defined(SYCL_DEVICE_ONLY)
     return numext::isfinite(a);
 #else
@@ -1059,7 +1075,7 @@
 
 template <typename Scalar>
 struct scalar_isfinite_op<Scalar, true> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const {
 #if defined(SYCL_DEVICE_ONLY)
     return (numext::isfinite(a) ? ptrue(a) : pzero(a));
 #else
@@ -1087,7 +1103,7 @@
   using result_type = Scalar;
   // `false` any value `a` that satisfies `a == Scalar(0)`
   // `true` is the complement of `false`
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const {
     return a == Scalar(0) ? Scalar(1) : Scalar(0);
   }
   template <typename Packet>
@@ -1106,7 +1122,7 @@
 struct bitwise_unary_impl {
   static constexpr size_t Size = sizeof(Scalar);
   using uint_t = typename numext::get_integer_by_size<Size>::unsigned_type;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run_not(const Scalar& a) {
+  static EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar run_not(const Scalar& a) {
     uint_t a_as_uint = numext::bit_cast<uint_t, Scalar>(a);
     uint_t result = ~a_as_uint;
     return numext::bit_cast<Scalar, uint_t>(result);
@@ -1116,7 +1132,7 @@
 template <typename Scalar>
 struct bitwise_unary_impl<Scalar, true> {
   using Real = typename NumTraits<Scalar>::Real;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run_not(const Scalar& a) {
+  static EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar run_not(const Scalar& a) {
     Real real_result = bitwise_unary_impl<Real>::run_not(numext::real(a));
     Real imag_result = bitwise_unary_impl<Real>::run_not(numext::imag(a));
     return Scalar(real_result, imag_result);
@@ -1134,7 +1150,7 @@
                       BITWISE OPERATIONS MAY ONLY BE PERFORMED ON PLAIN DATA TYPES)
   EIGEN_STATIC_ASSERT((!internal::is_same<Scalar, bool>::value), DONT USE BITWISE OPS ON BOOLEAN TYPES)
   using result_type = Scalar;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const {
     return bitwise_unary_impl<Scalar>::run_not(a);
   }
   template <typename Packet>
@@ -1153,7 +1169,7 @@
  */
 template <typename Scalar>
 struct scalar_sign_op {
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::sign(a); }
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const { return numext::sign(a); }
 
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
@@ -1188,7 +1204,7 @@
 // Complex-valud implementation.
 template <typename T>
 struct scalar_logistic_op_impl<T, std::enable_if_t<NumTraits<T>::IsComplex>> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE T operator()(const T& x) const {
     const T e = numext::exp(x);
     return (numext::isinf)(numext::real(e)) ? T(1) : e / (e + T(1));
   }
@@ -1315,8 +1331,9 @@
       internal::has_ReturnType<ScalarBinaryOpTraits<Scalar, ExponentScalar, scalar_unary_pow_op>>::value>::type
       PromotedExponent;
   typedef typename ScalarBinaryOpTraits<Scalar, PromotedExponent, scalar_unary_pow_op>::ReturnType result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_unary_pow_op(const ExponentScalar& exponent) : m_exponent(exponent) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(const Scalar& a) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE scalar_unary_pow_op(const ExponentScalar& exponent)
+      : m_exponent(exponent) {}
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type operator()(const Scalar& a) const {
     EIGEN_USING_STD(pow);
     return static_cast<result_type>(pow(a, m_exponent));
   }
@@ -1354,7 +1371,7 @@
     check_is_representable();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const {
     EIGEN_USING_STD(pow);
     return static_cast<Scalar>(pow(a, m_exponent));
   }
@@ -1370,9 +1387,10 @@
 
 template <typename Scalar, typename ExponentScalar, bool BaseIsInteger>
 struct scalar_unary_pow_op<Scalar, ExponentScalar, BaseIsInteger, true, false, false> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_unary_pow_op(const ExponentScalar& exponent) : m_exponent(exponent) {}
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE scalar_unary_pow_op(const ExponentScalar& exponent)
+      : m_exponent(exponent) {}
   // TODO: error handling logic for complex^real_integer
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const {
     return unary_pow_impl<Scalar, ExponentScalar>::run(a, m_exponent);
   }
   template <typename Packet>

diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 7238fcd..cc22dc7 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h

@@ -57,6 +57,10 @@
 const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512 * 1024);
 const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(4 * 1024 * 1024);
 #endif
+#elif EIGEN_ARCH_ARM_OR_ARM64
+const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(64 * 1024);
+const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(1024 * 1024);
+const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(4 * 1024 * 1024);
 #else
 const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(16 * 1024);
 const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512 * 1024);
@@ -126,6 +130,7 @@
   std::ptrdiff_t l1, l2, l3;
   manage_caching_sizes(GetAction, &l1, &l2, &l3);
 #ifdef EIGEN_VECTORIZE_AVX512
+  const std::ptrdiff_t phys_l1 = l1;
   // We need to find a rationale for that, but without this adjustment,
   // performance with AVX512 is pretty bad, like -20% slower.
   // One reason is that with increasing packet-size, the blocking size k
@@ -150,13 +155,13 @@
     // increasing the value of k, so we'll cap it at 320 (value determined
     // experimentally).
     // To avoid that k vanishes, we make k_cache at least as big as kr
-    const Index k_cache = numext::maxi<Index>(kr, (numext::mini<Index>)((l1 - ksub) / kdiv, 320));
+    const Index k_cache = numext::maxi<Index>(kr, (numext::mini<Index>)(static_cast<Index>((l1 - ksub) / kdiv), 320));
     if (k_cache < k) {
       k = k_cache - (k_cache % kr);
       eigen_internal_assert(k > 0);
     }
 
-    const Index n_cache = (l2 - l1) / (nr * sizeof(RhsScalar) * k);
+    const Index n_cache = static_cast<Index>((l2 - l1) / (nr * sizeof(RhsScalar) * k));
     const Index n_per_thread = numext::div_ceil(n, num_threads);
     if (n_cache <= n_per_thread) {
       // Don't exceed the capacity of the l2 cache.
@@ -169,7 +174,7 @@
 
     if (l3 > l2) {
       // l3 is shared between all cores, so we'll give each thread its own chunk of l3.
-      const Index m_cache = (l3 - l2) / (sizeof(LhsScalar) * k * num_threads);
+      const Index m_cache = static_cast<Index>((l3 - l2) / (sizeof(LhsScalar) * k * num_threads));
       const Index m_per_thread = numext::div_ceil(m, num_threads);
       if (m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
         m = m_cache - (m_cache % mr);
@@ -188,7 +193,7 @@
 #endif
 
     // Early return for small problems because the computation below are time consuming for small problems.
-    // Perhaps it would make more sense to consider k*n*m??
+    // Perhaps it would make more sense to consider k*n*m?
     // Note that for very tiny problem, this function should be bypassed anyway
     // because we use the coefficient-based implementation for them.
     if ((numext::maxi)(k, (numext::maxi)(m, n)) < 48) return;
@@ -207,7 +212,7 @@
     // We also include a register-level block of the result (mx x nr).
     // (In an ideal world only the lhs panel would stay in L1)
     // Moreover, kc has to be a multiple of 8 to be compatible with loop peeling, leading to a maximum blocking size of:
-    const Index max_kc = numext::maxi<Index>(((l1 - k_sub) / k_div) & (~(k_peeling - 1)), 1);
+    const Index max_kc = numext::maxi<Index>(static_cast<Index>(((l1 - k_sub) / k_div) & (~(k_peeling - 1))), 1);
     const Index old_k = k;
     if (k > max_kc) {
       // We are really blocking on the third dimension:
@@ -219,33 +224,49 @@
       eigen_internal_assert(((old_k / k) == (old_k / max_kc)) && "the number of sweeps has to remain the same");
     }
 
+#ifdef EIGEN_VECTORIZE_AVX512
+    // The l1 *= 4 inflation above allows larger kc for better accumulator reuse,
+    // but can overfill the physical L1. Recompute max_kc using 85% of actual L1
+    // to leave headroom for RHS streaming, prefetch buffers, and stack.
+    {
+      const Index phys_l1_eff = phys_l1 * 85 / 100;
+      const Index max_kc_phys = numext::maxi<Index>(((phys_l1_eff - k_sub) / k_div) & (~(k_peeling - 1)), k_peeling);
+      if (max_kc_phys < k) {
+        k = (old_k % max_kc_phys) == 0 ? max_kc_phys
+                                       : max_kc_phys - k_peeling * ((max_kc_phys - 1 - (old_k % max_kc_phys)) /
+                                                                    (k_peeling * (old_k / max_kc_phys + 1)));
+      }
+    }
+#endif
+
 // ---- 2nd level of blocking on max(L2,L3), yields nc ----
 
-// TODO find a reliable way to get the actual amount of cache per core to use for 2nd level blocking, that is:
-//      actual_l2 = max(l2, l3/nb_core_sharing_l3)
-// The number below is quite conservative: it is better to underestimate the cache size rather than overestimating it)
-// For instance, it corresponds to 6MB of L3 shared among 4 cores.
+// Estimate the effective per-core L2 capacity for 2nd-level blocking.
+// Use 1.5x the runtime-detected L2 size. The extra 50% accounts for data
+// that spills to L3 but remains accessible with low latency. This matches
+// the empirically-tuned constant (1.5MB) previously used when L2 was 1MB.
 #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
-    const Index actual_l2 = l3;
+    const Index actual_l2 = static_cast<Index>(l3);
 #else
-    const Index actual_l2 = 1572864;  // == 1.5 MB
+    const Index actual_l2 = static_cast<Index>(l2 * 3 / 2);
 #endif
 
     // Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2.
     // The second half is implicitly reserved to access the result and lhs coefficients.
-    // When k<max_kc, then nc can arbitrarily growth. In practice, it seems to be fruitful
-    // to limit this growth: we bound nc to growth by a factor x1.5.
+    // When k<max_kc, then nc can grow without bound. In practice, it seems to be fruitful
+    // to limit this growth: we bound nc growth to a factor of 1.5x.
     // However, if the entire lhs block fit within L1, then we are not going to block on the rows at all,
     // and it becomes fruitful to keep the packed rhs blocks in L1 if there is enough remaining space.
     Index max_nc;
     const Index lhs_bytes = m * k * sizeof(LhsScalar);
-    const Index remaining_l1 = l1 - k_sub - lhs_bytes;
+    const Index remaining_l1 = static_cast<Index>(l1 - k_sub - lhs_bytes);
     if (remaining_l1 >= Index(Traits::nr * sizeof(RhsScalar)) * k) {
       // L1 blocking
       max_nc = remaining_l1 / (k * sizeof(RhsScalar));
     } else {
-      // L2 blocking
-      max_nc = (3 * actual_l2) / (2 * 2 * max_kc * sizeof(RhsScalar));
+      // L2 blocking: use actual kc (k) rather than max_kc so that nc is not
+      // unnecessarily squeezed when k < max_kc (e.g. on CPUs with large L1).
+      max_nc = (3 * actual_l2) / (2 * 2 * k * sizeof(RhsScalar));
     }
     // WARNING Below, we assume that Traits::nr is a power of two.
     Index nc = numext::mini<Index>(actual_l2 / (2 * k * sizeof(RhsScalar)), max_nc) & (~(Traits::nr - 1));
@@ -266,11 +287,11 @@
       if (problem_size <= 1024) {
         // problem is small enough to keep in L1
         // Let's choose m such that lhs's block fit in 1/3 of L1
-        actual_lm = l1;
+        actual_lm = static_cast<Index>(l1);
       } else if (l3 != 0 && problem_size <= 32768) {
         // we have both L2 and L3, and problem is small enough to be kept in L2
         // Let's choose m such that lhs's block fit in 1/3 of L2
-        actual_lm = l2;
+        actual_lm = static_cast<Index>(l2);
         max_mc = (numext::mini<Index>)(576, max_mc);
       }
       Index mc = (numext::mini<Index>)(actual_lm / (3 * k * sizeof(LhsScalar)), max_mc);
@@ -443,7 +464,7 @@
   typedef QuadPacket<RhsPacket> RhsPacketx4;
   typedef ResPacket AccPacket;
 
-  EIGEN_STRONG_INLINE void initAcc(AccPacket& p) { p = pset1<ResPacket>(ResScalar(0)); }
+  EIGEN_STRONG_INLINE void initAcc(AccPacket& p) const { p = pset1<ResPacket>(ResScalar(0)); }
 
   template <typename RhsPacketType>
   EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const {
@@ -548,7 +569,7 @@
 
   typedef ResPacket AccPacket;
 
-  EIGEN_STRONG_INLINE void initAcc(AccPacket& p) { p = pset1<ResPacket>(ResScalar(0)); }
+  EIGEN_STRONG_INLINE void initAcc(AccPacket& p) const { p = pset1<ResPacket>(ResScalar(0)); }
 
   template <typename RhsPacketType>
   EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const {
@@ -571,8 +592,7 @@
   }
 
   EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const true_type&) const {
-    // FIXME we can do better!
-    // what we want here is a ploadheight
+    // FIXME: replace with a dedicated ploadheight operation for more efficient quad loading.
     RhsScalar tmp[4] = {b[0], b[0], b[1], b[1]};
     dest = ploadquad<RhsPacket>(tmp);
   }
@@ -650,8 +670,10 @@
 
 template <typename Packet>
 DoublePacket<typename unpacket_traits<Packet>::half> predux_half(
-    const DoublePacket<Packet>& a, std::enable_if_t<unpacket_traits<Packet>::size == 16>* = 0) {
-  // yes, that's pretty hackish :(
+    const DoublePacket<Packet>& a,
+    std::enable_if_t<unpacket_traits<Packet>::size >= 16 &&
+                     !NumTraits<typename unpacket_traits<Packet>::type>::IsComplex>* = 0) {
+  // Workaround: reduce real packets to half size by reinterpreting as complex.
   DoublePacket<typename unpacket_traits<Packet>::half> res;
   typedef std::complex<typename unpacket_traits<Packet>::type> Cplx;
   typedef typename packet_traits<Cplx>::type CplxPacket;
@@ -671,7 +693,7 @@
 template <typename Scalar, typename RealPacket>
 void loadQuadToDoublePacket(const Scalar* b, DoublePacket<RealPacket>& dest,
                             std::enable_if_t<unpacket_traits<RealPacket>::size == 16>* = 0) {
-  // yes, that's pretty hackish too :(
+  // Workaround: load quad elements by reinterpreting real packets as complex.
   typedef typename NumTraits<Scalar>::Real RealScalar;
   RealScalar r[4] = {numext::real(b[0]), numext::real(b[0]), numext::real(b[1]), numext::real(b[1])};
   RealScalar i[4] = {numext::imag(b[0]), numext::imag(b[0]), numext::imag(b[1]), numext::imag(b[1])};
@@ -680,7 +702,7 @@
 }
 
 template <typename Packet>
-struct unpacket_traits<DoublePacket<Packet> > {
+struct unpacket_traits<DoublePacket<Packet>> {
   typedef DoublePacket<typename unpacket_traits<Packet>::half> half;
   enum { size = 2 * unpacket_traits<Packet>::size };
 };
@@ -735,9 +757,9 @@
   // this actually holds 8 packets!
   typedef QuadPacket<RhsPacket> RhsPacketx4;
 
-  EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); }
+  EIGEN_STRONG_INLINE void initAcc(Scalar& p) const { p = Scalar(0); }
 
-  EIGEN_STRONG_INLINE void initAcc(DoublePacketType& p) {
+  EIGEN_STRONG_INLINE void initAcc(DoublePacketType& p) const {
     p.first = pset1<RealPacket>(RealScalar(0));
     p.second = pset1<RealPacket>(RealScalar(0));
   }
@@ -879,7 +901,7 @@
   typedef QuadPacket<RhsPacket> RhsPacketx4;
   typedef ResPacket AccPacket;
 
-  EIGEN_STRONG_INLINE void initAcc(AccPacket& p) { p = pset1<ResPacket>(ResScalar(0)); }
+  EIGEN_STRONG_INLINE void initAcc(AccPacket& p) const { p = pset1<ResPacket>(ResScalar(0)); }
 
   template <typename RhsPacketType>
   EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const {
@@ -1004,7 +1026,7 @@
 
   EIGEN_DONT_INLINE void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, Index rows,
                                     Index depth, Index cols, ResScalar alpha, Index strideA = -1, Index strideB = -1,
-                                    Index offsetA = 0, Index offsetB = 0);
+                                    Index offsetA = 0, Index offsetB = 0) const;
 };
 
 template <typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr,
@@ -1023,7 +1045,7 @@
 
   EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits& straits, const LhsScalar* blA,
                                       const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2,
-                                      ResScalar alpha, SAccPacket& C0) {
+                                      ResScalar alpha, SAccPacket& C0) const {
     EIGEN_UNUSED_VARIABLE(res);
     EIGEN_UNUSED_VARIABLE(straits);
     EIGEN_UNUSED_VARIABLE(blA);
@@ -1051,7 +1073,7 @@
 
   EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits& straits, const LhsScalar* blA,
                                       const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2,
-                                      ResScalar alpha, SAccPacket& C0) {
+                                      ResScalar alpha, SAccPacket& C0) const {
     typedef typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half SResPacketQuarter;
     typedef typename unpacket_traits<typename unpacket_traits<SLhsPacket>::half>::half SLhsPacketQuarter;
     typedef typename unpacket_traits<typename unpacket_traits<SRhsPacket>::half>::half SRhsPacketQuarter;
@@ -1082,339 +1104,264 @@
   }
 };
 
-template <int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar,
-          typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits,
-          typename LinearMapper, typename DataMapper>
-struct lhs_process_one_packet {
-  typedef typename GEBPTraits::RhsPacketx4 RhsPacketx4;
+// Compile-time recursive helper: processes RHS columns J..NrCols-1 for gebp_micro_onestep.
+// For each column, loads/updates the RHS panel and does madd for all MrPackets LHS packets.
+// The bool partial specialization terminates the recursion without requiring if constexpr.
+template <int J, int MrPackets, int NrCols, bool Continue = (J < NrCols)>
+struct gebp_rhs_cols;
 
-  EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits,
-                                             LhsPacket* A0, RhsPacketx4* rhs_panel, RhsPacket* T0, AccPacket* C0,
-                                             AccPacket* C1, AccPacket* C2, AccPacket* C3) {
-    EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4");
-    EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");
-    traits.loadLhs(&blA[(0 + 1 * K) * LhsProgress], *A0);
-    traits.loadRhs(&blB[(0 + 4 * K) * RhsProgress], *rhs_panel);
-    traits.madd(*A0, *rhs_panel, *C0, *T0, fix<0>);
-    traits.madd(*A0, *rhs_panel, *C1, *T0, fix<1>);
-    traits.madd(*A0, *rhs_panel, *C2, *T0, fix<2>);
-    traits.madd(*A0, *rhs_panel, *C3, *T0, fix<3>);
-#if EIGEN_GNUC_STRICT_AT_LEAST(6, 0, 0) && defined(EIGEN_VECTORIZE_SSE) && !(EIGEN_COMP_LCC)
-    __asm__("" : "+x,m"(*A0));
-#endif
-    EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
+// Base case: J >= NrCols, do nothing.
+template <int J, int MrPackets, int NrCols>
+struct gebp_rhs_cols<J, MrPackets, NrCols, false> {
+  template <typename GEBPTraits, typename LhsArray, typename RhsPanelType, typename RhsPacketType, typename AccArray,
+            typename RhsScalar>
+  static EIGEN_ALWAYS_INLINE void run(GEBPTraits&, const RhsScalar*, Index, LhsArray&, RhsPanelType&, RhsPacketType&,
+                                      AccArray&) {}
+};
+
+// Active case: J < NrCols.
+template <int J, int MrPackets, int NrCols>
+struct gebp_rhs_cols<J, MrPackets, NrCols, true> {
+  template <typename GEBPTraits, typename LhsArray, typename RhsPanelType, typename RhsPacketType, typename AccArray,
+            typename RhsScalar>
+  static EIGEN_ALWAYS_INLINE void run(GEBPTraits& traits, const RhsScalar* blB, Index rhs_offset, LhsArray& A,
+                                      RhsPanelType& rhs_panel, RhsPacketType& T0, AccArray& C) {
+    constexpr int lane = J % 4;
+    EIGEN_IF_CONSTEXPR(lane == 0)
+    traits.loadRhs(blB + (J + rhs_offset) * GEBPTraits::RhsProgress, rhs_panel);
+    else traits.updateRhs(blB + (J + rhs_offset) * GEBPTraits::RhsProgress, rhs_panel);
+
+    EIGEN_IF_CONSTEXPR(MrPackets >= 1) traits.madd(A[0], rhs_panel, C[J + 0 * NrCols], T0, fix<lane>);
+    EIGEN_IF_CONSTEXPR(MrPackets >= 2) traits.madd(A[1], rhs_panel, C[J + 1 * NrCols], T0, fix<lane>);
+    EIGEN_IF_CONSTEXPR(MrPackets >= 3) traits.madd(A[2], rhs_panel, C[J + 2 * NrCols], T0, fix<lane>);
+
+    gebp_rhs_cols<J + 1, MrPackets, NrCols>::run(traits, blB, rhs_offset, A, rhs_panel, T0, C);
   }
+};
 
-  EIGEN_ALWAYS_INLINE void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
-                                      ResScalar alpha, Index peelStart, Index peelEnd, Index strideA, Index strideB,
-                                      Index offsetA, Index offsetB, int prefetch_res_offset, Index peeled_kc, Index pk,
-                                      Index cols, Index depth, Index packet_cols4) {
-    GEBPTraits traits;
-    Index packet_cols8 = nr >= 8 ? (cols / 8) * 8 : 0;
-    // loops on each largest micro horizontal panel of lhs
-    // (LhsProgress x depth)
-    for (Index i = peelStart; i < peelEnd; i += LhsProgress) {
-#if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
-      EIGEN_IF_CONSTEXPR(nr >= 8) {
-        for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
-          const LhsScalar* blA = &blockA[i * strideA + offsetA * (LhsProgress)];
-          prefetch(&blA[0]);
+// One step of the micro-kernel: loads MrPackets LHS packets at step K,
+// then processes NrCols RHS columns via gebp_rhs_cols.
+template <int K, int MrPackets, int NrCols>
+struct gebp_micro_step {
+  template <typename GEBPTraits, typename LhsScalar_, typename RhsScalar_, typename LhsArray, typename RhsPanelType,
+            typename RhsPacketType, typename AccArray>
+  static EIGEN_ALWAYS_INLINE void run(GEBPTraits& traits, const LhsScalar_* blA, const RhsScalar_* blB, LhsArray& A,
+                                      RhsPanelType& rhs_panel, RhsPacketType& T0, AccArray& C) {
+    constexpr int LhsProg = GEBPTraits::LhsProgress;
 
-          // gets res block as register
-          AccPacket C0, C1, C2, C3, C4, C5, C6, C7;
-          traits.initAcc(C0);
-          traits.initAcc(C1);
-          traits.initAcc(C2);
-          traits.initAcc(C3);
-          traits.initAcc(C4);
-          traits.initAcc(C5);
-          traits.initAcc(C6);
-          traits.initAcc(C7);
+    EIGEN_IF_CONSTEXPR(MrPackets >= 1) traits.loadLhs(&blA[(0 + MrPackets * K) * LhsProg], A[0]);
+    EIGEN_IF_CONSTEXPR(MrPackets >= 2) traits.loadLhs(&blA[(1 + MrPackets * K) * LhsProg], A[1]);
+    EIGEN_IF_CONSTEXPR(MrPackets >= 3) traits.loadLhs(&blA[(2 + MrPackets * K) * LhsProg], A[2]);
 
-          LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
-          LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
-          LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
-          LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
-          LinearMapper r4 = res.getLinearMapper(i, j2 + 4);
-          LinearMapper r5 = res.getLinearMapper(i, j2 + 5);
-          LinearMapper r6 = res.getLinearMapper(i, j2 + 6);
-          LinearMapper r7 = res.getLinearMapper(i, j2 + 7);
-          r0.prefetch(prefetch_res_offset);
-          r1.prefetch(prefetch_res_offset);
-          r2.prefetch(prefetch_res_offset);
-          r3.prefetch(prefetch_res_offset);
-          r4.prefetch(prefetch_res_offset);
-          r5.prefetch(prefetch_res_offset);
-          r6.prefetch(prefetch_res_offset);
-          r7.prefetch(prefetch_res_offset);
-          const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 8];
-          prefetch(&blB[0]);
+    gebp_rhs_cols<0, MrPackets, NrCols>::run(traits, blB, Index(NrCols * K), A, rhs_panel, T0, C);
+  }
+};
+// Compiler register allocation workarounds for the GEBP micro-kernel.
+// GCC can fail to keep array-based SIMD values in vector registers, causing
+// excessive spilling. These helpers use inline asm constraints to pin values.
+// Only applied when the scalar type is actually vectorizable (not custom types).
+// See Eigen bugs 935, 1637, and 3059.
 
-          LhsPacket A0;
-          for (Index k = 0; k < peeled_kc; k += pk) {
-            RhsPacketx4 rhs_panel;
-            RhsPacket T0;
-#define EIGEN_GEBGP_ONESTEP(K)                                    \
-  do {                                                            \
-    EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX8");    \
-    traits.loadLhs(&blA[(0 + 1 * K) * LhsProgress], A0);          \
-    traits.loadRhs(&blB[(0 + 8 * K) * RhsProgress], rhs_panel);   \
-    traits.madd(A0, rhs_panel, C0, T0, fix<0>);                   \
-    traits.updateRhs(&blB[(1 + 8 * K) * RhsProgress], rhs_panel); \
-    traits.madd(A0, rhs_panel, C1, T0, fix<1>);                   \
-    traits.updateRhs(&blB[(2 + 8 * K) * RhsProgress], rhs_panel); \
-    traits.madd(A0, rhs_panel, C2, T0, fix<2>);                   \
-    traits.updateRhs(&blB[(3 + 8 * K) * RhsProgress], rhs_panel); \
-    traits.madd(A0, rhs_panel, C3, T0, fix<3>);                   \
-    traits.loadRhs(&blB[(4 + 8 * K) * RhsProgress], rhs_panel);   \
-    traits.madd(A0, rhs_panel, C4, T0, fix<0>);                   \
-    traits.updateRhs(&blB[(5 + 8 * K) * RhsProgress], rhs_panel); \
-    traits.madd(A0, rhs_panel, C5, T0, fix<1>);                   \
-    traits.updateRhs(&blB[(6 + 8 * K) * RhsProgress], rhs_panel); \
-    traits.madd(A0, rhs_panel, C6, T0, fix<2>);                   \
-    traits.updateRhs(&blB[(7 + 8 * K) * RhsProgress], rhs_panel); \
-    traits.madd(A0, rhs_panel, C7, T0, fix<3>);                   \
-    EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX8");      \
+// ARM64 NEON: pin 3 LHS packets in vector registers.
+// Old GCC (< 9) misallocates registers for 3-packet paths without this hint.
+template <int MrPackets, typename GEBPTraits_, typename FullLhsPacket_, typename LhsArray_>
+EIGEN_ALWAYS_INLINE void gebp_neon_3p_workaround(LhsArray_& A) {
+#if EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && EIGEN_GNUC_STRICT_LESS_THAN(9, 0, 0)
+  using LhsElement = std::remove_all_extents_t<std::remove_reference_t<LhsArray_>>;
+  constexpr bool apply = GEBPTraits_::Vectorizable && MrPackets == 3 && std::is_same<LhsElement, FullLhsPacket_>::value;
+  EIGEN_IF_CONSTEXPR(apply) { __asm__("" : "+w,m"(A[0]), "+w,m"(A[1]), "+w,m"(A[2])); }
+#else
+  EIGEN_UNUSED_VARIABLE(A);
+#endif
+}
+
+// GCC SSE: prevent register spilling for LHS packets and accumulators.
+// C++17: pin accumulators with strict "+x" (if constexpr discards dead branches).
+// C++14: pin LHS packets with relaxed "+x,m" (memory fallback for non-SSE types).
+template <int MrPackets, int NrCols, typename GEBPTraits_, typename FullLhsPacket_, typename LhsArray_,
+          typename AccArray_>
+EIGEN_ALWAYS_INLINE void gebp_sse_spilling_workaround(LhsArray_& A, AccArray_& ACC) {
+  EIGEN_UNUSED_VARIABLE(A);
+  EIGEN_UNUSED_VARIABLE(ACC);
+#if EIGEN_GNUC_STRICT_AT_LEAST(6, 0, 0) && defined(EIGEN_VECTORIZE_SSE)
+  using LhsElement = std::remove_all_extents_t<std::remove_reference_t<LhsArray_>>;
+  constexpr bool apply =
+      GEBPTraits_::Vectorizable && MrPackets <= 2 && NrCols >= 4 && std::is_same<LhsElement, FullLhsPacket_>::value;
+  EIGEN_IF_CONSTEXPR(apply) {
+#ifdef EIGEN_HAS_CXX17_IFCONSTEXPR
+    using AccElement = std::decay_t<decltype(ACC[0])>;
+    constexpr bool pin_acc = std::is_same<AccElement, FullLhsPacket_>::value && MrPackets == 2 && NrCols == 4;
+    if constexpr (pin_acc) {
+      __asm__(""
+              : "+x"(ACC[0]), "+x"(ACC[1]), "+x"(ACC[2]), "+x"(ACC[3]), "+x"(ACC[4]), "+x"(ACC[5]), "+x"(ACC[6]),
+                "+x"(ACC[7]));
+    }
+#else
+    EIGEN_IF_CONSTEXPR(MrPackets == 2) { __asm__("" : "+x,m"(A[0]), "+x,m"(A[1])); }
+#endif
+  }
+#endif
+}
+
+// Unrolled peeled loop body: calls gebp_micro_step for K=0..7, handling
+// double-accumulation for 1pX4, prefetches, and compiler workarounds.
+template <int MrPackets, int NrCols>
+struct gebp_peeled_loop {
+  template <typename GEBPTraits, typename LhsScalar_, typename RhsScalar_, typename LhsArray, typename RhsPanelType,
+            typename RhsPacketType, typename AccArray, typename AccArrayD, typename FullLhsPacket>
+  static EIGEN_ALWAYS_INLINE void run(GEBPTraits& traits, const LhsScalar_* blA, const RhsScalar_* blB, LhsArray& A,
+                                      RhsPanelType& rhs_panel, RhsPacketType& T0, AccArray& C, AccArrayD& D) {
+    constexpr bool use_double_accum = (MrPackets == 1 && NrCols == 4);
+
+    // Prefetch for 4-col paths
+    EIGEN_IF_CONSTEXPR(NrCols == 4) { internal::prefetch(blB + (48 + 0)); }
+
+    // Helper to do one step with workarounds
+#define EIGEN_GEBP_DO_STEP(KVAL, ACC)                                                       \
+  do {                                                                                      \
+    gebp_micro_step<KVAL, MrPackets, NrCols>::run(traits, blA, blB, A, rhs_panel, T0, ACC); \
+    gebp_neon_3p_workaround<MrPackets, GEBPTraits, FullLhsPacket>(A);                       \
+    gebp_sse_spilling_workaround<MrPackets, NrCols, GEBPTraits, FullLhsPacket>(A, ACC);     \
+    /* LHS prefetch for 2pX4 and 3pX4 */                                                    \
+    EIGEN_IF_CONSTEXPR((MrPackets == 2 || MrPackets == 3) && NrCols == 4) {                 \
+      internal::prefetch(blA + (MrPackets * KVAL + 16) * GEBPTraits::LhsProgress);          \
+      if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) {                                              \
+        internal::prefetch(blB + (NrCols * KVAL + 16) * GEBPTraits::RhsProgress);           \
+      }                                                                                     \
+    }                                                                                       \
   } while (false)
 
-            EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX8");
+    EIGEN_IF_CONSTEXPR(use_double_accum) {
+      EIGEN_GEBP_DO_STEP(0, C);
+      EIGEN_GEBP_DO_STEP(1, D);
+      EIGEN_GEBP_DO_STEP(2, C);
+      EIGEN_GEBP_DO_STEP(3, D);
+      EIGEN_IF_CONSTEXPR(NrCols == 4) { internal::prefetch(blB + (48 + 16)); }
+      EIGEN_GEBP_DO_STEP(4, C);
+      EIGEN_GEBP_DO_STEP(5, D);
+      EIGEN_GEBP_DO_STEP(6, C);
+      EIGEN_GEBP_DO_STEP(7, D);
+    }
+    else {
+      EIGEN_GEBP_DO_STEP(0, C);
+      EIGEN_GEBP_DO_STEP(1, C);
+      EIGEN_GEBP_DO_STEP(2, C);
+      EIGEN_GEBP_DO_STEP(3, C);
+      EIGEN_IF_CONSTEXPR(NrCols == 4 && MrPackets == 2) { internal::prefetch(blB + (48 + 16)); }
+      EIGEN_GEBP_DO_STEP(4, C);
+      EIGEN_GEBP_DO_STEP(5, C);
+      EIGEN_GEBP_DO_STEP(6, C);
+      EIGEN_GEBP_DO_STEP(7, C);
+    }
 
-            EIGEN_GEBGP_ONESTEP(0);
-            EIGEN_GEBGP_ONESTEP(1);
-            EIGEN_GEBGP_ONESTEP(2);
-            EIGEN_GEBGP_ONESTEP(3);
-            EIGEN_GEBGP_ONESTEP(4);
-            EIGEN_GEBGP_ONESTEP(5);
-            EIGEN_GEBGP_ONESTEP(6);
-            EIGEN_GEBGP_ONESTEP(7);
+#undef EIGEN_GEBP_DO_STEP
+  }
+};
 
-            blB += pk * 8 * RhsProgress;
-            blA += pk * (1 * LhsProgress);
+// Unified micro-panel function: handles a MrPackets x NrCols register block.
+// GEBPTraits determines the packet types (supports full/half/quarter sizes).
+// Accumulator layout: C[j + p * NrCols] for column j, LHS packet p.
+template <int MrPackets, int NrCols, typename GEBPTraits, typename LhsScalar_, typename RhsScalar_, typename ResScalar_,
+          typename Index_, typename DataMapper_, typename LinearMapper_, typename FullLhsPacket>
+EIGEN_ALWAYS_INLINE void gebp_micro_panel_impl(GEBPTraits& traits, const DataMapper_& res, const LhsScalar_* blockA,
+                                               const RhsScalar_* blockB, ResScalar_ alpha, Index_ i, Index_ j2,
+                                               Index_ depth, Index_ strideA, Index_ strideB, Index_ offsetA,
+                                               Index_ offsetB, int prefetch_res_offset, Index_ peeled_kc, int pk) {
+  using LhsPacketLocal = typename GEBPTraits::LhsPacket;
+  using RhsPacketLocal = typename GEBPTraits::RhsPacket;
+  using ResPacketLocal = typename GEBPTraits::ResPacket;
+  using AccPacketLocal = typename GEBPTraits::AccPacket;
+  using RhsPacketx4Local = typename GEBPTraits::RhsPacketx4;
+  constexpr int LhsProg = GEBPTraits::LhsProgress;
+  constexpr int RhsProg = GEBPTraits::RhsProgress;
+  constexpr int ResPacketSz = GEBPTraits::ResPacketSize;
 
-            EIGEN_ASM_COMMENT("end gebp micro kernel 1pX8");
-          }
-          // process remaining peeled loop
-          for (Index k = peeled_kc; k < depth; k++) {
-            RhsPacketx4 rhs_panel;
-            RhsPacket T0;
-            EIGEN_GEBGP_ONESTEP(0);
-            blB += 8 * RhsProgress;
-            blA += 1 * LhsProgress;
-          }
+  // Determine RhsPanel type based on register pressure
+  using RhsPanelType = std::conditional_t<
+      NrCols == 1, RhsPacketLocal,
+      typename RhsPanelHelper<RhsPacketLocal, RhsPacketx4Local, MrPackets * NrCols + MrPackets>::type>;
 
-#undef EIGEN_GEBGP_ONESTEP
+  const LhsScalar_* blA = &blockA[i * strideA + offsetA * (MrPackets * LhsProg)];
+  prefetch(&blA[0]);
 
-          ResPacket R0, R1;
-          ResPacket alphav = pset1<ResPacket>(alpha);
+  // Accumulators: C[j + p * NrCols] for column j, LHS packet p.
+  // With if constexpr (C++17) we use exact sizes; with plain if (C++14) we pad
+  // to 3*NrCols so dead-branch array accesses in gebp_rhs_cols remain valid.
+#ifdef EIGEN_HAS_CXX17_IFCONSTEXPR
+  constexpr int CSize = MrPackets * NrCols;
+#else
+  constexpr int CSize = 3 * NrCols > MrPackets * NrCols ? 3 * NrCols : MrPackets * NrCols;
+#endif
+  AccPacketLocal C[CSize];
+  for (int n = 0; n < MrPackets * NrCols; ++n) traits.initAcc(C[n]);
 
-          R0 = r0.template loadPacket<ResPacket>(0);
-          R1 = r1.template loadPacket<ResPacket>(0);
-          traits.acc(C0, alphav, R0);
-          traits.acc(C1, alphav, R1);
-          r0.storePacket(0, R0);
-          r1.storePacket(0, R1);
+  // Double-accumulation trick for 1pX4 path to break FMA dependency chains
+  constexpr bool use_double_accum = (MrPackets == 1 && NrCols == 4);
+#ifdef EIGEN_HAS_CXX17_IFCONSTEXPR
+  AccPacketLocal D[use_double_accum ? NrCols : 1];
+#else
+  // Without if constexpr, we must allocate a larger array to satisfy the
+  // compiler that D[n] is always in bounds for the use_double_accum path.
+  AccPacketLocal D[CSize];
+#endif
+  EIGEN_IF_CONSTEXPR(use_double_accum) {
+    for (int n = 0; n < NrCols; ++n) traits.initAcc(D[n]);
+  }
 
-          R0 = r2.template loadPacket<ResPacket>(0);
-          R1 = r3.template loadPacket<ResPacket>(0);
-          traits.acc(C2, alphav, R0);
-          traits.acc(C3, alphav, R1);
-          r2.storePacket(0, R0);
-          r3.storePacket(0, R1);
+  // Prefetch result memory
+  for (int j = 0; j < NrCols; ++j) res.getLinearMapper(i, j2 + j).prefetch(NrCols > 1 ? prefetch_res_offset : 0);
 
-          R0 = r4.template loadPacket<ResPacket>(0);
-          R1 = r5.template loadPacket<ResPacket>(0);
-          traits.acc(C4, alphav, R0);
-          traits.acc(C5, alphav, R1);
-          r4.storePacket(0, R0);
-          r5.storePacket(0, R1);
+  // RHS pointer
+  const RhsScalar_* blB = &blockB[j2 * strideB + offsetB * NrCols];
+  prefetch(&blB[0]);
 
-          R0 = r6.template loadPacket<ResPacket>(0);
-          R1 = r7.template loadPacket<ResPacket>(0);
-          traits.acc(C6, alphav, R0);
-          traits.acc(C7, alphav, R1);
-          r6.storePacket(0, R0);
-          r7.storePacket(0, R1);
-        }
-      }
+  // LHS packet staging area. With if constexpr (C++17) we use exact sizes.
+#ifdef EIGEN_HAS_CXX17_IFCONSTEXPR
+  LhsPacketLocal A[MrPackets];
+#else
+  LhsPacketLocal A[3];
 #endif
 
-      // loops on each largest micro vertical panel of rhs (depth * nr)
-      for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
-        // We select a LhsProgress x nr micro block of res
-        // which is entirely stored into 1 x nr registers.
+  // ---- Peeled k-loop (pk=8 unrolled) ----
+  for (Index_ k = 0; k < peeled_kc; k += pk) {
+    RhsPanelType rhs_panel;
+    RhsPacketLocal T0;
 
-        const LhsScalar* blA = &blockA[i * strideA + offsetA * (LhsProgress)];
-        prefetch(&blA[0]);
+    gebp_peeled_loop<MrPackets, NrCols>::template run<GEBPTraits, LhsScalar_, RhsScalar_, decltype(A), RhsPanelType,
+                                                      RhsPacketLocal, decltype(C), decltype(D), FullLhsPacket>(
+        traits, blA, blB, A, rhs_panel, T0, C, D);
 
-        // gets res block as register
-        AccPacket C0, C1, C2, C3;
-        traits.initAcc(C0);
-        traits.initAcc(C1);
-        traits.initAcc(C2);
-        traits.initAcc(C3);
-        // To improve instruction pipelining, let's double the accumulation registers:
-        //  even k will accumulate in C*, while odd k will accumulate in D*.
-        // This trick is crucial to get good performance with FMA, otherwise it is
-        // actually faster to perform separated MUL+ADD because of a naturally
-        // better instruction-level parallelism.
-        AccPacket D0, D1, D2, D3;
-        traits.initAcc(D0);
-        traits.initAcc(D1);
-        traits.initAcc(D2);
-        traits.initAcc(D3);
+    blB += pk * NrCols * RhsProg;
+    blA += pk * MrPackets * LhsProg;
+  }
 
-        LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
-        LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
-        LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
-        LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
+  // Merge double accumulators
+  EIGEN_IF_CONSTEXPR(use_double_accum) {
+    for (int n = 0; n < NrCols; ++n) C[n] = padd(C[n], D[n]);
+  }
 
-        r0.prefetch(prefetch_res_offset);
-        r1.prefetch(prefetch_res_offset);
-        r2.prefetch(prefetch_res_offset);
-        r3.prefetch(prefetch_res_offset);
+  // ---- Remainder k-loop ----
+  for (Index_ k = peeled_kc; k < depth; k++) {
+    RhsPanelType rhs_panel;
+    RhsPacketLocal T0;
 
-        // performs "inner" products
-        const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 4];
-        prefetch(&blB[0]);
-        LhsPacket A0, A1;
+    gebp_micro_step<0, MrPackets, NrCols>::run(traits, blA, blB, A, rhs_panel, T0, C);
 
-        for (Index k = 0; k < peeled_kc; k += pk) {
-          EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX4");
-          RhsPacketx4 rhs_panel;
-          RhsPacket T0;
+    blB += NrCols * RhsProg;
+    blA += MrPackets * LhsProg;
+  }
 
-          internal::prefetch(blB + (48 + 0));
-          peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
-          peeled_kc_onestep(1, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
-          peeled_kc_onestep(2, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
-          peeled_kc_onestep(3, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
-          internal::prefetch(blB + (48 + 16));
-          peeled_kc_onestep(4, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
-          peeled_kc_onestep(5, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
-          peeled_kc_onestep(6, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
-          peeled_kc_onestep(7, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
-
-          blB += pk * 4 * RhsProgress;
-          blA += pk * LhsProgress;
-
-          EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX4");
-        }
-        C0 = padd(C0, D0);
-        C1 = padd(C1, D1);
-        C2 = padd(C2, D2);
-        C3 = padd(C3, D3);
-
-        // process remaining peeled loop
-        for (Index k = peeled_kc; k < depth; k++) {
-          RhsPacketx4 rhs_panel;
-          RhsPacket T0;
-          peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
-          blB += 4 * RhsProgress;
-          blA += LhsProgress;
-        }
-
-        ResPacket R0, R1;
-        ResPacket alphav = pset1<ResPacket>(alpha);
-
-        R0 = r0.template loadPacket<ResPacket>(0);
-        R1 = r1.template loadPacket<ResPacket>(0);
-        traits.acc(C0, alphav, R0);
-        traits.acc(C1, alphav, R1);
-        r0.storePacket(0, R0);
-        r1.storePacket(0, R1);
-
-        R0 = r2.template loadPacket<ResPacket>(0);
-        R1 = r3.template loadPacket<ResPacket>(0);
-        traits.acc(C2, alphav, R0);
-        traits.acc(C3, alphav, R1);
-        r2.storePacket(0, R0);
-        r3.storePacket(0, R1);
-      }
-
-      // Deal with remaining columns of the rhs
-      for (Index j2 = packet_cols4; j2 < cols; j2++) {
-        // One column at a time
-        const LhsScalar* blA = &blockA[i * strideA + offsetA * (LhsProgress)];
-        prefetch(&blA[0]);
-
-        // gets res block as register
-        AccPacket C0;
-        traits.initAcc(C0);
-
-        LinearMapper r0 = res.getLinearMapper(i, j2);
-
-        // performs "inner" products
-        const RhsScalar* blB = &blockB[j2 * strideB + offsetB];
-        LhsPacket A0;
-
-        for (Index k = 0; k < peeled_kc; k += pk) {
-          EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX1");
-          RhsPacket B_0;
-
-#define EIGEN_GEBGP_ONESTEP(K)                                             \
-  do {                                                                     \
-    EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1/half/quarterX1"); \
-    EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");    \
-    /* FIXME: why unaligned???? */                                         \
-    traits.loadLhsUnaligned(&blA[(0 + 1 * K) * LhsProgress], A0);          \
-    traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0);                      \
-    traits.madd(A0, B_0, C0, B_0, fix<0>);                                 \
-    EIGEN_ASM_COMMENT("end step of gebp micro kernel 1/half/quarterX1");   \
-  } while (false);
-
-          EIGEN_GEBGP_ONESTEP(0);
-          EIGEN_GEBGP_ONESTEP(1);
-          EIGEN_GEBGP_ONESTEP(2);
-          EIGEN_GEBGP_ONESTEP(3);
-          EIGEN_GEBGP_ONESTEP(4);
-          EIGEN_GEBGP_ONESTEP(5);
-          EIGEN_GEBGP_ONESTEP(6);
-          EIGEN_GEBGP_ONESTEP(7);
-
-          blB += pk * RhsProgress;
-          blA += pk * LhsProgress;
-
-          EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX1");
-        }
-
-        // process remaining peeled loop
-        for (Index k = peeled_kc; k < depth; k++) {
-          RhsPacket B_0;
-          EIGEN_GEBGP_ONESTEP(0);
-          blB += RhsProgress;
-          blA += LhsProgress;
-        }
-#undef EIGEN_GEBGP_ONESTEP
-        ResPacket R0;
-        ResPacket alphav = pset1<ResPacket>(alpha);
-        R0 = r0.template loadPacket<ResPacket>(0);
-        traits.acc(C0, alphav, R0);
-        r0.storePacket(0, R0);
-      }
+  // ---- Store results: C[j + p * NrCols] -> res(i + p*ResPacketSz, j2 + j) ----
+  ResPacketLocal alphav = pset1<ResPacketLocal>(alpha);
+  for (int j = 0; j < NrCols; ++j) {
+    LinearMapper_ r = res.getLinearMapper(i, j2 + j);
+    for (int p = 0; p < MrPackets; ++p) {
+      ResPacketLocal R = r.template loadPacket<ResPacketLocal>(p * ResPacketSz);
+      traits.acc(C[j + p * NrCols], alphav, R);
+      r.storePacket(p * ResPacketSz, R);
     }
   }
-};
-
-template <int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar,
-          typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits,
-          typename LinearMapper, typename DataMapper>
-struct lhs_process_fraction_of_packet
-    : lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket,
-                             RhsPacket, ResPacket, GEBPTraits, LinearMapper, DataMapper> {
-  EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits,
-                                             LhsPacket* A0, RhsPacket* B_0, RhsPacket* B1, RhsPacket* B2, RhsPacket* B3,
-                                             AccPacket* C0, AccPacket* C1, AccPacket* C2, AccPacket* C3) {
-    EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4");
-    EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");
-    traits.loadLhsUnaligned(&blA[(0 + 1 * K) * (LhsProgress)], *A0);
-    traits.broadcastRhs(&blB[(0 + 4 * K) * RhsProgress], *B_0, *B1, *B2, *B3);
-    traits.madd(*A0, *B_0, *C0, *B_0);
-    traits.madd(*A0, *B1, *C1, *B1);
-    traits.madd(*A0, *B2, *C2, *B2);
-    traits.madd(*A0, *B3, *C3, *B3);
-    EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
-  }
-};
+}
 
 template <typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr,
           bool ConjugateLhs, bool ConjugateRhs>
@@ -1422,7 +1369,7 @@
                                    ConjugateRhs>::operator()(const DataMapper& res, const LhsScalar* blockA,
                                                              const RhsScalar* blockB, Index rows, Index depth,
                                                              Index cols, ResScalar alpha, Index strideA, Index strideB,
-                                                             Index offsetA, Index offsetB) {
+                                                             Index offsetA, Index offsetB) const {
   Traits traits;
   SwappedTraits straits;
 
@@ -1445,20 +1392,25 @@
   enum { pk = 8 };  // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell)
   const Index peeled_kc = depth & ~(pk - 1);
   const int prefetch_res_offset = 32 / sizeof(ResScalar);
-  //     const Index depth2     = depth & ~1;
+
+  // Helper to invoke gebp_micro_panel_impl with the right types.
+  // The always_inline attribute is critical: without it GCC outlines each
+  // template instantiation of this generic lambda as a separate function,
+  // adding call overhead that causes 10-17 % regressions in LLT/TRSM
+  // for small-to-medium matrix sizes.
+  auto micro_panel = [&](auto mrp_tag, auto nrc_tag, auto& local_traits, Index i, Index j2) EIGEN_LAMBDA_ALWAYS_INLINE {
+    constexpr int MrP = decltype(mrp_tag)::value;
+    constexpr int NrC = decltype(nrc_tag)::value;
+    using LTraits = std::remove_reference_t<decltype(local_traits)>;
+    gebp_micro_panel_impl<MrP, NrC, LTraits, LhsScalar, RhsScalar, ResScalar, Index, DataMapper, LinearMapper,
+                          LhsPacket>(local_traits, res, blockA, blockB, alpha, i, j2, depth, strideA, strideB, offsetA,
+                                     offsetB, prefetch_res_offset, peeled_kc, pk);
+  };
 
   //---------- Process 3 * LhsProgress rows at once ----------
-  // This corresponds to 3*LhsProgress x nr register blocks.
-  // Usually, make sense only with FMA
-  if (mr >= 3 * Traits::LhsProgress) {
-    // Here, the general idea is to loop on each largest micro horizontal panel of the lhs (3*Traits::LhsProgress x
-    // depth) and on each largest micro vertical panel of the rhs (depth * nr). Blocking sizes, i.e., 'depth' has been
-    // computed so that the micro horizontal panel of the lhs fit in L1. However, if depth is too small, we can extend
-    // the number of rows of these horizontal panels. This actual number of rows is computed as follow:
-    const Index l1 = defaultL1CacheSize;  // in Bytes, TODO, l1 should be passed to this function.
-    // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
-    // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only
-    // guess), or because we are testing specific blocking sizes.
+  EIGEN_IF_CONSTEXPR(mr >= 3 * Traits::LhsProgress) {
+    std::ptrdiff_t l1, l2, l3;
+    manage_caching_sizes(GetAction, &l1, &l2, &l3);
     const Index actual_panel_rows =
         (3 * LhsProgress) * std::max<Index>(1, ((l1 - sizeof(ResScalar) * mr * nr - depth * nr * sizeof(RhsScalar)) /
                                                 (depth * sizeof(LhsScalar) * 3 * LhsProgress)));
@@ -1468,859 +1420,126 @@
       EIGEN_IF_CONSTEXPR(nr >= 8) {
         for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
           for (Index i = i1; i < actual_panel_end; i += 3 * LhsProgress) {
-            const LhsScalar* blA = &blockA[i * strideA + offsetA * (3 * LhsProgress)];
-            prefetch(&blA[0]);
-            // gets res block as register
-            AccPacket C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, C10, C11, C12, C13, C14, C15, C16, C17, C18, C19, C20,
-                C21, C22, C23;
-            traits.initAcc(C0);
-            traits.initAcc(C1);
-            traits.initAcc(C2);
-            traits.initAcc(C3);
-            traits.initAcc(C4);
-            traits.initAcc(C5);
-            traits.initAcc(C6);
-            traits.initAcc(C7);
-            traits.initAcc(C8);
-            traits.initAcc(C9);
-            traits.initAcc(C10);
-            traits.initAcc(C11);
-            traits.initAcc(C12);
-            traits.initAcc(C13);
-            traits.initAcc(C14);
-            traits.initAcc(C15);
-            traits.initAcc(C16);
-            traits.initAcc(C17);
-            traits.initAcc(C18);
-            traits.initAcc(C19);
-            traits.initAcc(C20);
-            traits.initAcc(C21);
-            traits.initAcc(C22);
-            traits.initAcc(C23);
-
-            LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
-            LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
-            LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
-            LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
-            LinearMapper r4 = res.getLinearMapper(i, j2 + 4);
-            LinearMapper r5 = res.getLinearMapper(i, j2 + 5);
-            LinearMapper r6 = res.getLinearMapper(i, j2 + 6);
-            LinearMapper r7 = res.getLinearMapper(i, j2 + 7);
-
-            r0.prefetch(0);
-            r1.prefetch(0);
-            r2.prefetch(0);
-            r3.prefetch(0);
-            r4.prefetch(0);
-            r5.prefetch(0);
-            r6.prefetch(0);
-            r7.prefetch(0);
-
-            // performs "inner" products
-            const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 8];
-            prefetch(&blB[0]);
-            LhsPacket A0, A1;
-            for (Index k = 0; k < peeled_kc; k += pk) {
-              EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX8");
-              // 27 registers are taken (24 for acc, 3 for lhs).
-              RhsPanel27 rhs_panel;
-              RhsPacket T0;
-              LhsPacket A2;
-#if EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && EIGEN_GNUC_STRICT_LESS_THAN(9, 0, 0)
-// see http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1633
-// without this workaround A0, A1, and A2 are loaded in the same register,
-// which is not good for pipelining
-#define EIGEN_GEBP_3Px8_REGISTER_ALLOC_WORKAROUND __asm__("" : "+w,m"(A0), "+w,m"(A1), "+w,m"(A2));
-#else
-#define EIGEN_GEBP_3Px8_REGISTER_ALLOC_WORKAROUND
-#endif
-
-#define EIGEN_GEBP_ONESTEP(K)                                                                                     \
-  do {                                                                                                            \
-    EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX8");                                                    \
-    traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0);                                                          \
-    traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1);                                                          \
-    traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2);                                                          \
-    EIGEN_GEBP_3Px8_REGISTER_ALLOC_WORKAROUND traits.loadRhs(blB + (0 + 8 * K) * Traits::RhsProgress, rhs_panel); \
-    traits.madd(A0, rhs_panel, C0, T0, fix<0>);                                                                   \
-    traits.madd(A1, rhs_panel, C8, T0, fix<0>);                                                                   \
-    traits.madd(A2, rhs_panel, C16, T0, fix<0>);                                                                  \
-    traits.updateRhs(blB + (1 + 8 * K) * Traits::RhsProgress, rhs_panel);                                         \
-    traits.madd(A0, rhs_panel, C1, T0, fix<1>);                                                                   \
-    traits.madd(A1, rhs_panel, C9, T0, fix<1>);                                                                   \
-    traits.madd(A2, rhs_panel, C17, T0, fix<1>);                                                                  \
-    traits.updateRhs(blB + (2 + 8 * K) * Traits::RhsProgress, rhs_panel);                                         \
-    traits.madd(A0, rhs_panel, C2, T0, fix<2>);                                                                   \
-    traits.madd(A1, rhs_panel, C10, T0, fix<2>);                                                                  \
-    traits.madd(A2, rhs_panel, C18, T0, fix<2>);                                                                  \
-    traits.updateRhs(blB + (3 + 8 * K) * Traits::RhsProgress, rhs_panel);                                         \
-    traits.madd(A0, rhs_panel, C3, T0, fix<3>);                                                                   \
-    traits.madd(A1, rhs_panel, C11, T0, fix<3>);                                                                  \
-    traits.madd(A2, rhs_panel, C19, T0, fix<3>);                                                                  \
-    traits.loadRhs(blB + (4 + 8 * K) * Traits::RhsProgress, rhs_panel);                                           \
-    traits.madd(A0, rhs_panel, C4, T0, fix<0>);                                                                   \
-    traits.madd(A1, rhs_panel, C12, T0, fix<0>);                                                                  \
-    traits.madd(A2, rhs_panel, C20, T0, fix<0>);                                                                  \
-    traits.updateRhs(blB + (5 + 8 * K) * Traits::RhsProgress, rhs_panel);                                         \
-    traits.madd(A0, rhs_panel, C5, T0, fix<1>);                                                                   \
-    traits.madd(A1, rhs_panel, C13, T0, fix<1>);                                                                  \
-    traits.madd(A2, rhs_panel, C21, T0, fix<1>);                                                                  \
-    traits.updateRhs(blB + (6 + 8 * K) * Traits::RhsProgress, rhs_panel);                                         \
-    traits.madd(A0, rhs_panel, C6, T0, fix<2>);                                                                   \
-    traits.madd(A1, rhs_panel, C14, T0, fix<2>);                                                                  \
-    traits.madd(A2, rhs_panel, C22, T0, fix<2>);                                                                  \
-    traits.updateRhs(blB + (7 + 8 * K) * Traits::RhsProgress, rhs_panel);                                         \
-    traits.madd(A0, rhs_panel, C7, T0, fix<3>);                                                                   \
-    traits.madd(A1, rhs_panel, C15, T0, fix<3>);                                                                  \
-    traits.madd(A2, rhs_panel, C23, T0, fix<3>);                                                                  \
-    EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX8");                                                      \
-  } while (false)
-
-              EIGEN_GEBP_ONESTEP(0);
-              EIGEN_GEBP_ONESTEP(1);
-              EIGEN_GEBP_ONESTEP(2);
-              EIGEN_GEBP_ONESTEP(3);
-              EIGEN_GEBP_ONESTEP(4);
-              EIGEN_GEBP_ONESTEP(5);
-              EIGEN_GEBP_ONESTEP(6);
-              EIGEN_GEBP_ONESTEP(7);
-
-              blB += pk * 8 * RhsProgress;
-              blA += pk * 3 * Traits::LhsProgress;
-              EIGEN_ASM_COMMENT("end gebp micro kernel 3pX8");
-            }
-
-            // process remaining peeled loop
-            for (Index k = peeled_kc; k < depth; k++) {
-              RhsPanel27 rhs_panel;
-              RhsPacket T0;
-              LhsPacket A2;
-              EIGEN_GEBP_ONESTEP(0);
-              blB += 8 * RhsProgress;
-              blA += 3 * Traits::LhsProgress;
-            }
-
-#undef EIGEN_GEBP_ONESTEP
-
-            ResPacket R0, R1, R2;
-            ResPacket alphav = pset1<ResPacket>(alpha);
-
-            R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-            R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-            R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
-            traits.acc(C0, alphav, R0);
-            traits.acc(C8, alphav, R1);
-            traits.acc(C16, alphav, R2);
-            r0.storePacket(0 * Traits::ResPacketSize, R0);
-            r0.storePacket(1 * Traits::ResPacketSize, R1);
-            r0.storePacket(2 * Traits::ResPacketSize, R2);
-
-            R0 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-            R1 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-            R2 = r1.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
-            traits.acc(C1, alphav, R0);
-            traits.acc(C9, alphav, R1);
-            traits.acc(C17, alphav, R2);
-            r1.storePacket(0 * Traits::ResPacketSize, R0);
-            r1.storePacket(1 * Traits::ResPacketSize, R1);
-            r1.storePacket(2 * Traits::ResPacketSize, R2);
-
-            R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-            R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-            R2 = r2.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
-            traits.acc(C2, alphav, R0);
-            traits.acc(C10, alphav, R1);
-            traits.acc(C18, alphav, R2);
-            r2.storePacket(0 * Traits::ResPacketSize, R0);
-            r2.storePacket(1 * Traits::ResPacketSize, R1);
-            r2.storePacket(2 * Traits::ResPacketSize, R2);
-
-            R0 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-            R1 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-            R2 = r3.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
-            traits.acc(C3, alphav, R0);
-            traits.acc(C11, alphav, R1);
-            traits.acc(C19, alphav, R2);
-            r3.storePacket(0 * Traits::ResPacketSize, R0);
-            r3.storePacket(1 * Traits::ResPacketSize, R1);
-            r3.storePacket(2 * Traits::ResPacketSize, R2);
-
-            R0 = r4.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-            R1 = r4.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-            R2 = r4.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
-            traits.acc(C4, alphav, R0);
-            traits.acc(C12, alphav, R1);
-            traits.acc(C20, alphav, R2);
-            r4.storePacket(0 * Traits::ResPacketSize, R0);
-            r4.storePacket(1 * Traits::ResPacketSize, R1);
-            r4.storePacket(2 * Traits::ResPacketSize, R2);
-
-            R0 = r5.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-            R1 = r5.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-            R2 = r5.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
-            traits.acc(C5, alphav, R0);
-            traits.acc(C13, alphav, R1);
-            traits.acc(C21, alphav, R2);
-            r5.storePacket(0 * Traits::ResPacketSize, R0);
-            r5.storePacket(1 * Traits::ResPacketSize, R1);
-            r5.storePacket(2 * Traits::ResPacketSize, R2);
-
-            R0 = r6.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-            R1 = r6.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-            R2 = r6.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
-            traits.acc(C6, alphav, R0);
-            traits.acc(C14, alphav, R1);
-            traits.acc(C22, alphav, R2);
-            r6.storePacket(0 * Traits::ResPacketSize, R0);
-            r6.storePacket(1 * Traits::ResPacketSize, R1);
-            r6.storePacket(2 * Traits::ResPacketSize, R2);
-
-            R0 = r7.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-            R1 = r7.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-            R2 = r7.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
-            traits.acc(C7, alphav, R0);
-            traits.acc(C15, alphav, R1);
-            traits.acc(C23, alphav, R2);
-            r7.storePacket(0 * Traits::ResPacketSize, R0);
-            r7.storePacket(1 * Traits::ResPacketSize, R1);
-            r7.storePacket(2 * Traits::ResPacketSize, R2);
+            micro_panel(fix<3>, fix<8>, traits, i, j2);
           }
         }
       }
 #endif
       for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
         for (Index i = i1; i < actual_panel_end; i += 3 * LhsProgress) {
-          // We selected a 3*Traits::LhsProgress x nr micro block of res which is entirely
-          // stored into 3 x nr registers.
-
-          const LhsScalar* blA = &blockA[i * strideA + offsetA * (3 * LhsProgress)];
-          prefetch(&blA[0]);
-
-          // gets res block as register
-          AccPacket C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, C10, C11;
-          traits.initAcc(C0);
-          traits.initAcc(C1);
-          traits.initAcc(C2);
-          traits.initAcc(C3);
-          traits.initAcc(C4);
-          traits.initAcc(C5);
-          traits.initAcc(C6);
-          traits.initAcc(C7);
-          traits.initAcc(C8);
-          traits.initAcc(C9);
-          traits.initAcc(C10);
-          traits.initAcc(C11);
-
-          LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
-          LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
-          LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
-          LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
-
-          r0.prefetch(0);
-          r1.prefetch(0);
-          r2.prefetch(0);
-          r3.prefetch(0);
-
-          // performs "inner" products
-          const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 4];
-          prefetch(&blB[0]);
-          LhsPacket A0, A1;
-
-          for (Index k = 0; k < peeled_kc; k += pk) {
-            EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4");
-            // 15 registers are taken (12 for acc, 3 for lhs).
-            RhsPanel15 rhs_panel;
-            RhsPacket T0;
-            LhsPacket A2;
-#if EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && EIGEN_GNUC_STRICT_LESS_THAN(9, 0, 0)
-// see http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1633
-// without this workaround A0, A1, and A2 are loaded in the same register,
-// which is not good for pipelining
-#define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND __asm__("" : "+w,m"(A0), "+w,m"(A1), "+w,m"(A2));
-#else
-#define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND
-#endif
-#define EIGEN_GEBP_ONESTEP(K)                                             \
-  do {                                                                    \
-    EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4");            \
-    EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");   \
-    internal::prefetch(blA + (3 * K + 16) * LhsProgress);                 \
-    if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) {                              \
-      internal::prefetch(blB + (4 * K + 16) * RhsProgress);               \
-    } /* Bug 953 */                                                       \
-    traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0);                  \
-    traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1);                  \
-    traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2);                  \
-    EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND                             \
-    traits.loadRhs(blB + (0 + 4 * K) * Traits::RhsProgress, rhs_panel);   \
-    traits.madd(A0, rhs_panel, C0, T0, fix<0>);                           \
-    traits.madd(A1, rhs_panel, C4, T0, fix<0>);                           \
-    traits.madd(A2, rhs_panel, C8, T0, fix<0>);                           \
-    traits.updateRhs(blB + (1 + 4 * K) * Traits::RhsProgress, rhs_panel); \
-    traits.madd(A0, rhs_panel, C1, T0, fix<1>);                           \
-    traits.madd(A1, rhs_panel, C5, T0, fix<1>);                           \
-    traits.madd(A2, rhs_panel, C9, T0, fix<1>);                           \
-    traits.updateRhs(blB + (2 + 4 * K) * Traits::RhsProgress, rhs_panel); \
-    traits.madd(A0, rhs_panel, C2, T0, fix<2>);                           \
-    traits.madd(A1, rhs_panel, C6, T0, fix<2>);                           \
-    traits.madd(A2, rhs_panel, C10, T0, fix<2>);                          \
-    traits.updateRhs(blB + (3 + 4 * K) * Traits::RhsProgress, rhs_panel); \
-    traits.madd(A0, rhs_panel, C3, T0, fix<3>);                           \
-    traits.madd(A1, rhs_panel, C7, T0, fix<3>);                           \
-    traits.madd(A2, rhs_panel, C11, T0, fix<3>);                          \
-    EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4");              \
-  } while (false)
-
-            internal::prefetch(blB);
-            EIGEN_GEBP_ONESTEP(0);
-            EIGEN_GEBP_ONESTEP(1);
-            EIGEN_GEBP_ONESTEP(2);
-            EIGEN_GEBP_ONESTEP(3);
-            EIGEN_GEBP_ONESTEP(4);
-            EIGEN_GEBP_ONESTEP(5);
-            EIGEN_GEBP_ONESTEP(6);
-            EIGEN_GEBP_ONESTEP(7);
-
-            blB += pk * 4 * RhsProgress;
-            blA += pk * 3 * Traits::LhsProgress;
-
-            EIGEN_ASM_COMMENT("end gebp micro kernel 3pX4");
-          }
-          // process remaining peeled loop
-          for (Index k = peeled_kc; k < depth; k++) {
-            RhsPanel15 rhs_panel;
-            RhsPacket T0;
-            LhsPacket A2;
-            EIGEN_GEBP_ONESTEP(0);
-            blB += 4 * RhsProgress;
-            blA += 3 * Traits::LhsProgress;
-          }
-
-#undef EIGEN_GEBP_ONESTEP
-
-          ResPacket R0, R1, R2;
-          ResPacket alphav = pset1<ResPacket>(alpha);
-
-          R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-          R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-          R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
-          traits.acc(C0, alphav, R0);
-          traits.acc(C4, alphav, R1);
-          traits.acc(C8, alphav, R2);
-          r0.storePacket(0 * Traits::ResPacketSize, R0);
-          r0.storePacket(1 * Traits::ResPacketSize, R1);
-          r0.storePacket(2 * Traits::ResPacketSize, R2);
-
-          R0 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-          R1 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-          R2 = r1.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
-          traits.acc(C1, alphav, R0);
-          traits.acc(C5, alphav, R1);
-          traits.acc(C9, alphav, R2);
-          r1.storePacket(0 * Traits::ResPacketSize, R0);
-          r1.storePacket(1 * Traits::ResPacketSize, R1);
-          r1.storePacket(2 * Traits::ResPacketSize, R2);
-
-          R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-          R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-          R2 = r2.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
-          traits.acc(C2, alphav, R0);
-          traits.acc(C6, alphav, R1);
-          traits.acc(C10, alphav, R2);
-          r2.storePacket(0 * Traits::ResPacketSize, R0);
-          r2.storePacket(1 * Traits::ResPacketSize, R1);
-          r2.storePacket(2 * Traits::ResPacketSize, R2);
-
-          R0 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-          R1 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-          R2 = r3.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
-          traits.acc(C3, alphav, R0);
-          traits.acc(C7, alphav, R1);
-          traits.acc(C11, alphav, R2);
-          r3.storePacket(0 * Traits::ResPacketSize, R0);
-          r3.storePacket(1 * Traits::ResPacketSize, R1);
-          r3.storePacket(2 * Traits::ResPacketSize, R2);
+          micro_panel(fix<3>, fix<4>, traits, i, j2);
         }
       }
-
-      // Deal with remaining columns of the rhs
       for (Index j2 = packet_cols4; j2 < cols; j2++) {
         for (Index i = i1; i < actual_panel_end; i += 3 * LhsProgress) {
-          // One column at a time
-          const LhsScalar* blA = &blockA[i * strideA + offsetA * (3 * Traits::LhsProgress)];
-          prefetch(&blA[0]);
-
-          // gets res block as register
-          AccPacket C0, C4, C8;
-          traits.initAcc(C0);
-          traits.initAcc(C4);
-          traits.initAcc(C8);
-
-          LinearMapper r0 = res.getLinearMapper(i, j2);
-          r0.prefetch(0);
-
-          // performs "inner" products
-          const RhsScalar* blB = &blockB[j2 * strideB + offsetB];
-          LhsPacket A0, A1, A2;
-
-          for (Index k = 0; k < peeled_kc; k += pk) {
-            EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1");
-            RhsPacket B_0;
-#define EIGEN_GEBGP_ONESTEP(K)                                          \
-  do {                                                                  \
-    EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1");          \
-    EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
-    traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0);                \
-    traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1);                \
-    traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2);                \
-    traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0);                   \
-    traits.madd(A0, B_0, C0, B_0, fix<0>);                              \
-    traits.madd(A1, B_0, C4, B_0, fix<0>);                              \
-    traits.madd(A2, B_0, C8, B_0, fix<0>);                              \
-    EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1");            \
-  } while (false)
-
-            EIGEN_GEBGP_ONESTEP(0);
-            EIGEN_GEBGP_ONESTEP(1);
-            EIGEN_GEBGP_ONESTEP(2);
-            EIGEN_GEBGP_ONESTEP(3);
-            EIGEN_GEBGP_ONESTEP(4);
-            EIGEN_GEBGP_ONESTEP(5);
-            EIGEN_GEBGP_ONESTEP(6);
-            EIGEN_GEBGP_ONESTEP(7);
-
-            blB += int(pk) * int(RhsProgress);
-            blA += int(pk) * 3 * int(Traits::LhsProgress);
-
-            EIGEN_ASM_COMMENT("end gebp micro kernel 3pX1");
-          }
-
-          // process remaining peeled loop
-          for (Index k = peeled_kc; k < depth; k++) {
-            RhsPacket B_0;
-            EIGEN_GEBGP_ONESTEP(0);
-            blB += RhsProgress;
-            blA += 3 * Traits::LhsProgress;
-          }
-#undef EIGEN_GEBGP_ONESTEP
-          ResPacket R0, R1, R2;
-          ResPacket alphav = pset1<ResPacket>(alpha);
-
-          R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-          R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-          R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
-          traits.acc(C0, alphav, R0);
-          traits.acc(C4, alphav, R1);
-          traits.acc(C8, alphav, R2);
-          r0.storePacket(0 * Traits::ResPacketSize, R0);
-          r0.storePacket(1 * Traits::ResPacketSize, R1);
-          r0.storePacket(2 * Traits::ResPacketSize, R2);
+          micro_panel(fix<3>, fix<1>, traits, i, j2);
         }
       }
     }
   }
 
   //---------- Process 2 * LhsProgress rows at once ----------
-  if (mr >= 2 * Traits::LhsProgress) {
-    const Index l1 = defaultL1CacheSize;  // in Bytes, TODO, l1 should be passed to this function.
-    // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
-    // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only
-    // guess), or because we are testing specific blocking sizes.
+  EIGEN_IF_CONSTEXPR(mr >= 2 * Traits::LhsProgress) {
+    std::ptrdiff_t l1, l2, l3;
+    manage_caching_sizes(GetAction, &l1, &l2, &l3);
     Index actual_panel_rows =
         (2 * LhsProgress) * std::max<Index>(1, ((l1 - sizeof(ResScalar) * mr * nr - depth * nr * sizeof(RhsScalar)) /
                                                 (depth * sizeof(LhsScalar) * 2 * LhsProgress)));
-
     for (Index i1 = peeled_mc3; i1 < peeled_mc2; i1 += actual_panel_rows) {
       Index actual_panel_end = (std::min)(i1 + actual_panel_rows, peeled_mc2);
 #if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
       EIGEN_IF_CONSTEXPR(nr >= 8) {
         for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
           for (Index i = i1; i < actual_panel_end; i += 2 * LhsProgress) {
-            const LhsScalar* blA = &blockA[i * strideA + offsetA * (2 * Traits::LhsProgress)];
-            prefetch(&blA[0]);
-
-            AccPacket C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, C10, C11, C12, C13, C14, C15;
-            traits.initAcc(C0);
-            traits.initAcc(C1);
-            traits.initAcc(C2);
-            traits.initAcc(C3);
-            traits.initAcc(C4);
-            traits.initAcc(C5);
-            traits.initAcc(C6);
-            traits.initAcc(C7);
-            traits.initAcc(C8);
-            traits.initAcc(C9);
-            traits.initAcc(C10);
-            traits.initAcc(C11);
-            traits.initAcc(C12);
-            traits.initAcc(C13);
-            traits.initAcc(C14);
-            traits.initAcc(C15);
-
-            LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
-            LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
-            LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
-            LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
-            LinearMapper r4 = res.getLinearMapper(i, j2 + 4);
-            LinearMapper r5 = res.getLinearMapper(i, j2 + 5);
-            LinearMapper r6 = res.getLinearMapper(i, j2 + 6);
-            LinearMapper r7 = res.getLinearMapper(i, j2 + 7);
-            r0.prefetch(prefetch_res_offset);
-            r1.prefetch(prefetch_res_offset);
-            r2.prefetch(prefetch_res_offset);
-            r3.prefetch(prefetch_res_offset);
-            r4.prefetch(prefetch_res_offset);
-            r5.prefetch(prefetch_res_offset);
-            r6.prefetch(prefetch_res_offset);
-            r7.prefetch(prefetch_res_offset);
-
-            const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 8];
-            prefetch(&blB[0]);
-            LhsPacket A0, A1;
-            for (Index k = 0; k < peeled_kc; k += pk) {
-              RhsPacketx4 rhs_panel;
-              RhsPacket T0;
-// NOTE: the begin/end asm comments below work around bug 935!
-// but they are not enough for gcc>=6 without FMA (bug 1637)
-#if EIGEN_GNUC_STRICT_AT_LEAST(6, 0, 0) && defined(EIGEN_VECTORIZE_SSE)
-#define EIGEN_GEBP_2Px8_SPILLING_WORKAROUND __asm__("" : [a0] "+x,m"(A0), [a1] "+x,m"(A1));
-#else
-#define EIGEN_GEBP_2Px8_SPILLING_WORKAROUND
-#endif
-#define EIGEN_GEBGP_ONESTEP(K)                                                                   \
-  do {                                                                                           \
-    EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX8");                                   \
-    traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0);                                         \
-    traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1);                                         \
-    traits.loadRhs(&blB[(0 + 8 * K) * RhsProgress], rhs_panel);                                  \
-    traits.madd(A0, rhs_panel, C0, T0, fix<0>);                                                  \
-    traits.madd(A1, rhs_panel, C8, T0, fix<0>);                                                  \
-    traits.updateRhs(&blB[(1 + 8 * K) * RhsProgress], rhs_panel);                                \
-    traits.madd(A0, rhs_panel, C1, T0, fix<1>);                                                  \
-    traits.madd(A1, rhs_panel, C9, T0, fix<1>);                                                  \
-    traits.updateRhs(&blB[(2 + 8 * K) * RhsProgress], rhs_panel);                                \
-    traits.madd(A0, rhs_panel, C2, T0, fix<2>);                                                  \
-    traits.madd(A1, rhs_panel, C10, T0, fix<2>);                                                 \
-    traits.updateRhs(&blB[(3 + 8 * K) * RhsProgress], rhs_panel);                                \
-    traits.madd(A0, rhs_panel, C3, T0, fix<3>);                                                  \
-    traits.madd(A1, rhs_panel, C11, T0, fix<3>);                                                 \
-    traits.loadRhs(&blB[(4 + 8 * K) * RhsProgress], rhs_panel);                                  \
-    traits.madd(A0, rhs_panel, C4, T0, fix<0>);                                                  \
-    traits.madd(A1, rhs_panel, C12, T0, fix<0>);                                                 \
-    traits.updateRhs(&blB[(5 + 8 * K) * RhsProgress], rhs_panel);                                \
-    traits.madd(A0, rhs_panel, C5, T0, fix<1>);                                                  \
-    traits.madd(A1, rhs_panel, C13, T0, fix<1>);                                                 \
-    traits.updateRhs(&blB[(6 + 8 * K) * RhsProgress], rhs_panel);                                \
-    traits.madd(A0, rhs_panel, C6, T0, fix<2>);                                                  \
-    traits.madd(A1, rhs_panel, C14, T0, fix<2>);                                                 \
-    traits.updateRhs(&blB[(7 + 8 * K) * RhsProgress], rhs_panel);                                \
-    traits.madd(A0, rhs_panel, C7, T0, fix<3>);                                                  \
-    traits.madd(A1, rhs_panel, C15, T0, fix<3>);                                                 \
-    EIGEN_GEBP_2Px8_SPILLING_WORKAROUND EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX8"); \
-  } while (false)
-
-              EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX8");
-
-              EIGEN_GEBGP_ONESTEP(0);
-              EIGEN_GEBGP_ONESTEP(1);
-              EIGEN_GEBGP_ONESTEP(2);
-              EIGEN_GEBGP_ONESTEP(3);
-              EIGEN_GEBGP_ONESTEP(4);
-              EIGEN_GEBGP_ONESTEP(5);
-              EIGEN_GEBGP_ONESTEP(6);
-              EIGEN_GEBGP_ONESTEP(7);
-
-              blB += pk * 8 * RhsProgress;
-              blA += pk * (2 * Traits::LhsProgress);
-
-              EIGEN_ASM_COMMENT("end gebp micro kernel 2pX8");
-            }
-            // process remaining peeled loop
-            for (Index k = peeled_kc; k < depth; k++) {
-              RhsPacketx4 rhs_panel;
-              RhsPacket T0;
-              EIGEN_GEBGP_ONESTEP(0);
-              blB += 8 * RhsProgress;
-              blA += 2 * Traits::LhsProgress;
-            }
-
-#undef EIGEN_GEBGP_ONESTEP
-
-            ResPacket R0, R1, R2, R3;
-            ResPacket alphav = pset1<ResPacket>(alpha);
-
-            R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-            R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-            R2 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-            R3 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-            traits.acc(C0, alphav, R0);
-            traits.acc(C8, alphav, R1);
-            traits.acc(C1, alphav, R2);
-            traits.acc(C9, alphav, R3);
-            r0.storePacket(0 * Traits::ResPacketSize, R0);
-            r0.storePacket(1 * Traits::ResPacketSize, R1);
-            r1.storePacket(0 * Traits::ResPacketSize, R2);
-            r1.storePacket(1 * Traits::ResPacketSize, R3);
-
-            R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-            R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-            R2 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-            R3 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-            traits.acc(C2, alphav, R0);
-            traits.acc(C10, alphav, R1);
-            traits.acc(C3, alphav, R2);
-            traits.acc(C11, alphav, R3);
-            r2.storePacket(0 * Traits::ResPacketSize, R0);
-            r2.storePacket(1 * Traits::ResPacketSize, R1);
-            r3.storePacket(0 * Traits::ResPacketSize, R2);
-            r3.storePacket(1 * Traits::ResPacketSize, R3);
-
-            R0 = r4.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-            R1 = r4.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-            R2 = r5.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-            R3 = r5.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-            traits.acc(C4, alphav, R0);
-            traits.acc(C12, alphav, R1);
-            traits.acc(C5, alphav, R2);
-            traits.acc(C13, alphav, R3);
-            r4.storePacket(0 * Traits::ResPacketSize, R0);
-            r4.storePacket(1 * Traits::ResPacketSize, R1);
-            r5.storePacket(0 * Traits::ResPacketSize, R2);
-            r5.storePacket(1 * Traits::ResPacketSize, R3);
-
-            R0 = r6.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-            R1 = r6.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-            R2 = r7.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-            R3 = r7.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-            traits.acc(C6, alphav, R0);
-            traits.acc(C14, alphav, R1);
-            traits.acc(C7, alphav, R2);
-            traits.acc(C15, alphav, R3);
-            r6.storePacket(0 * Traits::ResPacketSize, R0);
-            r6.storePacket(1 * Traits::ResPacketSize, R1);
-            r7.storePacket(0 * Traits::ResPacketSize, R2);
-            r7.storePacket(1 * Traits::ResPacketSize, R3);
+            micro_panel(fix<2>, fix<8>, traits, i, j2);
           }
         }
       }
 #endif
       for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
         for (Index i = i1; i < actual_panel_end; i += 2 * LhsProgress) {
-          // We selected a 2*Traits::LhsProgress x nr micro block of res which is entirely
-          // stored into 2 x nr registers.
-
-          const LhsScalar* blA = &blockA[i * strideA + offsetA * (2 * Traits::LhsProgress)];
-          prefetch(&blA[0]);
-
-          // gets res block as register
-          AccPacket C0, C1, C2, C3, C4, C5, C6, C7;
-          traits.initAcc(C0);
-          traits.initAcc(C1);
-          traits.initAcc(C2);
-          traits.initAcc(C3);
-          traits.initAcc(C4);
-          traits.initAcc(C5);
-          traits.initAcc(C6);
-          traits.initAcc(C7);
-
-          LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
-          LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
-          LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
-          LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
-
-          r0.prefetch(prefetch_res_offset);
-          r1.prefetch(prefetch_res_offset);
-          r2.prefetch(prefetch_res_offset);
-          r3.prefetch(prefetch_res_offset);
-
-          // performs "inner" products
-          const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 4];
-          prefetch(&blB[0]);
-          LhsPacket A0, A1;
-
-          for (Index k = 0; k < peeled_kc; k += pk) {
-            EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4");
-            RhsPacketx4 rhs_panel;
-            RhsPacket T0;
-
-// NOTE: the begin/end asm comments below work around bug 935!
-// but they are not enough for gcc>=6 without FMA (bug 1637)
-#if EIGEN_GNUC_STRICT_AT_LEAST(6, 0, 0) && defined(EIGEN_VECTORIZE_SSE) && !(EIGEN_COMP_LCC)
-#define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__("" : [a0] "+x,m"(A0), [a1] "+x,m"(A1));
-#else
-#define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND
-#endif
-#define EIGEN_GEBGP_ONESTEP(K)                                  \
-  do {                                                          \
-    EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4");  \
-    traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0);        \
-    traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1);        \
-    traits.loadRhs(&blB[(0 + 4 * K) * RhsProgress], rhs_panel); \
-    traits.madd(A0, rhs_panel, C0, T0, fix<0>);                 \
-    traits.madd(A1, rhs_panel, C4, T0, fix<0>);                 \
-    traits.madd(A0, rhs_panel, C1, T0, fix<1>);                 \
-    traits.madd(A1, rhs_panel, C5, T0, fix<1>);                 \
-    traits.madd(A0, rhs_panel, C2, T0, fix<2>);                 \
-    traits.madd(A1, rhs_panel, C6, T0, fix<2>);                 \
-    traits.madd(A0, rhs_panel, C3, T0, fix<3>);                 \
-    traits.madd(A1, rhs_panel, C7, T0, fix<3>);                 \
-    EIGEN_GEBP_2PX4_SPILLING_WORKAROUND                         \
-    EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4");    \
-  } while (false)
-
-            internal::prefetch(blB + (48 + 0));
-            EIGEN_GEBGP_ONESTEP(0);
-            EIGEN_GEBGP_ONESTEP(1);
-            EIGEN_GEBGP_ONESTEP(2);
-            EIGEN_GEBGP_ONESTEP(3);
-            internal::prefetch(blB + (48 + 16));
-            EIGEN_GEBGP_ONESTEP(4);
-            EIGEN_GEBGP_ONESTEP(5);
-            EIGEN_GEBGP_ONESTEP(6);
-            EIGEN_GEBGP_ONESTEP(7);
-
-            blB += pk * 4 * RhsProgress;
-            blA += pk * (2 * Traits::LhsProgress);
-
-            EIGEN_ASM_COMMENT("end gebp micro kernel 2pX4");
-          }
-          // process remaining peeled loop
-          for (Index k = peeled_kc; k < depth; k++) {
-            RhsPacketx4 rhs_panel;
-            RhsPacket T0;
-            EIGEN_GEBGP_ONESTEP(0);
-            blB += 4 * RhsProgress;
-            blA += 2 * Traits::LhsProgress;
-          }
-#undef EIGEN_GEBGP_ONESTEP
-
-          ResPacket R0, R1, R2, R3;
-          ResPacket alphav = pset1<ResPacket>(alpha);
-
-          R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-          R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-          R2 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-          R3 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-          traits.acc(C0, alphav, R0);
-          traits.acc(C4, alphav, R1);
-          traits.acc(C1, alphav, R2);
-          traits.acc(C5, alphav, R3);
-          r0.storePacket(0 * Traits::ResPacketSize, R0);
-          r0.storePacket(1 * Traits::ResPacketSize, R1);
-          r1.storePacket(0 * Traits::ResPacketSize, R2);
-          r1.storePacket(1 * Traits::ResPacketSize, R3);
-
-          R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-          R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-          R2 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-          R3 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-          traits.acc(C2, alphav, R0);
-          traits.acc(C6, alphav, R1);
-          traits.acc(C3, alphav, R2);
-          traits.acc(C7, alphav, R3);
-          r2.storePacket(0 * Traits::ResPacketSize, R0);
-          r2.storePacket(1 * Traits::ResPacketSize, R1);
-          r3.storePacket(0 * Traits::ResPacketSize, R2);
-          r3.storePacket(1 * Traits::ResPacketSize, R3);
+          micro_panel(fix<2>, fix<4>, traits, i, j2);
         }
       }
-
-      // Deal with remaining columns of the rhs
       for (Index j2 = packet_cols4; j2 < cols; j2++) {
         for (Index i = i1; i < actual_panel_end; i += 2 * LhsProgress) {
-          // One column at a time
-          const LhsScalar* blA = &blockA[i * strideA + offsetA * (2 * Traits::LhsProgress)];
-          prefetch(&blA[0]);
-
-          // gets res block as register
-          AccPacket C0, C4;
-          traits.initAcc(C0);
-          traits.initAcc(C4);
-
-          LinearMapper r0 = res.getLinearMapper(i, j2);
-          r0.prefetch(prefetch_res_offset);
-
-          // performs "inner" products
-          const RhsScalar* blB = &blockB[j2 * strideB + offsetB];
-          LhsPacket A0, A1;
-
-          for (Index k = 0; k < peeled_kc; k += pk) {
-            EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1");
-            RhsPacket B_0, B1;
-
-#define EIGEN_GEBGP_ONESTEP(K)                                          \
-  do {                                                                  \
-    EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1");          \
-    EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
-    traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0);                \
-    traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1);                \
-    traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0);                   \
-    traits.madd(A0, B_0, C0, B1, fix<0>);                               \
-    traits.madd(A1, B_0, C4, B_0, fix<0>);                              \
-    EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1");            \
-  } while (false)
-
-            EIGEN_GEBGP_ONESTEP(0);
-            EIGEN_GEBGP_ONESTEP(1);
-            EIGEN_GEBGP_ONESTEP(2);
-            EIGEN_GEBGP_ONESTEP(3);
-            EIGEN_GEBGP_ONESTEP(4);
-            EIGEN_GEBGP_ONESTEP(5);
-            EIGEN_GEBGP_ONESTEP(6);
-            EIGEN_GEBGP_ONESTEP(7);
-
-            blB += int(pk) * int(RhsProgress);
-            blA += int(pk) * 2 * int(Traits::LhsProgress);
-
-            EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1");
-          }
-
-          // process remaining peeled loop
-          for (Index k = peeled_kc; k < depth; k++) {
-            RhsPacket B_0, B1;
-            EIGEN_GEBGP_ONESTEP(0);
-            blB += RhsProgress;
-            blA += 2 * Traits::LhsProgress;
-          }
-#undef EIGEN_GEBGP_ONESTEP
-          ResPacket R0, R1;
-          ResPacket alphav = pset1<ResPacket>(alpha);
-
-          R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-          R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-          traits.acc(C0, alphav, R0);
-          traits.acc(C4, alphav, R1);
-          r0.storePacket(0 * Traits::ResPacketSize, R0);
-          r0.storePacket(1 * Traits::ResPacketSize, R1);
+          micro_panel(fix<2>, fix<1>, traits, i, j2);
         }
       }
     }
   }
+
   //---------- Process 1 * LhsProgress rows at once ----------
-  if (mr >= 1 * Traits::LhsProgress) {
-    lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket,
-                           RhsPacket, ResPacket, Traits, LinearMapper, DataMapper>
-        p;
-    p(res, blockA, blockB, alpha, peeled_mc2, peeled_mc1, strideA, strideB, offsetA, offsetB, prefetch_res_offset,
-      peeled_kc, pk, cols, depth, packet_cols4);
+  EIGEN_IF_CONSTEXPR(mr >= 1 * Traits::LhsProgress) {
+    for (Index i = peeled_mc2; i < peeled_mc1; i += LhsProgress) {
+#if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
+      EIGEN_IF_CONSTEXPR(nr >= 8) {
+        for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
+          micro_panel(fix<1>, fix<8>, traits, i, j2);
+        }
+      }
+#endif
+      for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
+        micro_panel(fix<1>, fix<4>, traits, i, j2);
+      }
+      for (Index j2 = packet_cols4; j2 < cols; j2++) {
+        micro_panel(fix<1>, fix<1>, traits, i, j2);
+      }
+    }
   }
+
   //---------- Process LhsProgressHalf rows at once ----------
-  if ((LhsProgressHalf < LhsProgress) && mr >= LhsProgressHalf) {
-    lhs_process_fraction_of_packet<nr, LhsProgressHalf, RhsProgressHalf, LhsScalar, RhsScalar, ResScalar, AccPacketHalf,
-                                   LhsPacketHalf, RhsPacketHalf, ResPacketHalf, HalfTraits, LinearMapper, DataMapper>
-        p;
-    p(res, blockA, blockB, alpha, peeled_mc1, peeled_mc_half, strideA, strideB, offsetA, offsetB, prefetch_res_offset,
-      peeled_kc, pk, cols, depth, packet_cols4);
+  EIGEN_IF_CONSTEXPR((LhsProgressHalf < LhsProgress) && mr >= LhsProgressHalf) {
+    HalfTraits half_traits;
+    for (Index i = peeled_mc1; i < peeled_mc_half; i += LhsProgressHalf) {
+#if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
+      EIGEN_IF_CONSTEXPR(nr >= 8) {
+        for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
+          gebp_micro_panel_impl<1, 8, HalfTraits, LhsScalar, RhsScalar, ResScalar, Index, DataMapper, LinearMapper,
+                                LhsPacket>(half_traits, res, blockA, blockB, alpha, i, j2, depth, strideA, strideB,
+                                           offsetA, offsetB, prefetch_res_offset, peeled_kc, pk);
+        }
+      }
+#endif
+      for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
+        gebp_micro_panel_impl<1, 4, HalfTraits, LhsScalar, RhsScalar, ResScalar, Index, DataMapper, LinearMapper,
+                              LhsPacket>(half_traits, res, blockA, blockB, alpha, i, j2, depth, strideA, strideB,
+                                         offsetA, offsetB, prefetch_res_offset, peeled_kc, pk);
+      }
+      for (Index j2 = packet_cols4; j2 < cols; j2++) {
+        gebp_micro_panel_impl<1, 1, HalfTraits, LhsScalar, RhsScalar, ResScalar, Index, DataMapper, LinearMapper,
+                              LhsPacket>(half_traits, res, blockA, blockB, alpha, i, j2, depth, strideA, strideB,
+                                         offsetA, offsetB, prefetch_res_offset, peeled_kc, pk);
+      }
+    }
   }
+
   //---------- Process LhsProgressQuarter rows at once ----------
-  if ((LhsProgressQuarter < LhsProgressHalf) && mr >= LhsProgressQuarter) {
-    lhs_process_fraction_of_packet<nr, LhsProgressQuarter, RhsProgressQuarter, LhsScalar, RhsScalar, ResScalar,
-                                   AccPacketQuarter, LhsPacketQuarter, RhsPacketQuarter, ResPacketQuarter,
-                                   QuarterTraits, LinearMapper, DataMapper>
-        p;
-    p(res, blockA, blockB, alpha, peeled_mc_half, peeled_mc_quarter, strideA, strideB, offsetA, offsetB,
-      prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
+  EIGEN_IF_CONSTEXPR((LhsProgressQuarter < LhsProgressHalf) && mr >= LhsProgressQuarter) {
+    QuarterTraits quarter_traits;
+    for (Index i = peeled_mc_half; i < peeled_mc_quarter; i += LhsProgressQuarter) {
+#if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
+      EIGEN_IF_CONSTEXPR(nr >= 8) {
+        for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
+          gebp_micro_panel_impl<1, 8, QuarterTraits, LhsScalar, RhsScalar, ResScalar, Index, DataMapper, LinearMapper,
+                                LhsPacket>(quarter_traits, res, blockA, blockB, alpha, i, j2, depth, strideA, strideB,
+                                           offsetA, offsetB, prefetch_res_offset, peeled_kc, pk);
+        }
+      }
+#endif
+      for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
+        gebp_micro_panel_impl<1, 4, QuarterTraits, LhsScalar, RhsScalar, ResScalar, Index, DataMapper, LinearMapper,
+                              LhsPacket>(quarter_traits, res, blockA, blockB, alpha, i, j2, depth, strideA, strideB,
+                                         offsetA, offsetB, prefetch_res_offset, peeled_kc, pk);
+      }
+      for (Index j2 = packet_cols4; j2 < cols; j2++) {
+        gebp_micro_panel_impl<1, 1, QuarterTraits, LhsScalar, RhsScalar, ResScalar, Index, DataMapper, LinearMapper,
+                              LhsPacket>(quarter_traits, res, blockA, blockB, alpha, i, j2, depth, strideA, strideB,
+                                         offsetA, offsetB, prefetch_res_offset, peeled_kc, pk);
+      }
+    }
   }
+
   //---------- Process remaining rows, 1 at once ----------
   if (peeled_mc_quarter < rows) {
 #if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
@@ -2558,14 +1777,14 @@
 struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode> {
   typedef typename DataMapper::LinearMapper LinearMapper;
   EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0,
-                                    Index offset = 0);
+                                    Index offset = 0) const;
 };
 
 template <typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate,
           bool PanelMode>
 EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate,
                                      PanelMode>::operator()(Scalar* blockA, const DataMapper& lhs, Index depth,
-                                                            Index rows, Index stride, Index offset) {
+                                                            Index rows, Index stride, Index offset) const {
   typedef typename unpacket_traits<Packet>::half HalfPacket;
   typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
   enum {
@@ -2580,7 +1799,7 @@
   EIGEN_UNUSED_VARIABLE(stride);
   EIGEN_UNUSED_VARIABLE(offset);
   eigen_assert(((!PanelMode) && stride == 0 && offset == 0) || (PanelMode && stride >= depth && offset <= stride));
-  eigen_assert(((Pack1 % PacketSize) == 0 && Pack1 <= 4 * PacketSize) || (Pack1 <= 4));
+  eigen_assert(((Pack1 % PacketSize) == 0 && Pack1 <= 4 * PacketSize) || (Pack1 <= 4) || (Pack1 < PacketSize));
   conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
   Index count = 0;
 
@@ -2707,14 +1926,14 @@
 struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode> {
   typedef typename DataMapper::LinearMapper LinearMapper;
   EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0,
-                                    Index offset = 0);
+                                    Index offset = 0) const;
 };
 
 template <typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate,
           bool PanelMode>
 EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate,
                                      PanelMode>::operator()(Scalar* blockA, const DataMapper& lhs, Index depth,
-                                                            Index rows, Index stride, Index offset) {
+                                                            Index rows, Index stride, Index offset) const {
   typedef typename unpacket_traits<Packet>::half HalfPacket;
   typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
   enum {
@@ -2833,12 +2052,12 @@
   typedef typename DataMapper::LinearMapper LinearMapper;
   enum { PacketSize = packet_traits<Scalar>::size };
   EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0,
-                                    Index offset = 0);
+                                    Index offset = 0) const;
 };
 
 template <typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
 EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>::operator()(
-    Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) {
+    Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) const {
   EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR");
   EIGEN_UNUSED_VARIABLE(stride);
   EIGEN_UNUSED_VARIABLE(offset);
@@ -2967,7 +2186,7 @@
       const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
 
       Index k = 0;
-      if ((PacketSize % 4) == 0)  // TODO enable vectorized transposition for PacketSize==2 ??
+      if ((PacketSize % 4) == 0)  // TODO: enable vectorized transposition for PacketSize==2.
       {
         for (; k < peeled_k; k += PacketSize) {
           PacketBlock<Packet, (PacketSize % 4) == 0 ? 4 : PacketSize> kernel;
@@ -3020,7 +2239,7 @@
     QuarterPacketSize = unpacket_traits<QuarterPacket>::size
   };
   EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0,
-                                    Index offset = 0) {
+                                    Index offset = 0) const {
     EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
     EIGEN_UNUSED_VARIABLE(stride);
     EIGEN_UNUSED_VARIABLE(offset);

diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index ebfac01..2b45dd2 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h

@@ -383,8 +383,8 @@
     // to determine the following heuristic.
     // EIGEN_GEMM_TO_COEFFBASED_THRESHOLD is typically defined to 20 in GeneralProduct.h,
     // unless it has been specialized by the user or for a given architecture.
-    // Note that the condition rhs.rows()>0 was required because lazy product is (was?) not happy with empty inputs.
-    // I'm not sure it is still required.
+    // Note that the condition rhs.rows()>0 was required because lazy product did not handle empty inputs
+    // correctly. It is unclear whether this guard is still necessary.
     if ((rhs.rows() + dst.rows() + dst.cols()) < EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows() > 0)
       lazyproduct::eval_dynamic(dst, lhs, rhs, internal::assign_op<typename Dst::Scalar, Scalar>());
     else {

diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
index bf27567..5a8599a 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h

@@ -87,7 +87,7 @@
 
     // !!! mc must be a multiple of nr
     if (mc > Traits::nr) {
-      using UnsignedIndex = typename make_unsigned<Index>::type;
+      using UnsignedIndex = std::make_unsigned_t<Index>;
       mc = (UnsignedIndex(mc) / Traits::nr) * Traits::nr;
     }
 
@@ -154,7 +154,7 @@
 
   enum { BlockSize = meta_least_common_multiple<plain_enum_max(mr, nr), plain_enum_min(mr, nr)>::ret };
   void operator()(ResScalar* res_, Index resIncr, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB,
-                  Index size, Index depth, const ResScalar& alpha) {
+                  Index size, Index depth, const ResScalar& alpha) const {
     typedef blas_data_mapper<ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
     typedef blas_data_mapper<ResScalar, Index, ColMajor, Unaligned> BufferMapper;
     ResMapper res(res_, resStride, resIncr);

diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h
index 6817cc0..e14aa61 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h

@@ -67,7 +67,7 @@
 
 EIGEN_BLAS_RANKUPDATE_SPECIALIZE(double)
 EIGEN_BLAS_RANKUPDATE_SPECIALIZE(float)
-// TODO handle complex cases
+// TODO: handle complex cases
 // EIGEN_BLAS_RANKUPDATE_SPECIALIZE(dcomplex)
 // EIGEN_BLAS_RANKUPDATE_SPECIALIZE(scomplex)
 
@@ -137,10 +137,12 @@
 EIGEN_BLAS_RANKUPDATE_R(float, float, ssyrk_)
 #endif
 
-// TODO handle complex cases
+// TODO: handle complex cases
 // EIGEN_BLAS_RANKUPDATE_C(dcomplex, double, double, zherk_)
 // EIGEN_BLAS_RANKUPDATE_C(scomplex, float,  float, cherk_)
 
+#undef EIGEN_BLAS_RANKUPDATE_SPECIALIZE
+#undef EIGEN_BLAS_RANKUPDATE_R
 }  // end namespace internal
 
 }  // end namespace Eigen

diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h b/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h
index 913beb6..11c29b6 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h

@@ -49,7 +49,7 @@
 
 // gemm specialization
 
-#define GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX, BLASTYPE, BLASFUNC)                                                 \
+#define EIGEN_BLAS_GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX, BLASTYPE, BLASFUNC)                                      \
   template <typename Index, int LhsStorageOrder, bool ConjugateLhs, int RhsStorageOrder, bool ConjugateRhs>         \
   struct general_matrix_matrix_product<Index, EIGTYPE, LhsStorageOrder, ConjugateLhs, EIGTYPE, RhsStorageOrder,     \
                                        ConjugateRhs, ColMajor, 1> {                                                 \
@@ -105,15 +105,15 @@
   };
 
 #ifdef EIGEN_USE_MKL
-GEMM_SPECIALIZATION(double, d, double, dgemm)
-GEMM_SPECIALIZATION(float, f, float, sgemm)
-GEMM_SPECIALIZATION(dcomplex, cd, MKL_Complex16, zgemm)
-GEMM_SPECIALIZATION(scomplex, cf, MKL_Complex8, cgemm)
+EIGEN_BLAS_GEMM_SPECIALIZATION(double, d, double, dgemm)
+EIGEN_BLAS_GEMM_SPECIALIZATION(float, f, float, sgemm)
+EIGEN_BLAS_GEMM_SPECIALIZATION(dcomplex, cd, MKL_Complex16, zgemm)
+EIGEN_BLAS_GEMM_SPECIALIZATION(scomplex, cf, MKL_Complex8, cgemm)
 #else
-GEMM_SPECIALIZATION(double, d, double, dgemm_)
-GEMM_SPECIALIZATION(float, f, float, sgemm_)
-GEMM_SPECIALIZATION(dcomplex, cd, double, zgemm_)
-GEMM_SPECIALIZATION(scomplex, cf, float, cgemm_)
+EIGEN_BLAS_GEMM_SPECIALIZATION(double, d, double, dgemm_)
+EIGEN_BLAS_GEMM_SPECIALIZATION(float, f, float, sgemm_)
+EIGEN_BLAS_GEMM_SPECIALIZATION(dcomplex, cd, double, zgemm_)
+EIGEN_BLAS_GEMM_SPECIALIZATION(scomplex, cf, float, cgemm_)
 #endif
 
 // If OpenBLAS with BUILD_BFLOAT16=1 support is available,
@@ -198,6 +198,7 @@
 
 #endif  // EIGEN_USE_OPENBLAS_SBGEMM
 
+#undef EIGEN_BLAS_GEMM_SPECIALIZATION
 }  // namespace internal
 
 }  // end namespace Eigen

diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h
index ba72a8a..40694f2 100644
--- a/Eigen/src/Core/products/GeneralMatrixVector.h
+++ b/Eigen/src/Core/products/GeneralMatrixVector.h

@@ -97,14 +97,86 @@
   typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
   typedef typename QuarterTraits::ResPacket ResPacketQuarter;
 
-  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(Index rows, Index cols, const LhsMapper& lhs,
-                                                      const RhsMapper& rhs, ResScalar* res, Index resIncr,
-                                                      RhsScalar alpha);
+  EIGEN_DEVICE_FUNC inline static void run(Index rows, Index cols, const LhsMapper& lhs, const RhsMapper& rhs,
+                                           ResScalar* res, Index resIncr, RhsScalar alpha);
+
+  template <int N>
+  EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE void process_rows(
+      Index i, Index j2, Index jend, const LhsMapper& lhs, const RhsMapper& rhs, ResScalar* res,
+      const ResPacket& palpha, conj_helper<LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs>& pcj);
+};
+
+// Recursive template unroller for col-major GEMV full-packet row blocks.
+// Unrolls the packet dimension (K = 0..N-1) at compile time, guaranteeing
+// that each accumulator lives in its own register variable.
+template <int K, int N>
+struct gemv_colmajor_unroller {
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void init_zero(Packet* c) {
+    gemv_colmajor_unroller<K - 1, N>::init_zero(c);
+    c[K] = pzero(Packet{});
+  }
+
+  template <typename LhsPacket, int LhsStride, int Alignment, typename AccPacket, typename RhsPacket,
+            typename ConjHelper, typename LhsMapper, typename Index>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void madd(AccPacket* c, const LhsMapper& lhs, Index i, Index j,
+                                                         const RhsPacket& b0, ConjHelper& pcj) {
+    gemv_colmajor_unroller<K - 1, N>::template madd<LhsPacket, LhsStride, Alignment>(c, lhs, i, j, b0, pcj);
+    c[K] = pcj.pmadd(lhs.template load<LhsPacket, Alignment>(i + LhsStride * K, j), b0, c[K]);
+  }
+
+  template <typename ResPacket, int ResStride, typename ResScalar>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void store(const ResPacket* c, ResScalar* res, Index i,
+                                                          const ResPacket& palpha) {
+    gemv_colmajor_unroller<K - 1, N>::template store<ResPacket, ResStride>(c, res, i, palpha);
+    pstoreu(res + i + ResStride * K, pmadd(c[K], palpha, ploadu<ResPacket>(res + i + ResStride * K)));
+  }
+};
+
+template <int N>
+struct gemv_colmajor_unroller<0, N> {
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void init_zero(Packet* c) {
+    c[0] = pzero(Packet{});
+  }
+
+  template <typename LhsPacket, int LhsStride, int Alignment, typename AccPacket, typename RhsPacket,
+            typename ConjHelper, typename LhsMapper, typename Index>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void madd(AccPacket* c, const LhsMapper& lhs, Index i, Index j,
+                                                         const RhsPacket& b0, ConjHelper& pcj) {
+    c[0] = pcj.pmadd(lhs.template load<LhsPacket, Alignment>(i, j), b0, c[0]);
+  }
+
+  template <typename ResPacket, int ResStride, typename ResScalar>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void store(const ResPacket* c, ResScalar* res, Index i,
+                                                          const ResPacket& palpha) {
+    pstoreu(res + i, pmadd(c[0], palpha, ploadu<ResPacket>(res + i)));
+  }
 };
 
 template <typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar,
           typename RhsMapper, bool ConjugateRhs, int Version>
-EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void
+template <int N>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void general_matrix_vector_product<
+    Index, LhsScalar, LhsMapper, ColMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs,
+    Version>::process_rows(Index i, Index j2, Index jend, const LhsMapper& lhs, const RhsMapper& rhs, ResScalar* res,
+                           const ResPacket& palpha,
+                           conj_helper<LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs>& pcj) {
+  enum { LhsAlignment = Unaligned, LhsPacketSize = Traits::LhsPacketSize, ResPacketSize = Traits::ResPacketSize };
+  using Unroller = gemv_colmajor_unroller<N - 1, N>;
+
+  ResPacket c[N];
+  Unroller::init_zero(c);
+  for (Index j = j2; j < jend; ++j) {
+    RhsPacket b0 = pset1<RhsPacket>(rhs(j, 0));
+    Unroller::template madd<LhsPacket, LhsPacketSize, LhsAlignment>(c, lhs, i, j, b0, pcj);
+  }
+  Unroller::template store<ResPacket, ResPacketSize>(c, res, i, palpha);
+}
+
+template <typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar,
+          typename RhsMapper, bool ConjugateRhs, int Version>
+EIGEN_DEVICE_FUNC inline void
 general_matrix_vector_product<Index, LhsScalar, LhsMapper, ColMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs,
                               Version>::run(Index rows, Index cols, const LhsMapper& alhs, const RhsMapper& rhs,
                                             ResScalar* res, Index resIncr, RhsScalar alpha) {
@@ -141,7 +213,7 @@
   const Index n_quarter = rows - 1 * ResPacketSizeQuarter + 1;
 
   // TODO: improve the following heuristic:
-  const Index block_cols = cols < 128 ? cols : (lhsStride * sizeof(LhsScalar) < 32000 ? 16 : 4);
+  const Index block_cols = cols < 128 ? cols : (lhsStride * Index(sizeof(LhsScalar)) < 32000 ? Index(16) : Index(4));
   ResPacket palpha = pset1<ResPacket>(alpha);
   ResPacketHalf palpha_half = pset1<ResPacketHalf>(alpha);
   ResPacketQuarter palpha_quarter = pset1<ResPacketQuarter>(alpha);
@@ -149,89 +221,25 @@
   for (Index j2 = 0; j2 < cols; j2 += block_cols) {
     Index jend = numext::mini(j2 + block_cols, cols);
     Index i = 0;
-    for (; i < n8; i += ResPacketSize * 8) {
-      ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0)),
-                c2 = pset1<ResPacket>(ResScalar(0)), c3 = pset1<ResPacket>(ResScalar(0)),
-                c4 = pset1<ResPacket>(ResScalar(0)), c5 = pset1<ResPacket>(ResScalar(0)),
-                c6 = pset1<ResPacket>(ResScalar(0)), c7 = pset1<ResPacket>(ResScalar(0));
-
-      for (Index j = j2; j < jend; j += 1) {
-        RhsPacket b0 = pset1<RhsPacket>(rhs(j, 0));
-        c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 0, j), b0, c0);
-        c1 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 1, j), b0, c1);
-        c2 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 2, j), b0, c2);
-        c3 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 3, j), b0, c3);
-        c4 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 4, j), b0, c4);
-        c5 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 5, j), b0, c5);
-        c6 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 6, j), b0, c6);
-        c7 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 7, j), b0, c7);
-      }
-      pstoreu(res + i + ResPacketSize * 0, pmadd(c0, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 0)));
-      pstoreu(res + i + ResPacketSize * 1, pmadd(c1, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 1)));
-      pstoreu(res + i + ResPacketSize * 2, pmadd(c2, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 2)));
-      pstoreu(res + i + ResPacketSize * 3, pmadd(c3, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 3)));
-      pstoreu(res + i + ResPacketSize * 4, pmadd(c4, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 4)));
-      pstoreu(res + i + ResPacketSize * 5, pmadd(c5, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 5)));
-      pstoreu(res + i + ResPacketSize * 6, pmadd(c6, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 6)));
-      pstoreu(res + i + ResPacketSize * 7, pmadd(c7, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 7)));
-    }
+    for (; i < n8; i += ResPacketSize * 8) process_rows<8>(i, j2, jend, lhs, rhs, res, palpha, pcj);
     if (i < n4) {
-      ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0)),
-                c2 = pset1<ResPacket>(ResScalar(0)), c3 = pset1<ResPacket>(ResScalar(0));
-
-      for (Index j = j2; j < jend; j += 1) {
-        RhsPacket b0 = pset1<RhsPacket>(rhs(j, 0));
-        c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 0, j), b0, c0);
-        c1 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 1, j), b0, c1);
-        c2 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 2, j), b0, c2);
-        c3 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 3, j), b0, c3);
-      }
-      pstoreu(res + i + ResPacketSize * 0, pmadd(c0, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 0)));
-      pstoreu(res + i + ResPacketSize * 1, pmadd(c1, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 1)));
-      pstoreu(res + i + ResPacketSize * 2, pmadd(c2, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 2)));
-      pstoreu(res + i + ResPacketSize * 3, pmadd(c3, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 3)));
-
+      process_rows<4>(i, j2, jend, lhs, rhs, res, palpha, pcj);
       i += ResPacketSize * 4;
     }
     if (i < n3) {
-      ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0)),
-                c2 = pset1<ResPacket>(ResScalar(0));
-
-      for (Index j = j2; j < jend; j += 1) {
-        RhsPacket b0 = pset1<RhsPacket>(rhs(j, 0));
-        c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 0, j), b0, c0);
-        c1 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 1, j), b0, c1);
-        c2 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 2, j), b0, c2);
-      }
-      pstoreu(res + i + ResPacketSize * 0, pmadd(c0, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 0)));
-      pstoreu(res + i + ResPacketSize * 1, pmadd(c1, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 1)));
-      pstoreu(res + i + ResPacketSize * 2, pmadd(c2, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 2)));
-
+      process_rows<3>(i, j2, jend, lhs, rhs, res, palpha, pcj);
       i += ResPacketSize * 3;
     }
     if (i < n2) {
-      ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0));
-
-      for (Index j = j2; j < jend; j += 1) {
-        RhsPacket b0 = pset1<RhsPacket>(rhs(j, 0));
-        c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 0, j), b0, c0);
-        c1 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 1, j), b0, c1);
-      }
-      pstoreu(res + i + ResPacketSize * 0, pmadd(c0, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 0)));
-      pstoreu(res + i + ResPacketSize * 1, pmadd(c1, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 1)));
+      process_rows<2>(i, j2, jend, lhs, rhs, res, palpha, pcj);
       i += ResPacketSize * 2;
     }
     if (i < n1) {
-      ResPacket c0 = pset1<ResPacket>(ResScalar(0));
-      for (Index j = j2; j < jend; j += 1) {
-        RhsPacket b0 = pset1<RhsPacket>(rhs(j, 0));
-        c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 0, j), b0, c0);
-      }
-      pstoreu(res + i + ResPacketSize * 0, pmadd(c0, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 0)));
+      process_rows<1>(i, j2, jend, lhs, rhs, res, palpha, pcj);
       i += ResPacketSize;
     }
     if (HasHalf && i < n_half) {
-      ResPacketHalf c0 = pset1<ResPacketHalf>(ResScalar(0));
+      ResPacketHalf c0 = pzero(ResPacketHalf{});
       for (Index j = j2; j < jend; j += 1) {
         RhsPacketHalf b0 = pset1<RhsPacketHalf>(rhs(j, 0));
         c0 = pcj_half.pmadd(lhs.template load<LhsPacketHalf, LhsAlignment>(i + 0, j), b0, c0);
@@ -241,7 +249,7 @@
       i += ResPacketSizeHalf;
     }
     if (HasQuarter && i < n_quarter) {
-      ResPacketQuarter c0 = pset1<ResPacketQuarter>(ResScalar(0));
+      ResPacketQuarter c0 = pzero(ResPacketQuarter{});
       for (Index j = j2; j < jend; j += 1) {
         RhsPacketQuarter b0 = pset1<RhsPacketQuarter>(rhs(j, 0));
         c0 = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter, LhsAlignment>(i + 0, j), b0, c0);
@@ -290,17 +298,50 @@
   typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
   typedef typename QuarterTraits::ResPacket ResPacketQuarter;
 
-  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(Index rows, Index cols, const LhsMapper& lhs,
-                                                      const RhsMapper& rhs, ResScalar* res, Index resIncr,
-                                                      ResScalar alpha);
+  EIGEN_DEVICE_FUNC static inline void run(Index rows, Index cols, const LhsMapper& lhs, const RhsMapper& rhs,
+                                           ResScalar* res, Index resIncr, ResScalar alpha);
+
+  // Specialized path for when cols < full packet size. Kept noinline to avoid
+  // bloating the main run() function and causing icache pressure.
+  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run_small_cols(Index rows, Index cols, const LhsMapper& lhs,
+                                                                 const RhsMapper& rhs, ResScalar* res, Index resIncr,
+                                                                 ResScalar alpha);
+
+  // Templated helper that processes N rows in run_small_cols. N is a compile-time
+  // constant; row-dimension unrolling is done via recursive templates to guarantee
+  // full unrolling regardless of compiler heuristics.
+  template <int N>
+  EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE void process_rows_small_cols(Index i, Index cols, const LhsMapper& lhs,
+                                                                            const RhsMapper& rhs, ResScalar* res,
+                                                                            Index resIncr, ResScalar alpha,
+                                                                            Index halfColBlockEnd,
+                                                                            Index quarterColBlockEnd);
 };
 
 template <typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar,
           typename RhsMapper, bool ConjugateRhs, int Version>
-EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void
+EIGEN_DEVICE_FUNC inline void
 general_matrix_vector_product<Index, LhsScalar, LhsMapper, RowMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs,
                               Version>::run(Index rows, Index cols, const LhsMapper& alhs, const RhsMapper& rhs,
                                             ResScalar* res, Index resIncr, ResScalar alpha) {
+  // When cols < full packet size, the main vectorized loops are empty.
+  // Dispatch to a separate noinline function to avoid polluting the icache.
+  // Only dispatch when cols is large enough that half or quarter packets can be used;
+  // otherwise the helper would just do scalar work with extra function call overhead.
+  enum {
+    LhsPacketSize_ = Traits::LhsPacketSize,
+    MinUsefulCols_ =
+        ((int)QuarterTraits::LhsPacketSize < (int)HalfTraits::LhsPacketSize)
+            ? (int)QuarterTraits::LhsPacketSize
+            : (((int)HalfTraits::LhsPacketSize < (int)Traits::LhsPacketSize) ? (int)HalfTraits::LhsPacketSize
+                                                                             : (int)Traits::LhsPacketSize),
+    HasSubPackets_ = (int)MinUsefulCols_ < (int)LhsPacketSize_
+  };
+  if (HasSubPackets_ && cols >= MinUsefulCols_ && cols < LhsPacketSize_) {
+    run_small_cols(rows, cols, alhs, rhs, res, resIncr, alpha);
+    return;
+  }
+
   // The following copy tells the compiler that lhs's attributes are not modified outside this function
   // This helps GCC to generate proper code.
   LhsMapper lhs(alhs);
@@ -330,17 +371,15 @@
     HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf
   };
 
-  using UnsignedIndex = typename make_unsigned<Index>::type;
+  using UnsignedIndex = std::make_unsigned_t<Index>;
   const Index fullColBlockEnd = LhsPacketSize * (UnsignedIndex(cols) / LhsPacketSize);
   const Index halfColBlockEnd = LhsPacketSizeHalf * (UnsignedIndex(cols) / LhsPacketSizeHalf);
   const Index quarterColBlockEnd = LhsPacketSizeQuarter * (UnsignedIndex(cols) / LhsPacketSizeQuarter);
 
   Index i = 0;
   for (; i < n8; i += 8) {
-    ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0)),
-              c2 = pset1<ResPacket>(ResScalar(0)), c3 = pset1<ResPacket>(ResScalar(0)),
-              c4 = pset1<ResPacket>(ResScalar(0)), c5 = pset1<ResPacket>(ResScalar(0)),
-              c6 = pset1<ResPacket>(ResScalar(0)), c7 = pset1<ResPacket>(ResScalar(0));
+    ResPacket c0 = pzero(ResPacket{}), c1 = pzero(ResPacket{}), c2 = pzero(ResPacket{}), c3 = pzero(ResPacket{}),
+              c4 = pzero(ResPacket{}), c5 = pzero(ResPacket{}), c6 = pzero(ResPacket{}), c7 = pzero(ResPacket{});
 
     for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize) {
       RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j, 0);
@@ -385,8 +424,7 @@
     res[(i + 7) * resIncr] += alpha * cc7;
   }
   for (; i < n4; i += 4) {
-    ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0)),
-              c2 = pset1<ResPacket>(ResScalar(0)), c3 = pset1<ResPacket>(ResScalar(0));
+    ResPacket c0 = pzero(ResPacket{}), c1 = pzero(ResPacket{}), c2 = pzero(ResPacket{}), c3 = pzero(ResPacket{});
 
     for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize) {
       RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j, 0);
@@ -415,7 +453,7 @@
     res[(i + 3) * resIncr] += alpha * cc3;
   }
   for (; i < n2; i += 2) {
-    ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0));
+    ResPacket c0 = pzero(ResPacket{}), c1 = pzero(ResPacket{});
 
     for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize) {
       RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j, 0);
@@ -436,9 +474,9 @@
     res[(i + 1) * resIncr] += alpha * cc1;
   }
   for (; i < rows; ++i) {
-    ResPacket c0 = pset1<ResPacket>(ResScalar(0));
-    ResPacketHalf c0_h = pset1<ResPacketHalf>(ResScalar(0));
-    ResPacketQuarter c0_q = pset1<ResPacketQuarter>(ResScalar(0));
+    ResPacket c0 = pzero(ResPacket{});
+    ResPacketHalf c0_h = pzero(ResPacketHalf{});
+    ResPacketQuarter c0_q = pzero(ResPacketQuarter{});
 
     for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize) {
       RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j, 0);
@@ -466,6 +504,170 @@
   }
 }
 
+// Recursive template unroller for process_rows_small_cols.
+// Unrolls the row dimension (K = 0..N-1) at compile time, guaranteeing
+// that each accumulator lives in its own register variable regardless
+// of compiler unrolling heuristics.
+template <int K, int N>
+struct gemv_small_cols_unroller {
+  template <typename LhsPacket, typename AccPacket, int Alignment, typename RhsType, typename ConjHelper,
+            typename LhsMapper, typename Index>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void madd(AccPacket* acc, const LhsMapper& lhs, Index i, Index j,
+                                                         const RhsType& b0, ConjHelper& pcj) {
+    gemv_small_cols_unroller<K - 1, N>::template madd<LhsPacket, AccPacket, Alignment>(acc, lhs, i, j, b0, pcj);
+    acc[K] = pcj.pmadd(lhs.template load<LhsPacket, Alignment>(i + K, j), b0, acc[K]);
+  }
+
+  template <typename ResScalar, typename RhsScalar, typename ConjHelper, typename LhsMapper, typename Index>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void scalar_madd(ResScalar* cc, const LhsMapper& lhs, Index i, Index j,
+                                                                const RhsScalar& b0, ConjHelper& cj) {
+    gemv_small_cols_unroller<K - 1, N>::scalar_madd(cc, lhs, i, j, b0, cj);
+    cc[K] += cj.pmul(lhs(i + K, j), b0);
+  }
+
+  template <typename Scalar, typename Packet>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void predux_accum(Scalar* cc, const Packet* acc) {
+    gemv_small_cols_unroller<K - 1, N>::predux_accum(cc, acc);
+    cc[K] += predux(acc[K]);
+  }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void init_zero(Packet* acc) {
+    gemv_small_cols_unroller<K - 1, N>::init_zero(acc);
+    acc[K] = pzero(Packet{});
+  }
+
+  template <typename Scalar, typename Index>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void write_result(Scalar* res, Index resIncr, Index i, Scalar alpha,
+                                                                 const Scalar* cc) {
+    gemv_small_cols_unroller<K - 1, N>::write_result(res, resIncr, i, alpha, cc);
+    res[(i + K) * resIncr] += alpha * cc[K];
+  }
+};
+
+template <int N>
+struct gemv_small_cols_unroller<0, N> {
+  template <typename LhsPacket, typename AccPacket, int Alignment, typename RhsType, typename ConjHelper,
+            typename LhsMapper, typename Index>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void madd(AccPacket* acc, const LhsMapper& lhs, Index i, Index j,
+                                                         const RhsType& b0, ConjHelper& pcj) {
+    acc[0] = pcj.pmadd(lhs.template load<LhsPacket, Alignment>(i, j), b0, acc[0]);
+  }
+
+  template <typename ResScalar, typename RhsScalar, typename ConjHelper, typename LhsMapper, typename Index>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void scalar_madd(ResScalar* cc, const LhsMapper& lhs, Index i, Index j,
+                                                                const RhsScalar& b0, ConjHelper& cj) {
+    cc[0] += cj.pmul(lhs(i, j), b0);
+  }
+
+  template <typename Scalar, typename Packet>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void predux_accum(Scalar* cc, const Packet* acc) {
+    cc[0] += predux(acc[0]);
+  }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void init_zero(Packet* acc) {
+    acc[0] = pzero(Packet{});
+  }
+
+  template <typename Scalar, typename Index>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void write_result(Scalar* res, Index resIncr, Index i, Scalar alpha,
+                                                                 const Scalar* cc) {
+    res[i * resIncr] += alpha * cc[0];
+  }
+};
+
+template <typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar,
+          typename RhsMapper, bool ConjugateRhs, int Version>
+template <int N>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void
+general_matrix_vector_product<Index, LhsScalar, LhsMapper, RowMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs,
+                              Version>::process_rows_small_cols(Index i, Index cols, const LhsMapper& lhs,
+                                                                const RhsMapper& rhs, ResScalar* res, Index resIncr,
+                                                                ResScalar alpha, Index halfColBlockEnd,
+                                                                Index quarterColBlockEnd) {
+  conj_helper<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs> cj;
+  conj_helper<LhsPacketHalf, RhsPacketHalf, ConjugateLhs, ConjugateRhs> pcj_half;
+  conj_helper<LhsPacketQuarter, RhsPacketQuarter, ConjugateLhs, ConjugateRhs> pcj_quarter;
+
+  enum {
+    LhsAlignment = Unaligned,
+    ResPacketSizeHalf = HalfTraits::ResPacketSize,
+    ResPacketSizeQuarter = QuarterTraits::ResPacketSize,
+    LhsPacketSizeHalf = HalfTraits::LhsPacketSize,
+    LhsPacketSizeQuarter = QuarterTraits::LhsPacketSize,
+    HasHalf = (int)ResPacketSizeHalf < (int)Traits::ResPacketSize,
+    HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf
+  };
+
+  using Unroll = gemv_small_cols_unroller<N - 1, N>;
+
+  ResScalar cc[N] = {};
+  if (HasHalf) {
+    ResPacketHalf h[N];
+    Unroll::init_zero(h);
+    for (Index j = 0; j < halfColBlockEnd; j += LhsPacketSizeHalf) {
+      RhsPacketHalf b0 = rhs.template load<RhsPacketHalf, Unaligned>(j, 0);
+      Unroll::template madd<LhsPacketHalf, ResPacketHalf, LhsAlignment>(h, lhs, i, j, b0, pcj_half);
+    }
+    Unroll::predux_accum(cc, h);
+  }
+  if (HasQuarter) {
+    ResPacketQuarter q[N];
+    Unroll::init_zero(q);
+    for (Index j = halfColBlockEnd; j < quarterColBlockEnd; j += LhsPacketSizeQuarter) {
+      RhsPacketQuarter b0 = rhs.template load<RhsPacketQuarter, Unaligned>(j, 0);
+      Unroll::template madd<LhsPacketQuarter, ResPacketQuarter, LhsAlignment>(q, lhs, i, j, b0, pcj_quarter);
+    }
+    Unroll::predux_accum(cc, q);
+  }
+  for (Index j = quarterColBlockEnd; j < cols; ++j) {
+    RhsScalar b0 = rhs(j, 0);
+    Unroll::scalar_madd(cc, lhs, i, j, b0, cj);
+  }
+  Unroll::write_result(res, resIncr, i, alpha, cc);
+}
+
+template <typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar,
+          typename RhsMapper, bool ConjugateRhs, int Version>
+EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void
+general_matrix_vector_product<Index, LhsScalar, LhsMapper, RowMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs,
+                              Version>::run_small_cols(Index rows, Index cols, const LhsMapper& alhs,
+                                                       const RhsMapper& rhs, ResScalar* res, Index resIncr,
+                                                       ResScalar alpha) {
+  LhsMapper lhs(alhs);
+  eigen_internal_assert(rhs.stride() == 1);
+
+  enum {
+    LhsPacketSizeHalf = HalfTraits::LhsPacketSize,
+    LhsPacketSizeQuarter = QuarterTraits::LhsPacketSize,
+  };
+
+  using UnsignedIndex = std::make_unsigned_t<Index>;
+  const Index halfColBlockEnd = LhsPacketSizeHalf * (UnsignedIndex(cols) / LhsPacketSizeHalf);
+  const Index quarterColBlockEnd = LhsPacketSizeQuarter * (UnsignedIndex(cols) / LhsPacketSizeQuarter);
+
+  const Index n8 = lhs.stride() * sizeof(LhsScalar) > 32000 ? 0 : rows - 7;
+  const Index n4 = rows - 3;
+  const Index n2 = rows - 1;
+
+  Index i = 0;
+  for (; i < n8; i += 8) {
+    process_rows_small_cols<8>(i, cols, lhs, rhs, res, resIncr, alpha, halfColBlockEnd, quarterColBlockEnd);
+  }
+  // Process remaining groups of 4 rows in case n8 was 0.
+  for (; i < n4; i += 4) {
+    process_rows_small_cols<4>(i, cols, lhs, rhs, res, resIncr, alpha, halfColBlockEnd, quarterColBlockEnd);
+  }
+  if (i < n2) {
+    process_rows_small_cols<2>(i, cols, lhs, rhs, res, resIncr, alpha, halfColBlockEnd, quarterColBlockEnd);
+    i += 2;
+  }
+  if (i < rows) {
+    process_rows_small_cols<1>(i, cols, lhs, rhs, res, resIncr, alpha, halfColBlockEnd, quarterColBlockEnd);
+  }
+}
+
 }  // end namespace internal
 
 }  // end namespace Eigen

diff --git a/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h b/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h
index 4010a0a..18adfd1 100644
--- a/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h
+++ b/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h

@@ -132,6 +132,8 @@
 EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, float, cgemv_)
 #endif
 
+#undef EIGEN_BLAS_GEMV_SPECIALIZE
+#undef EIGEN_BLAS_GEMV_SPECIALIZATION
 }  // namespace internal
 
 }  // end namespace Eigen

diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h
index b1b89ef..f4625e2 100644
--- a/Eigen/src/Core/products/Parallelizer.h
+++ b/Eigen/src/Core/products/Parallelizer.h

@@ -182,7 +182,7 @@
 
   // compute the maximal number of threads from the total amount of work:
   double work = static_cast<double>(rows) * static_cast<double>(cols) * static_cast<double>(depth);
-  double kMinTaskSize = 50000;  // FIXME improve this heuristic.
+  double kMinTaskSize = 50000;  // FIXME: tune this minimum task-size heuristic based on architecture and scalar type.
   pb_max_threads = std::max<Index>(1, std::min<Index>(pb_max_threads, static_cast<Index>(work / kMinTaskSize)));
 
   // compute the number of threads we are going to use

diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
index 899283d..fd66b60 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h

@@ -22,7 +22,7 @@
 struct symm_pack_lhs {
   template <int BlockRows>
   inline void pack(Scalar* blockA, const const_blas_data_mapper<Scalar, Index, StorageOrder>& lhs, Index cols, Index i,
-                   Index& count) {
+                   Index& count) const {
     // normal copy
     for (Index k = 0; k < i; k++)
       for (Index w = 0; w < BlockRows; w++) blockA[count++] = lhs(i + w, k);  // normal
@@ -40,7 +40,7 @@
     for (Index k = i + BlockRows; k < cols; k++)
       for (Index w = 0; w < BlockRows; w++) blockA[count++] = numext::conj(lhs(k, i + w));  // transposed
   }
-  void operator()(Scalar* blockA, const Scalar* lhs_, Index lhsStride, Index cols, Index rows) {
+  void operator()(Scalar* blockA, const Scalar* lhs_, Index lhsStride, Index cols, Index rows) const {
     typedef typename unpacket_traits<typename packet_traits<Scalar>::type>::half HalfPacket;
     typedef typename unpacket_traits<typename unpacket_traits<typename packet_traits<Scalar>::type>::half>::half
         QuarterPacket;
@@ -99,7 +99,7 @@
 template <typename Scalar, typename Index, int nr, int StorageOrder>
 struct symm_pack_rhs {
   enum { PacketSize = packet_traits<Scalar>::size };
-  void operator()(Scalar* blockB, const Scalar* rhs_, Index rhsStride, Index rows, Index cols, Index k2) {
+  void operator()(Scalar* blockB, const Scalar* rhs_, Index rhsStride, Index rows, Index cols, Index k2) const {
     Index end_k = k2 + rows;
     Index count = 0;
     const_blas_data_mapper<Scalar, Index, StorageOrder> rhs(rhs_, rhsStride);

diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h
index c0dbfd1..684a90f 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h

@@ -270,6 +270,10 @@
 EIGEN_BLAS_HEMM_R(dcomplex, double, cd, zhemm_)
 EIGEN_BLAS_HEMM_R(scomplex, float, cf, chemm_)
 #endif
+
+#undef EIGEN_BLAS_SYMM_L
+#undef EIGEN_BLAS_SYMM_R
+#undef EIGEN_BLAS_HEMM_R
 }  // end namespace internal
 
 }  // end namespace Eigen

diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector.h b/Eigen/src/Core/products/SelfadjointMatrixVector.h
index 580f6a8..f87509b 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixVector.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h

@@ -18,9 +18,11 @@
 namespace internal {
 
 /* Optimized selfadjoint matrix * vector product:
- * This algorithm processes 2 columns at once that allows to both reduce
- * the number of load/stores of the result by a factor 2 and to reduce
- * the instruction dependency.
+ * This algorithm processes 4 columns at once to reduce the number of
+ * load/stores of the result vector by a factor of 4 compared to the
+ * naive approach, and to increase instruction-level parallelism.
+ * A 2-column cleanup handles the remaining even columns, and a
+ * 1-column loop handles any final odd column.
  */
 
 template <typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs,
@@ -61,84 +63,246 @@
 
   Scalar cjAlpha = ConjugateRhs ? numext::conj(alpha) : alpha;
 
-  Index bound = numext::maxi(Index(0), size - 8) & 0xfffffffe;
-  if (FirstTriangular) bound = size - bound;
+  // Compute column counts for 4-col, 2-col, and 1-col processing phases.
+  // We leave up to ~8 columns near the diagonal for cleanup (short off-diagonal ranges).
+  Index n4 = (numext::maxi(Index(0), size - 8) / 4) * 4;
+  Index n2 = ((size - n4) / 2) * 2;
+  // Remaining (size - n4 - n2) is 0 or 1 columns.
 
-  for (Index j = FirstTriangular ? bound : 0; j < (FirstTriangular ? size : bound); j += 2) {
-    const Scalar* EIGEN_RESTRICT A0 = lhs + j * lhsStride;
-    const Scalar* EIGEN_RESTRICT A1 = lhs + (j + 1) * lhsStride;
+  // For !FirstTriangular: 4-col [0, n4), 2-col [n4, n4+n2), 1-col [n4+n2, size)
+  // For FirstTriangular:  1-col [0, size-n4-n2), 2-col [size-n4-n2, size-n4), 4-col [size-n4, size)
 
-    Scalar t0 = cjAlpha * rhs[j];
-    Packet ptmp0 = pset1<Packet>(t0);
-    Scalar t1 = cjAlpha * rhs[j + 1];
-    Packet ptmp1 = pset1<Packet>(t1);
+  // === Phase 1: 4 columns at a time ===
+  {
+    Index jStart = FirstTriangular ? (size - n4) : 0;
+    Index jEnd = FirstTriangular ? size : n4;
 
-    Scalar t2(0);
-    Packet ptmp2 = pset1<Packet>(t2);
-    Scalar t3(0);
-    Packet ptmp3 = pset1<Packet>(t3);
+    for (Index j = jStart; j < jEnd; j += 4) {
+      const Scalar* EIGEN_RESTRICT A0 = lhs + j * lhsStride;
+      const Scalar* EIGEN_RESTRICT A1 = lhs + (j + 1) * lhsStride;
+      const Scalar* EIGEN_RESTRICT A2 = lhs + (j + 2) * lhsStride;
+      const Scalar* EIGEN_RESTRICT A3 = lhs + (j + 3) * lhsStride;
 
-    Index starti = FirstTriangular ? 0 : j + 2;
-    Index endi = FirstTriangular ? j : size;
-    Index alignedStart = (starti) + internal::first_default_aligned(&res[starti], endi - starti);
-    Index alignedEnd = alignedStart + ((endi - alignedStart) / (PacketSize)) * (PacketSize);
+      Scalar t0 = cjAlpha * rhs[j];
+      Scalar t1 = cjAlpha * rhs[j + 1];
+      Scalar t2 = cjAlpha * rhs[j + 2];
+      Scalar t3 = cjAlpha * rhs[j + 3];
+      Packet ptmp0 = pset1<Packet>(t0);
+      Packet ptmp1 = pset1<Packet>(t1);
+      Packet ptmp2 = pset1<Packet>(t2);
+      Packet ptmp3 = pset1<Packet>(t3);
 
-    res[j] += cjd.pmul(numext::real(A0[j]), t0);
-    res[j + 1] += cjd.pmul(numext::real(A1[j + 1]), t1);
-    if (FirstTriangular) {
-      res[j] += cj0.pmul(A1[j], t1);
-      t3 += cj1.pmul(A1[j], rhs[j]);
-    } else {
-      res[j + 1] += cj0.pmul(A0[j + 1], t0);
-      t2 += cj1.pmul(A0[j + 1], rhs[j + 1]);
+      Scalar t4(0), t5(0), t6(0), t7(0);
+      Packet ptmp4 = pzero(Packet{});
+      Packet ptmp5 = pzero(Packet{});
+      Packet ptmp6 = pzero(Packet{});
+      Packet ptmp7 = pzero(Packet{});
+
+      Index starti = FirstTriangular ? 0 : j + 4;
+      Index endi = FirstTriangular ? j : size;
+      Index alignedStart = starti + internal::first_default_aligned(&res[starti], endi - starti);
+      Index alignedEnd = alignedStart + ((endi - alignedStart) / PacketSize) * PacketSize;
+
+      // Handle the 4x4 diagonal block: diagonal elements
+      res[j] += cjd.pmul(numext::real(A0[j]), t0);
+      res[j + 1] += cjd.pmul(numext::real(A1[j + 1]), t1);
+      res[j + 2] += cjd.pmul(numext::real(A2[j + 2]), t2);
+      res[j + 3] += cjd.pmul(numext::real(A3[j + 3]), t3);
+
+      // Handle the 4x4 diagonal block: off-diagonal cross terms
+      if (FirstTriangular) {
+        // Upper triangle stored (A_k[l] for l <= k)
+        res[j] += cj0.pmul(A1[j], t1) + cj0.pmul(A2[j], t2) + cj0.pmul(A3[j], t3);
+        res[j + 1] += cj0.pmul(A2[j + 1], t2) + cj0.pmul(A3[j + 1], t3);
+        res[j + 2] += cj0.pmul(A3[j + 2], t3);
+
+        t5 += cj1.pmul(A1[j], rhs[j]);
+        t6 += cj1.pmul(A2[j], rhs[j]) + cj1.pmul(A2[j + 1], rhs[j + 1]);
+        t7 += cj1.pmul(A3[j], rhs[j]) + cj1.pmul(A3[j + 1], rhs[j + 1]) + cj1.pmul(A3[j + 2], rhs[j + 2]);
+      } else {
+        // Lower triangle stored (A_k[l] for l >= k)
+        res[j + 1] += cj0.pmul(A0[j + 1], t0);
+        res[j + 2] += cj0.pmul(A0[j + 2], t0) + cj0.pmul(A1[j + 2], t1);
+        res[j + 3] += cj0.pmul(A0[j + 3], t0) + cj0.pmul(A1[j + 3], t1) + cj0.pmul(A2[j + 3], t2);
+
+        t4 += cj1.pmul(A0[j + 1], rhs[j + 1]) + cj1.pmul(A0[j + 2], rhs[j + 2]) + cj1.pmul(A0[j + 3], rhs[j + 3]);
+        t5 += cj1.pmul(A1[j + 2], rhs[j + 2]) + cj1.pmul(A1[j + 3], rhs[j + 3]);
+        t6 += cj1.pmul(A2[j + 3], rhs[j + 3]);
+      }
+
+      // Pre-alignment scalar loop
+      for (Index i = starti; i < alignedStart; ++i) {
+        res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i], t1) + cj0.pmul(A2[i], t2) + cj0.pmul(A3[i], t3);
+        t4 += cj1.pmul(A0[i], rhs[i]);
+        t5 += cj1.pmul(A1[i], rhs[i]);
+        t6 += cj1.pmul(A2[i], rhs[i]);
+        t7 += cj1.pmul(A3[i], rhs[i]);
+      }
+
+      // Main vectorized loop: 4 matrix column loads, 1 rhs load, 1 result load/store
+      const Scalar* EIGEN_RESTRICT a0It = A0 + alignedStart;
+      const Scalar* EIGEN_RESTRICT a1It = A1 + alignedStart;
+      const Scalar* EIGEN_RESTRICT a2It = A2 + alignedStart;
+      const Scalar* EIGEN_RESTRICT a3It = A3 + alignedStart;
+      const Scalar* EIGEN_RESTRICT rhsIt = rhs + alignedStart;
+      Scalar* EIGEN_RESTRICT resIt = res + alignedStart;
+      for (Index i = alignedStart; i < alignedEnd; i += PacketSize) {
+        Packet A0i = ploadu<Packet>(a0It);
+        a0It += PacketSize;
+        Packet A1i = ploadu<Packet>(a1It);
+        a1It += PacketSize;
+        Packet A2i = ploadu<Packet>(a2It);
+        a2It += PacketSize;
+        Packet A3i = ploadu<Packet>(a3It);
+        a3It += PacketSize;
+        Packet Bi = ploadu<Packet>(rhsIt);
+        rhsIt += PacketSize;
+        Packet Xi = pload<Packet>(resIt);
+
+        Xi = pcj0.pmadd(A0i, ptmp0, Xi);
+        Xi = pcj0.pmadd(A1i, ptmp1, Xi);
+        Xi = pcj0.pmadd(A2i, ptmp2, Xi);
+        Xi = pcj0.pmadd(A3i, ptmp3, Xi);
+        pstore(resIt, Xi);
+        resIt += PacketSize;
+
+        ptmp4 = pcj1.pmadd(A0i, Bi, ptmp4);
+        ptmp5 = pcj1.pmadd(A1i, Bi, ptmp5);
+        ptmp6 = pcj1.pmadd(A2i, Bi, ptmp6);
+        ptmp7 = pcj1.pmadd(A3i, Bi, ptmp7);
+      }
+
+      // Post-alignment scalar loop
+      for (Index i = alignedEnd; i < endi; ++i) {
+        res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i], t1) + cj0.pmul(A2[i], t2) + cj0.pmul(A3[i], t3);
+        t4 += cj1.pmul(A0[i], rhs[i]);
+        t5 += cj1.pmul(A1[i], rhs[i]);
+        t6 += cj1.pmul(A2[i], rhs[i]);
+        t7 += cj1.pmul(A3[i], rhs[i]);
+      }
+
+      res[j] += alpha * (t4 + predux(ptmp4));
+      res[j + 1] += alpha * (t5 + predux(ptmp5));
+      res[j + 2] += alpha * (t6 + predux(ptmp6));
+      res[j + 3] += alpha * (t7 + predux(ptmp7));
     }
-
-    for (Index i = starti; i < alignedStart; ++i) {
-      res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i], t1);
-      t2 += cj1.pmul(A0[i], rhs[i]);
-      t3 += cj1.pmul(A1[i], rhs[i]);
-    }
-    // Yes this an optimization for gcc 4.3 and 4.4 (=> huge speed up)
-    // gcc 4.2 does this optimization automatically.
-    const Scalar* EIGEN_RESTRICT a0It = A0 + alignedStart;
-    const Scalar* EIGEN_RESTRICT a1It = A1 + alignedStart;
-    const Scalar* EIGEN_RESTRICT rhsIt = rhs + alignedStart;
-    Scalar* EIGEN_RESTRICT resIt = res + alignedStart;
-    for (Index i = alignedStart; i < alignedEnd; i += PacketSize) {
-      Packet A0i = ploadu<Packet>(a0It);
-      a0It += PacketSize;
-      Packet A1i = ploadu<Packet>(a1It);
-      a1It += PacketSize;
-      Packet Bi = ploadu<Packet>(rhsIt);
-      rhsIt += PacketSize;  // FIXME should be aligned in most cases
-      Packet Xi = pload<Packet>(resIt);
-
-      Xi = pcj0.pmadd(A0i, ptmp0, pcj0.pmadd(A1i, ptmp1, Xi));
-      ptmp2 = pcj1.pmadd(A0i, Bi, ptmp2);
-      ptmp3 = pcj1.pmadd(A1i, Bi, ptmp3);
-      pstore(resIt, Xi);
-      resIt += PacketSize;
-    }
-    for (Index i = alignedEnd; i < endi; i++) {
-      res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i], t1);
-      t2 += cj1.pmul(A0[i], rhs[i]);
-      t3 += cj1.pmul(A1[i], rhs[i]);
-    }
-
-    res[j] += alpha * (t2 + predux(ptmp2));
-    res[j + 1] += alpha * (t3 + predux(ptmp3));
   }
-  for (Index j = FirstTriangular ? 0 : bound; j < (FirstTriangular ? bound : size); j++) {
-    const Scalar* EIGEN_RESTRICT A0 = lhs + j * lhsStride;
 
-    Scalar t1 = cjAlpha * rhs[j];
-    Scalar t2(0);
-    res[j] += cjd.pmul(numext::real(A0[j]), t1);
-    for (Index i = FirstTriangular ? 0 : j + 1; i < (FirstTriangular ? j : size); i++) {
-      res[i] += cj0.pmul(A0[i], t1);
-      t2 += cj1.pmul(A0[i], rhs[i]);
+  // === Phase 2: 2 columns at a time ===
+  {
+    Index jStart = FirstTriangular ? (size - n4 - n2) : n4;
+    Index jEnd = FirstTriangular ? (size - n4) : (n4 + n2);
+
+    for (Index j = jStart; j < jEnd; j += 2) {
+      const Scalar* EIGEN_RESTRICT A0 = lhs + j * lhsStride;
+      const Scalar* EIGEN_RESTRICT A1 = lhs + (j + 1) * lhsStride;
+
+      Scalar t0 = cjAlpha * rhs[j];
+      Packet ptmp0 = pset1<Packet>(t0);
+      Scalar t1 = cjAlpha * rhs[j + 1];
+      Packet ptmp1 = pset1<Packet>(t1);
+
+      Scalar t2(0);
+      Packet ptmp2 = pzero(Packet{});
+      Scalar t3(0);
+      Packet ptmp3 = pzero(Packet{});
+
+      Index starti = FirstTriangular ? 0 : j + 2;
+      Index endi = FirstTriangular ? j : size;
+      Index alignedStart = starti + internal::first_default_aligned(&res[starti], endi - starti);
+      Index alignedEnd = alignedStart + ((endi - alignedStart) / PacketSize) * PacketSize;
+
+      res[j] += cjd.pmul(numext::real(A0[j]), t0);
+      res[j + 1] += cjd.pmul(numext::real(A1[j + 1]), t1);
+      if (FirstTriangular) {
+        res[j] += cj0.pmul(A1[j], t1);
+        t3 += cj1.pmul(A1[j], rhs[j]);
+      } else {
+        res[j + 1] += cj0.pmul(A0[j + 1], t0);
+        t2 += cj1.pmul(A0[j + 1], rhs[j + 1]);
+      }
+
+      for (Index i = starti; i < alignedStart; ++i) {
+        res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i], t1);
+        t2 += cj1.pmul(A0[i], rhs[i]);
+        t3 += cj1.pmul(A1[i], rhs[i]);
+      }
+      const Scalar* EIGEN_RESTRICT a0It = A0 + alignedStart;
+      const Scalar* EIGEN_RESTRICT a1It = A1 + alignedStart;
+      const Scalar* EIGEN_RESTRICT rhsIt = rhs + alignedStart;
+      Scalar* EIGEN_RESTRICT resIt = res + alignedStart;
+      for (Index i = alignedStart; i < alignedEnd; i += PacketSize) {
+        Packet A0i = ploadu<Packet>(a0It);
+        a0It += PacketSize;
+        Packet A1i = ploadu<Packet>(a1It);
+        a1It += PacketSize;
+        Packet Bi = ploadu<Packet>(rhsIt);
+        rhsIt += PacketSize;
+        Packet Xi = pload<Packet>(resIt);
+
+        Xi = pcj0.pmadd(A0i, ptmp0, pcj0.pmadd(A1i, ptmp1, Xi));
+        ptmp2 = pcj1.pmadd(A0i, Bi, ptmp2);
+        ptmp3 = pcj1.pmadd(A1i, Bi, ptmp3);
+        pstore(resIt, Xi);
+        resIt += PacketSize;
+      }
+      for (Index i = alignedEnd; i < endi; i++) {
+        res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i], t1);
+        t2 += cj1.pmul(A0[i], rhs[i]);
+        t3 += cj1.pmul(A1[i], rhs[i]);
+      }
+
+      res[j] += alpha * (t2 + predux(ptmp2));
+      res[j + 1] += alpha * (t3 + predux(ptmp3));
     }
-    res[j] += alpha * t2;
+  }
+
+  // === Phase 3: 1 column at a time ===
+  {
+    Index jStart = FirstTriangular ? 0 : (n4 + n2);
+    Index jEnd = FirstTriangular ? (size - n4 - n2) : size;
+
+    for (Index j = jStart; j < jEnd; j++) {
+      const Scalar* EIGEN_RESTRICT A0 = lhs + j * lhsStride;
+
+      Scalar t1 = cjAlpha * rhs[j];
+      Scalar t2(0);
+      Packet ptmp1 = pset1<Packet>(t1);
+      Packet ptmp2 = pzero(Packet{});
+
+      res[j] += cjd.pmul(numext::real(A0[j]), t1);
+
+      Index starti = FirstTriangular ? 0 : j + 1;
+      Index endi = FirstTriangular ? j : size;
+      Index alignedStart = starti + internal::first_default_aligned(&res[starti], endi - starti);
+      Index alignedEnd = alignedStart + ((endi - alignedStart) / PacketSize) * PacketSize;
+
+      for (Index i = starti; i < alignedStart; ++i) {
+        res[i] += cj0.pmul(A0[i], t1);
+        t2 += cj1.pmul(A0[i], rhs[i]);
+      }
+      const Scalar* EIGEN_RESTRICT a0It = A0 + alignedStart;
+      const Scalar* EIGEN_RESTRICT rhsIt = rhs + alignedStart;
+      Scalar* EIGEN_RESTRICT resIt = res + alignedStart;
+      for (Index i = alignedStart; i < alignedEnd; i += PacketSize) {
+        Packet A0i = ploadu<Packet>(a0It);
+        a0It += PacketSize;
+        Packet Bi = ploadu<Packet>(rhsIt);
+        rhsIt += PacketSize;
+        Packet Xi = pload<Packet>(resIt);
+
+        Xi = pcj0.pmadd(A0i, ptmp1, Xi);
+        pstore(resIt, Xi);
+        resIt += PacketSize;
+
+        ptmp2 = pcj1.pmadd(A0i, Bi, ptmp2);
+      }
+      for (Index i = alignedEnd; i < endi; i++) {
+        res[i] += cj0.pmul(A0[i], t1);
+        t2 += cj1.pmul(A0[i], rhs[i]);
+      }
+      res[j] += alpha * (t2 + predux(ptmp2));
+    }
   }
 }
 

diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h b/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h
index 187c911..413d949 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h

@@ -108,6 +108,8 @@
 EIGEN_BLAS_SYMV_SPECIALIZATION(scomplex, float, chemv_)
 #endif
 
+#undef EIGEN_BLAS_SYMV_SPECIALIZATION
+#undef EIGEN_BLAS_SYMV_SPECIALIZE
 }  // end namespace internal
 
 }  // end namespace Eigen

diff --git a/Eigen/src/Core/products/SelfadjointProduct.h b/Eigen/src/Core/products/SelfadjointProduct.h
index f103465..4a453b1 100644
--- a/Eigen/src/Core/products/SelfadjointProduct.h
+++ b/Eigen/src/Core/products/SelfadjointProduct.h

@@ -24,14 +24,104 @@
 template <typename Scalar, typename Index, int UpLo, bool ConjLhs, bool ConjRhs>
 struct selfadjoint_rank1_update<Scalar, Index, ColMajor, UpLo, ConjLhs, ConjRhs> {
   static void run(Index size, Scalar* mat, Index stride, const Scalar* vecX, const Scalar* vecY, const Scalar& alpha) {
-    internal::conj_if<ConjRhs> cj;
-    typedef Map<const Matrix<Scalar, Dynamic, 1> > OtherMap;
-    typedef std::conditional_t<ConjLhs, typename OtherMap::ConjugateReturnType, const OtherMap&> ConjLhsType;
-    for (Index i = 0; i < size; ++i) {
-      Map<Matrix<Scalar, Dynamic, 1> >(mat + stride * i + (UpLo == Lower ? i : 0),
-                                       (UpLo == Lower ? size - i : (i + 1))) +=
-          (alpha * cj(vecY[i])) *
-          ConjLhsType(OtherMap(vecX + (UpLo == Lower ? i : 0), UpLo == Lower ? size - i : (i + 1)));
+    typedef typename internal::packet_traits<Scalar>::type Packet;
+    const Index PacketSize = internal::unpacket_traits<Packet>::size;
+
+    internal::conj_if<ConjRhs> cjy;
+    internal::conj_if<ConjLhs> cjx;
+    internal::conj_helper<Packet, Packet, ConjLhs, false> pcj;
+
+    // Process 2 columns at a time to share vecX loads and reduce loop overhead.
+    Index j = 0;
+    for (; j + 1 < size; j += 2) {
+      Scalar s0 = alpha * cjy(vecY[j]);
+      Scalar s1 = alpha * cjy(vecY[j + 1]);
+      Packet ps0 = internal::pset1<Packet>(s0);
+      Packet ps1 = internal::pset1<Packet>(s1);
+
+      if (UpLo == Lower) {
+        Scalar* EIGEN_RESTRICT col0 = mat + stride * j + j;
+        Scalar* EIGEN_RESTRICT col1 = mat + stride * (j + 1) + (j + 1);
+
+        // Diagonal and cross-diagonal scalar elements
+        col0[0] += s0 * cjx(vecX[j]);
+        col0[1] += s0 * cjx(vecX[j + 1]);
+        col1[0] += s1 * cjx(vecX[j + 1]);
+
+        // Shared vectorized loop for rows j+2..size-1
+        Index len = size - j - 2;
+        const Scalar* EIGEN_RESTRICT xp = vecX + j + 2;
+        Scalar* EIGEN_RESTRICT d0 = col0 + 2;
+        Scalar* EIGEN_RESTRICT d1 = col1 + 1;
+
+        Index k = 0;
+        Index vectorizedEnd = (len / PacketSize) * PacketSize;
+        for (; k < vectorizedEnd; k += PacketSize) {
+          Packet xi = internal::ploadu<Packet>(xp + k);
+          Packet m0 = internal::ploadu<Packet>(d0 + k);
+          m0 = pcj.pmadd(xi, ps0, m0);
+          internal::pstoreu(d0 + k, m0);
+          Packet m1 = internal::ploadu<Packet>(d1 + k);
+          m1 = pcj.pmadd(xi, ps1, m1);
+          internal::pstoreu(d1 + k, m1);
+        }
+        for (; k < len; ++k) {
+          Scalar cx = cjx(xp[k]);
+          d0[k] += s0 * cx;
+          d1[k] += s1 * cx;
+        }
+      } else {
+        // UpLo == Upper
+        Scalar* EIGEN_RESTRICT col0 = mat + stride * j;
+        Scalar* EIGEN_RESTRICT col1 = mat + stride * (j + 1);
+
+        // Shared vectorized loop for rows 0..j-1
+        const Scalar* EIGEN_RESTRICT xp = vecX;
+        Index len = j;
+        Index k = 0;
+        Index vectorizedEnd = (len / PacketSize) * PacketSize;
+        for (; k < vectorizedEnd; k += PacketSize) {
+          Packet xi = internal::ploadu<Packet>(xp + k);
+          Packet m0 = internal::ploadu<Packet>(col0 + k);
+          Packet m1 = internal::ploadu<Packet>(col1 + k);
+          m0 = pcj.pmadd(xi, ps0, m0);
+          m1 = pcj.pmadd(xi, ps1, m1);
+          internal::pstoreu(col0 + k, m0);
+          internal::pstoreu(col1 + k, m1);
+        }
+        for (; k < len; ++k) {
+          Scalar cx = cjx(xp[k]);
+          col0[k] += s0 * cx;
+          col1[k] += s1 * cx;
+        }
+
+        // Diagonal and cross-diagonal scalar elements
+        col0[j] += s0 * cjx(vecX[j]);
+        col1[j] += s1 * cjx(vecX[j]);
+        col1[j + 1] += s1 * cjx(vecX[j + 1]);
+      }
+    }
+
+    // Handle last column if size is odd
+    if (j < size) {
+      Scalar s = alpha * cjy(vecY[j]);
+      Packet ps = internal::pset1<Packet>(s);
+      Index start = UpLo == Lower ? j : 0;
+      Index len = UpLo == Lower ? size - j : j + 1;
+      Scalar* EIGEN_RESTRICT dst = mat + stride * j + start;
+      const Scalar* EIGEN_RESTRICT xp = vecX + start;
+
+      Index k = 0;
+      Index vectorizedEnd = (len / PacketSize) * PacketSize;
+      for (; k < vectorizedEnd; k += PacketSize) {
+        Packet xi = internal::ploadu<Packet>(xp + k);
+        Packet di = internal::ploadu<Packet>(dst + k);
+        di = pcj.pmadd(xi, ps, di);
+        internal::pstoreu(dst + k, di);
+      }
+      for (; k < len; ++k) {
+        dst[k] += s * cjx(xp[k]);
+      }
     }
   }
 };

diff --git a/Eigen/src/Core/products/SelfadjointRank2Update.h b/Eigen/src/Core/products/SelfadjointRank2Update.h
index 9c234ec..75e4ceb 100644
--- a/Eigen/src/Core/products/SelfadjointRank2Update.h
+++ b/Eigen/src/Core/products/SelfadjointRank2Update.h

@@ -21,29 +21,168 @@
  * It corresponds to the Level2 syr2 BLAS routine
  */
 
-template <typename Scalar, typename Index, typename UType, typename VType, int UpLo>
+template <typename Scalar, typename Index, int UpLo>
 struct selfadjoint_rank2_update_selector;
 
-template <typename Scalar, typename Index, typename UType, typename VType>
-struct selfadjoint_rank2_update_selector<Scalar, Index, UType, VType, Lower> {
-  static EIGEN_DEVICE_FUNC void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha) {
-    const Index size = u.size();
-    for (Index i = 0; i < size; ++i) {
-      Map<Matrix<Scalar, Dynamic, 1>>(mat + stride * i + i, size - i) +=
-          (numext::conj(alpha) * numext::conj(u.coeff(i))) * v.tail(size - i) +
-          (alpha * numext::conj(v.coeff(i))) * u.tail(size - i);
+template <typename Scalar, typename Index>
+struct selfadjoint_rank2_update_selector<Scalar, Index, Lower> {
+  static void run(Index size, Scalar* mat, Index stride, const Scalar* u, const Scalar* v, const Scalar& alpha) {
+    typedef typename packet_traits<Scalar>::type Packet;
+    const Index PacketSize = unpacket_traits<Packet>::size;
+    const Scalar cAlpha = numext::conj(alpha);
+
+    // Process 2 columns at a time to share u/v loads and reduce loop overhead.
+    Index j = 0;
+    for (; j + 1 < size; j += 2) {
+      // Scale factors: col[j:] += s0u * v[j:] + s0v * u[j:]
+      Scalar s0u = cAlpha * numext::conj(u[j]);
+      Scalar s0v = alpha * numext::conj(v[j]);
+      Scalar s1u = cAlpha * numext::conj(u[j + 1]);
+      Scalar s1v = alpha * numext::conj(v[j + 1]);
+
+      Packet ps0u = pset1<Packet>(s0u);
+      Packet ps0v = pset1<Packet>(s0v);
+      Packet ps1u = pset1<Packet>(s1u);
+      Packet ps1v = pset1<Packet>(s1v);
+
+      Scalar* EIGEN_RESTRICT col0 = mat + stride * j + j;
+      Scalar* EIGEN_RESTRICT col1 = mat + stride * (j + 1) + (j + 1);
+
+      // Diagonal and cross-diagonal scalar elements
+      col0[0] += s0u * v[j] + s0v * u[j];
+      col0[1] += s0u * v[j + 1] + s0v * u[j + 1];
+      col1[0] += s1u * v[j + 1] + s1v * u[j + 1];
+
+      // Shared vectorized loop for rows j+2..size-1
+      Index len = size - j - 2;
+      const Scalar* EIGEN_RESTRICT up = u + j + 2;
+      const Scalar* EIGEN_RESTRICT vp = v + j + 2;
+      Scalar* EIGEN_RESTRICT d0 = col0 + 2;
+      Scalar* EIGEN_RESTRICT d1 = col1 + 1;
+
+      Index k = 0;
+      Index vectorizedEnd = (len / PacketSize) * PacketSize;
+      for (; k < vectorizedEnd; k += PacketSize) {
+        Packet ui = ploadu<Packet>(up + k);
+        Packet vi = ploadu<Packet>(vp + k);
+        Packet m0 = ploadu<Packet>(d0 + k);
+        m0 = pmadd(vi, ps0u, m0);
+        m0 = pmadd(ui, ps0v, m0);
+        pstoreu(d0 + k, m0);
+        Packet m1 = ploadu<Packet>(d1 + k);
+        m1 = pmadd(vi, ps1u, m1);
+        m1 = pmadd(ui, ps1v, m1);
+        pstoreu(d1 + k, m1);
+      }
+      for (; k < len; ++k) {
+        d0[k] += s0u * vp[k] + s0v * up[k];
+        d1[k] += s1u * vp[k] + s1v * up[k];
+      }
+    }
+
+    // Handle last column if size is odd
+    if (j < size) {
+      Scalar su = cAlpha * numext::conj(u[j]);
+      Scalar sv = alpha * numext::conj(v[j]);
+      Packet psu = pset1<Packet>(su);
+      Packet psv = pset1<Packet>(sv);
+
+      Scalar* EIGEN_RESTRICT dst = mat + stride * j + j;
+      const Scalar* EIGEN_RESTRICT up = u + j;
+      const Scalar* EIGEN_RESTRICT vp = v + j;
+      Index len = size - j;
+
+      Index k = 0;
+      Index vectorizedEnd = (len / PacketSize) * PacketSize;
+      for (; k < vectorizedEnd; k += PacketSize) {
+        Packet ui = ploadu<Packet>(up + k);
+        Packet vi = ploadu<Packet>(vp + k);
+        Packet di = ploadu<Packet>(dst + k);
+        di = pmadd(vi, psu, di);
+        di = pmadd(ui, psv, di);
+        pstoreu(dst + k, di);
+      }
+      for (; k < len; ++k) {
+        dst[k] += su * vp[k] + sv * up[k];
+      }
     }
   }
 };
 
-template <typename Scalar, typename Index, typename UType, typename VType>
-struct selfadjoint_rank2_update_selector<Scalar, Index, UType, VType, Upper> {
-  static void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha) {
-    const Index size = u.size();
-    for (Index i = 0; i < size; ++i)
-      Map<Matrix<Scalar, Dynamic, 1>>(mat + stride * i, i + 1) +=
-          (numext::conj(alpha) * numext::conj(u.coeff(i))) * v.head(i + 1) +
-          (alpha * numext::conj(v.coeff(i))) * u.head(i + 1);
+template <typename Scalar, typename Index>
+struct selfadjoint_rank2_update_selector<Scalar, Index, Upper> {
+  static void run(Index size, Scalar* mat, Index stride, const Scalar* u, const Scalar* v, const Scalar& alpha) {
+    typedef typename packet_traits<Scalar>::type Packet;
+    const Index PacketSize = unpacket_traits<Packet>::size;
+    const Scalar cAlpha = numext::conj(alpha);
+
+    // Process 2 columns at a time to share u/v loads and reduce loop overhead.
+    Index j = 0;
+    for (; j + 1 < size; j += 2) {
+      Scalar s0u = cAlpha * numext::conj(u[j]);
+      Scalar s0v = alpha * numext::conj(v[j]);
+      Scalar s1u = cAlpha * numext::conj(u[j + 1]);
+      Scalar s1v = alpha * numext::conj(v[j + 1]);
+
+      Packet ps0u = pset1<Packet>(s0u);
+      Packet ps0v = pset1<Packet>(s0v);
+      Packet ps1u = pset1<Packet>(s1u);
+      Packet ps1v = pset1<Packet>(s1v);
+
+      Scalar* EIGEN_RESTRICT col0 = mat + stride * j;
+      Scalar* EIGEN_RESTRICT col1 = mat + stride * (j + 1);
+
+      // Shared vectorized loop for rows 0..j-1
+      Index len = j;
+      Index k = 0;
+      Index vectorizedEnd = (len / PacketSize) * PacketSize;
+      for (; k < vectorizedEnd; k += PacketSize) {
+        Packet ui = ploadu<Packet>(u + k);
+        Packet vi = ploadu<Packet>(v + k);
+        Packet m0 = ploadu<Packet>(col0 + k);
+        m0 = pmadd(vi, ps0u, m0);
+        m0 = pmadd(ui, ps0v, m0);
+        pstoreu(col0 + k, m0);
+        Packet m1 = ploadu<Packet>(col1 + k);
+        m1 = pmadd(vi, ps1u, m1);
+        m1 = pmadd(ui, ps1v, m1);
+        pstoreu(col1 + k, m1);
+      }
+      for (; k < len; ++k) {
+        col0[k] += s0u * v[k] + s0v * u[k];
+        col1[k] += s1u * v[k] + s1v * u[k];
+      }
+
+      // Diagonal and cross-diagonal scalar elements
+      col0[j] += s0u * v[j] + s0v * u[j];
+      col1[j] += s1u * v[j] + s1v * u[j];
+      col1[j + 1] += s1u * v[j + 1] + s1v * u[j + 1];
+    }
+
+    // Handle last column if size is odd
+    if (j < size) {
+      Scalar su = cAlpha * numext::conj(u[j]);
+      Scalar sv = alpha * numext::conj(v[j]);
+      Packet psu = pset1<Packet>(su);
+      Packet psv = pset1<Packet>(sv);
+
+      Scalar* EIGEN_RESTRICT dst = mat + stride * j;
+      Index len = j + 1;
+
+      Index k = 0;
+      Index vectorizedEnd = (len / PacketSize) * PacketSize;
+      for (; k < vectorizedEnd; k += PacketSize) {
+        Packet ui = ploadu<Packet>(u + k);
+        Packet vi = ploadu<Packet>(v + k);
+        Packet di = ploadu<Packet>(dst + k);
+        di = pmadd(vi, psu, di);
+        di = pmadd(ui, psv, di);
+        pstoreu(dst + k, di);
+      }
+      for (; k < len; ++k) {
+        dst[k] += su * v[k] + sv * u[k];
+      }
+    }
   }
 };
 
@@ -69,23 +208,47 @@
 
   // If MatrixType is row major, then we use the routine for lower triangular in the upper triangular case and
   // vice versa, and take the complex conjugate of all coefficients and vector entries.
+  enum {
+    IsRowMajor = (internal::traits<MatrixType>::Flags & RowMajorBit) ? 1 : 0,
+    // Only need to conjugate if complex and the condition triggers
+    NeedConjU = (int(IsRowMajor) ^ int(UBlasTraits::NeedToConjugate)) && NumTraits<Scalar>::IsComplex,
+    NeedConjV = (int(IsRowMajor) ^ int(VBlasTraits::NeedToConjugate)) && NumTraits<Scalar>::IsComplex,
+    UseUDirectly = ActualUType_::InnerStrideAtCompileTime == 1 && !NeedConjU,
+    UseVDirectly = ActualVType_::InnerStrideAtCompileTime == 1 && !NeedConjV
+  };
 
-  enum { IsRowMajor = (internal::traits<MatrixType>::Flags & RowMajorBit) ? 1 : 0 };
   Scalar actualAlpha = alpha * UBlasTraits::extractScalarFactor(u.derived()) *
                        numext::conj(VBlasTraits::extractScalarFactor(v.derived()));
   if (IsRowMajor) actualAlpha = numext::conj(actualAlpha);
 
-  typedef internal::remove_all_t<
-      typename internal::conj_expr_if<int(IsRowMajor) ^ int(UBlasTraits::NeedToConjugate), ActualUType_>::type>
-      UType;
-  typedef internal::remove_all_t<
-      typename internal::conj_expr_if<int(IsRowMajor) ^ int(VBlasTraits::NeedToConjugate), ActualVType_>::type>
-      VType;
-  internal::selfadjoint_rank2_update_selector<Scalar, Index, UType, VType,
-                                              (IsRowMajor ? int(UpLo == Upper ? Lower : Upper)
-                                                          : UpLo)>::run(_expression().const_cast_derived().data(),
-                                                                        _expression().outerStride(), UType(actualU),
-                                                                        VType(actualV), actualAlpha);
+  const Index size = u.size();
+
+  // Copy u to contiguous buffer, applying conjugation if needed
+  internal::gemv_static_vector_if<Scalar, DerivedU::SizeAtCompileTime, DerivedU::MaxSizeAtCompileTime, !UseUDirectly>
+      static_u;
+  ei_declare_aligned_stack_constructed_variable(Scalar, uPtr, size,
+                                                (UseUDirectly ? const_cast<Scalar*>(actualU.data()) : static_u.data()));
+  if (!UseUDirectly) {
+    if (NeedConjU)
+      Map<typename ActualUType_::PlainObject>(uPtr, size) = actualU.conjugate();
+    else
+      Map<typename ActualUType_::PlainObject>(uPtr, size) = actualU;
+  }
+
+  // Copy v to contiguous buffer, applying conjugation if needed
+  internal::gemv_static_vector_if<Scalar, DerivedV::SizeAtCompileTime, DerivedV::MaxSizeAtCompileTime, !UseVDirectly>
+      static_v;
+  ei_declare_aligned_stack_constructed_variable(Scalar, vPtr, size,
+                                                (UseVDirectly ? const_cast<Scalar*>(actualV.data()) : static_v.data()));
+  if (!UseVDirectly) {
+    if (NeedConjV)
+      Map<typename ActualVType_::PlainObject>(vPtr, size) = actualV.conjugate();
+    else
+      Map<typename ActualVType_::PlainObject>(vPtr, size) = actualV;
+  }
+
+  internal::selfadjoint_rank2_update_selector<Scalar, Index, (IsRowMajor ? int(UpLo == Upper ? Lower : Upper) : UpLo)>::
+      run(size, _expression().const_cast_derived().data(), _expression().outerStride(), uPtr, vPtr, actualAlpha);
 
   return *this;
 }

diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h
index a0d05ef..e4e446e 100644
--- a/Eigen/src/Core/products/TriangularMatrixMatrix.h
+++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h

@@ -17,30 +17,6 @@
 
 namespace internal {
 
-// template<typename Scalar, int mr, int StorageOrder, bool Conjugate, int Mode>
-// struct gemm_pack_lhs_triangular
-// {
-//   Matrix<Scalar,mr,mr,
-//   void operator()(Scalar* blockA, const EIGEN_RESTRICT Scalar* lhs_, int lhsStride, int depth, int rows)
-//   {
-//     conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
-//     const_blas_data_mapper<Scalar, StorageOrder> lhs(lhs_,lhsStride);
-//     int count = 0;
-//     const int peeled_mc = (rows/mr)*mr;
-//     for(int i=0; i<peeled_mc; i+=mr)
-//     {
-//       for(int k=0; k<depth; k++)
-//         for(int w=0; w<mr; w++)
-//           blockA[count++] = cj(lhs(i+w, k));
-//     }
-//     for(int i=peeled_mc; i<rows; i++)
-//     {
-//       for(int k=0; k<depth; k++)
-//         blockA[count++] = cj(lhs(i, k));
-//     }
-//   }
-// };
-
 /* Optimized triangular matrix * matrix (_TRMM++) product built on top of
  * the general matrix matrix product.
  */

diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h b/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h
index 3d612b0..043011e 100644
--- a/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h
+++ b/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h

@@ -318,6 +318,10 @@
 EIGEN_BLAS_TRMM_R(float, float, f, strmm_)
 EIGEN_BLAS_TRMM_R(scomplex, float, cf, ctrmm_)
 #endif
+
+#undef EIGEN_BLAS_TRMM_SPECIALIZE
+#undef EIGEN_BLAS_TRMM_L
+#undef EIGEN_BLAS_TRMM_R
 }  // end namespace internal
 
 }  // end namespace Eigen

diff --git a/Eigen/src/Core/products/TriangularMatrixVector.h b/Eigen/src/Core/products/TriangularMatrixVector.h
index bef4cba..41395f9 100644
--- a/Eigen/src/Core/products/TriangularMatrixVector.h
+++ b/Eigen/src/Core/products/TriangularMatrixVector.h

@@ -43,43 +43,102 @@
   Index rows = IsLower ? _rows : (std::min)(_rows, _cols);
   Index cols = IsLower ? (std::min)(_rows, _cols) : _cols;
 
-  typedef Map<const Matrix<LhsScalar, Dynamic, Dynamic, ColMajor>, 0, OuterStride<> > LhsMap;
-  const LhsMap lhs(lhs_, rows, cols, OuterStride<>(lhsStride));
-  typename conj_expr_if<ConjLhs, LhsMap>::type cjLhs(lhs);
-
-  typedef Map<const Matrix<RhsScalar, Dynamic, 1>, 0, InnerStride<> > RhsMap;
-  const RhsMap rhs(rhs_, cols, InnerStride<>(rhsIncr));
-  typename conj_expr_if<ConjRhs, RhsMap>::type cjRhs(rhs);
-
-  typedef Map<Matrix<ResScalar, Dynamic, 1> > ResMap;
-  ResMap res(res_, rows);
-
   typedef const_blas_data_mapper<LhsScalar, Index, ColMajor> LhsMapper;
   typedef const_blas_data_mapper<RhsScalar, Index, RowMajor> RhsMapper;
 
+  conj_if<ConjLhs> cjl;
+  conj_if<ConjRhs> cjr;
+
   for (Index pi = 0; pi < size; pi += PanelWidth) {
     Index actualPanelWidth = (std::min)(PanelWidth, size - pi);
-    for (Index k = 0; k < actualPanelWidth; ++k) {
-      Index i = pi + k;
-      Index s = IsLower ? ((HasUnitDiag || HasZeroDiag) ? i + 1 : i) : pi;
-      Index r = IsLower ? actualPanelWidth - k : k + 1;
-      if ((!(HasUnitDiag || HasZeroDiag)) || (--r) > 0)
-        res.segment(s, r) += (alpha * cjRhs.coeff(i)) * cjLhs.col(i).segment(s, r);
-      if (HasUnitDiag) res.coeffRef(i) += alpha * cjRhs.coeff(i);
+
+    // Process the triangular panel using raw pointer operations with 2-column batching
+    // to eliminate expression template overhead and share result loads/stores.
+    if (IsLower) {
+      Index k = 0;
+      for (; k + 1 < actualPanelWidth; k += 2) {
+        Index i0 = pi + k;
+        Index i1 = i0 + 1;
+        ResScalar s0 = alpha * cjr(rhs_[i0 * rhsIncr]);
+        ResScalar s1 = alpha * cjr(rhs_[i1 * rhsIncr]);
+        const LhsScalar* EIGEN_RESTRICT c0 = lhs_ + i0 * lhsStride;
+        const LhsScalar* EIGEN_RESTRICT c1 = lhs_ + i1 * lhsStride;
+
+        // Diagonal of column 0
+        if (!(HasUnitDiag || HasZeroDiag)) res_[i0] += s0 * cjl(c0[i0]);
+        // Row i1: contribution from column 0 + diagonal of column 1
+        {
+          ResScalar r1 = s0 * cjl(c0[i1]);
+          if (!(HasUnitDiag || HasZeroDiag)) r1 += s1 * cjl(c1[i1]);
+          res_[i1] += r1;
+        }
+        // Shared rows where both columns contribute
+        Index panelEnd = pi + actualPanelWidth;
+        for (Index j = i1 + 1; j < panelEnd; ++j) res_[j] += s0 * cjl(c0[j]) + s1 * cjl(c1[j]);
+
+        if (HasUnitDiag) {
+          res_[i0] += s0;
+          res_[i1] += s1;
+        }
+      }
+      if (k < actualPanelWidth) {
+        Index i = pi + k;
+        ResScalar s = alpha * cjr(rhs_[i * rhsIncr]);
+        const LhsScalar* EIGEN_RESTRICT c = lhs_ + i * lhsStride;
+        if (!(HasUnitDiag || HasZeroDiag)) res_[i] += s * cjl(c[i]);
+        if (HasUnitDiag) res_[i] += s;
+      }
+    } else {
+      // Upper triangular: process 2 columns at a time
+      Index k = 0;
+      for (; k + 1 < actualPanelWidth; k += 2) {
+        Index i0 = pi + k;
+        Index i1 = i0 + 1;
+        ResScalar s0 = alpha * cjr(rhs_[i0 * rhsIncr]);
+        ResScalar s1 = alpha * cjr(rhs_[i1 * rhsIncr]);
+        const LhsScalar* EIGEN_RESTRICT c0 = lhs_ + i0 * lhsStride;
+        const LhsScalar* EIGEN_RESTRICT c1 = lhs_ + i1 * lhsStride;
+
+        // Shared rows before the diagonal block
+        for (Index j = pi; j < i0; ++j) res_[j] += s0 * cjl(c0[j]) + s1 * cjl(c1[j]);
+
+        // Row i0: diagonal of col0 + contribution from col1
+        {
+          ResScalar r0 = s1 * cjl(c1[i0]);
+          if (!(HasUnitDiag || HasZeroDiag)) r0 += s0 * cjl(c0[i0]);
+          res_[i0] += r0;
+        }
+        // Diagonal of column 1
+        if (!(HasUnitDiag || HasZeroDiag)) res_[i1] += s1 * cjl(c1[i1]);
+
+        if (HasUnitDiag) {
+          res_[i0] += s0;
+          res_[i1] += s1;
+        }
+      }
+      if (k < actualPanelWidth) {
+        Index i = pi + k;
+        ResScalar s = alpha * cjr(rhs_[i * rhsIncr]);
+        const LhsScalar* EIGEN_RESTRICT c = lhs_ + i * lhsStride;
+        for (Index j = pi; j < i; ++j) res_[j] += s * cjl(c[j]);
+        if (!(HasUnitDiag || HasZeroDiag)) res_[i] += s * cjl(c[i]);
+        if (HasUnitDiag) res_[i] += s;
+      }
     }
+
+    // Rectangular part: delegate to optimized GEMV
     Index r = IsLower ? rows - pi - actualPanelWidth : pi;
     if (r > 0) {
       Index s = IsLower ? pi + actualPanelWidth : 0;
       general_matrix_vector_product<Index, LhsScalar, LhsMapper, ColMajor, ConjLhs, RhsScalar, RhsMapper, ConjRhs,
-                                    BuiltIn>::run(r, actualPanelWidth, LhsMapper(&lhs.coeffRef(s, pi), lhsStride),
-                                                  RhsMapper(&rhs.coeffRef(pi), rhsIncr), &res.coeffRef(s), resIncr,
-                                                  alpha);
+                                    BuiltIn>::run(r, actualPanelWidth, LhsMapper(&lhs_[pi * lhsStride + s], lhsStride),
+                                                  RhsMapper(&rhs_[pi * rhsIncr], rhsIncr), &res_[s], resIncr, alpha);
     }
   }
   if ((!IsLower) && cols > size) {
     general_matrix_vector_product<Index, LhsScalar, LhsMapper, ColMajor, ConjLhs, RhsScalar, RhsMapper, ConjRhs>::run(
-        rows, cols - size, LhsMapper(&lhs.coeffRef(0, size), lhsStride), RhsMapper(&rhs.coeffRef(size), rhsIncr), res_,
-        resIncr, alpha);
+        rows, cols - size, LhsMapper(&lhs_[size * lhsStride], lhsStride), RhsMapper(&rhs_[size * rhsIncr], rhsIncr),
+        res_, resIncr, alpha);
   }
 }
 
@@ -105,43 +164,48 @@
   Index rows = IsLower ? _rows : diagSize;
   Index cols = IsLower ? diagSize : _cols;
 
-  typedef Map<const Matrix<LhsScalar, Dynamic, Dynamic, RowMajor>, 0, OuterStride<> > LhsMap;
-  const LhsMap lhs(lhs_, rows, cols, OuterStride<>(lhsStride));
-  typename conj_expr_if<ConjLhs, LhsMap>::type cjLhs(lhs);
-
-  typedef Map<const Matrix<RhsScalar, Dynamic, 1> > RhsMap;
-  const RhsMap rhs(rhs_, cols);
-  typename conj_expr_if<ConjRhs, RhsMap>::type cjRhs(rhs);
-
-  typedef Map<Matrix<ResScalar, Dynamic, 1>, 0, InnerStride<> > ResMap;
-  ResMap res(res_, rows, InnerStride<>(resIncr));
-
   typedef const_blas_data_mapper<LhsScalar, Index, RowMajor> LhsMapper;
   typedef const_blas_data_mapper<RhsScalar, Index, RowMajor> RhsMapper;
 
+  conj_if<ConjLhs> cjl;
+  conj_if<ConjRhs> cjr;
+
   for (Index pi = 0; pi < diagSize; pi += PanelWidth) {
     Index actualPanelWidth = (std::min)(PanelWidth, diagSize - pi);
+
+    // Process the triangular panel using raw dot products to eliminate
+    // the cwiseProduct().sum() expression template overhead.
     for (Index k = 0; k < actualPanelWidth; ++k) {
       Index i = pi + k;
-      Index s = IsLower ? pi : ((HasUnitDiag || HasZeroDiag) ? i + 1 : i);
-      Index r = IsLower ? k + 1 : actualPanelWidth - k;
-      if ((!(HasUnitDiag || HasZeroDiag)) || (--r) > 0)
-        res.coeffRef(i) += alpha * (cjLhs.row(i).segment(s, r).cwiseProduct(cjRhs.segment(s, r).transpose())).sum();
-      if (HasUnitDiag) res.coeffRef(i) += alpha * cjRhs.coeff(i);
+      const LhsScalar* EIGEN_RESTRICT row_i = lhs_ + i * lhsStride;
+      ResScalar dot = ResScalar(0);
+
+      if (IsLower) {
+        Index s = pi;
+        Index len = (HasUnitDiag || HasZeroDiag) ? k : k + 1;
+        for (Index j = 0; j < len; ++j) dot += cjl(row_i[s + j]) * cjr(rhs_[s + j]);
+      } else {
+        Index s = (HasUnitDiag || HasZeroDiag) ? i + 1 : i;
+        Index len = pi + actualPanelWidth - s;
+        for (Index j = 0; j < len; ++j) dot += cjl(row_i[s + j]) * cjr(rhs_[s + j]);
+      }
+      res_[i * resIncr] += alpha * dot;
+      if (HasUnitDiag) res_[i * resIncr] += alpha * cjr(rhs_[i]);
     }
+
+    // Rectangular part: delegate to optimized GEMV
     Index r = IsLower ? pi : cols - pi - actualPanelWidth;
     if (r > 0) {
       Index s = IsLower ? 0 : pi + actualPanelWidth;
       general_matrix_vector_product<Index, LhsScalar, LhsMapper, RowMajor, ConjLhs, RhsScalar, RhsMapper, ConjRhs,
-                                    BuiltIn>::run(actualPanelWidth, r, LhsMapper(&lhs.coeffRef(pi, s), lhsStride),
-                                                  RhsMapper(&rhs.coeffRef(s), rhsIncr), &res.coeffRef(pi), resIncr,
-                                                  alpha);
+                                    BuiltIn>::run(actualPanelWidth, r, LhsMapper(&lhs_[pi * lhsStride + s], lhsStride),
+                                                  RhsMapper(&rhs_[s], rhsIncr), &res_[pi * resIncr], resIncr, alpha);
     }
   }
   if (IsLower && rows > diagSize) {
     general_matrix_vector_product<Index, LhsScalar, LhsMapper, RowMajor, ConjLhs, RhsScalar, RhsMapper, ConjRhs>::run(
-        rows - diagSize, cols, LhsMapper(&lhs.coeffRef(diagSize, 0), lhsStride), RhsMapper(&rhs.coeffRef(0), rhsIncr),
-        &res.coeffRef(diagSize), resIncr, alpha);
+        rows - diagSize, cols, LhsMapper(&lhs_[diagSize * lhsStride], lhsStride), RhsMapper(rhs_, rhsIncr),
+        &res_[diagSize * resIncr], resIncr, alpha);
   }
 }
 
@@ -212,7 +276,7 @@
     ResScalar actualAlpha = alpha * lhs_alpha * rhs_alpha;
 
     // FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1
-    // on, the other hand it is good for the cache to pack the vector anyways...
+    // On the other hand, it is good for the cache to pack the vector anyways...
     constexpr bool EvalToDestAtCompileTime = Dest::InnerStrideAtCompileTime == 1;
     constexpr bool ComplexByReal = (NumTraits<LhsScalar>::IsComplex) && (!NumTraits<RhsScalar>::IsComplex);
     constexpr bool MightCannotUseDest = (Dest::InnerStrideAtCompileTime != 1) || ComplexByReal;

diff --git a/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h b/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h
index 1de6880..19ed813 100644
--- a/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h
+++ b/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h

@@ -268,6 +268,9 @@
 EIGEN_BLAS_TRMV_RM(scomplex, float, cf, c, _)
 #endif
 
+#undef EIGEN_BLAS_TRMV_RM
+#undef EIGEN_BLAS_TRMV_SPECIALIZE
+#undef EIGEN_BLAS_TRMV_CM
 }  // namespace internal
 
 }  // end namespace Eigen

diff --git a/Eigen/src/Core/products/TriangularSolverMatrix.h b/Eigen/src/Core/products/TriangularSolverMatrix.h
index c09fec4..bee971d 100644
--- a/Eigen/src/Core/products/TriangularSolverMatrix.h
+++ b/Eigen/src/Core/products/TriangularSolverMatrix.h

@@ -52,7 +52,7 @@
 
   // tr solve
   for (Index k = 0; k < size; ++k) {
-    // TODO write a small kernel handling this (can be shared with trsv)
+    // TODO: write a small kernel handling this (can be shared with trsv)
     Index i = IsLower ? k : -k - 1;
     Index rs = size - k - 1;  // remaining size
     Index s = TriStorageOrder == RowMajor ? (IsLower ? 0 : i + 1) : IsLower ? i + 1 : i - rs;
@@ -97,14 +97,72 @@
     Index j = IsLower ? size - k - 1 : k;
 
     typename LhsMapper::LinearMapper r = lhs.getLinearMapper(0, j);
-    for (Index k3 = 0; k3 < k; ++k3) {
-      Scalar b = conj(rhs(IsLower ? j + 1 + k3 : k3, j));
-      typename LhsMapper::LinearMapper a = lhs.getLinearMapper(0, IsLower ? j + 1 + k3 : k3);
-      for (Index i = 0; i < otherSize; ++i) r(i) -= a(i) * b;
+    EIGEN_IF_CONSTEXPR(OtherInnerStride == 1 && packet_traits<Scalar>::Vectorizable) {
+      using Packet = typename packet_traits<Scalar>::type;
+      constexpr Index PS = unpacket_traits<Packet>::size;
+      // Unrolled k3 loop by 4 to reduce r load/store traffic.
+      Index k3 = 0;
+      for (; k3 + 3 < k; k3 += 4) {
+        Index col0 = IsLower ? j + 1 + k3 : k3;
+        Scalar b0 = conj(rhs(col0, j));
+        Scalar b1 = conj(rhs(col0 + 1, j));
+        Scalar b2 = conj(rhs(col0 + 2, j));
+        Scalar b3 = conj(rhs(col0 + 3, j));
+        Packet neg_pb0 = pset1<Packet>(-b0);
+        Packet neg_pb1 = pset1<Packet>(-b1);
+        Packet neg_pb2 = pset1<Packet>(-b2);
+        Packet neg_pb3 = pset1<Packet>(-b3);
+        typename LhsMapper::LinearMapper a0 = lhs.getLinearMapper(0, col0);
+        typename LhsMapper::LinearMapper a1 = lhs.getLinearMapper(0, col0 + 1);
+        typename LhsMapper::LinearMapper a2 = lhs.getLinearMapper(0, col0 + 2);
+        typename LhsMapper::LinearMapper a3 = lhs.getLinearMapper(0, col0 + 3);
+        Index i = 0;
+        for (; i + PS <= otherSize; i += PS) {
+          Packet pr = r.template loadPacket<Packet>(i);
+          pr = pmadd(a0.template loadPacket<Packet>(i), neg_pb0, pr);
+          pr = pmadd(a1.template loadPacket<Packet>(i), neg_pb1, pr);
+          pr = pmadd(a2.template loadPacket<Packet>(i), neg_pb2, pr);
+          pr = pmadd(a3.template loadPacket<Packet>(i), neg_pb3, pr);
+          r.template storePacket<Packet>(i, pr);
+        }
+        for (; i < otherSize; ++i) {
+          r(i) -= a0(i) * b0 + a1(i) * b1 + a2(i) * b2 + a3(i) * b3;
+        }
+      }
+      // Handle remaining k3 iterations with vectorized inner loop.
+      for (; k3 < k; ++k3) {
+        Scalar b = conj(rhs(IsLower ? j + 1 + k3 : k3, j));
+        typename LhsMapper::LinearMapper a = lhs.getLinearMapper(0, IsLower ? j + 1 + k3 : k3);
+        Packet neg_pb = pset1<Packet>(-b);
+        Index i = 0;
+        for (; i + PS <= otherSize; i += PS) {
+          Packet pr = r.template loadPacket<Packet>(i);
+          pr = pmadd(a.template loadPacket<Packet>(i), neg_pb, pr);
+          r.template storePacket<Packet>(i, pr);
+        }
+        for (; i < otherSize; ++i) r(i) -= a(i) * b;
+      }
+      // Vectorized diagonal scaling.
+      EIGEN_IF_CONSTEXPR((Mode & UnitDiag) == 0) {
+        Scalar inv_rjj = RealScalar(1) / conj(rhs(j, j));
+        Packet pinv = pset1<Packet>(inv_rjj);
+        Index i = 0;
+        for (; i + PS <= otherSize; i += PS) {
+          r.template storePacket<Packet>(i, pmul(r.template loadPacket<Packet>(i), pinv));
+        }
+        for (; i < otherSize; ++i) r(i) *= inv_rjj;
+      }
     }
-    if ((Mode & UnitDiag) == 0) {
-      Scalar inv_rjj = RealScalar(1) / conj(rhs(j, j));
-      for (Index i = 0; i < otherSize; ++i) r(i) *= inv_rjj;
+    else {
+      for (Index k3 = 0; k3 < k; ++k3) {
+        Scalar b = conj(rhs(IsLower ? j + 1 + k3 : k3, j));
+        typename LhsMapper::LinearMapper a = lhs.getLinearMapper(0, IsLower ? j + 1 + k3 : k3);
+        for (Index i = 0; i < otherSize; ++i) r(i) -= a(i) * b;
+      }
+      EIGEN_IF_CONSTEXPR((Mode & UnitDiag) == 0) {
+        Scalar inv_rjj = RealScalar(1) / conj(rhs(j, j));
+        for (Index i = 0; i < otherSize; ++i) r(i) *= inv_rjj;
+      }
     }
   }
 }

diff --git a/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h b/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h
index 9cc15fb..20c8f20 100644
--- a/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h
+++ b/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h

@@ -159,6 +159,8 @@
 EIGEN_BLAS_TRSM_R(scomplex, float, ctrsm_)
 #endif
 
+#undef EIGEN_BLAS_TRSM_R
+#undef EIGEN_BLAS_TRSM_L
 }  // end namespace internal
 
 }  // end namespace Eigen

diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h
index 19d9917..2e1ca91 100644
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h

@@ -43,12 +43,12 @@
 
 template <typename From, typename To>
 struct get_factor {
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE To run(const From& x) { return To(x); }
+  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE To run(const From& x) { return To(x); }
 };
 
 template <typename Scalar>
 struct get_factor<Scalar, typename NumTraits<Scalar>::Real> {
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE typename NumTraits<Scalar>::Real run(const Scalar& x) {
+  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE typename NumTraits<Scalar>::Real run(const Scalar& x) {
     return numext::real(x);
   }
 };
@@ -56,9 +56,9 @@
 template <typename Scalar, typename Index>
 class BlasVectorMapper {
  public:
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasVectorMapper(Scalar* data) : m_data(data) {}
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE BlasVectorMapper(Scalar* data) : m_data(data) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const { return m_data[i]; }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const { return m_data[i]; }
   template <typename Packet, int AlignmentType>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet load(Index i) const {
     return ploadt<Packet, AlignmentType>(m_data + i);
@@ -79,14 +79,14 @@
 template <typename Scalar, typename Index, int AlignmentType>
 class BlasLinearMapper<Scalar, Index, AlignmentType> {
  public:
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar* data, Index incr = 1) : m_data(data) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar* data, Index incr = 1) : m_data(data) {
     EIGEN_ONLY_USED_FOR_DEBUG(incr);
     eigen_assert(incr == 1);
   }
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(Index i) const { internal::prefetch(&operator()(i)); }
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar& operator()(Index i) const { return m_data[i]; }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE Scalar& operator()(Index i) const { return m_data[i]; }
 
   template <typename PacketType>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacket(Index i) const {
@@ -178,27 +178,27 @@
   typedef blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType> SubMapper;
   typedef BlasVectorMapper<Scalar, Index> VectorMapper;
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride, Index incr = 1)
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride, Index incr = 1)
       : m_data(data), m_stride(stride) {
     EIGEN_ONLY_USED_FOR_DEBUG(incr);
     eigen_assert(incr == 1);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SubMapper getSubMapper(Index i, Index j) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE SubMapper getSubMapper(Index i, Index j) const {
     return SubMapper(&operator()(i, j), m_stride);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
     return LinearMapper(&operator()(i, j));
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
     return VectorMapper(&operator()(i, j));
   }
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(Index i, Index j) const { internal::prefetch(&operator()(i, j)); }
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar& operator()(Index i, Index j) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE Scalar& operator()(Index i, Index j) const {
     return m_data[StorageOrder == RowMajor ? j + i * m_stride : i + j * m_stride];
   }
 
@@ -239,8 +239,8 @@
     return pgather<Scalar, SubPacket>(&operator()(i, j), m_stride);
   }
 
-  EIGEN_DEVICE_FUNC const Index stride() const { return m_stride; }
-  EIGEN_DEVICE_FUNC const Index incr() const { return 1; }
+  EIGEN_DEVICE_FUNC constexpr const Index stride() const { return m_stride; }
+  EIGEN_DEVICE_FUNC constexpr const Index incr() const { return 1; }
   EIGEN_DEVICE_FUNC constexpr const Scalar* data() const { return m_data; }
 
   EIGEN_DEVICE_FUNC Index firstAligned(Index size) const {
@@ -268,11 +268,14 @@
 template <typename Scalar, typename Index, int AlignmentType, int Incr>
 class BlasLinearMapper {
  public:
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar* data, Index incr) : m_data(data), m_incr(incr) {}
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar* data, Index incr)
+      : m_data(data), m_incr(incr) {}
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(int i) const { internal::prefetch(&operator()(i)); }
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar& operator()(Index i) const { return m_data[i * m_incr.value()]; }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE Scalar& operator()(Index i) const {
+    return m_data[i * m_incr.value()];
+  }
 
   template <typename PacketType>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacket(Index i) const {
@@ -306,20 +309,20 @@
   typedef BlasLinearMapper<Scalar, Index, AlignmentType, Incr> LinearMapper;
   typedef blas_data_mapper SubMapper;
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride, Index incr)
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride, Index incr)
       : m_data(data), m_stride(stride), m_incr(incr) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SubMapper getSubMapper(Index i, Index j) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE SubMapper getSubMapper(Index i, Index j) const {
     return SubMapper(&operator()(i, j), m_stride, m_incr.value());
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
     return LinearMapper(&operator()(i, j), m_incr.value());
   }
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(Index i, Index j) const { internal::prefetch(&operator()(i, j)); }
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar& operator()(Index i, Index j) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE Scalar& operator()(Index i, Index j) const {
     return m_data[StorageOrder == RowMajor ? j * m_incr.value() + i * m_stride : i * m_incr.value() + j * m_stride];
   }
 
@@ -428,8 +431,8 @@
     spb.store(this, i, j, block);
   }
 
-  EIGEN_DEVICE_FUNC const Index stride() const { return m_stride; }
-  EIGEN_DEVICE_FUNC const Index incr() const { return m_incr.value(); }
+  EIGEN_DEVICE_FUNC constexpr const Index stride() const { return m_stride; }
+  EIGEN_DEVICE_FUNC constexpr const Index incr() const { return m_incr.value(); }
   EIGEN_DEVICE_FUNC constexpr Scalar* data() const { return m_data; }
 
  protected:
@@ -567,18 +570,18 @@
 
 template <typename T, bool HasUsableDirectAccess = blas_traits<T>::HasUsableDirectAccess>
 struct extract_data_selector {
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static const typename T::Scalar* run(const T& m) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE static const typename T::Scalar* run(const T& m) {
     return blas_traits<T>::extract(m).data();
   }
 };
 
 template <typename T>
 struct extract_data_selector<T, false> {
-  EIGEN_DEVICE_FUNC static typename T::Scalar* run(const T&) { return 0; }
+  EIGEN_DEVICE_FUNC constexpr static typename T::Scalar* run(const T&) { return 0; }
 };
 
 template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const typename T::Scalar* extract_data(const T& m) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE const typename T::Scalar* extract_data(const T& m) {
   return extract_data_selector<T>::run(m);
 }
 
@@ -588,30 +591,31 @@
  */
 template <typename ResScalar, typename Lhs, typename Rhs>
 struct combine_scalar_factors_impl {
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static ResScalar run(const Lhs& lhs, const Rhs& rhs) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE static ResScalar run(const Lhs& lhs, const Rhs& rhs) {
     return blas_traits<Lhs>::extractScalarFactor(lhs) * blas_traits<Rhs>::extractScalarFactor(rhs);
   }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static ResScalar run(const ResScalar& alpha, const Lhs& lhs, const Rhs& rhs) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE static ResScalar run(const ResScalar& alpha, const Lhs& lhs,
+                                                                       const Rhs& rhs) {
     return alpha * blas_traits<Lhs>::extractScalarFactor(lhs) * blas_traits<Rhs>::extractScalarFactor(rhs);
   }
 };
 template <typename Lhs, typename Rhs>
 struct combine_scalar_factors_impl<bool, Lhs, Rhs> {
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run(const Lhs& lhs, const Rhs& rhs) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE static bool run(const Lhs& lhs, const Rhs& rhs) {
     return blas_traits<Lhs>::extractScalarFactor(lhs) && blas_traits<Rhs>::extractScalarFactor(rhs);
   }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run(const bool& alpha, const Lhs& lhs, const Rhs& rhs) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE static bool run(const bool& alpha, const Lhs& lhs, const Rhs& rhs) {
     return alpha && blas_traits<Lhs>::extractScalarFactor(lhs) && blas_traits<Rhs>::extractScalarFactor(rhs);
   }
 };
 
 template <typename ResScalar, typename Lhs, typename Rhs>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ResScalar combine_scalar_factors(const ResScalar& alpha, const Lhs& lhs,
-                                                                       const Rhs& rhs) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE ResScalar combine_scalar_factors(const ResScalar& alpha, const Lhs& lhs,
+                                                                                 const Rhs& rhs) {
   return combine_scalar_factors_impl<ResScalar, Lhs, Rhs>::run(alpha, lhs, rhs);
 }
 template <typename ResScalar, typename Lhs, typename Rhs>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ResScalar combine_scalar_factors(const Lhs& lhs, const Rhs& rhs) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE ResScalar combine_scalar_factors(const Lhs& lhs, const Rhs& rhs) {
   return combine_scalar_factors_impl<ResScalar, Lhs, Rhs>::run(lhs, rhs);
 }
 

diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h
index d41d05d..095735d 100644
--- a/Eigen/src/Core/util/ConfigureVectorization.h
+++ b/Eigen/src/Core/util/ConfigureVectorization.h

@@ -357,7 +357,7 @@
 // notice that since these are C headers, the extern "C" is theoretically needed anyways.
 extern "C" {
 // In theory we should only include immintrin.h and not the other *mmintrin.h header files directly.
-// Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus:
+// Doing so triggers some issues with ICC. However old gcc versions may not have this file, thus:
 #if EIGEN_COMP_ICC >= 1110 || EIGEN_COMP_EMSCRIPTEN
 #include <immintrin.h>
 #else
@@ -388,7 +388,7 @@
 #define EIGEN_VECTORIZE_VSX 1
 #define EIGEN_VECTORIZE_FMA
 #include <altivec.h>
-// We need to #undef all these ugly tokens defined in <altivec.h>
+// We need to #undef macros defined by <altivec.h> that conflict with standard C++ names.
 // => use __vector instead of vector
 #undef bool
 #undef vector
@@ -400,7 +400,7 @@
 #define EIGEN_VECTORIZE_ALTIVEC
 #define EIGEN_VECTORIZE_FMA
 #include <altivec.h>
-// We need to #undef all these ugly tokens defined in <altivec.h>
+// We need to #undef macros defined by <altivec.h> that conflict with standard C++ names.
 // => use __vector instead of vector
 #undef bool
 #undef vector
@@ -467,6 +467,10 @@
 #endif
 #endif
 
+#if defined(__riscv_zvfbfwma)
+#define EIGEN_VECTORIZE_RVV10BF16
+#endif
+
 #endif  // defined(EIGEN_ARCH_RISCV)
 
 #elif (defined __s390x__ && defined __VEC__)

diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h
index 8aba62b..f7e8b70 100644
--- a/Eigen/src/Core/util/Constants.h
+++ b/Eigen/src/Core/util/Constants.h

@@ -22,21 +22,21 @@
  *
  * Changing the value of Dynamic breaks the ABI, as Dynamic is often used as a template parameter for Matrix.
  */
-const int Dynamic = -1;
+constexpr int Dynamic = -1;
 
 /** This value means that a signed quantity (e.g., a signed index) is not known at compile-time, and that instead its
  * value has to be specified at runtime.
  */
-const int DynamicIndex = 0xffffff;
+constexpr int DynamicIndex = 0xffffff;
 
 /** This value means that the requested value is not defined.
  */
-const int Undefined = 0xfffffe;
+constexpr int Undefined = 0xfffffe;
 
 /** This value means +Infinity; it is currently used only as the p parameter to MatrixBase::lpNorm<int>().
  * The value Infinity there means the L-infinity norm.
  */
-const int Infinity = -1;
+constexpr int Infinity = -1;
 
 /** This value means that the cost to evaluate an expression coefficient is either very expensive or
  * cannot be known at compile time.
@@ -45,7 +45,7 @@
  * and very very expensive expressions. It thus must also be large enough to make sure unrolling won't happen and that
  * sub expressions will be evaluated, but not too large to avoid overflow.
  */
-const int HugeCost = 10000;
+constexpr int HugeCost = 10000;
 
 /** \defgroup flags Flags
  * \ingroup Core_Module
@@ -67,16 +67,16 @@
  * For an expression, this determines the storage order of
  * the matrix created by evaluation of that expression.
  * \sa \blank  \ref TopicStorageOrders */
-const unsigned int RowMajorBit = 0x1;
+constexpr unsigned int RowMajorBit = 0x1;
 
 /** \ingroup flags
  * means the expression should be evaluated by the calling expression */
-const unsigned int EvalBeforeNestingBit = 0x2;
+constexpr unsigned int EvalBeforeNestingBit = 0x2;
 
 /** \ingroup flags
  * \deprecated
  * means the expression should be evaluated before any assignment */
-EIGEN_DEPRECATED const unsigned int EvalBeforeAssigningBit = 0x4;  // FIXME deprecated
+EIGEN_DEPRECATED constexpr unsigned int EvalBeforeAssigningBit = 0x4;  // FIXME deprecated
 
 /** \ingroup flags
  *
@@ -94,7 +94,7 @@
  * \note This bit can be set regardless of whether vectorization is actually enabled.
  *       To check for actual vectorizability, see \a ActualPacketAccessBit.
  */
-const unsigned int PacketAccessBit = 0x8;
+constexpr unsigned int PacketAccessBit = 0x8;
 
 #ifdef EIGEN_VECTORIZE
 /** \ingroup flags
@@ -105,9 +105,9 @@
  * If vectorization is not enabled (EIGEN_VECTORIZE is not defined) this constant
  * is set to the value 0.
  */
-const unsigned int ActualPacketAccessBit = PacketAccessBit;
+constexpr unsigned int ActualPacketAccessBit = PacketAccessBit;
 #else
-const unsigned int ActualPacketAccessBit = 0x0;
+constexpr unsigned int ActualPacketAccessBit = 0x0;
 #endif
 
 /** \ingroup flags
@@ -130,7 +130,7 @@
  * Product is a vector expression. Thus, vector Product expressions allow index-based coefficient access but
  * not index-based packet access, so they don't have the LinearAccessBit.
  */
-const unsigned int LinearAccessBit = 0x10;
+constexpr unsigned int LinearAccessBit = 0x10;
 
 /** \ingroup flags
  *
@@ -145,7 +145,7 @@
  * Expressions having LvalueBit also have their coeff() method returning a const reference instead of returning a new
  * value.
  */
-const unsigned int LvalueBit = 0x20;
+constexpr unsigned int LvalueBit = 0x20;
 
 /** \ingroup flags
  *
@@ -156,7 +156,7 @@
  *
  * See the comment on LvalueBit for an explanation of how LvalueBit and DirectAccessBit are mutually orthogonal.
  */
-const unsigned int DirectAccessBit = 0x40;
+constexpr unsigned int DirectAccessBit = 0x40;
 
 /** \deprecated \ingroup flags
  *
@@ -168,9 +168,9 @@
  * expression.packet<Aligned>(0);
  * \endcode
  */
-EIGEN_DEPRECATED const unsigned int AlignedBit = 0x80;
+EIGEN_DEPRECATED constexpr unsigned int AlignedBit = 0x80;
 
-const unsigned int NestByRefBit = 0x100;
+constexpr unsigned int NestByRefBit = 0x100;
 
 /** \ingroup flags
  *
@@ -179,7 +179,7 @@
  * The precise choice will be decided at evaluation time or when
  * combined with other expressions.
  * \sa \blank  \ref RowMajorBit, \ref TopicStorageOrders */
-const unsigned int NoPreferredStorageOrderBit = 0x200;
+constexpr unsigned int NoPreferredStorageOrderBit = 0x200;
 
 /** \ingroup flags
   *
@@ -192,10 +192,10 @@
     inline const Index* innerNonZeroPtr() const;
     \endcode
   */
-const unsigned int CompressedAccessBit = 0x400;
+constexpr unsigned int CompressedAccessBit = 0x400;
 
 // list of flags that are inherited by default
-const unsigned int HereditaryBits = RowMajorBit | EvalBeforeNestingBit;
+constexpr unsigned int HereditaryBits = RowMajorBit | EvalBeforeNestingBit;
 
 /** \defgroup enums Enumerations
  * \ingroup Core_Module

diff --git a/Eigen/src/Core/util/EmulateArray.h b/Eigen/src/Core/util/EmulateArray.h
index 6c4c22d..9414cb2 100644
--- a/Eigen/src/Core/util/EmulateArray.h
+++ b/Eigen/src/Core/util/EmulateArray.h

@@ -231,37 +231,20 @@
 using array = std::array<T, N>;
 
 namespace internal {
-/* std::get is only constexpr in C++14, not yet in C++11
- *     - libstdc++ from version 4.7 onwards has it nevertheless,
- *                                          so use that
- *     - libstdc++ older versions: use _M_instance directly
- *     - libc++ all versions so far: use __elems_ directly
- *     - all other libs: use std::get to be portable, but
- *                       this may not be constexpr
- */
-#if defined(__GLIBCXX__) && __GLIBCXX__ < 20120322
-#define STD_GET_ARR_HACK a._M_instance[I_]
-#elif defined(_LIBCPP_VERSION)
-#define STD_GET_ARR_HACK a.__elems_[I_]
-#else
-#define STD_GET_ARR_HACK std::template get<I_, T, N>(a)
-#endif
 
 template <std::size_t I_, class T, std::size_t N>
 constexpr T& array_get(std::array<T, N>& a) {
-  return (T&)STD_GET_ARR_HACK;
+  return std::get<I_>(a);
 }
 template <std::size_t I_, class T, std::size_t N>
 constexpr T&& array_get(std::array<T, N>&& a) {
-  return (T&&)STD_GET_ARR_HACK;
+  return std::get<I_>(std::move(a));
 }
 template <std::size_t I_, class T, std::size_t N>
 constexpr T const& array_get(std::array<T, N> const& a) {
-  return (T const&)STD_GET_ARR_HACK;
+  return std::get<I_>(a);
 }
 
-#undef STD_GET_ARR_HACK
-
 }  // end namespace internal
 }  // end namespace Eigen
 

diff --git a/Eigen/src/Core/util/GpuHipCudaDefines.inc b/Eigen/src/Core/util/GpuHipCudaDefines.inc
index 4e10500..f3d4023 100644
--- a/Eigen/src/Core/util/GpuHipCudaDefines.inc
+++ b/Eigen/src/Core/util/GpuHipCudaDefines.inc

@@ -15,11 +15,11 @@
 // There is code in the Tensorflow codebase that will define EIGEN_USE_GPU,  but
 // for some reason gets sent to the gcc/host compiler instead of the gpu/nvcc/hipcc compiler
 // When compiling such files, gcc will end up trying to pick up the CUDA headers by
-// default (see the code within "unsupported/Eigen/CXX11/Tensor" that is guarded by EIGEN_USE_GPU)
+// default (see the code within "unsupported/Eigen/Tensor" that is guarded by EIGEN_USE_GPU)
 // This will obviously not work when trying to compile tensorflow on a system with no CUDA
 // To work around this issue for HIP systems (and leave the default behaviour intact), the
 // HIP tensorflow build defines EIGEN_USE_HIP when compiling all source files, and
-// "unsupported/Eigen/CXX11/Tensor" has been updated to use HIP header when EIGEN_USE_HIP is
+// "unsupported/Eigen/Tensor" has been updated to use HIP header when EIGEN_USE_HIP is
 // defined. In continuation of that requirement, the guard here needs to be EIGEN_USE_HIP as well
 
 #if defined(EIGEN_USE_HIP)

diff --git a/Eigen/src/Core/util/IndexedViewHelper.h b/Eigen/src/Core/util/IndexedViewHelper.h
index abf4b19..d18b258 100644
--- a/Eigen/src/Core/util/IndexedViewHelper.h
+++ b/Eigen/src/Core/util/IndexedViewHelper.h

@@ -155,16 +155,16 @@
   static constexpr Index SizeAtCompileTime = SizeAtCompileTime_;
   static constexpr Index IncrAtCompileTime = IncrAtCompileTime_;
 
-  constexpr ArithmeticSequenceRange(Index first, Index size, Index incr) : first_{first}, size_{size}, incr_{incr} {}
+  constexpr ArithmeticSequenceRange(Index first, Index size, Index incr) : m_first{first}, m_size{size}, m_incr{incr} {}
   constexpr Index operator[](Index i) const { return first() + i * incr(); }
-  constexpr Index first() const noexcept { return first_.value(); }
-  constexpr Index size() const noexcept { return size_.value(); }
-  constexpr Index incr() const noexcept { return incr_.value(); }
+  constexpr Index first() const noexcept { return m_first.value(); }
+  constexpr Index size() const noexcept { return m_size.value(); }
+  constexpr Index incr() const noexcept { return m_incr.value(); }
 
  private:
-  variable_if_dynamicindex<Index, int(FirstAtCompileTime)> first_;
-  variable_if_dynamic<Index, int(SizeAtCompileTime)> size_;
-  variable_if_dynamicindex<Index, int(IncrAtCompileTime)> incr_;
+  variable_if_dynamicindex<Index, int(FirstAtCompileTime)> m_first;
+  variable_if_dynamic<Index, int(SizeAtCompileTime)> m_size;
+  variable_if_dynamicindex<Index, int(IncrAtCompileTime)> m_incr;
 };
 
 template <typename FirstType, typename SizeType, typename IncrType, int NestedSizeAtCompileTime>
@@ -221,14 +221,14 @@
   static constexpr Index SizeAtCompileTime = Index(1);
   static constexpr Index IncrAtCompileTime = Index(1);  // Needs to be 1 to be treated as block-like.
 
-  constexpr SingleRange(Index v) noexcept : value_(v) {}
+  constexpr SingleRange(Index v) noexcept : m_value(v) {}
   constexpr Index operator[](Index) const noexcept { return first(); }
-  constexpr Index first() const noexcept { return value_.value(); }
+  constexpr Index first() const noexcept { return m_value.value(); }
   constexpr Index size() const noexcept { return SizeAtCompileTime; }
   constexpr Index incr() const noexcept { return IncrAtCompileTime; }
 
  private:
-  variable_if_dynamicindex<Index, int(ValueAtCompileTime)> value_;
+  variable_if_dynamicindex<Index, int(ValueAtCompileTime)> m_value;
 };
 
 template <typename T>
@@ -280,14 +280,14 @@
   static constexpr Index FirstAtCompileTime = Index(0);
   static constexpr Index SizeAtCompileTime = SizeAtCompileTime_;
   static constexpr Index IncrAtCompileTime = Index(1);
-  constexpr AllRange(Index size) : size_(size) {}
+  constexpr AllRange(Index size) : m_size(size) {}
   constexpr Index operator[](Index i) const noexcept { return i; }
   constexpr Index first() const noexcept { return FirstAtCompileTime; }
-  constexpr Index size() const noexcept { return size_.value(); }
+  constexpr Index size() const noexcept { return m_size.value(); }
   constexpr Index incr() const noexcept { return IncrAtCompileTime; }
 
  private:
-  variable_if_dynamic<Index, int(SizeAtCompileTime)> size_;
+  variable_if_dynamic<Index, int(SizeAtCompileTime)> m_size;
 };
 
 template <int NestedSizeAtCompileTime>

diff --git a/Eigen/src/Core/util/IntegralConstant.h b/Eigen/src/Core/util/IntegralConstant.h
index 53fabd5..f79fea8 100644
--- a/Eigen/src/Core/util/IntegralConstant.h
+++ b/Eigen/src/Core/util/IntegralConstant.h

@@ -144,8 +144,8 @@
 class VariableAndFixedInt {
  public:
   static const int value = N;
-  operator int() const { return m_value; }
-  VariableAndFixedInt(int val) { m_value = val; }
+  constexpr operator int() const { return m_value; }
+  constexpr VariableAndFixedInt(int val) : m_value(val) {}
 
  protected:
   int m_value;
@@ -172,7 +172,7 @@
 };
 
 template <typename T>
-EIGEN_DEVICE_FUNC Index get_runtime_value(const T &x) {
+EIGEN_DEVICE_FUNC constexpr Index get_runtime_value(const T &x) {
   return x;
 }
 

diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index ff84a38..4952612 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h

@@ -282,10 +282,10 @@
 
 /// \internal EIGEN_COMP_GNUC_STRICT set to 1 if the compiler is really GCC and not a compatible compiler (e.g., ICC,
 /// clang, mingw, etc.)
-#if EIGEN_COMP_GNUC &&                                                                                      \
-    !(EIGEN_COMP_CLANG || EIGEN_COMP_ICC || EIGEN_COMP_CLANGICC || EIGEN_COMP_MINGW || EIGEN_COMP_PGI ||    \
-      EIGEN_COMP_IBM || EIGEN_COMP_ARM || EIGEN_COMP_EMSCRIPTEN || EIGEN_COMP_FCC || EIGEN_COMP_CLANGFCC || \
-      EIGEN_COMP_CPE || EIGEN_COMP_CLANGCPE || EIGEN_COMP_LCC)
+#if EIGEN_COMP_GNUC &&                                                                                   \
+    !(EIGEN_COMP_CLANG || EIGEN_COMP_ICC || EIGEN_COMP_CLANGICC || EIGEN_COMP_MINGW || EIGEN_COMP_PGI || \
+      EIGEN_COMP_NVHPC || EIGEN_COMP_IBM || EIGEN_COMP_ARM || EIGEN_COMP_EMSCRIPTEN || EIGEN_COMP_FCC || \
+      EIGEN_COMP_CLANGFCC || EIGEN_COMP_CPE || EIGEN_COMP_CLANGCPE || EIGEN_COMP_LCC)
 #define EIGEN_COMP_GNUC_STRICT 1
 #else
 #define EIGEN_COMP_GNUC_STRICT 0
@@ -599,7 +599,7 @@
 // without an explicit launch_bounds attribute is called with a threads_per_block value
 // greater than 256.
 //
-// This is a regression in functioanlity and is expected to be fixed within the next
+// This is a regression in functionality and is expected to be fixed within the next
 // couple of ROCm releases (compiler will go back to using 1024 value as the default)
 //
 // In the meantime, we will use a "only enabled for HIP" macro to set the launch_bounds
@@ -808,6 +808,15 @@
 // NOTE: Intel C++ Compiler Classic (icc) Version 19.0 and later supports dynamic allocation
 //       for over-aligned data, but not in a manner that is compatible with Eigen.
 //       See https://gitlab.com/libeigen/eigen/-/issues/2575
+// Does the compiler support C++17 if constexpr?
+#ifndef EIGEN_HAS_CXX17_IFCONSTEXPR
+#if EIGEN_MAX_CPP_VER >= 17 && EIGEN_COMP_CXXVER >= 17 &&                                                            \
+    ((EIGEN_COMP_MSVC >= 1911) || (EIGEN_GNUC_STRICT_AT_LEAST(7, 0, 0)) || (EIGEN_CLANG_STRICT_AT_LEAST(3, 9, 0)) || \
+     (EIGEN_COMP_CLANGAPPLE && EIGEN_COMP_CLANGAPPLE >= 10000000))
+#define EIGEN_HAS_CXX17_IFCONSTEXPR 1
+#endif
+#endif
+
 #ifndef EIGEN_HAS_CXX17_OVERALIGN
 #if EIGEN_MAX_CPP_VER >= 17 && EIGEN_COMP_CXXVER >= 17 &&                                                            \
     ((EIGEN_COMP_MSVC >= 1912) || (EIGEN_GNUC_STRICT_AT_LEAST(7, 0, 0)) || (EIGEN_CLANG_STRICT_AT_LEAST(5, 0, 0)) || \
@@ -901,6 +910,17 @@
 #define EIGEN_ALWAYS_INLINE EIGEN_STRONG_INLINE
 #endif
 
+// EIGEN_LAMBDA_ALWAYS_INLINE forces inlining of lambda functions.
+// On GCC/Clang, __attribute__((always_inline)) works on lambdas.
+// On MSVC, [[msvc::forceinline]] cannot be applied to generic lambdas
+// (those with auto parameters), so we leave it empty and rely on the
+// optimizer to inline small lambda bodies at /O2.
+#if EIGEN_COMP_GNUC && !defined(SYCL_DEVICE_ONLY)
+#define EIGEN_LAMBDA_ALWAYS_INLINE __attribute__((always_inline))
+#else
+#define EIGEN_LAMBDA_ALWAYS_INLINE
+#endif
+
 #if EIGEN_COMP_GNUC
 #define EIGEN_DONT_INLINE __attribute__((noinline))
 #elif EIGEN_COMP_MSVC
@@ -1002,6 +1022,10 @@
 #define EIGEN_DEPRECATED_WITH_REASON(message)
 #endif
 
+// Deprecated no-op macro. Was a workaround for GCC 4.3 empty struct issues, removed in Eigen 5.0.
+// Defined here for backward compatibility with downstream code that still references it.
+#define EIGEN_EMPTY_STRUCT_CTOR(X)
+
 #if EIGEN_COMP_GNUC
 #define EIGEN_UNUSED __attribute__((unused))
 #else
@@ -1028,7 +1052,7 @@
 namespace Eigen {
 namespace internal {
 template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void ignore_unused_variable(const T&) {}
+EIGEN_DEVICE_FUNC constexpr void ignore_unused_variable(const T&) {}
 }  // namespace internal
 }  // namespace Eigen
 #define EIGEN_UNUSED_VARIABLE(var) Eigen::internal::ignore_unused_variable(var);
@@ -1123,7 +1147,7 @@
 
 #if EIGEN_COMP_MSVC
 // NOTE MSVC often gives C4127 warnings with compiletime if statements. See bug 1362.
-// This workaround is ugly, but it does the job.
+// This workaround suppresses MSVC C4127 warnings for compile-time conditionals.
 #define EIGEN_CONST_CONDITIONAL(cond) (void)0, cond
 #else
 #define EIGEN_CONST_CONDITIONAL(cond) cond
@@ -1260,7 +1284,7 @@
 
 #define EIGEN_MAKE_CWISE_BINARY_OP(METHOD, OPNAME)                                                                \
   template <typename OtherDerived>                                                                                \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const EIGEN_CWISE_BINARY_RETURN_TYPE(                                     \
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const EIGEN_CWISE_BINARY_RETURN_TYPE(                           \
       Derived, OtherDerived, OPNAME)(METHOD)(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const { \
     return EIGEN_CWISE_BINARY_RETURN_TYPE(Derived, OtherDerived, OPNAME)(derived(), other.derived());             \
   }
@@ -1281,7 +1305,7 @@
 
 #define EIGEN_MAKE_SCALAR_BINARY_OP_ONTHERIGHT(METHOD, OPNAME)                                                       \
   template <typename T>                                                                                              \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(                                \
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(                      \
       Derived,                                                                                                       \
       typename internal::promote_scalar_arg<Scalar EIGEN_COMMA T EIGEN_COMMA EIGEN_SCALAR_BINARY_SUPPORTED(          \
           OPNAME, Scalar, T)>::type,                                                                                 \
@@ -1295,7 +1319,7 @@
 
 #define EIGEN_MAKE_SCALAR_BINARY_OP_ONTHELEFT(METHOD, OPNAME)                                                        \
   template <typename T>                                                                                              \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(                         \
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE friend const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(               \
       typename internal::promote_scalar_arg<Scalar EIGEN_COMMA T EIGEN_COMMA EIGEN_SCALAR_BINARY_SUPPORTED(          \
           OPNAME, T, Scalar)>::type,                                                                                 \
       Derived, OPNAME)(METHOD)(const T& scalar, const StorageBaseType& matrix) {                                     \
@@ -1340,10 +1364,10 @@
 namespace Eigen {
 namespace internal {
 
-EIGEN_DEVICE_FUNC inline bool all() { return true; }
+EIGEN_DEVICE_FUNC constexpr bool all() { return true; }
 
 template <typename T, typename... Ts>
-EIGEN_DEVICE_FUNC bool all(T t, Ts... ts) {
+EIGEN_DEVICE_FUNC constexpr bool all(T t, Ts... ts) {
   return t && all(ts...);
 }
 

diff --git a/Eigen/src/Core/util/MaxSizeVector.h b/Eigen/src/Core/util/MaxSizeVector.h
index db5bb89..3f13b2d 100644
--- a/Eigen/src/Core/util/MaxSizeVector.h
+++ b/Eigen/src/Core/util/MaxSizeVector.h

@@ -34,104 +34,103 @@
  public:
   // Construct a new MaxSizeVector, reserve n elements.
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit MaxSizeVector(size_t n)
-      : reserve_(n), size_(0), data_(static_cast<T*>(internal::handmade_aligned_malloc(n * sizeof(T), alignment))) {}
+      : m_reserve(n), m_size(0), m_data(static_cast<T*>(internal::handmade_aligned_malloc(n * sizeof(T), alignment))) {}
 
   // Construct a new MaxSizeVector, reserve and resize to n.
   // Copy the init value to all elements.
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MaxSizeVector(size_t n, const T& init)
-      : reserve_(n), size_(n), data_(static_cast<T*>(internal::handmade_aligned_malloc(n * sizeof(T), alignment))) {
+      : m_reserve(n), m_size(n), m_data(static_cast<T*>(internal::handmade_aligned_malloc(n * sizeof(T), alignment))) {
     size_t i = 0;
     EIGEN_TRY {
-      for (; i < size_; ++i) {
-        new (&data_[i]) T(init);
+      for (; i < m_size; ++i) {
+        new (&m_data[i]) T(init);
       }
     }
     EIGEN_CATCH(...) {
       // Construction failed, destruct in reverse order:
       for (; (i + 1) > 0; --i) {
-        data_[i - 1].~T();
+        m_data[i - 1].~T();
       }
-      internal::handmade_aligned_free(data_);
+      internal::handmade_aligned_free(m_data);
       EIGEN_THROW;
     }
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~MaxSizeVector() {
-    for (size_t i = size_; i > 0; --i) {
-      data_[i - 1].~T();
+    for (size_t i = m_size; i > 0; --i) {
+      m_data[i - 1].~T();
     }
-    internal::handmade_aligned_free(data_);
+    internal::handmade_aligned_free(m_data);
   }
 
   void resize(size_t n) {
-    eigen_assert(n <= reserve_);
-    for (; size_ < n; ++size_) {
-      new (&data_[size_]) T;
+    eigen_assert(n <= m_reserve);
+    for (; m_size < n; ++m_size) {
+      new (&m_data[m_size]) T;
     }
-    for (; size_ > n; --size_) {
-      data_[size_ - 1].~T();
+    for (; m_size > n; --m_size) {
+      m_data[m_size - 1].~T();
     }
-    eigen_assert(size_ == n);
+    eigen_assert(m_size == n);
   }
 
   // Append new elements (up to reserved size).
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void push_back(const T& t) {
-    eigen_assert(size_ < reserve_);
-    new (&data_[size_++]) T(t);
+    eigen_assert(m_size < m_reserve);
+    new (&m_data[m_size++]) T(t);
   }
 
-  // For C++03 compatibility this only takes one argument
   template <class X>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void emplace_back(const X& x) {
-    eigen_assert(size_ < reserve_);
-    new (&data_[size_++]) T(x);
+    eigen_assert(m_size < m_reserve);
+    new (&m_data[m_size++]) T(x);
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& operator[](size_t i) const {
-    eigen_assert(i < size_);
-    return data_[i];
+    eigen_assert(i < m_size);
+    return m_data[i];
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& operator[](size_t i) {
-    eigen_assert(i < size_);
-    return data_[i];
+    eigen_assert(i < m_size);
+    return m_data[i];
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& back() {
-    eigen_assert(size_ > 0);
-    return data_[size_ - 1];
+    eigen_assert(m_size > 0);
+    return m_data[m_size - 1];
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& back() const {
-    eigen_assert(size_ > 0);
-    return data_[size_ - 1];
+    eigen_assert(m_size > 0);
+    return m_data[m_size - 1];
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pop_back() {
-    eigen_assert(size_ > 0);
-    data_[--size_].~T();
+    eigen_assert(m_size > 0);
+    m_data[--m_size].~T();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t size() const { return size_; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t size() const { return m_size; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool empty() const { return size_ == 0; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool empty() const { return m_size == 0; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* data() { return data_; }
+  EIGEN_DEVICE_FUNC constexpr T* data() { return m_data; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* data() const { return data_; }
+  EIGEN_DEVICE_FUNC constexpr const T* data() const { return m_data; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* begin() { return data_; }
+  EIGEN_DEVICE_FUNC constexpr T* begin() { return m_data; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* end() { return data_ + size_; }
+  EIGEN_DEVICE_FUNC constexpr T* end() { return m_data + m_size; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* begin() const { return data_; }
+  EIGEN_DEVICE_FUNC constexpr const T* begin() const { return m_data; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* end() const { return data_ + size_; }
+  EIGEN_DEVICE_FUNC constexpr const T* end() const { return m_data + m_size; }
 
  private:
-  size_t reserve_;
-  size_t size_;
-  T* data_;
+  size_t m_reserve;
+  size_t m_size;
+  T* m_data;
 };
 
 }  // namespace Eigen

diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index 1492f72..2d48eb5 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h

@@ -114,7 +114,7 @@
 EIGEN_DEVICE_FUNC inline bool is_free_allowed() { return is_free_allowed_impl(false); }
 EIGEN_DEVICE_FUNC inline bool set_is_free_allowed(bool new_value) { return is_free_allowed_impl(true, new_value); }
 EIGEN_DEVICE_FUNC inline void check_that_free_is_allowed() {
-  eigen_assert(is_malloc_allowed() &&
+  eigen_assert(is_free_allowed() &&
                "heap deallocation is forbidden (EIGEN_RUNTIME_NO_MALLOC is defined and set_is_free_allowed is false)");
 }
 #else
@@ -1305,6 +1305,37 @@
     //   ||cpuid_is_vendor(abcd,"SiS SiS SiS ")
     //   ||cpuid_is_vendor(abcd,"UMC UMC UMC ")
     //   ||cpuid_is_vendor(abcd,"NexGenDriven")
+#elif EIGEN_OS_MAC
+  // On macOS (including Apple Silicon), use sysctlbyname to query cache sizes.
+  // The sysctl values are 64-bit, so read into int64_t and convert.
+  // For L1, prefer P-core (perflevel0) size since compute-heavy work like GEMM
+  // is typically scheduled on performance cores. L1 is per-core so always safe.
+  // For L2, use the generic hw.l2cachesize which is more conservative (reports
+  // the smaller E-core cluster L2 on heterogeneous chips). The P-core L2 is
+  // shared among all P-cores and would overestimate per-core capacity.
+  {
+    int64_t val = 0;
+    std::size_t val_size = sizeof(val);
+    l1 = -1;
+    val_size = sizeof(val);
+    if (sysctlbyname("hw.perflevel0.l1dcachesize", &val, &val_size, NULL, 0) == 0 && val > 0)
+      l1 = static_cast<int>(val);
+    else {
+      val_size = sizeof(val);
+      if (sysctlbyname("hw.l1dcachesize", &val, &val_size, NULL, 0) == 0) l1 = static_cast<int>(val);
+    }
+    l2 = -1;
+    val_size = sizeof(val);
+    if (sysctlbyname("hw.l2cachesize", &val, &val_size, NULL, 0) == 0) l2 = static_cast<int>(val);
+    l3 = -1;
+    val_size = sizeof(val);
+    if (sysctlbyname("hw.l3cachesize", &val, &val_size, NULL, 0) == 0 && val > 0) l3 = static_cast<int>(val);
+  }
+#elif EIGEN_OS_UNIX && defined(_SC_LEVEL1_DCACHE_SIZE)
+  // On Linux and other POSIX systems, use sysconf to query cache sizes.
+  l1 = sysconf(_SC_LEVEL1_DCACHE_SIZE);
+  l2 = sysconf(_SC_LEVEL2_CACHE_SIZE);
+  l3 = sysconf(_SC_LEVEL3_CACHE_SIZE);
 #else
   l1 = l2 = l3 = -1;
 #endif
@@ -1355,15 +1386,13 @@
 #endif
 
 // FIXME(rmlarsen): Work around missing linker symbol with msan on ARM.
-#if !defined(EIGEN_DONT_ASSUME_ALIGNED) && __has_feature(memory_sanitizer) && \
-    (EIGEN_ARCH_ARM || EIGEN_ARCH_ARM64)
+#if !defined(EIGEN_DONT_ASSUME_ALIGNED) && __has_feature(memory_sanitizer) && (EIGEN_ARCH_ARM || EIGEN_ARCH_ARM64)
 #define EIGEN_DONT_ASSUME_ALIGNED
 #endif
 
-
 #if !defined(EIGEN_DONT_ASSUME_ALIGNED) && defined(__cpp_lib_assume_aligned) && (__cpp_lib_assume_aligned >= 201811L)
 template <std::size_t N, typename T>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr T* assume_aligned(T* ptr) {
+EIGEN_DEVICE_FUNC constexpr T* assume_aligned(T* ptr) {
   return std::assume_aligned<N, T>(ptr);
 }
 #elif !defined(EIGEN_DONT_ASSUME_ALIGNED) && EIGEN_HAS_BUILTIN(__builtin_assume_aligned)
@@ -1373,7 +1402,7 @@
 }
 #else
 template <std::size_t N, typename T>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr T* assume_aligned(T* ptr) {
+EIGEN_DEVICE_FUNC constexpr T* assume_aligned(T* ptr) {
   return ptr;
 }
 #endif

diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h
index ddbc898..ececab8 100644
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h

@@ -94,13 +94,7 @@
 using std::true_type;
 
 template <bool Condition>
-struct bool_constant;
-
-template <>
-struct bool_constant<true> : true_type {};
-
-template <>
-struct bool_constant<false> : false_type {};
+using bool_constant = std::integral_constant<bool, Condition>;
 
 // Third-party libraries rely on these.
 using std::conditional;
@@ -136,77 +130,24 @@
 template <typename T>
 using remove_all_t = typename remove_all<T>::type;
 
+// Eigen's is_arithmetic is similar to std::is_arithmetic but can be specialized
+// for SIMD packet types and other Eigen-specific types. The primary template
+// delegates to std::is_arithmetic for fundamental types.
 template <typename T>
 struct is_arithmetic {
-  enum { value = false };
-};
-template <>
-struct is_arithmetic<float> {
-  enum { value = true };
-};
-template <>
-struct is_arithmetic<double> {
-  enum { value = true };
+  enum { value = std::is_arithmetic<T>::value };
 };
 // GPU devices treat `long double` as `double`.
-#ifndef EIGEN_GPU_COMPILE_PHASE
+#ifdef EIGEN_GPU_COMPILE_PHASE
 template <>
 struct is_arithmetic<long double> {
-  enum { value = true };
+  enum { value = false };
 };
 #endif
-template <>
-struct is_arithmetic<bool> {
-  enum { value = true };
-};
-template <>
-struct is_arithmetic<char> {
-  enum { value = true };
-};
-template <>
-struct is_arithmetic<signed char> {
-  enum { value = true };
-};
-template <>
-struct is_arithmetic<unsigned char> {
-  enum { value = true };
-};
-template <>
-struct is_arithmetic<signed short> {
-  enum { value = true };
-};
-template <>
-struct is_arithmetic<unsigned short> {
-  enum { value = true };
-};
-template <>
-struct is_arithmetic<signed int> {
-  enum { value = true };
-};
-template <>
-struct is_arithmetic<unsigned int> {
-  enum { value = true };
-};
-template <>
-struct is_arithmetic<signed long> {
-  enum { value = true };
-};
-template <>
-struct is_arithmetic<unsigned long> {
-  enum { value = true };
-};
 
-template <typename T, typename U>
-struct is_same {
-  enum { value = 0 };
-};
-template <typename T>
-struct is_same<T, T> {
-  enum { value = 1 };
-};
+using std::is_same;
 
-template <class T>
-struct is_void : is_same<void, std::remove_const_t<T>> {};
+using std::is_void;
 
 /** \internal
  * Implementation of std::void_t for SFINAE.
@@ -223,26 +164,11 @@
 using void_t = void;
 #endif
 
-template <>
-struct is_arithmetic<signed long long> {
-  enum { value = true };
-};
-template <>
-struct is_arithmetic<unsigned long long> {
-  enum { value = true };
-};
 using std::is_integral;
 
 using std::make_unsigned;
 
-template <typename T>
-struct is_const {
-  enum { value = 0 };
-};
-template <typename T>
-struct is_const<T const> {
-  enum { value = 1 };
-};
+using std::is_const;
 
 template <typename T>
 struct add_const_on_value_type {
@@ -499,12 +425,6 @@
   enum { Defined = 0 };
 };
 
-// FIXME quick workaround around current limitation of result_of
-// template<typename Scalar, typename ArgType0, typename ArgType1>
-// struct result_of<scalar_product_op<Scalar>(ArgType0,ArgType1)> {
-// typedef typename scalar_product_traits<remove_all_t<ArgType0>, remove_all_t<ArgType1>>::ReturnType type;
-// };
-
 /** \internal Obtains a POD type suitable to use as storage for an object of a size
  * of at most Len bytes, aligned as specified by \c Align.
  */
@@ -524,14 +444,14 @@
 
 #if defined(EIGEN_GPU_COMPILE_PHASE)
 template <typename T>
-EIGEN_DEVICE_FUNC void swap(T& a, T& b) {
+EIGEN_DEVICE_FUNC constexpr void swap(T& a, T& b) {
   T tmp = b;
   b = a;
   a = tmp;
 }
 #else
 template <typename T>
-EIGEN_STRONG_INLINE void swap(T& a, T& b) {
+constexpr EIGEN_STRONG_INLINE void swap(T& a, T& b) {
   std::swap(a, b);
 }
 #endif
@@ -542,7 +462,7 @@
 template <typename X, typename Y, bool XIsInteger = NumTraits<X>::IsInteger, bool XIsSigned = NumTraits<X>::IsSigned,
           bool YIsInteger = NumTraits<Y>::IsInteger, bool YIsSigned = NumTraits<Y>::IsSigned>
 struct equal_strict_impl {
-  static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool run(const X& x, const Y& y) { return x == y; }
+  static constexpr EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool run(const X& x, const Y& y) { return x == y; }
 };
 template <typename X, typename Y>
 struct equal_strict_impl<X, Y, true, false, true, true> {
@@ -550,7 +470,7 @@
   // Y is a signed integer
   // if Y is non-negative, it may be represented exactly as its unsigned counterpart.
   using UnsignedY = typename internal::make_unsigned<Y>::type;
-  static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool run(const X& x, const Y& y) {
+  static constexpr EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool run(const X& x, const Y& y) {
     return y < Y(0) ? false : (x == static_cast<UnsignedY>(y));
   }
 };
@@ -558,7 +478,7 @@
 struct equal_strict_impl<X, Y, true, true, true, false> {
   // X is a signed integer
   // Y is an unsigned integer
-  static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool run(const X& x, const Y& y) {
+  static constexpr EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool run(const X& x, const Y& y) {
     return equal_strict_impl<Y, X>::run(y, x);
   }
 };
@@ -566,18 +486,18 @@
 // The aim of the following functions is to bypass -Wfloat-equal warnings
 // when we really want a strict equality comparison on floating points.
 template <typename X, typename Y>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool equal_strict(const X& x, const Y& y) {
+constexpr EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool equal_strict(const X& x, const Y& y) {
   return equal_strict_impl<X, Y>::run(x, y);
 }
 
 #if !defined(EIGEN_GPU_COMPILE_PHASE) || (!defined(EIGEN_CUDA_ARCH) && defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC))
 template <>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool equal_strict(const float& x, const float& y) {
+constexpr EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool equal_strict(const float& x, const float& y) {
   return std::equal_to<float>()(x, y);
 }
 
 template <>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool equal_strict(const double& x, const double& y) {
+constexpr EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool equal_strict(const double& x, const double& y) {
   return std::equal_to<double>()(x, y);
 }
 #endif
@@ -587,7 +507,7 @@
  * Use this to to bypass -Wfloat-equal warnings when exact zero is what needs to be tested.
  */
 template <typename X>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool is_exactly_zero(const X& x) {
+constexpr EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool is_exactly_zero(const X& x) {
   return equal_strict(x, typename NumTraits<X>::Literal{0});
 }
 
@@ -596,23 +516,23 @@
  * Use this to to bypass -Wfloat-equal warnings when exact one is what needs to be tested.
  */
 template <typename X>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool is_exactly_one(const X& x) {
+constexpr EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool is_exactly_one(const X& x) {
   return equal_strict(x, typename NumTraits<X>::Literal{1});
 }
 
 template <typename X, typename Y>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool not_equal_strict(const X& x, const Y& y) {
+constexpr EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool not_equal_strict(const X& x, const Y& y) {
   return !equal_strict_impl<X, Y>::run(x, y);
 }
 
 #if !defined(EIGEN_GPU_COMPILE_PHASE) || (!defined(EIGEN_CUDA_ARCH) && defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC))
 template <>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool not_equal_strict(const float& x, const float& y) {
+constexpr EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool not_equal_strict(const float& x, const float& y) {
   return std::not_equal_to<float>()(x, y);
 }
 
 template <>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool not_equal_strict(const double& x, const double& y) {
+constexpr EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool not_equal_strict(const double& x, const double& y) {
   return std::not_equal_to<double>()(x, y);
 }
 #endif
@@ -623,11 +543,11 @@
 
 template <typename Scalar>
 struct is_identically_zero_impl {
-  static inline bool run(const Scalar& s) { return numext::is_exactly_zero(s); }
+  static constexpr bool run(const Scalar& s) { return numext::is_exactly_zero(s); }
 };
 
 template <typename Scalar>
-EIGEN_STRONG_INLINE bool is_identically_zero(const Scalar& s) {
+constexpr EIGEN_STRONG_INLINE bool is_identically_zero(const Scalar& s) {
   return is_identically_zero_impl<Scalar>::run(s);
 }
 

diff --git a/Eigen/src/Core/util/MoreMeta.h b/Eigen/src/Core/util/MoreMeta.h
index 6823bca..a20506e 100644
--- a/Eigen/src/Core/util/MoreMeta.h
+++ b/Eigen/src/Core/util/MoreMeta.h

@@ -186,12 +186,11 @@
 template <int n>
 struct h_skip {
   template <typename T, T... ii>
-  constexpr static EIGEN_STRONG_INLINE typename h_skip_helper_numeric<T, n, ii...>::type helper(
-      numeric_list<T, ii...>) {
+  constexpr static typename h_skip_helper_numeric<T, n, ii...>::type helper(numeric_list<T, ii...>) {
     return typename h_skip_helper_numeric<T, n, ii...>::type();
   }
   template <typename... tt>
-  constexpr static EIGEN_STRONG_INLINE typename h_skip_helper_type<n, tt...>::type helper(type_list<tt...>) {
+  constexpr static typename h_skip_helper_type<n, tt...>::type helper(type_list<tt...>) {
     return typename h_skip_helper_type<n, tt...>::type();
   }
 };
@@ -330,17 +329,17 @@
 
 template <typename Reducer>
 struct reduce<Reducer> {
-  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE int run() { return Reducer::Identity; }
+  EIGEN_DEVICE_FUNC constexpr static int run() { return Reducer::Identity; }
 };
 
 template <typename Reducer, typename A>
 struct reduce<Reducer, A> {
-  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE A run(A a) { return a; }
+  EIGEN_DEVICE_FUNC constexpr static A run(A a) { return a; }
 };
 
 template <typename Reducer, typename A, typename... Ts>
 struct reduce<Reducer, A, Ts...> {
-  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(A a, Ts... ts)
+  EIGEN_DEVICE_FUNC constexpr static auto run(A a, Ts... ts)
       -> decltype(Reducer::run(a, reduce<Reducer, Ts...>::run(ts...))) {
     return Reducer::run(a, reduce<Reducer, Ts...>::run(ts...));
   }
@@ -350,14 +349,14 @@
 
 struct sum_op {
   template <typename A, typename B>
-  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a + b) {
+  EIGEN_DEVICE_FUNC constexpr static auto run(A a, B b) -> decltype(a + b) {
     return a + b;
   }
   static constexpr int Identity = 0;
 };
 struct product_op {
   template <typename A, typename B>
-  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a * b) {
+  EIGEN_DEVICE_FUNC constexpr static auto run(A a, B b) -> decltype(a * b) {
     return a * b;
   }
   static constexpr int Identity = 1;
@@ -365,50 +364,50 @@
 
 struct logical_and_op {
   template <typename A, typename B>
-  constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a && b) {
+  constexpr static auto run(A a, B b) -> decltype(a && b) {
     return a && b;
   }
 };
 struct logical_or_op {
   template <typename A, typename B>
-  constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a || b) {
+  constexpr static auto run(A a, B b) -> decltype(a || b) {
     return a || b;
   }
 };
 
 struct equal_op {
   template <typename A, typename B>
-  constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a == b) {
+  constexpr static auto run(A a, B b) -> decltype(a == b) {
     return a == b;
   }
 };
 struct not_equal_op {
   template <typename A, typename B>
-  constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a != b) {
+  constexpr static auto run(A a, B b) -> decltype(a != b) {
     return a != b;
   }
 };
 struct lesser_op {
   template <typename A, typename B>
-  constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a < b) {
+  constexpr static auto run(A a, B b) -> decltype(a < b) {
     return a < b;
   }
 };
 struct lesser_equal_op {
   template <typename A, typename B>
-  constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a <= b) {
+  constexpr static auto run(A a, B b) -> decltype(a <= b) {
     return a <= b;
   }
 };
 struct greater_op {
   template <typename A, typename B>
-  constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a > b) {
+  constexpr static auto run(A a, B b) -> decltype(a > b) {
     return a > b;
   }
 };
 struct greater_equal_op {
   template <typename A, typename B>
-  constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a >= b) {
+  constexpr static auto run(A a, B b) -> decltype(a >= b) {
     return a >= b;
   }
 };
@@ -417,48 +416,46 @@
 
 struct not_op {
   template <typename A>
-  constexpr static EIGEN_STRONG_INLINE auto run(A a) -> decltype(!a) {
+  constexpr static auto run(A a) -> decltype(!a) {
     return !a;
   }
 };
 struct negation_op {
   template <typename A>
-  constexpr static EIGEN_STRONG_INLINE auto run(A a) -> decltype(-a) {
+  constexpr static auto run(A a) -> decltype(-a) {
     return -a;
   }
 };
 struct greater_equal_zero_op {
   template <typename A>
-  constexpr static EIGEN_STRONG_INLINE auto run(A a) -> decltype(a >= 0) {
+  constexpr static auto run(A a) -> decltype(a >= 0) {
     return a >= 0;
   }
 };
 
 /* reductions for lists */
 
-// using auto -> return value spec makes ICC 13.0 and 13.1 crash here, so we have to hack it
-// together in front... (13.0 doesn't work with array_prod/array_reduce/... anyway, but 13.1
-// does...
+// Using auto -> return value spec makes ICC 13.0 and 13.1 crash here,
+// so the return type is specified explicitly using decltype.
 template <typename... Ts>
-EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE decltype(reduce<product_op, Ts...>::run((*((Ts*)0))...)) arg_prod(
-    Ts... ts) {
+EIGEN_DEVICE_FUNC constexpr decltype(reduce<product_op, Ts...>::run((*((Ts*)0))...)) arg_prod(Ts... ts) {
   return reduce<product_op, Ts...>::run(ts...);
 }
 
 template <typename... Ts>
-constexpr EIGEN_STRONG_INLINE decltype(reduce<sum_op, Ts...>::run((*((Ts*)0))...)) arg_sum(Ts... ts) {
+constexpr decltype(reduce<sum_op, Ts...>::run((*((Ts*)0))...)) arg_sum(Ts... ts) {
   return reduce<sum_op, Ts...>::run(ts...);
 }
 
 /* reverse arrays */
 
 template <typename Array, int... n>
-constexpr EIGEN_STRONG_INLINE Array h_array_reverse(Array arr, numeric_list<int, n...>) {
+constexpr Array h_array_reverse(Array arr, numeric_list<int, n...>) {
   return {{array_get<sizeof...(n) - n - 1>(arr)...}};
 }
 
 template <typename T, std::size_t N>
-constexpr EIGEN_STRONG_INLINE array<T, N> array_reverse(array<T, N> arr) {
+constexpr array<T, N> array_reverse(array<T, N> arr) {
   return h_array_reverse(arr, typename gen_numeric_list<int, N>::type());
 }
 
@@ -471,7 +468,7 @@
 // an infinite loop)
 template <typename Reducer, typename T, std::size_t N, std::size_t n = N - 1>
 struct h_array_reduce {
-  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(array<T, N> arr, T identity)
+  EIGEN_DEVICE_FUNC constexpr static auto run(array<T, N> arr, T identity)
       -> decltype(Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr, identity), array_get<n>(arr))) {
     return Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr, identity), array_get<n>(arr));
   }
@@ -479,16 +476,16 @@
 
 template <typename Reducer, typename T, std::size_t N>
 struct h_array_reduce<Reducer, T, N, 0> {
-  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE T run(const array<T, N>& arr, T) { return array_get<0>(arr); }
+  EIGEN_DEVICE_FUNC constexpr static T run(const array<T, N>& arr, T) { return array_get<0>(arr); }
 };
 
 template <typename Reducer, typename T>
 struct h_array_reduce<Reducer, T, 0> {
-  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE T run(const array<T, 0>&, T identity) { return identity; }
+  EIGEN_DEVICE_FUNC constexpr static T run(const array<T, 0>&, T identity) { return identity; }
 };
 
 template <typename Reducer, typename T, std::size_t N>
-EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE auto array_reduce(const array<T, N>& arr, T identity)
+EIGEN_DEVICE_FUNC constexpr auto array_reduce(const array<T, N>& arr, T identity)
     -> decltype(h_array_reduce<Reducer, T, N>::run(arr, identity)) {
   return h_array_reduce<Reducer, T, N>::run(arr, identity);
 }
@@ -496,13 +493,13 @@
 /* standard array reductions */
 
 template <typename T, std::size_t N>
-EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE auto array_sum(const array<T, N>& arr)
+EIGEN_DEVICE_FUNC constexpr auto array_sum(const array<T, N>& arr)
     -> decltype(array_reduce<sum_op, T, N>(arr, static_cast<T>(0))) {
   return array_reduce<sum_op, T, N>(arr, static_cast<T>(0));
 }
 
 template <typename T, std::size_t N>
-EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE auto array_prod(const array<T, N>& arr)
+EIGEN_DEVICE_FUNC constexpr auto array_prod(const array<T, N>& arr)
     -> decltype(array_reduce<product_op, T, N>(arr, static_cast<T>(1))) {
   return array_reduce<product_op, T, N>(arr, static_cast<T>(1));
 }
@@ -520,20 +517,19 @@
 /* zip an array */
 
 template <typename Op, typename A, typename B, std::size_t N, int... n>
-constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A(), B())), N> h_array_zip(array<A, N> a, array<B, N> b,
-                                                                                numeric_list<int, n...>) {
+constexpr array<decltype(Op::run(A(), B())), N> h_array_zip(array<A, N> a, array<B, N> b, numeric_list<int, n...>) {
   return array<decltype(Op::run(A(), B())), N>{{Op::run(array_get<n>(a), array_get<n>(b))...}};
 }
 
 template <typename Op, typename A, typename B, std::size_t N>
-constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A(), B())), N> array_zip(array<A, N> a, array<B, N> b) {
+constexpr array<decltype(Op::run(A(), B())), N> array_zip(array<A, N> a, array<B, N> b) {
   return h_array_zip<Op>(a, b, typename gen_numeric_list<int, N>::type());
 }
 
 /* zip an array and reduce the result */
 
 template <typename Reducer, typename Op, typename A, typename B, std::size_t N, int... n>
-constexpr EIGEN_STRONG_INLINE auto h_array_zip_and_reduce(array<A, N> a, array<B, N> b, numeric_list<int, n...>)
+constexpr auto h_array_zip_and_reduce(array<A, N> a, array<B, N> b, numeric_list<int, n...>)
     -> decltype(reduce<Reducer, typename id_numeric<int, n, decltype(Op::run(A(), B()))>::type...>::run(
         Op::run(array_get<n>(a), array_get<n>(b))...)) {
   return reduce<Reducer, typename id_numeric<int, n, decltype(Op::run(A(), B()))>::type...>::run(
@@ -541,7 +537,7 @@
 }
 
 template <typename Reducer, typename Op, typename A, typename B, std::size_t N>
-constexpr EIGEN_STRONG_INLINE auto array_zip_and_reduce(array<A, N> a, array<B, N> b)
+constexpr auto array_zip_and_reduce(array<A, N> a, array<B, N> b)
     -> decltype(h_array_zip_and_reduce<Reducer, Op, A, B, N>(a, b, typename gen_numeric_list<int, N>::type())) {
   return h_array_zip_and_reduce<Reducer, Op, A, B, N>(a, b, typename gen_numeric_list<int, N>::type());
 }
@@ -549,19 +545,19 @@
 /* apply stuff to an array */
 
 template <typename Op, typename A, std::size_t N, int... n>
-constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A())), N> h_array_apply(array<A, N> a, numeric_list<int, n...>) {
+constexpr array<decltype(Op::run(A())), N> h_array_apply(array<A, N> a, numeric_list<int, n...>) {
   return array<decltype(Op::run(A())), N>{{Op::run(array_get<n>(a))...}};
 }
 
 template <typename Op, typename A, std::size_t N>
-constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A())), N> array_apply(array<A, N> a) {
+constexpr array<decltype(Op::run(A())), N> array_apply(array<A, N> a) {
   return h_array_apply<Op>(a, typename gen_numeric_list<int, N>::type());
 }
 
 /* apply stuff to an array and reduce */
 
 template <typename Reducer, typename Op, typename A, std::size_t N, int... n>
-constexpr EIGEN_STRONG_INLINE auto h_array_apply_and_reduce(array<A, N> arr, numeric_list<int, n...>)
+constexpr auto h_array_apply_and_reduce(array<A, N> arr, numeric_list<int, n...>)
     -> decltype(reduce<Reducer, typename id_numeric<int, n, decltype(Op::run(A()))>::type...>::run(
         Op::run(array_get<n>(arr))...)) {
   return reduce<Reducer, typename id_numeric<int, n, decltype(Op::run(A()))>::type...>::run(
@@ -569,7 +565,7 @@
 }
 
 template <typename Reducer, typename Op, typename A, std::size_t N>
-constexpr EIGEN_STRONG_INLINE auto array_apply_and_reduce(array<A, N> a)
+constexpr auto array_apply_and_reduce(array<A, N> a)
     -> decltype(h_array_apply_and_reduce<Reducer, Op, A, N>(a, typename gen_numeric_list<int, N>::type())) {
   return h_array_apply_and_reduce<Reducer, Op, A, N>(a, typename gen_numeric_list<int, N>::type());
 }
@@ -582,7 +578,7 @@
 template <int n>
 struct h_repeat {
   template <typename t, int... ii>
-  constexpr static EIGEN_STRONG_INLINE array<t, n> run(t v, numeric_list<int, ii...>) {
+  constexpr static array<t, n> run(t v, numeric_list<int, ii...>) {
     return {{typename id_numeric<int, ii, t>::type(v)...}};
   }
 };

diff --git a/Eigen/src/Core/util/Serializer.h b/Eigen/src/Core/util/Serializer.h
index dc3bd13..6de7fe2 100644
--- a/Eigen/src/Core/util/Serializer.h
+++ b/Eigen/src/Core/util/Serializer.h

@@ -46,7 +46,7 @@
    * \param value the value to serialize.
    * \return the next memory address past the end of the serialized data.
    */
-  EIGEN_DEVICE_FUNC uint8_t* serialize(uint8_t* dest, uint8_t* end, const T& value) {
+  EIGEN_DEVICE_FUNC uint8_t* serialize(uint8_t* dest, uint8_t* end, const T& value) const {
     if (EIGEN_PREDICT_FALSE(dest == nullptr)) return nullptr;
     if (EIGEN_PREDICT_FALSE(dest + sizeof(value) > end)) return nullptr;
     EIGEN_USING_STD(memcpy)
@@ -84,7 +84,7 @@
 
   EIGEN_DEVICE_FUNC size_t size(const Derived& value) const { return sizeof(Header) + sizeof(Scalar) * value.size(); }
 
-  EIGEN_DEVICE_FUNC uint8_t* serialize(uint8_t* dest, uint8_t* end, const Derived& value) {
+  EIGEN_DEVICE_FUNC uint8_t* serialize(uint8_t* dest, uint8_t* end, const Derived& value) const {
     if (EIGEN_PREDICT_FALSE(dest == nullptr)) return nullptr;
     if (EIGEN_PREDICT_FALSE(dest + size(value) > end)) return nullptr;
     const size_t header_bytes = sizeof(Header);
@@ -129,7 +129,7 @@
 
 template <size_t N, typename T1, typename... Ts>
 struct serialize_impl<N, T1, Ts...> {
-  using Serializer = Eigen::Serializer<typename std::decay<T1>::type>;
+  using Serializer = Eigen::Serializer<std::decay_t<T1>>;
 
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t serialize_size(const T1& value, const Ts&... args) {
     Serializer serializer;

diff --git a/Eigen/src/Core/util/SymbolicIndex.h b/Eigen/src/Core/util/SymbolicIndex.h
index dc204af..a90e2bb 100644
--- a/Eigen/src/Core/util/SymbolicIndex.h
+++ b/Eigen/src/Core/util/SymbolicIndex.h

@@ -188,10 +188,10 @@
 class ValueExpr : BaseExpr<ValueExpr<IndexType>> {
  public:
   constexpr ValueExpr() = default;
-  constexpr ValueExpr(IndexType val) : value_(val) {}
+  constexpr ValueExpr(IndexType val) : m_value(val) {}
   template <typename... Tags, typename... Types>
   constexpr IndexType eval_impl(const SymbolValue<Tags, Types>&...) const {
-    return value_;
+    return m_value;
   }
   template <typename... Tags, typename... Types>
   static constexpr IndexType eval_at_compile_time_impl(const SymbolValue<Tags, Types>&...) {
@@ -199,7 +199,7 @@
   }
 
  protected:
-  IndexType value_;
+  IndexType m_value;
 };
 
 // Specialization for compile-time value,
@@ -232,10 +232,10 @@
   constexpr SymbolValue() = default;
 
   /** Default constructor from the value \a val */
-  constexpr SymbolValue(Index val) : value_(val) {}
+  constexpr SymbolValue(Index val) : m_value(val) {}
 
   /** \returns the stored value of the symbol */
-  constexpr Index value() const { return value_; }
+  constexpr Index value() const { return m_value; }
 
   /** \returns the stored value of the symbol at compile time, or Undefined if not known. */
   static constexpr Index value_at_compile_time() { return Index(Undefined); }
@@ -251,7 +251,7 @@
   }
 
  protected:
-  Index value_;
+  Index m_value;
 };
 
 template <typename Tag, int N>

diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h
index a0e160e..9ae6d69 100644
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h

@@ -20,11 +20,10 @@
 
 // useful for unsigned / signed integer comparisons when idx is intended to be non-negative
 template <typename IndexType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename make_unsigned<IndexType>::type returnUnsignedIndexValue(
-    const IndexType& idx) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::make_unsigned_t<IndexType> returnUnsignedIndexValue(const IndexType& idx) {
   EIGEN_STATIC_ASSERT((NumTraits<IndexType>::IsInteger), THIS FUNCTION IS FOR INTEGER TYPES)
   eigen_internal_assert(idx >= 0 && "Index value is negative and target type is unsigned");
-  using UnsignedType = typename make_unsigned<IndexType>::type;
+  using UnsignedType = std::make_unsigned_t<IndexType>;
   return static_cast<UnsignedType>(idx);
 }
 
@@ -97,7 +96,7 @@
 
 template <typename S, typename T>
 struct promote_scalar_arg<S, T, true> {
-  typedef T type;
+  using type = T;
 };
 
 // Recursively check safe conversion to PromotedType, and then ExprScalar if they are different.
@@ -113,7 +112,7 @@
 // We found a match!
 template <typename S, typename T, typename PromotedType>
 struct promote_scalar_arg_unsupported<S, T, PromotedType, true, true> {
-  typedef PromotedType type;
+  using type = PromotedType;
 };
 
 // No match, but no real-to-integer issues, and ExprScalar and current PromotedType are different,
@@ -132,8 +131,7 @@
 
 // classes inheriting no_assignment_operator don't generate a default operator=.
 class no_assignment_operator {
- private:
-  no_assignment_operator& operator=(const no_assignment_operator&);
+  no_assignment_operator& operator=(const no_assignment_operator&) = delete;
 
  protected:
   EIGEN_DEFAULT_COPY_CONSTRUCTOR(no_assignment_operator)
@@ -143,7 +141,7 @@
 /** \internal return the index type with the largest number of bits */
 template <typename I1, typename I2>
 struct promote_index_type {
-  typedef std::conditional_t<(sizeof(I1) < sizeof(I2)), I2, I1> type;
+  using type = std::conditional_t<(sizeof(I1) < sizeof(I2)), I2, I1>;
 };
 
 /** \internal If the template parameter Value is Dynamic, this class is just a wrapper around a T variable that
@@ -153,13 +151,12 @@
 template <typename T, int Value>
 class variable_if_dynamic {
  public:
-  EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(variable_if_dynamic)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamic(T v) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE explicit variable_if_dynamic(T v) {
     EIGEN_ONLY_USED_FOR_DEBUG(v);
     eigen_assert(v == T(Value));
   }
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr T value() { return T(Value); }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr operator T() const { return T(Value); }
+  EIGEN_DEVICE_FUNC static constexpr T value() { return T(Value); }
+  EIGEN_DEVICE_FUNC constexpr operator T() const { return T(Value); }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T v) const {
     EIGEN_ONLY_USED_FOR_DEBUG(v);
     eigen_assert(v == T(Value));
@@ -171,9 +168,9 @@
   T m_value;
 
  public:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamic(T value = 0) noexcept : m_value(value) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T value() const { return m_value; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator T() const { return m_value; }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE explicit variable_if_dynamic(T value = 0) noexcept : m_value(value) {}
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE T value() const { return m_value; }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE operator T() const { return m_value; }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T value) { m_value = value; }
 };
 
@@ -182,12 +179,12 @@
 template <typename T, int Value>
 class variable_if_dynamicindex {
  public:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamicindex(T v) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE explicit variable_if_dynamicindex(T v) {
     EIGEN_ONLY_USED_FOR_DEBUG(v);
     eigen_assert(v == T(Value));
   }
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr T value() { return T(Value); }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T) {}
+  EIGEN_DEVICE_FUNC static constexpr T value() { return T(Value); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void setValue(T) {}
 };
 
 template <typename T>
@@ -196,8 +193,8 @@
   EIGEN_DEVICE_FUNC variable_if_dynamicindex() { eigen_assert(false); }
 
  public:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamicindex(T value) : m_value(value) {}
-  EIGEN_DEVICE_FUNC T EIGEN_STRONG_INLINE value() const { return m_value; }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE explicit variable_if_dynamicindex(T value) : m_value(value) {}
+  EIGEN_DEVICE_FUNC constexpr T EIGEN_STRONG_INLINE value() const { return m_value; }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T value) { m_value = value; }
 };
 
@@ -277,17 +274,17 @@
 
 template <int Size, typename PacketType>
 struct find_best_packet_helper<Size, PacketType, true> {
-  typedef PacketType type;
+  using type = PacketType;
 };
 
 template <int Size, typename PacketType>
 struct find_best_packet_helper<Size, PacketType, false> {
-  typedef typename find_best_packet_helper<Size, typename unpacket_traits<PacketType>::half>::type type;
+  using type = typename find_best_packet_helper<Size, typename unpacket_traits<PacketType>::half>::type;
 };
 
 template <typename T, int Size>
 struct find_best_packet {
-  typedef typename find_best_packet_helper<Size, typename packet_traits<T>::type>::type type;
+  using type = typename find_best_packet_helper<Size, typename packet_traits<T>::type>::type;
 };
 
 template <int Size, typename PacketType,
@@ -349,17 +346,16 @@
                                       : (Cols_ == 1 && Rows_ != 1) ? ColMajor
                                                                    : EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION),
           int MaxRows_ = Rows_, int MaxCols_ = Cols_>
-class make_proper_matrix_type {
-  enum {
-    IsColVector = Cols_ == 1 && Rows_ != 1,
-    IsRowVector = Rows_ == 1 && Cols_ != 1,
-    Options = IsColVector   ? (Options_ | ColMajor) & ~RowMajor
-              : IsRowVector ? (Options_ | RowMajor) & ~ColMajor
-                            : Options_
-  };
+struct make_proper_matrix_type {
+ private:
+  static constexpr bool IsColVector = Cols_ == 1 && Rows_ != 1;
+  static constexpr bool IsRowVector = Rows_ == 1 && Cols_ != 1;
+  static constexpr int Options = IsColVector   ? (Options_ | ColMajor) & ~RowMajor
+                                 : IsRowVector ? (Options_ | RowMajor) & ~ColMajor
+                                               : Options_;
 
  public:
-  typedef Matrix<Scalar_, Rows_, Cols_, Options, MaxRows_, MaxCols_> type;
+  using type = Matrix<Scalar_, Rows_, Cols_, Options, MaxRows_, MaxCols_>;
 };
 
 constexpr unsigned compute_matrix_flags(int Options) {
@@ -391,32 +387,30 @@
 struct plain_matrix_type_dense;
 template <typename T>
 struct plain_matrix_type<T, Dense> {
-  typedef typename plain_matrix_type_dense<T, typename traits<T>::XprKind, traits<T>::Flags>::type type;
+  using type = typename plain_matrix_type_dense<T, typename traits<T>::XprKind, traits<T>::Flags>::type;
 };
 template <typename T>
 struct plain_matrix_type<T, DiagonalShape> {
-  typedef typename T::PlainObject type;
+  using type = typename T::PlainObject;
 };
 
 template <typename T>
 struct plain_matrix_type<T, SkewSymmetricShape> {
-  typedef typename T::PlainObject type;
+  using type = typename T::PlainObject;
 };
 
 template <typename T, int Flags>
 struct plain_matrix_type_dense<T, MatrixXpr, Flags> {
-  typedef Matrix<typename traits<T>::Scalar, traits<T>::RowsAtCompileTime, traits<T>::ColsAtCompileTime,
-                 AutoAlign | (Flags & RowMajorBit ? RowMajor : ColMajor), traits<T>::MaxRowsAtCompileTime,
-                 traits<T>::MaxColsAtCompileTime>
-      type;
+  using type = Matrix<typename traits<T>::Scalar, traits<T>::RowsAtCompileTime, traits<T>::ColsAtCompileTime,
+                      AutoAlign | (Flags & RowMajorBit ? RowMajor : ColMajor), traits<T>::MaxRowsAtCompileTime,
+                      traits<T>::MaxColsAtCompileTime>;
 };
 
 template <typename T, int Flags>
 struct plain_matrix_type_dense<T, ArrayXpr, Flags> {
-  typedef Array<typename traits<T>::Scalar, traits<T>::RowsAtCompileTime, traits<T>::ColsAtCompileTime,
-                AutoAlign | (Flags & RowMajorBit ? RowMajor : ColMajor), traits<T>::MaxRowsAtCompileTime,
-                traits<T>::MaxColsAtCompileTime>
-      type;
+  using type = Array<typename traits<T>::Scalar, traits<T>::RowsAtCompileTime, traits<T>::ColsAtCompileTime,
+                     AutoAlign | (Flags & RowMajorBit ? RowMajor : ColMajor), traits<T>::MaxRowsAtCompileTime,
+                     traits<T>::MaxColsAtCompileTime>;
 };
 
 /* eval : the return type of eval(). For matrices, this is just a const reference
@@ -428,36 +422,28 @@
 
 template <typename T>
 struct eval<T, Dense> {
-  typedef typename plain_matrix_type<T>::type type;
-  //   typedef typename T::PlainObject type;
-  //   typedef T::Matrix<typename traits<T>::Scalar,
-  //                 traits<T>::RowsAtCompileTime,
-  //                 traits<T>::ColsAtCompileTime,
-  //                 AutoAlign | (traits<T>::Flags&RowMajorBit ? RowMajor : ColMajor),
-  //                 traits<T>::MaxRowsAtCompileTime,
-  //                 traits<T>::MaxColsAtCompileTime
-  //           > type;
+  using type = typename plain_matrix_type<T>::type;
 };
 
 template <typename T>
 struct eval<T, DiagonalShape> {
-  typedef typename plain_matrix_type<T>::type type;
+  using type = typename plain_matrix_type<T>::type;
 };
 
 template <typename T>
 struct eval<T, SkewSymmetricShape> {
-  typedef typename plain_matrix_type<T>::type type;
+  using type = typename plain_matrix_type<T>::type;
 };
 
 // for matrices, no need to evaluate, just use a const reference to avoid a useless copy
 template <typename Scalar_, int Rows_, int Cols_, int Options_, int MaxRows_, int MaxCols_>
 struct eval<Matrix<Scalar_, Rows_, Cols_, Options_, MaxRows_, MaxCols_>, Dense> {
-  typedef const Matrix<Scalar_, Rows_, Cols_, Options_, MaxRows_, MaxCols_>& type;
+  using type = const Matrix<Scalar_, Rows_, Cols_, Options_, MaxRows_, MaxCols_>&;
 };
 
 template <typename Scalar_, int Rows_, int Cols_, int Options_, int MaxRows_, int MaxCols_>
 struct eval<Array<Scalar_, Rows_, Cols_, Options_, MaxRows_, MaxCols_>, Dense> {
-  typedef const Array<Scalar_, Rows_, Cols_, Options_, MaxRows_, MaxCols_>& type;
+  using type = const Array<Scalar_, Rows_, Cols_, Options_, MaxRows_, MaxCols_>&;
 };
 
 /* similar to plain_matrix_type, but using the evaluator's Flags */
@@ -466,37 +452,31 @@
 
 template <typename T>
 struct plain_object_eval<T, Dense> {
-  typedef typename plain_matrix_type_dense<T, typename traits<T>::XprKind, evaluator<T>::Flags>::type type;
+  using type = typename plain_matrix_type_dense<T, typename traits<T>::XprKind, evaluator<T>::Flags>::type;
 };
 
 /* plain_matrix_type_column_major : same as plain_matrix_type but guaranteed to be column-major
  */
 template <typename T>
 struct plain_matrix_type_column_major {
-  enum {
-    Rows = traits<T>::RowsAtCompileTime,
-    Cols = traits<T>::ColsAtCompileTime,
-    MaxRows = traits<T>::MaxRowsAtCompileTime,
-    MaxCols = traits<T>::MaxColsAtCompileTime
-  };
-  typedef Matrix<typename traits<T>::Scalar, Rows, Cols, (MaxRows == 1 && MaxCols != 1) ? RowMajor : ColMajor, MaxRows,
-                 MaxCols>
-      type;
+  static constexpr int Rows = traits<T>::RowsAtCompileTime;
+  static constexpr int Cols = traits<T>::ColsAtCompileTime;
+  static constexpr int MaxRows = traits<T>::MaxRowsAtCompileTime;
+  static constexpr int MaxCols = traits<T>::MaxColsAtCompileTime;
+  using type = Matrix<typename traits<T>::Scalar, Rows, Cols, (MaxRows == 1 && MaxCols != 1) ? RowMajor : ColMajor,
+                      MaxRows, MaxCols>;
 };
 
 /* plain_matrix_type_row_major : same as plain_matrix_type but guaranteed to be row-major
  */
 template <typename T>
 struct plain_matrix_type_row_major {
-  enum {
-    Rows = traits<T>::RowsAtCompileTime,
-    Cols = traits<T>::ColsAtCompileTime,
-    MaxRows = traits<T>::MaxRowsAtCompileTime,
-    MaxCols = traits<T>::MaxColsAtCompileTime
-  };
-  typedef Matrix<typename traits<T>::Scalar, Rows, Cols, (MaxCols == 1 && MaxRows != 1) ? ColMajor : RowMajor, MaxRows,
-                 MaxCols>
-      type;
+  static constexpr int Rows = traits<T>::RowsAtCompileTime;
+  static constexpr int Cols = traits<T>::ColsAtCompileTime;
+  static constexpr int MaxRows = traits<T>::MaxRowsAtCompileTime;
+  static constexpr int MaxCols = traits<T>::MaxColsAtCompileTime;
+  using type = Matrix<typename traits<T>::Scalar, Rows, Cols, (MaxCols == 1 && MaxRows != 1) ? ColMajor : RowMajor,
+                      MaxRows, MaxCols>;
 };
 
 /** \internal The reference selector for template expressions. The idea is that we don't
@@ -504,15 +484,9 @@
  * objects which should generate no copying overhead. */
 template <typename T>
 struct ref_selector {
-  typedef std::conditional_t<bool(traits<T>::Flags& NestByRefBit), T const&, const T> type;
+  using type = std::conditional_t<bool(traits<T>::Flags& NestByRefBit), T const&, const T>;
 
-  typedef std::conditional_t<bool(traits<T>::Flags& NestByRefBit), T&, T> non_const_type;
-};
-
-/** \internal Adds the const qualifier on the value-type of T2 if and only if T1 is a const type */
-template <typename T1, typename T2>
-struct transfer_constness {
-  typedef std::conditional_t<bool(internal::is_const<T1>::value), add_const_on_value_type_t<T2>, T2> type;
+  using non_const_type = std::conditional_t<bool(traits<T>::Flags& NestByRefBit), T&, T>;
 };
 
 // However, we still need a mechanism to detect whether an expression which is evaluated multiple time
@@ -546,14 +520,9 @@
     Evaluate = (int(evaluator<T>::Flags) & EvalBeforeNestingBit) || (int(CostEval) < int(CostNoEval))
   };
 
-  typedef std::conditional_t<Evaluate, PlainObject, typename ref_selector<T>::type> type;
+  using type = std::conditional_t<Evaluate, PlainObject, typename ref_selector<T>::type>;
 };
 
-template <typename T>
-EIGEN_DEVICE_FUNC inline T* const_cast_ptr(const T* ptr) {
-  return const_cast<T*>(ptr);
-}
-
 template <typename Derived, typename XprKind = typename traits<Derived>::XprKind>
 struct dense_xpr_base {
   /* dense_xpr_base should only ever be used on dense expressions, thus falling either into the MatrixXpr or into the
@@ -562,12 +531,12 @@
 
 template <typename Derived>
 struct dense_xpr_base<Derived, MatrixXpr> {
-  typedef MatrixBase<Derived> type;
+  using type = MatrixBase<Derived>;
 };
 
 template <typename Derived>
 struct dense_xpr_base<Derived, ArrayXpr> {
-  typedef ArrayBase<Derived> type;
+  using type = ArrayBase<Derived>;
 };
 
 template <typename Derived, typename XprKind = typename traits<Derived>::XprKind,
@@ -576,15 +545,15 @@
 
 template <typename Derived, typename XprKind>
 struct generic_xpr_base<Derived, XprKind, Dense> {
-  typedef typename dense_xpr_base<Derived, XprKind>::type type;
+  using type = typename dense_xpr_base<Derived, XprKind>::type;
 };
 
 template <typename XprType, typename CastType>
 struct cast_return_type {
-  typedef typename XprType::Scalar CurrentScalarType;
-  typedef remove_all_t<CastType> CastType_;
-  typedef typename CastType_::Scalar NewScalarType;
-  typedef std::conditional_t<is_same<CurrentScalarType, NewScalarType>::value, const XprType&, CastType> type;
+  using CurrentScalarType = typename XprType::Scalar;
+  using CastType_ = remove_all_t<CastType>;
+  using NewScalarType = typename CastType_::Scalar;
+  using type = std::conditional_t<is_same<CurrentScalarType, NewScalarType>::value, const XprType&, CastType>;
 };
 
 template <typename A, typename B>
@@ -592,15 +561,15 @@
 
 template <typename A>
 struct promote_storage_type<A, A> {
-  typedef A ret;
+  using ret = A;
 };
 template <typename A>
 struct promote_storage_type<A, const A> {
-  typedef A ret;
+  using ret = A;
 };
 template <typename A>
 struct promote_storage_type<const A, A> {
-  typedef A ret;
+  using ret = A;
 };
 
 /** \internal Specify the "storage kind" of applying a coefficient-wise
@@ -621,27 +590,27 @@
 
 template <typename A, typename Functor>
 struct cwise_promote_storage_type<A, A, Functor> {
-  typedef A ret;
+  using ret = A;
 };
 template <typename Functor>
 struct cwise_promote_storage_type<Dense, Dense, Functor> {
-  typedef Dense ret;
+  using ret = Dense;
 };
 template <typename A, typename Functor>
 struct cwise_promote_storage_type<A, Dense, Functor> {
-  typedef Dense ret;
+  using ret = Dense;
 };
 template <typename B, typename Functor>
 struct cwise_promote_storage_type<Dense, B, Functor> {
-  typedef Dense ret;
+  using ret = Dense;
 };
 template <typename Functor>
 struct cwise_promote_storage_type<Sparse, Dense, Functor> {
-  typedef Sparse ret;
+  using ret = Sparse;
 };
 template <typename Functor>
 struct cwise_promote_storage_type<Dense, Sparse, Functor> {
-  typedef Sparse ret;
+  using ret = Sparse;
 };
 
 template <typename LhsKind, typename RhsKind, int LhsOrder, int RhsOrder>
@@ -681,74 +650,74 @@
 
 template <typename A, int ProductTag>
 struct product_promote_storage_type<A, A, ProductTag> {
-  typedef A ret;
+  using ret = A;
 };
 template <int ProductTag>
 struct product_promote_storage_type<Dense, Dense, ProductTag> {
-  typedef Dense ret;
+  using ret = Dense;
 };
 template <typename A, int ProductTag>
 struct product_promote_storage_type<A, Dense, ProductTag> {
-  typedef Dense ret;
+  using ret = Dense;
 };
 template <typename B, int ProductTag>
 struct product_promote_storage_type<Dense, B, ProductTag> {
-  typedef Dense ret;
+  using ret = Dense;
 };
 
 template <typename A, int ProductTag>
 struct product_promote_storage_type<A, DiagonalShape, ProductTag> {
-  typedef A ret;
+  using ret = A;
 };
 template <typename B, int ProductTag>
 struct product_promote_storage_type<DiagonalShape, B, ProductTag> {
-  typedef B ret;
+  using ret = B;
 };
 template <int ProductTag>
 struct product_promote_storage_type<Dense, DiagonalShape, ProductTag> {
-  typedef Dense ret;
+  using ret = Dense;
 };
 template <int ProductTag>
 struct product_promote_storage_type<DiagonalShape, Dense, ProductTag> {
-  typedef Dense ret;
+  using ret = Dense;
 };
 
 template <typename A, int ProductTag>
 struct product_promote_storage_type<A, SkewSymmetricShape, ProductTag> {
-  typedef A ret;
+  using ret = A;
 };
 template <typename B, int ProductTag>
 struct product_promote_storage_type<SkewSymmetricShape, B, ProductTag> {
-  typedef B ret;
+  using ret = B;
 };
 template <int ProductTag>
 struct product_promote_storage_type<Dense, SkewSymmetricShape, ProductTag> {
-  typedef Dense ret;
+  using ret = Dense;
 };
 template <int ProductTag>
 struct product_promote_storage_type<SkewSymmetricShape, Dense, ProductTag> {
-  typedef Dense ret;
+  using ret = Dense;
 };
 template <int ProductTag>
 struct product_promote_storage_type<SkewSymmetricShape, SkewSymmetricShape, ProductTag> {
-  typedef Dense ret;
+  using ret = Dense;
 };
 
 template <typename A, int ProductTag>
 struct product_promote_storage_type<A, PermutationStorage, ProductTag> {
-  typedef A ret;
+  using ret = A;
 };
 template <typename B, int ProductTag>
 struct product_promote_storage_type<PermutationStorage, B, ProductTag> {
-  typedef B ret;
+  using ret = B;
 };
 template <int ProductTag>
 struct product_promote_storage_type<Dense, PermutationStorage, ProductTag> {
-  typedef Dense ret;
+  using ret = Dense;
 };
 template <int ProductTag>
 struct product_promote_storage_type<PermutationStorage, Dense, ProductTag> {
-  typedef Dense ret;
+  using ret = Dense;
 };
 
 /** \internal gives the plain matrix or array type to store a row/column/diagonal of a matrix type.
@@ -756,63 +725,56 @@
  */
 template <typename ExpressionType, typename Scalar = typename ExpressionType::Scalar>
 struct plain_row_type {
-  typedef Matrix<Scalar, 1, ExpressionType::ColsAtCompileTime,
-                 int(ExpressionType::PlainObject::Options) | int(RowMajor), 1, ExpressionType::MaxColsAtCompileTime>
-      MatrixRowType;
-  typedef Array<Scalar, 1, ExpressionType::ColsAtCompileTime, int(ExpressionType::PlainObject::Options) | int(RowMajor),
-                1, ExpressionType::MaxColsAtCompileTime>
-      ArrayRowType;
+  using MatrixRowType =
+      Matrix<Scalar, 1, ExpressionType::ColsAtCompileTime, int(ExpressionType::PlainObject::Options) | int(RowMajor), 1,
+             ExpressionType::MaxColsAtCompileTime>;
+  using ArrayRowType =
+      Array<Scalar, 1, ExpressionType::ColsAtCompileTime, int(ExpressionType::PlainObject::Options) | int(RowMajor), 1,
+            ExpressionType::MaxColsAtCompileTime>;
 
-  typedef std::conditional_t<is_same<typename traits<ExpressionType>::XprKind, MatrixXpr>::value, MatrixRowType,
-                             ArrayRowType>
-      type;
+  using type = std::conditional_t<is_same<typename traits<ExpressionType>::XprKind, MatrixXpr>::value, MatrixRowType,
+                                  ArrayRowType>;
 };
 
 template <typename ExpressionType, typename Scalar = typename ExpressionType::Scalar>
 struct plain_col_type {
-  typedef Matrix<Scalar, ExpressionType::RowsAtCompileTime, 1, ExpressionType::PlainObject::Options & ~RowMajor,
-                 ExpressionType::MaxRowsAtCompileTime, 1>
-      MatrixColType;
-  typedef Array<Scalar, ExpressionType::RowsAtCompileTime, 1, ExpressionType::PlainObject::Options & ~RowMajor,
-                ExpressionType::MaxRowsAtCompileTime, 1>
-      ArrayColType;
+  using MatrixColType =
+      Matrix<Scalar, ExpressionType::RowsAtCompileTime, 1, ExpressionType::PlainObject::Options & ~RowMajor,
+             ExpressionType::MaxRowsAtCompileTime, 1>;
+  using ArrayColType = Array<Scalar, ExpressionType::RowsAtCompileTime, 1,
+                             ExpressionType::PlainObject::Options & ~RowMajor, ExpressionType::MaxRowsAtCompileTime, 1>;
 
-  typedef std::conditional_t<is_same<typename traits<ExpressionType>::XprKind, MatrixXpr>::value, MatrixColType,
-                             ArrayColType>
-      type;
+  using type = std::conditional_t<is_same<typename traits<ExpressionType>::XprKind, MatrixXpr>::value, MatrixColType,
+                                  ArrayColType>;
 };
 
 template <typename ExpressionType, typename Scalar = typename ExpressionType::Scalar>
 struct plain_diag_type {
-  enum {
-    diag_size = internal::min_size_prefer_dynamic(ExpressionType::RowsAtCompileTime, ExpressionType::ColsAtCompileTime),
-    max_diag_size = min_size_prefer_fixed(ExpressionType::MaxRowsAtCompileTime, ExpressionType::MaxColsAtCompileTime)
-  };
-  typedef Matrix<Scalar, diag_size, 1, ExpressionType::PlainObject::Options & ~RowMajor, max_diag_size, 1>
-      MatrixDiagType;
-  typedef Array<Scalar, diag_size, 1, ExpressionType::PlainObject::Options & ~RowMajor, max_diag_size, 1> ArrayDiagType;
+  static constexpr int diag_size =
+      internal::min_size_prefer_dynamic(ExpressionType::RowsAtCompileTime, ExpressionType::ColsAtCompileTime);
+  static constexpr int max_diag_size =
+      min_size_prefer_fixed(ExpressionType::MaxRowsAtCompileTime, ExpressionType::MaxColsAtCompileTime);
+  using MatrixDiagType =
+      Matrix<Scalar, diag_size, 1, ExpressionType::PlainObject::Options & ~RowMajor, max_diag_size, 1>;
+  using ArrayDiagType = Array<Scalar, diag_size, 1, ExpressionType::PlainObject::Options & ~RowMajor, max_diag_size, 1>;
 
-  typedef std::conditional_t<is_same<typename traits<ExpressionType>::XprKind, MatrixXpr>::value, MatrixDiagType,
-                             ArrayDiagType>
-      type;
+  using type = std::conditional_t<is_same<typename traits<ExpressionType>::XprKind, MatrixXpr>::value, MatrixDiagType,
+                                  ArrayDiagType>;
 };
 
 template <typename Expr, typename Scalar = typename Expr::Scalar>
 struct plain_constant_type {
-  enum { Options = (traits<Expr>::Flags & RowMajorBit) ? RowMajor : 0 };
+  static constexpr int Options = (traits<Expr>::Flags & RowMajorBit) ? RowMajor : 0;
 
-  typedef Array<Scalar, traits<Expr>::RowsAtCompileTime, traits<Expr>::ColsAtCompileTime, Options,
-                traits<Expr>::MaxRowsAtCompileTime, traits<Expr>::MaxColsAtCompileTime>
-      array_type;
+  using array_type = Array<Scalar, traits<Expr>::RowsAtCompileTime, traits<Expr>::ColsAtCompileTime, Options,
+                           traits<Expr>::MaxRowsAtCompileTime, traits<Expr>::MaxColsAtCompileTime>;
 
-  typedef Matrix<Scalar, traits<Expr>::RowsAtCompileTime, traits<Expr>::ColsAtCompileTime, Options,
-                 traits<Expr>::MaxRowsAtCompileTime, traits<Expr>::MaxColsAtCompileTime>
-      matrix_type;
+  using matrix_type = Matrix<Scalar, traits<Expr>::RowsAtCompileTime, traits<Expr>::ColsAtCompileTime, Options,
+                             traits<Expr>::MaxRowsAtCompileTime, traits<Expr>::MaxColsAtCompileTime>;
 
-  typedef CwiseNullaryOp<
+  using type = CwiseNullaryOp<
       scalar_constant_op<Scalar>,
-      const std::conditional_t<is_same<typename traits<Expr>::XprKind, MatrixXpr>::value, matrix_type, array_type>>
-      type;
+      const std::conditional_t<is_same<typename traits<Expr>::XprKind, MatrixXpr>::value, matrix_type, array_type>>;
 };
 
 template <typename ExpressionType>
@@ -854,7 +816,7 @@
 struct glue_shapes;
 template <>
 struct glue_shapes<DenseShape, TriangularShape> {
-  typedef TriangularShape type;
+  using type = TriangularShape;
 };
 
 template <typename T1, typename T2>
@@ -1049,34 +1011,34 @@
 
 template <typename T, typename BinaryOp>
 struct ScalarBinaryOpTraits<T, T, BinaryOp> {
-  typedef T ReturnType;
+  using ReturnType = T;
 };
 
 template <typename T, typename BinaryOp>
 struct ScalarBinaryOpTraits<T, typename NumTraits<std::enable_if_t<NumTraits<T>::IsComplex, T>>::Real, BinaryOp> {
-  typedef T ReturnType;
+  using ReturnType = T;
 };
 template <typename T, typename BinaryOp>
 struct ScalarBinaryOpTraits<typename NumTraits<std::enable_if_t<NumTraits<T>::IsComplex, T>>::Real, T, BinaryOp> {
-  typedef T ReturnType;
+  using ReturnType = T;
 };
 
 // For Matrix * Permutation
 template <typename T, typename BinaryOp>
 struct ScalarBinaryOpTraits<T, void, BinaryOp> {
-  typedef T ReturnType;
+  using ReturnType = T;
 };
 
 // For Permutation * Matrix
 template <typename T, typename BinaryOp>
 struct ScalarBinaryOpTraits<void, T, BinaryOp> {
-  typedef T ReturnType;
+  using ReturnType = T;
 };
 
 // for Permutation*Permutation
 template <typename BinaryOp>
 struct ScalarBinaryOpTraits<void, void, BinaryOp> {
-  typedef void ReturnType;
+  using ReturnType = void;
 };
 
 // We require Lhs and Rhs to have "compatible" scalar types.

diff --git a/Eigen/src/Eigenvalues/ComplexEigenSolver.h b/Eigen/src/Eigenvalues/ComplexEigenSolver.h
index 50fa3b8..c848955 100644
--- a/Eigen/src/Eigenvalues/ComplexEigenSolver.h
+++ b/Eigen/src/Eigenvalues/ComplexEigenSolver.h

@@ -222,7 +222,7 @@
   }
 
   /** \brief Returns the maximum number of iterations. */
-  Index getMaxIterations() { return m_schur.getMaxIterations(); }
+  Index getMaxIterations() const { return m_schur.getMaxIterations(); }
 
  protected:
   EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
@@ -265,8 +265,6 @@
 void ComplexEigenSolver<MatrixType>::doComputeEigenvectors(RealScalar matrixnorm) {
   const Index n = m_eivalues.size();
 
-  matrixnorm = numext::maxi(matrixnorm, (std::numeric_limits<RealScalar>::min)());
-
   // Compute X such that T = X D X^(-1), where D is the diagonal of T.
   // The matrix X is unit triangular.
   m_matX = EigenvectorType::Zero(n, n);
@@ -282,7 +280,8 @@
       if (z == ComplexScalar(0)) {
         // If the i-th and k-th eigenvalue are equal, then z equals 0.
         // Use a small value instead, to prevent division by zero.
-        numext::real_ref(z) = NumTraits<RealScalar>::epsilon() * matrixnorm;
+        numext::real_ref(z) = numext::maxi(std::numeric_limits<RealScalar>::epsilon() * matrixnorm,
+                                           (std::numeric_limits<RealScalar>::min)());
       }
       m_matX.coeffRef(i, k) = m_matX.coeff(i, k) / z;
     }

diff --git a/Eigen/src/Eigenvalues/ComplexQZ.h b/Eigen/src/Eigenvalues/ComplexQZ.h
index ed32ea3..1d5f8d0 100644
--- a/Eigen/src/Eigenvalues/ComplexQZ.h
+++ b/Eigen/src/Eigenvalues/ComplexQZ.h

@@ -170,9 +170,9 @@
   template <typename SparseMatrixType_>
   void computeSparse(const SparseMatrixType_& A, const SparseMatrixType_& B, bool computeQZ = true);
 
-  /** \brief Reports whether the last computation was successfull.
+  /** \brief Reports whether the last computation was successful.
    *
-   * \returns \c Success if computation was successfull, \c NoConvergence otherwise.
+   * \returns \c Success if computation was successful, \c NoConvergence otherwise.
    */
   ComputationInfo info() const { return m_info; }
 
@@ -203,7 +203,7 @@
 
   inline Mat2 computeZk2(const Row2& b);
 
-  // This is basically taken from from Eigen3::RealQZ
+  // This is basically taken from Eigen3::RealQZ
   void hessenbergTriangular(const MatrixType& A, const MatrixType& B);
 
   // This function can be called when m_Q and m_Z are initialized and m_S, m_T
@@ -243,7 +243,7 @@
   reduceHessenbergTriangular();
 }
 
-// This is basically taken from from Eigen3::RealQZ
+// This is basically taken from Eigen3::RealQZ
 template <typename MatrixType_>
 void ComplexQZ<MatrixType_>::hessenbergTriangular(const MatrixType& A, const MatrixType& B) {
   // Copy A and B, these will be the matrices on which we operate later

diff --git a/Eigen/src/Eigenvalues/ComplexSchur.h b/Eigen/src/Eigenvalues/ComplexSchur.h
index 22433f2..86d6349 100644
--- a/Eigen/src/Eigenvalues/ComplexSchur.h
+++ b/Eigen/src/Eigenvalues/ComplexSchur.h

@@ -233,7 +233,7 @@
   }
 
   /** \brief Returns the maximum number of iterations. */
-  Index getMaxIterations() { return m_maxIters; }
+  Index getMaxIterations() const { return m_maxIters; }
 
   /** \brief Maximum number of iterations per row.
    *
@@ -277,7 +277,8 @@
   using std::abs;
   if ((iter == 10 || iter == 20) && iu > 1) {
     // exceptional shift, taken from http://www.netlib.org/eispack/comqr.f
-    return abs(numext::real(m_matT.coeff(iu, iu - 1))) + abs(numext::real(m_matT.coeff(iu - 1, iu - 2)));
+    return ComplexSchur<MatrixType>::ComplexScalar(abs(numext::real(m_matT.coeff(iu, iu - 1))) +
+                                                   abs(numext::real(m_matT.coeff(iu - 1, iu - 2))));
   }
 
   // compute the shift as one of the eigenvalues of t, the 2x2
@@ -362,7 +363,7 @@
     _this.m_hess.compute(matrix);
     _this.m_matT = _this.m_hess.matrixH().template cast<ComplexScalar>();
     if (computeU) {
-      // This may cause an allocation which seems to be avoidable
+      // TODO: this temporary allocation could potentially be avoided.
       MatrixType Q = _this.m_hess.matrixQ();
       _this.m_matU = Q.template cast<ComplexScalar>();
     }

diff --git a/Eigen/src/Eigenvalues/EigenSolver.h b/Eigen/src/Eigenvalues/EigenSolver.h
index 9dba7bd..f439912 100644
--- a/Eigen/src/Eigenvalues/EigenSolver.h
+++ b/Eigen/src/Eigenvalues/EigenSolver.h

@@ -292,7 +292,7 @@
   }
 
   /** \brief Returns the maximum number of iterations. */
-  Index getMaxIterations() { return m_realSchur.getMaxIterations(); }
+  Index getMaxIterations() const { return m_realSchur.getMaxIterations(); }
 
  private:
   void doComputeEigenvectors();

diff --git a/Eigen/src/Eigenvalues/HessenbergDecomposition.h b/Eigen/src/Eigenvalues/HessenbergDecomposition.h
index f79ee33..42fe3fd 100644
--- a/Eigen/src/Eigenvalues/HessenbergDecomposition.h
+++ b/Eigen/src/Eigenvalues/HessenbergDecomposition.h

@@ -317,7 +317,7 @@
  * HessenbergDecomposition class until the it is assigned or evaluated for
  * some other reason (the reference should remain valid during the life time
  * of this object). This class is the return type of
- * HessenbergDecomposition::matrixH(); there is probably no other use for this
+ * HessenbergDecomposition::matrixH(); there is no other intended use for this
  * class.
  */
 template <typename MatrixType>

diff --git a/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h b/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h
index 62227bd..38f46f8 100644
--- a/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h
+++ b/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h

@@ -111,8 +111,7 @@
 inline typename MatrixBase<Derived>::RealScalar MatrixBase<Derived>::operatorNorm() const {
   using std::sqrt;
   typename Derived::PlainObject m_eval(derived());
-  // FIXME if it is really guaranteed that the eigenvalues are already sorted,
-  // then we don't need to compute a maxCoeff() here, comparing the 1st and last ones is enough.
+  // FIXME: if eigenvalues are guaranteed to be sorted, comparing the first and last is sufficient.
   return sqrt((m_eval * m_eval.adjoint()).eval().template selfadjointView<Lower>().eigenvalues().maxCoeff());
 }
 

diff --git a/Eigen/src/Eigenvalues/RealSchur.h b/Eigen/src/Eigenvalues/RealSchur.h
index 94bc34d..6e50a43 100644
--- a/Eigen/src/Eigenvalues/RealSchur.h
+++ b/Eigen/src/Eigenvalues/RealSchur.h

@@ -207,7 +207,7 @@
   }
 
   /** \brief Returns the maximum number of iterations. */
-  Index getMaxIterations() { return m_maxIters; }
+  Index getMaxIterations() const { return m_maxIters; }
 
   /** \brief Maximum number of iterations per row.
    *
@@ -343,7 +343,7 @@
 template <typename MatrixType>
 inline typename MatrixType::Scalar RealSchur<MatrixType>::computeNormOfT() {
   const Index size = m_matT.cols();
-  // FIXME to be efficient the following would requires a triangular reduxion code
+  // FIXME: a triangular reduction would be more efficient here.
   // Scalar norm = m_matT.upper().cwiseAbs().sum()
   //               + m_matT.bottomLeftCorner(size-1,size-1).diagonal().cwiseAbs().sum();
   Scalar norm(0);

diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
index 161e6b5..7edac1f 100644
--- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
+++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h

@@ -548,8 +548,7 @@
     info = NoConvergence;
 
   // Sort eigenvalues and corresponding vectors.
-  // TODO make the sort optional ?
-  // TODO use a better sort algorithm !!
+  // TODO: make the sort optional and use a more efficient sorting algorithm.
   if (info == Success) {
     for (Index i = 0; i < n - 1; ++i) {
       Index k;
@@ -653,12 +652,12 @@
 
     // Shift the matrix to the mean eigenvalue and map the matrix coefficients to [-1:1] to avoid over- and underflow.
     Scalar shift = mat.trace() / Scalar(3);
-    // TODO Avoid this copy. Currently it is necessary to suppress bogus values when determining maxCoeff and for
-    // computing the eigenvectors later
+    // TODO: avoid this copy. Currently necessary to suppress bogus values when determining maxCoeff and for
+    // computing the eigenvectors later.
     MatrixType scaledMat = mat.template selfadjointView<Lower>();
     scaledMat.diagonal().array() -= shift;
     Scalar scale = scaledMat.cwiseAbs().maxCoeff();
-    if (scale > 0) scaledMat /= scale;  // TODO for scale==0 we could save the remaining operations
+    if (scale > 0) scaledMat /= scale;  // TODO: skip remaining operations when scale==0.
 
     // compute the eigenvalues
     computeRoots(scaledMat, eivals);
@@ -852,7 +851,7 @@
 
     // apply the givens rotation to the unit matrix Q = Q * G
     if (matrixQ) {
-      // FIXME if StorageOrder == RowMajor this operation is not very efficient
+      // FIXME: this operation is inefficient for RowMajor storage order.
       Map<Matrix<Scalar, Dynamic, Dynamic, StorageOrder> > q(matrixQ, n, n);
       q.applyOnTheRight(k, k + 1, rot);
     }

diff --git a/Eigen/src/Eigenvalues/Tridiagonalization.h b/Eigen/src/Eigenvalues/Tridiagonalization.h
index 9cc9201..aef8739 100644
--- a/Eigen/src/Eigenvalues/Tridiagonalization.h
+++ b/Eigen/src/Eigenvalues/Tridiagonalization.h

@@ -22,7 +22,7 @@
 struct TridiagonalizationMatrixTReturnType;
 template <typename MatrixType>
 struct traits<TridiagonalizationMatrixTReturnType<MatrixType>> : public traits<typename MatrixType::PlainObject> {
-  typedef typename MatrixType::PlainObject ReturnType;  // FIXME shall it be a BandMatrix?
+  typedef typename MatrixType::PlainObject ReturnType;  // FIXME: consider using BandMatrix as ReturnType.
   enum { Flags = 0 };
 };
 

diff --git a/Eigen/src/Geometry/EulerAngles.h b/Eigen/src/Geometry/EulerAngles.h
index 366a32c..9c6a0c1 100644
--- a/Eigen/src/Geometry/EulerAngles.h
+++ b/Eigen/src/Geometry/EulerAngles.h

@@ -37,7 +37,7 @@
  * For proper Euler angle configurations (a0 == a2), the returned angles are in the ranges [-pi:pi]x[0:pi]x[-pi:pi].
  *
  * The approach used is also described here:
- * https://d3cw3dd2w32x2b.cloudfront.net/wp-content/uploads/2012/07/euler-angles.pdf
+ * https://web.archive.org/web/20240715191429/https://d3cw3dd2w32x2b.cloudfront.net/wp-content/uploads/2012/07/euler-angles.pdf
  *
  * \sa class AngleAxis
  */

diff --git a/Eigen/src/Geometry/Homogeneous.h b/Eigen/src/Geometry/Homogeneous.h
index 4159dc6..b304b67 100644
--- a/Eigen/src/Geometry/Homogeneous.h
+++ b/Eigen/src/Geometry/Homogeneous.h

@@ -247,7 +247,7 @@
 
   template <typename Dest>
   EIGEN_DEVICE_FUNC void evalTo(Dest& dst) const {
-    // FIXME investigate how to allow lazy evaluation of this product when possible
+    // FIXME: investigate how to allow lazy evaluation of this product when possible.
     dst = Block < const LhsMatrixTypeNested, LhsMatrixTypeNested::RowsAtCompileTime,
     LhsMatrixTypeNested::ColsAtCompileTime == Dynamic
         ? Dynamic
@@ -278,7 +278,7 @@
 
   template <typename Dest>
   EIGEN_DEVICE_FUNC void evalTo(Dest& dst) const {
-    // FIXME investigate how to allow lazy evaluation of this product when possible
+    // FIXME: investigate how to allow lazy evaluation of this product when possible.
     dst = m_lhs * Block < const RhsNested,
     RhsNested::RowsAtCompileTime == Dynamic ? Dynamic : RhsNested::RowsAtCompileTime - 1,
     RhsNested::ColsAtCompileTime > (m_rhs, 0, 0, m_rhs.rows() - 1, m_rhs.cols());
@@ -392,8 +392,7 @@
   }
 };
 
-// TODO: the following specialization is to address a regression from 3.2 to 3.3
-// In the future, this path should be optimized.
+// TODO: this specialization addresses a performance regression from 3.2 to 3.3; optimize this path.
 template <typename Lhs, typename RhsArg, int ProductTag>
 struct generic_product_impl<Lhs, Homogeneous<RhsArg, Vertical>, TriangularShape, HomogeneousShape, ProductTag> {
   template <typename Dest>

diff --git a/Eigen/src/Geometry/Hyperplane.h b/Eigen/src/Geometry/Hyperplane.h
index 0fa0319..1bdb811 100644
--- a/Eigen/src/Geometry/Hyperplane.h
+++ b/Eigen/src/Geometry/Hyperplane.h

@@ -111,7 +111,7 @@
    * If the dimension of the ambient space is greater than 2, then there isn't uniqueness,
    * so an arbitrary choice is made.
    */
-  // FIXME to be consistent with the rest this could be implemented as a static Through function ??
+  // FIXME: for consistency, consider implementing as a static Through function.
   EIGEN_DEVICE_FUNC explicit Hyperplane(const ParametrizedLine<Scalar, AmbientDimAtCompileTime>& parametrized) {
     normal() = parametrized.direction().unitOrthogonal();
     offset() = -parametrized.origin().dot(normal());

diff --git a/Eigen/src/Geometry/OrthoMethods.h b/Eigen/src/Geometry/OrthoMethods.h
index fc708ee..758c82e 100644
--- a/Eigen/src/Geometry/OrthoMethods.h
+++ b/Eigen/src/Geometry/OrthoMethods.h

@@ -200,7 +200,7 @@
   EIGEN_DEVICE_FUNC static inline VectorType run(const Derived& src) {
     VectorType perp;
     /* Let us compute the crossed product of *this with a vector
-     * that is not too close to being colinear to *this.
+     * that is not too close to being collinear to *this.
      */
 
     /* unless the x and y coords are both close to zero, we can
@@ -213,7 +213,7 @@
       perp.coeffRef(2) = 0;
     }
     /* if both x and y are close to zero, then the vector is close
-     * to the z-axis, so it's far from colinear to the x-axis for instance.
+     * to the z-axis, so it's far from collinear to the x-axis for instance.
      * So we take the crossed product with (1,0,0) and normalize it.
      */
     else {
@@ -242,7 +242,7 @@
  * \returns a unit vector which is orthogonal to \c *this
  *
  * The size of \c *this must be at least 2. If the size is exactly 2,
- * then the returned vector is a counter clock wise rotation of \c *this, i.e., (-y,x).normalized().
+ * then the returned vector is a counter-clockwise rotation of \c *this, i.e., (-y,x).normalized().
  *
  * \sa cross()
  */

diff --git a/Eigen/src/Geometry/Quaternion.h b/Eigen/src/Geometry/Quaternion.h
index f2d2d05..8b71cd8 100644
--- a/Eigen/src/Geometry/Quaternion.h
+++ b/Eigen/src/Geometry/Quaternion.h

@@ -115,13 +115,6 @@
   template <class OtherDerived>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const QuaternionBase<OtherDerived>& other);
 
-  // disabled this copy operator as it is giving very strange compilation errors when compiling
-  // test_stdvector with GCC 4.4.2. This looks like a GCC bug though, so feel free to re-enable it if it's
-  // useful; however notice that we already have the templated operator= above and e.g. in MatrixBase
-  // we didn't have to add, in addition to templated operator=, such a non-templated copy operator.
-  //  Derived& operator=(const QuaternionBase& other)
-  //  { return operator=<Derived>(other); }
-
   EIGEN_DEVICE_FUNC Derived& operator=(const AngleAxisType& aa);
   template <class OtherDerived>
   EIGEN_DEVICE_FUNC Derived& operator=(const MatrixBase<OtherDerived>& m);
@@ -797,7 +790,7 @@
 template <class Derived>
 EIGEN_DEVICE_FUNC inline Quaternion<typename internal::traits<Derived>::Scalar> QuaternionBase<Derived>::inverse()
     const {
-  // FIXME should this function be called multiplicativeInverse and conjugate() be called inverse() or opposite()  ??
+  // FIXME: consider renaming to multiplicativeInverse() and renaming conjugate() to inverse() or opposite().
   Scalar n2 = this->squaredNorm();
   if (n2 > Scalar(0))
     return Quaternion<Scalar>(conjugate().coeffs() / n2);

diff --git a/Eigen/src/Geometry/Rotation2D.h b/Eigen/src/Geometry/Rotation2D.h
index 5918025..515bd27 100644
--- a/Eigen/src/Geometry/Rotation2D.h
+++ b/Eigen/src/Geometry/Rotation2D.h

@@ -23,7 +23,7 @@
  *
  * \tparam Scalar_ the scalar type, i.e., the type of the coefficients
  *
- * This class is equivalent to a single scalar representing a counter clock wise rotation
+ * This class is equivalent to a single scalar representing a counter-clockwise rotation
  * as a single angle in radian. It provides some additional features such as the automatic
  * conversion from/to a 2x2 rotation matrix. Moreover this class aims to provide a similar
  * interface to Quaternion in order to facilitate the writing of generic algorithms
@@ -57,7 +57,7 @@
   Scalar m_angle;
 
  public:
-  /** Construct a 2D counter clock wise rotation from the angle \a a in radian. */
+  /** Construct a 2D counter-clockwise rotation from the angle \a a in radian. */
   EIGEN_DEVICE_FUNC explicit inline Rotation2D(const Scalar& a) : m_angle(a) {}
 
   /** Default constructor without initialization. The represented rotation is undefined. */

diff --git a/Eigen/src/Geometry/Scaling.h b/Eigen/src/Geometry/Scaling.h
index a0604ce..5bb7c45 100644
--- a/Eigen/src/Geometry/Scaling.h
+++ b/Eigen/src/Geometry/Scaling.h

@@ -80,7 +80,7 @@
   }
 
   /** Concatenates a uniform scaling and a linear transformation matrix */
-  // TODO returns an expression
+  // TODO: return an expression instead of a dense matrix.
   template <typename Derived>
   inline typename Eigen::internal::plain_matrix_type<Derived>::type operator*(const MatrixBase<Derived>& other) const {
     return other * m_factor;

diff --git a/Eigen/src/Geometry/Transform.h b/Eigen/src/Geometry/Transform.h
index bb8ba4c..88b6b30 100644
--- a/Eigen/src/Geometry/Transform.h
+++ b/Eigen/src/Geometry/Transform.h

@@ -745,7 +745,7 @@
 
 /** \returns a QMatrix from \c *this assuming the dimension is 2.
  *
- * \warning this conversion might loss data if \c *this is not affine
+ * \warning this conversion might lose data if \c *this is not affine
  *
  * This function is available only if the token EIGEN_QT_SUPPORT is defined.
  */

diff --git a/Eigen/src/Householder/BlockHouseholder.h b/Eigen/src/Householder/BlockHouseholder.h
index 8b92304..6a4bc67 100644
--- a/Eigen/src/Householder/BlockHouseholder.h
+++ b/Eigen/src/Householder/BlockHouseholder.h

@@ -21,35 +21,6 @@
 namespace internal {
 
 /** \internal */
-// template<typename TriangularFactorType,typename VectorsType,typename CoeffsType>
-// void make_block_householder_triangular_factor(TriangularFactorType& triFactor, const VectorsType& vectors, const
-// CoeffsType& hCoeffs)
-// {
-//   typedef typename VectorsType::Scalar Scalar;
-//   const Index nbVecs = vectors.cols();
-//   eigen_assert(triFactor.rows() == nbVecs && triFactor.cols() == nbVecs && vectors.rows()>=nbVecs);
-//
-//   for(Index i = 0; i < nbVecs; i++)
-//   {
-//     Index rs = vectors.rows() - i;
-//     // Warning, note that hCoeffs may alias with vectors.
-//     // It is then necessary to copy it before modifying vectors(i,i).
-//     typename CoeffsType::Scalar h = hCoeffs(i);
-//     // This hack permits to pass through nested Block<> and Transpose<> expressions.
-//     Scalar *Vii_ptr = const_cast<Scalar*>(vectors.data() + vectors.outerStride()*i + vectors.innerStride()*i);
-//     Scalar Vii = *Vii_ptr;
-//     *Vii_ptr = Scalar(1);
-//     triFactor.col(i).head(i).noalias() = -h * vectors.block(i, 0, rs, i).adjoint()
-//                                        * vectors.col(i).tail(rs);
-//     *Vii_ptr = Vii;
-//     // FIXME add .noalias() once the triangular product can work inplace
-//     triFactor.col(i).head(i) = triFactor.block(0,0,i,i).template triangularView<Upper>()
-//                              * triFactor.col(i).head(i);
-//     triFactor(i,i) = hCoeffs(i);
-//   }
-// }
-
-/** \internal */
 // This variant avoid modifications in vectors
 template <typename TriangularFactorType, typename VectorsType, typename CoeffsType>
 void make_block_householder_triangular_factor(TriangularFactorType& triFactor, const VectorsType& vectors,
@@ -65,7 +36,7 @@
       triFactor.row(i).tail(rt).noalias() = -hCoeffs(i) * vectors.col(i).tail(rs).adjoint() *
                                             vectors.bottomRightCorner(rs, rt).template triangularView<UnitLower>();
 
-      // FIXME use the following line with .noalias() once the triangular product can work inplace
+      // FIXME: use the following line with .noalias() once triangular product supports in-place operation.
       // triFactor.row(i).tail(rt) = triFactor.row(i).tail(rt) * triFactor.bottomRightCorner(rt,rt).template
       // triangularView<Upper>();
       for (Index j = nbVecs - 1; j > i; --j) {
@@ -100,7 +71,7 @@
          (VectorsType::MaxColsAtCompileTime == 1 && MatrixType::MaxColsAtCompileTime != 1) ? RowMajor : ColMajor,
          VectorsType::MaxColsAtCompileTime, MatrixType::MaxColsAtCompileTime>
       tmp = V.adjoint() * mat;
-  // FIXME add .noalias() once the triangular product can work inplace
+  // FIXME: add .noalias() once triangular product supports in-place operation.
   if (forward)
     tmp = T.template triangularView<Upper>() * tmp;
   else

diff --git a/Eigen/src/Householder/Householder.h b/Eigen/src/Householder/Householder.h
index e5d2d4f..f77e54f 100644
--- a/Eigen/src/Householder/Householder.h
+++ b/Eigen/src/Householder/Householder.h

@@ -17,9 +17,17 @@
 namespace Eigen {
 
 namespace internal {
-template <int n>
+template <int N>
 struct decrement_size {
-  enum { ret = n == Dynamic ? n : n - 1 };
+  static constexpr int ret = N - 1;
+};
+template <>
+struct decrement_size<0> {
+  static constexpr int ret = 0;
+};
+template <>
+struct decrement_size<Dynamic> {
+  static constexpr int ret = Dynamic;
 };
 }  // namespace internal
 
@@ -67,9 +75,9 @@
   using numext::conj;
 
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(EssentialPart)
-  VectorBlock<const Derived, EssentialPart::SizeAtCompileTime> tail(derived(), 1, size() - 1);
+  const VectorBlock<const Derived, EssentialPart::SizeAtCompileTime> tail(derived(), 1, size() - 1);
 
-  RealScalar tailSqNorm = size() == 1 ? RealScalar(0) : tail.squaredNorm();
+  RealScalar tailSqNorm = size() == 1 ? RealScalar(0) : tail.unwind().squaredNorm();
   Scalar c0 = coeff(0);
   const RealScalar tol = (std::numeric_limits<RealScalar>::min)();
 
@@ -80,7 +88,7 @@
   } else {
     beta = numext::sqrt(numext::abs2(c0) + tailSqNorm);
     if (numext::real(c0) >= RealScalar(0)) beta = -beta;
-    essential = tail / (c0 - beta);
+    essential = tail.unwind() / (c0 - beta);
     tau = conj((beta - c0) / beta);
   }
 }
@@ -110,10 +118,10 @@
     Map<typename internal::plain_row_type<PlainObject>::type> tmp(workspace, cols());
     Block<Derived, EssentialPart::SizeAtCompileTime, Derived::ColsAtCompileTime> bottom(derived(), 1, 0, rows() - 1,
                                                                                         cols());
-    tmp.noalias() = essential.adjoint() * bottom;
-    tmp += this->row(0);
-    this->row(0) -= tau * tmp;
-    bottom.noalias() -= tau * essential * tmp;
+    tmp.noalias() = essential.adjoint() * bottom.unwind();
+    tmp = tau * (tmp + this->row(0));
+    this->row(0) = this->row(0) - tmp;
+    bottom.unwind().noalias() -= essential * tmp;
   }
 }
 
@@ -142,10 +150,10 @@
     Map<typename internal::plain_col_type<PlainObject>::type> tmp(workspace, rows());
     Block<Derived, Derived::RowsAtCompileTime, EssentialPart::SizeAtCompileTime> right(derived(), 0, 1, rows(),
                                                                                        cols() - 1);
-    tmp.noalias() = right * essential;
-    tmp += this->col(0);
-    this->col(0) -= tau * tmp;
-    right.noalias() -= tau * tmp * essential.adjoint();
+    tmp.noalias() = right.unwind() * essential;
+    tmp = tau * (tmp + this->col(0));
+    this->col(0) = this->col(0) - tmp;
+    right.unwind().noalias() -= tmp * essential.adjoint();
   }
 }
 

diff --git a/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h b/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
index 904d853..d4a187e 100644
--- a/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
+++ b/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h

@@ -86,14 +86,14 @@
   }
 
   template <typename Rhs>
-  inline const Solve<DiagonalPreconditioner, Rhs> solve(const MatrixBase<Rhs>& b) const {
+  inline Solve<DiagonalPreconditioner, Rhs> solve(const MatrixBase<Rhs>& b) const {
     eigen_assert(m_isInitialized && "DiagonalPreconditioner is not initialized.");
     eigen_assert(m_invdiag.size() == b.rows() &&
                  "DiagonalPreconditioner::solve(): invalid number of rows of the right hand side matrix b");
     return Solve<DiagonalPreconditioner, Rhs>(*this, b.derived());
   }
 
-  ComputationInfo info() { return Success; }
+  ComputationInfo info() const { return Success; }
 
  protected:
   Vector m_invdiag;
@@ -166,7 +166,7 @@
     return factorize(mat);
   }
 
-  ComputationInfo info() { return Success; }
+  ComputationInfo info() const { return Success; }
 
  protected:
 };
@@ -205,7 +205,7 @@
     return b;
   }
 
-  ComputationInfo info() { return Success; }
+  ComputationInfo info() const { return Success; }
 };
 
 }  // end namespace Eigen

diff --git a/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h b/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
index 8fdeb84..2d461fe 100644
--- a/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
+++ b/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h

@@ -34,7 +34,6 @@
   typedef typename Dest::RealScalar RealScalar;
   typedef typename Dest::Scalar Scalar;
   typedef Matrix<Scalar, Dynamic, 1> VectorType;
-  RealScalar tol = tol_error;
   Index maxIters = iters;
 
   Index n = mat.cols();
@@ -48,6 +47,9 @@
     x.setZero();
     return true;
   }
+
+  RealScalar tol = tol_error * rhs_norm;
+
   Scalar rho(1);
   Scalar alpha(0);
   Scalar w(1);

diff --git a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h
index dd40058..a426381 100644
--- a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h
+++ b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h

@@ -264,7 +264,7 @@
     else
       m_scale(j) = 1;
 
-  // TODO disable scaling if not needed, i.e., if it is roughly uniform? (this will make solve() faster)
+  // TODO: disable scaling when roughly uniform to speed up solve().
 
   // Scale and compute the shift for the matrix
   RealScalar mindiag = NumTraits<RealScalar>::highest();

diff --git a/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h b/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
index 11ce5e5..3a84e55 100644
--- a/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
+++ b/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h

@@ -244,8 +244,8 @@
   // To this end, let's symmetrize the pattern and perform AMD on it.
   SparseMatrix<Scalar, ColMajor, StorageIndex> mat1 = amat;
   SparseMatrix<Scalar, ColMajor, StorageIndex> mat2 = amat.transpose();
-  // FIXME for a matrix with nearly symmetric pattern, mat2+mat1 is the appropriate choice.
-  //       on the other hand for a really non-symmetric pattern, mat2*mat1 should be preferred...
+  // FIXME: for a nearly symmetric pattern, mat2+mat1 is appropriate;
+  //        for a highly non-symmetric pattern, mat2*mat1 should be preferred.
   SparseMatrix<Scalar, ColMajor, StorageIndex> AtA = mat2 + mat1;
   AMDOrdering<StorageIndex> ordering;
   ordering(AtA, m_P);

diff --git a/Eigen/src/Jacobi/Jacobi.h b/Eigen/src/Jacobi/Jacobi.h
index d97477b..f24a851 100644
--- a/Eigen/src/Jacobi/Jacobi.h
+++ b/Eigen/src/Jacobi/Jacobi.h

@@ -239,7 +239,7 @@
 
 namespace internal {
 /** \jacobi_module
- * Applies the clock wise 2D rotation \a j to the set of 2D vectors of coordinates \a x and \a y:
+ * Applies the clockwise 2D rotation \a j to the set of 2D vectors of coordinates \a x and \a y:
  * \f$ \left ( \begin{array}{cc} x \\ y \end{array} \right )  =  J \left ( \begin{array}{cc} x \\ y \end{array} \right )
  * \f$
  *
@@ -421,31 +421,44 @@
 }
 
 template <typename MatrixType, typename RealScalar, typename Index>
-void real_2x2_jacobi_svd(const MatrixType& matrix, Index p, Index q, JacobiRotation<RealScalar>* j_left,
-                         JacobiRotation<RealScalar>* j_right) {
-  using std::abs;
-  using std::sqrt;
-  Matrix<RealScalar, 2, 2> m;
-  m << numext::real(matrix.coeff(p, p)), numext::real(matrix.coeff(p, q)), numext::real(matrix.coeff(q, p)),
-      numext::real(matrix.coeff(q, q));
-  JacobiRotation<RealScalar> rot1;
-  RealScalar t = m.coeff(0, 0) + m.coeff(1, 1);
-  RealScalar d = m.coeff(1, 0) - m.coeff(0, 1);
+EIGEN_DONT_INLINE void real_2x2_jacobi_svd(const MatrixType& matrix, Index p, Index q,
+                                           JacobiRotation<RealScalar>* j_left, JacobiRotation<RealScalar>* j_right) {
+  // Extract 2x2 submatrix into scalars (avoids Matrix construction on stack).
+  const RealScalar m00 = numext::real(matrix.coeff(p, p));
+  const RealScalar m01 = numext::real(matrix.coeff(p, q));
+  const RealScalar m10 = numext::real(matrix.coeff(q, p));
+  const RealScalar m11 = numext::real(matrix.coeff(q, q));
 
-  if (abs(d) < (std::numeric_limits<RealScalar>::min)()) {
-    rot1.s() = RealScalar(0);
-    rot1.c() = RealScalar(1);
+  // Compute the symmetrizing rotation rot1 such that rot1 * [m] is symmetric.
+  const RealScalar t = m00 + m11;
+  const RealScalar d = m10 - m01;
+
+  RealScalar c1, s1;
+  if (numext::abs(d) < (std::numeric_limits<RealScalar>::min)()) {
+    c1 = RealScalar(1);
+    s1 = RealScalar(0);
   } else {
     // If d!=0, then t/d cannot overflow because the magnitude of the
     // entries forming d are not too small compared to the ones forming t.
     RealScalar u = t / d;
-    RealScalar tmp = sqrt(RealScalar(1) + numext::abs2(u));
-    rot1.s() = RealScalar(1) / tmp;
-    rot1.c() = u / tmp;
+    s1 = RealScalar(1) / numext::sqrt(RealScalar(1) + numext::abs2(u));
+    c1 = u * s1;
   }
-  m.applyOnTheLeft(0, 1, rot1);
-  j_right->makeJacobi(m, 0, 1);
-  *j_left = rot1 * j_right->transpose();
+
+  // Apply rot1 to the 2x2 submatrix inline (avoids rotation dispatch overhead).
+  // Result is symmetric, so we only need 3 values: a00, a01 (== a10), a11.
+  const RealScalar a00 = c1 * m00 + s1 * m10;
+  const RealScalar a01 = c1 * m01 + s1 * m11;
+  const RealScalar a11 = -s1 * m01 + c1 * m11;
+
+  // Compute the diagonalizing rotation j_right from the symmetrized matrix.
+  j_right->makeJacobi(a00, a01, a11);
+
+  // Compose j_left = rot1 * j_right^T inline (avoids template machinery overhead).
+  const RealScalar jr_c = j_right->c();
+  const RealScalar jr_s = j_right->s();
+  j_left->c() = c1 * jr_c + s1 * jr_s;
+  j_left->s() = s1 * jr_c - c1 * jr_s;
 }
 
 }  // end namespace internal

diff --git a/Eigen/src/KLUSupport/KLUSupport.h b/Eigen/src/KLUSupport/KLUSupport.h
index 21324ab..f07ad8d 100644
--- a/Eigen/src/KLUSupport/KLUSupport.h
+++ b/Eigen/src/KLUSupport/KLUSupport.h

@@ -112,32 +112,7 @@
     eigen_assert(m_isInitialized && "Decomposition is not initialized.");
     return m_info;
   }
-#if 0  // not implemented yet
-    inline const LUMatrixType& matrixL() const
-    {
-      if (m_extractedDataAreDirty) extractData();
-      return m_l;
-    }
-
-    inline const LUMatrixType& matrixU() const
-    {
-      if (m_extractedDataAreDirty) extractData();
-      return m_u;
-    }
-
-    inline const IntColVectorType& permutationP() const
-    {
-      if (m_extractedDataAreDirty) extractData();
-      return m_p;
-    }
-
-    inline const IntRowVectorType& permutationQ() const
-    {
-      if (m_extractedDataAreDirty) extractData();
-      return m_q;
-    }
-#endif
-  /** Computes the sparse Cholesky decomposition of \a matrix
+  /** Computes the sparse LU factorization of \a matrix
    *  Note that the matrix should be column-major, and in compressed format for best performance.
    *  \sa SparseMatrix::makeCompressed().
    */
@@ -172,7 +147,7 @@
    */
   inline const klu_common &kluCommon() const { return m_common; }
 
-  /** Provides access to the control settings array used by UmfPack.
+  /** Provides access to the control settings array used by KLU.
    *
    * If this array contains NaN's, the default values are used.
    *
@@ -182,7 +157,7 @@
 
   /** Performs a numeric decomposition of \a matrix
    *
-   * The given matrix must have the same sparsity than the matrix on which the pattern anylysis has been performed.
+   * The given matrix must have the same sparsity as the matrix on which the pattern analysis has been performed.
    *
    * \sa analyzePattern(), compute()
    */
@@ -200,12 +175,6 @@
   template <typename BDerived, typename XDerived>
   bool _solve_impl(const MatrixBase<BDerived> &b, MatrixBase<XDerived> &x) const;
 
-#if 0  // not implemented yet
-    Scalar determinant() const;
-
-    void extractData() const;
-#endif
-
  protected:
   void init() {
     m_info = InvalidInput;
@@ -255,14 +224,6 @@
     }
   }
 
-  // cached data to reduce reallocation, etc.
-#if 0  // not implemented yet
-    mutable LUMatrixType m_l;
-    mutable LUMatrixType m_u;
-    mutable IntColVectorType m_p;
-    mutable IntRowVectorType m_q;
-#endif
-
   KLUMatrixType m_dummy;
   KLUMatrixRef mp_matrix;
 
@@ -278,45 +239,6 @@
   KLU(const KLU &) {}
 };
 
-#if 0  // not implemented yet
-template<typename MatrixType>
-void KLU<MatrixType>::extractData() const
-{
-  if (m_extractedDataAreDirty)
-  {
-     eigen_assert(false && "KLU: extractData Not Yet Implemented");
-
-    // get size of the data
-    int lnz, unz, rows, cols, nz_udiag;
-    umfpack_get_lunz(&lnz, &unz, &rows, &cols, &nz_udiag, m_numeric, Scalar());
-
-    // allocate data
-    m_l.resize(rows,(std::min)(rows,cols));
-    m_l.resizeNonZeros(lnz);
-
-    m_u.resize((std::min)(rows,cols),cols);
-    m_u.resizeNonZeros(unz);
-
-    m_p.resize(rows);
-    m_q.resize(cols);
-
-    // extract
-    umfpack_get_numeric(m_l.outerIndexPtr(), m_l.innerIndexPtr(), m_l.valuePtr(),
-                        m_u.outerIndexPtr(), m_u.innerIndexPtr(), m_u.valuePtr(),
-                        m_p.data(), m_q.data(), 0, 0, 0, m_numeric);
-
-    m_extractedDataAreDirty = false;
-  }
-}
-
-template<typename MatrixType>
-typename KLU<MatrixType>::Scalar KLU<MatrixType>::determinant() const
-{
-  eigen_assert(false && "KLU: extractData Not Yet Implemented");
-  return Scalar();
-}
-#endif
-
 template <typename MatrixType>
 template <typename BDerived, typename XDerived>
 bool KLU<MatrixType>::_solve_impl(const MatrixBase<BDerived> &b, MatrixBase<XDerived> &x) const {

diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h
index 786cd76..74bdcf3 100644
--- a/Eigen/src/LU/FullPivLU.h
+++ b/Eigen/src/LU/FullPivLU.h

@@ -247,7 +247,7 @@
    * \sa TriangularView::solve(), kernel(), inverse()
    */
   template <typename Rhs>
-  inline const Solve<FullPivLU, Rhs> solve(const MatrixBase<Rhs>& b) const;
+  inline Solve<FullPivLU, Rhs> solve(const MatrixBase<Rhs>& b) const;
 #endif
 
   /** \returns an estimate of the reciprocal condition number of the matrix of which \c *this is
@@ -321,9 +321,10 @@
   RealScalar threshold() const {
     eigen_assert(m_isInitialized || m_usePrescribedThreshold);
     return m_usePrescribedThreshold ? m_prescribedThreshold
-                                    // this formula comes from experimenting (see "LU precision tuning" thread on the
-                                    // list) and turns out to be identical to Higham's formula used already in LDLt.
-                                    : NumTraits<Scalar>::epsilon() * RealScalar(m_lu.diagonalSize());
+                                    // Higham's backward error bound for Gaussian elimination with
+                                    // complete pivoting (Theorem 9.4) is ||ΔA||₂ ≤ c·min(m,n)·u·||A||₂.
+                                    // The factor of 4 covers the constant c.
+                                    : NumTraits<Scalar>::epsilon() * RealScalar(4 * m_lu.diagonalSize());
   }
 
   /** \returns the rank of the matrix of which *this is the LU decomposition.
@@ -394,7 +395,7 @@
    *
    * \sa MatrixBase::inverse()
    */
-  inline const Inverse<FullPivLU> inverse() const {
+  inline Inverse<FullPivLU> inverse() const {
     eigen_assert(m_isInitialized && "LU is not initialized.");
     eigen_assert(m_lu.rows() == m_lu.cols() && "You can't take the inverse of a non-square matrix!");
     return Inverse<FullPivLU>(*this);
@@ -571,7 +572,7 @@
   const Index smalldim = (std::min)(m_lu.rows(), m_lu.cols());
   // LU
   MatrixType res(m_lu.rows(), m_lu.cols());
-  // FIXME the .toDenseMatrix() should not be needed...
+  // FIXME: the .toDenseMatrix() calls should not be needed.
   res = m_lu.leftCols(smalldim).template triangularView<UnitLower>().toDenseMatrix() *
         m_lu.topRows(smalldim).template triangularView<Upper>().toDenseMatrix();
 
@@ -632,10 +633,10 @@
       if (abs(dec().matrixLU().coeff(i, i)) > premultiplied_threshold) pivots.coeffRef(p++) = i;
     eigen_internal_assert(p == rank());
 
-    // we construct a temporaty trapezoid matrix m, by taking the U matrix and
-    // permuting the rows and cols to bring the nonnegligible pivots to the top of
-    // the main diagonal. We need that to be able to apply our triangular solvers.
-    // FIXME when we get triangularView-for-rectangular-matrices, this can be simplified
+    // Construct a temporary trapezoid matrix m by taking the U matrix and permuting
+    // the rows and cols to bring the nonnegligible pivots to the top of the main diagonal.
+    // This is needed to apply our triangular solvers.
+    // FIXME: simplify once triangularView supports rectangular matrices.
     Matrix<typename MatrixType::Scalar, Dynamic, Dynamic, traits<MatrixType>::Options, MaxSmallDimAtCompileTime,
            MatrixType::MaxColsAtCompileTime>
         m(dec().matrixLU().block(0, 0, rank(), cols));
@@ -816,8 +817,7 @@
  */
 template <typename Derived>
 template <typename PermutationIndex>
-inline const FullPivLU<typename MatrixBase<Derived>::PlainObject, PermutationIndex> MatrixBase<Derived>::fullPivLu()
-    const {
+inline FullPivLU<typename MatrixBase<Derived>::PlainObject, PermutationIndex> MatrixBase<Derived>::fullPivLu() const {
   return FullPivLU<PlainObject, PermutationIndex>(eval());
 }
 

diff --git a/Eigen/src/LU/InverseImpl.h b/Eigen/src/LU/InverseImpl.h
index fe8859e..5c24dfe 100644
--- a/Eigen/src/LU/InverseImpl.h
+++ b/Eigen/src/LU/InverseImpl.h

@@ -276,7 +276,7 @@
  * \sa computeInverseAndDetWithCheck()
  */
 template <typename Derived>
-EIGEN_DEVICE_FUNC inline const Inverse<Derived> MatrixBase<Derived>::inverse() const {
+EIGEN_DEVICE_FUNC inline Inverse<Derived> MatrixBase<Derived>::inverse() const {
   EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::IsInteger, THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES)
   eigen_assert(rows() == cols());
   return Inverse<Derived>(derived());

diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h
index 7ea14f5..4f4227e 100644
--- a/Eigen/src/LU/PartialPivLU.h
+++ b/Eigen/src/LU/PartialPivLU.h

@@ -28,9 +28,6 @@
 
 template <typename T, typename Derived>
 struct enable_if_ref;
-// {
-//   typedef Derived type;
-// };
 
 template <typename T, typename Derived>
 struct enable_if_ref<Ref<T>, Derived> {
@@ -181,7 +178,7 @@
    * \sa TriangularView::solve(), inverse(), computeInverse()
    */
   template <typename Rhs>
-  inline const Solve<PartialPivLU, Rhs> solve(const MatrixBase<Rhs>& b) const;
+  inline Solve<PartialPivLU, Rhs> solve(const MatrixBase<Rhs>& b) const;
 #endif
 
   /** \returns an estimate of the reciprocal condition number of the matrix of which \c *this is
@@ -199,7 +196,7 @@
    *
    * \sa MatrixBase::inverse(), LU::inverse()
    */
-  inline const Inverse<PartialPivLU> inverse() const {
+  inline Inverse<PartialPivLU> inverse() const {
     eigen_assert(m_isInitialized && "PartialPivLU is not initialized.");
     return Inverse<PartialPivLU>(*this);
   }
@@ -562,8 +559,8 @@
  */
 template <typename Derived>
 template <typename PermutationIndex>
-inline const PartialPivLU<typename MatrixBase<Derived>::PlainObject, PermutationIndex>
-MatrixBase<Derived>::partialPivLu() const {
+inline PartialPivLU<typename MatrixBase<Derived>::PlainObject, PermutationIndex> MatrixBase<Derived>::partialPivLu()
+    const {
   return PartialPivLU<PlainObject, PermutationIndex>(eval());
 }
 
@@ -577,7 +574,7 @@
  */
 template <typename Derived>
 template <typename PermutationIndex>
-inline const PartialPivLU<typename MatrixBase<Derived>::PlainObject, PermutationIndex> MatrixBase<Derived>::lu() const {
+inline PartialPivLU<typename MatrixBase<Derived>::PlainObject, PermutationIndex> MatrixBase<Derived>::lu() const {
   return PartialPivLU<PlainObject, PermutationIndex>(eval());
 }
 

diff --git a/Eigen/src/MetisSupport/MetisSupport.h b/Eigen/src/MetisSupport/MetisSupport.h
index 6c7bf94..961c840 100644
--- a/Eigen/src/MetisSupport/MetisSupport.h
+++ b/Eigen/src/MetisSupport/MetisSupport.h

@@ -38,7 +38,7 @@
     IndexVector visited(m);
     visited.setConstant(-1);
     for (StorageIndex j = 0; j < m; j++) {
-      // Compute the union structure of of A(j,:) and At(j,:)
+      // Compute the union structure of A(j,:) and At(j,:)
       visited(j) = j;  // Do not include the diagonal element
       // Get the nonzeros in row/column j of A
       for (typename MatrixType::InnerIterator it(A, j); it; ++it) {

diff --git a/Eigen/src/OrderingMethods/Eigen_Colamd.h b/Eigen/src/OrderingMethods/Eigen_Colamd.h
index f1ea2ee..02be282 100644
--- a/Eigen/src/OrderingMethods/Eigen_Colamd.h
+++ b/Eigen/src/OrderingMethods/Eigen_Colamd.h

@@ -713,7 +713,7 @@
   for (c = n_col - 1; c >= 0; c--) {
     deg = Col[c].length;
     if (deg == 0) {
-      /* this is a empty column, kill and order it last */
+      /* this is an empty column, kill and order it last */
       Col[c].shared2.order = --n_col2;
       Col[c].kill_principal();
     }

diff --git a/Eigen/src/OrderingMethods/Ordering.h b/Eigen/src/OrderingMethods/Ordering.h
index 1a65007..8ede4de 100644
--- a/Eigen/src/OrderingMethods/Ordering.h
+++ b/Eigen/src/OrderingMethods/Ordering.h

@@ -22,7 +22,7 @@
  * \ingroup OrderingMethods_Module
  * \param[in] A the input non-symmetric matrix
  * \param[out] symmat the symmetric pattern A^T+A from the input matrix \a A.
- * FIXME: The values should not be considered here
+ * FIXME: only the sparsity pattern should be used here; values should be ignored.
  */
 template <typename MatrixType>
 void ordering_helper_at_plus_a(const MatrixType& A, MatrixType& symmat) {
@@ -53,7 +53,7 @@
    * This routine is much faster if the input matrix is column-major
    */
   template <typename MatrixType>
-  void operator()(const MatrixType& mat, PermutationType& perm) {
+  void operator()(const MatrixType& mat, PermutationType& perm) const {
     // Compute the symmetric pattern
     SparseMatrix<typename MatrixType::Scalar, ColMajor, StorageIndex> symm;
     internal::ordering_helper_at_plus_a(mat, symm);
@@ -65,7 +65,7 @@
 
   /** Compute the permutation with a selfadjoint matrix */
   template <typename SrcType, unsigned int SrcUpLo>
-  void operator()(const SparseSelfAdjointView<SrcType, SrcUpLo>& mat, PermutationType& perm) {
+  void operator()(const SparseSelfAdjointView<SrcType, SrcUpLo>& mat, PermutationType& perm) const {
     SparseMatrix<typename SrcType::Scalar, ColMajor, StorageIndex> C;
     C = mat;
 
@@ -90,7 +90,7 @@
 
   /** Compute the permutation vector from a column-major sparse matrix */
   template <typename MatrixType>
-  void operator()(const MatrixType& /*mat*/, PermutationType& perm) {
+  void operator()(const MatrixType& /*mat*/, PermutationType& perm) const {
     perm.resize(0);
   }
 };
@@ -113,7 +113,7 @@
    * \warning The input sparse matrix \a mat must be in compressed mode (see SparseMatrix::makeCompressed()).
    */
   template <typename MatrixType>
-  void operator()(const MatrixType& mat, PermutationType& perm) {
+  void operator()(const MatrixType& mat, PermutationType& perm) const {
     eigen_assert(mat.isCompressed() &&
                  "COLAMDOrdering requires a sparse matrix in compressed mode. Call .makeCompressed() before passing it "
                  "to COLAMDOrdering");

diff --git a/Eigen/src/PardisoSupport/PardisoSupport.h b/Eigen/src/PardisoSupport/PardisoSupport.h
index 2f5d83e..90df3cd 100644
--- a/Eigen/src/PardisoSupport/PardisoSupport.h
+++ b/Eigen/src/PardisoSupport/PardisoSupport.h

@@ -157,7 +157,7 @@
 
   /** Performs a numeric decomposition of \a matrix
    *
-   * The given matrix must have the same sparsity than the matrix on which the symbolic decomposition has been
+   * The given matrix must have the same sparsity as the matrix on which the symbolic decomposition has been
    * performed.
    *
    * \sa analyzePattern()
@@ -185,7 +185,7 @@
     bool symmetric = std::abs(m_type) < 10;
     m_iparm[0] = 1;                   // No solver default
     m_iparm[1] = 2;                   // use Metis for the ordering
-    m_iparm[2] = 0;                   // Reserved. Set to zero. (??Numbers of processors, value of OMP_NUM_THREADS??)
+    m_iparm[2] = 0;                   // Reserved. Set to zero. (Was number of processors / OMP_NUM_THREADS.)
     m_iparm[3] = 0;                   // No iterative-direct algorithm
     m_iparm[4] = 0;                   // No user fill-in reducing permutation
     m_iparm[5] = 0;                   // Write solution into x, b is left unchanged

diff --git a/Eigen/src/QR/ColPivHouseholderQR.h b/Eigen/src/QR/ColPivHouseholderQR.h
index 092c29d..3695078 100644
--- a/Eigen/src/QR/ColPivHouseholderQR.h
+++ b/Eigen/src/QR/ColPivHouseholderQR.h

@@ -158,7 +158,7 @@
    * Output: \verbinclude ColPivHouseholderQR_solve.out
    */
   template <typename Rhs>
-  inline const Solve<ColPivHouseholderQR, Rhs> solve(const MatrixBase<Rhs>& b) const;
+  inline Solve<ColPivHouseholderQR, Rhs> solve(const MatrixBase<Rhs>& b) const;
 #endif
 
   HouseholderSequenceType householderQ() const;
@@ -318,7 +318,7 @@
    * \note If this matrix is not invertible, the returned matrix has undefined coefficients.
    *       Use isInvertible() to first determine whether this matrix is invertible.
    */
-  inline const Inverse<ColPivHouseholderQR> inverse() const {
+  inline Inverse<ColPivHouseholderQR> inverse() const {
     eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
     return Inverse<ColPivHouseholderQR>(*this);
   }
@@ -375,9 +375,10 @@
   RealScalar threshold() const {
     eigen_assert(m_isInitialized || m_usePrescribedThreshold);
     return m_usePrescribedThreshold ? m_prescribedThreshold
-                                    // this formula comes from experimenting (see "LU precision tuning" thread on the
-                                    // list) and turns out to be identical to Higham's formula used already in LDLt.
-                                    : NumTraits<Scalar>::epsilon() * RealScalar(m_qr.diagonalSize());
+                                    // Higham's backward error bound for Householder QR (Theorem 19.4) is
+                                    // ||ΔA||₂ ≤ c·min(m,n)·u·||A||₂. The factor of 4 covers the
+                                    // constant c (typically 3–6 worst-case, ~1 probabilistically).
+                                    : NumTraits<Scalar>::epsilon() * RealScalar(4 * m_qr.diagonalSize());
   }
 
   /** \returns the number of nonzero pivots in the QR decomposition.
@@ -664,7 +665,7 @@
  */
 template <typename Derived>
 template <typename PermutationIndexType>
-const ColPivHouseholderQR<typename MatrixBase<Derived>::PlainObject, PermutationIndexType>
+ColPivHouseholderQR<typename MatrixBase<Derived>::PlainObject, PermutationIndexType>
 MatrixBase<Derived>::colPivHouseholderQr() const {
   return ColPivHouseholderQR<PlainObject, PermutationIndexType>(eval());
 }

diff --git a/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h b/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h
index 37ac55f..881bc3c 100644
--- a/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h
+++ b/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h

@@ -97,7 +97,9 @@
 
     maxpivot = qr.diagonal().cwiseAbs().maxCoeff();
     hCoeffs.adjointInPlace();
-    RealScalar defaultThreshold = NumTraits<RealScalar>::epsilon() * RealScalar(qr.diagonalSize());
+    // Higham's backward error bound (Theorem 19.4): ||ΔA||₂ ≤ c·min(m,n)·u·||A||₂.
+    // The factor of 4 covers the constant c (typically 3–6 worst-case).
+    RealScalar defaultThreshold = NumTraits<RealScalar>::epsilon() * RealScalar(4 * qr.diagonalSize());
     RealScalar threshold = usePrescribedThreshold ? prescribedThreshold : defaultThreshold;
     RealScalar premultiplied_threshold = maxpivot * threshold;
     nonzero_pivots = (qr.diagonal().cwiseAbs().array() > premultiplied_threshold).count();

diff --git a/Eigen/src/QR/CompleteOrthogonalDecomposition.h b/Eigen/src/QR/CompleteOrthogonalDecomposition.h
index 960ccb1..3a5be59 100644
--- a/Eigen/src/QR/CompleteOrthogonalDecomposition.h
+++ b/Eigen/src/QR/CompleteOrthogonalDecomposition.h

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2016 Rasmus Munk Larsen <rmlarsen@google.com>
+// Copyright (C) 2016 Rasmus Munk Larsen <rmlarsen@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -140,7 +140,7 @@
    *
    */
   template <typename Rhs>
-  inline const Solve<CompleteOrthogonalDecomposition, Rhs> solve(const MatrixBase<Rhs>& b) const;
+  inline Solve<CompleteOrthogonalDecomposition, Rhs> solve(const MatrixBase<Rhs>& b) const;
 #endif
 
   HouseholderSequenceType householderQ(void) const;
@@ -293,7 +293,7 @@
    * \warning: Do not compute \c this->pseudoInverse()*rhs to solve a linear systems.
    * It is more efficient and numerically stable to call \c this->solve(rhs).
    */
-  inline const Inverse<CompleteOrthogonalDecomposition> pseudoInverse() const {
+  inline Inverse<CompleteOrthogonalDecomposition> pseudoInverse() const {
     eigen_assert(m_cpqr.m_isInitialized && "CompleteOrthogonalDecomposition is not initialized.");
     return Inverse<CompleteOrthogonalDecomposition>(*this);
   }
@@ -638,7 +638,7 @@
  */
 template <typename Derived>
 template <typename PermutationIndex>
-const CompleteOrthogonalDecomposition<typename MatrixBase<Derived>::PlainObject, PermutationIndex>
+CompleteOrthogonalDecomposition<typename MatrixBase<Derived>::PlainObject, PermutationIndex>
 MatrixBase<Derived>::completeOrthogonalDecomposition() const {
   return CompleteOrthogonalDecomposition<PlainObject>(eval());
 }

diff --git a/Eigen/src/QR/FullPivHouseholderQR.h b/Eigen/src/QR/FullPivHouseholderQR.h
index d173444..40f9047 100644
--- a/Eigen/src/QR/FullPivHouseholderQR.h
+++ b/Eigen/src/QR/FullPivHouseholderQR.h

@@ -186,7 +186,7 @@
    * Output: \verbinclude FullPivHouseholderQR_solve.out
    */
   template <typename Rhs>
-  inline const Solve<FullPivHouseholderQR, Rhs> solve(const MatrixBase<Rhs>& b) const;
+  inline Solve<FullPivHouseholderQR, Rhs> solve(const MatrixBase<Rhs>& b) const;
 #endif
 
   /** \returns Expression object representing the matrix Q
@@ -339,7 +339,7 @@
    * \note If this matrix is not invertible, the returned matrix has undefined coefficients.
    *       Use isInvertible() to first determine whether this matrix is invertible.
    */
-  inline const Inverse<FullPivHouseholderQR> inverse() const {
+  inline Inverse<FullPivHouseholderQR> inverse() const {
     eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
     return Inverse<FullPivHouseholderQR>(*this);
   }
@@ -396,9 +396,10 @@
   RealScalar threshold() const {
     eigen_assert(m_isInitialized || m_usePrescribedThreshold);
     return m_usePrescribedThreshold ? m_prescribedThreshold
-                                    // this formula comes from experimenting (see "LU precision tuning" thread on the
-                                    // list) and turns out to be identical to Higham's formula used already in LDLt.
-                                    : NumTraits<Scalar>::epsilon() * RealScalar(m_qr.diagonalSize());
+                                    // Higham's backward error bound for Householder QR (Theorem 19.4) is
+                                    // ||ΔA||₂ ≤ c·min(m,n)·u·||A||₂. The factor of 4 covers the
+                                    // constant c (typically 3–6 worst-case, ~1 probabilistically).
+                                    : NumTraits<Scalar>::epsilon() * RealScalar(4 * m_qr.diagonalSize());
   }
 
   /** \returns the number of nonzero pivots in the QR decomposition.
@@ -575,8 +576,7 @@
 void FullPivHouseholderQR<MatrixType_, PermutationIndex_>::_solve_impl(const RhsType& rhs, DstType& dst) const {
   const Index l_rank = rank();
 
-  // FIXME introduce nonzeroPivots() and use it here. and more generally,
-  // make the same improvements in this dec as in FullPivLU.
+  // FIXME: introduce nonzeroPivots() and apply the same improvements as in FullPivLU.
   if (l_rank == 0) {
     dst.setZero();
     return;
@@ -703,11 +703,6 @@
   typename IntDiagSizeVectorType::Nested m_rowsTranspositions;
 };
 
-// template<typename MatrixType>
-// struct evaluator<FullPivHouseholderQRMatrixQReturnType<MatrixType> >
-//  : public evaluator<ReturnByValue<FullPivHouseholderQRMatrixQReturnType<MatrixType> > >
-// {};
-
 }  // end namespace internal
 
 template <typename MatrixType, typename PermutationIndex>
@@ -723,7 +718,7 @@
  */
 template <typename Derived>
 template <typename PermutationIndex>
-const FullPivHouseholderQR<typename MatrixBase<Derived>::PlainObject, PermutationIndex>
+FullPivHouseholderQR<typename MatrixBase<Derived>::PlainObject, PermutationIndex>
 MatrixBase<Derived>::fullPivHouseholderQr() const {
   return FullPivHouseholderQR<PlainObject, PermutationIndex>(eval());
 }

diff --git a/Eigen/src/QR/HouseholderQR.h b/Eigen/src/QR/HouseholderQR.h
index 497085d..e7173a5 100644
--- a/Eigen/src/QR/HouseholderQR.h
+++ b/Eigen/src/QR/HouseholderQR.h

@@ -156,7 +156,7 @@
    * Output: \verbinclude HouseholderQR_solve.out
    */
   template <typename Rhs>
-  inline const Solve<HouseholderQR, Rhs> solve(const MatrixBase<Rhs>& b) const;
+  inline Solve<HouseholderQR, Rhs> solve(const MatrixBase<Rhs>& b) const;
 #endif
 
   /** This method returns an expression of the unitary matrix Q as a sequence of Householder transformations.
@@ -384,7 +384,7 @@
   }
 }
 
-// TODO: add a corresponding public API for updating a QR factorization
+// TODO: expose a public API for rank-1 QR update.
 /** \internal
  * Basically a modified copy of @c Eigen::internal::householder_qr_inplace_unblocked that
  * performs a rank-1 update of the QR matrix in compact storage. This function assumes, that
@@ -534,7 +534,7 @@
  * \sa class HouseholderQR
  */
 template <typename Derived>
-const HouseholderQR<typename MatrixBase<Derived>::PlainObject> MatrixBase<Derived>::householderQr() const {
+HouseholderQR<typename MatrixBase<Derived>::PlainObject> MatrixBase<Derived>::householderQr() const {
   return HouseholderQR<PlainObject>(eval());
 }
 

diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h
index db1e4a2..ed219f0 100644
--- a/Eigen/src/SVD/BDCSVD.h
+++ b/Eigen/src/SVD/BDCSVD.h

@@ -34,11 +34,10 @@
 #include <iostream>
 #endif
 
-namespace Eigen {
+// Internal D&C implementation, templated only on RealScalar.
+#include "BDCSVDImpl.h"
 
-#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-IOFormat bdcsvdfmt(8, 0, ", ", "\n", "  [", "]");
-#endif
+namespace Eigen {
 
 template <typename MatrixType_, int Options>
 class BDCSVD;
@@ -127,7 +126,7 @@
    * The default constructor is useful in cases in which the user intends to
    * perform decompositions via BDCSVD::compute(const MatrixType&).
    */
-  BDCSVD() : m_algoswap(16), m_isTranspose(false), m_compU(false), m_compV(false), m_numIters(0) {}
+  BDCSVD() : m_isTranspose(false), m_numIters(0) {}
 
   /** \brief Default Constructor with memory preallocation
    *
@@ -135,9 +134,7 @@
    * according to the specified problem size and \a Options template parameter.
    * \sa BDCSVD()
    */
-  BDCSVD(Index rows, Index cols) : m_algoswap(16), m_numIters(0) {
-    allocate(rows, cols, internal::get_computation_options(Options));
-  }
+  BDCSVD(Index rows, Index cols) : m_numIters(0) { allocate(rows, cols, internal::get_computation_options(Options)); }
 
   /** \brief Default Constructor with memory preallocation
    *
@@ -156,7 +153,7 @@
    * be specified in the \a Options template parameter.
    */
   EIGEN_DEPRECATED_WITH_REASON("Options should be specified using the class template parameter.")
-  BDCSVD(Index rows, Index cols, unsigned int computationOptions) : m_algoswap(16), m_numIters(0) {
+  BDCSVD(Index rows, Index cols, unsigned int computationOptions) : m_numIters(0) {
     internal::check_svd_options_assertions<MatrixType, Options>(computationOptions, rows, cols);
     allocate(rows, cols, computationOptions);
   }
@@ -167,7 +164,7 @@
    * \param matrix the matrix to decompose
    */
   template <typename Derived>
-  BDCSVD(const MatrixBase<Derived>& matrix) : m_algoswap(16), m_numIters(0) {
+  BDCSVD(const MatrixBase<Derived>& matrix) : m_numIters(0) {
     compute_impl(matrix, internal::get_computation_options(Options));
   }
 
@@ -185,7 +182,7 @@
    */
   template <typename Derived>
   EIGEN_DEPRECATED_WITH_REASON("Options should be specified using the class template parameter.")
-  BDCSVD(const MatrixBase<Derived>& matrix, unsigned int computationOptions) : m_algoswap(16), m_numIters(0) {
+  BDCSVD(const MatrixBase<Derived>& matrix, unsigned int computationOptions) : m_numIters(0) {
     internal::check_svd_options_assertions<MatrixType, Options>(computationOptions, matrix.rows(), matrix.cols());
     compute_impl(matrix, computationOptions);
   }
@@ -220,42 +217,21 @@
 
   void setSwitchSize(int s) {
     eigen_assert(s >= 3 && "BDCSVD the size of the algo switch has to be at least 3.");
-    m_algoswap = s;
+    m_impl.setAlgoSwap(s);
   }
 
  private:
   template <typename Derived>
   BDCSVD& compute_impl(const MatrixBase<Derived>& matrix, unsigned int computationOptions);
-  void divide(Index firstCol, Index lastCol, Index firstRowW, Index firstColW, Index shift);
-  void computeSVDofM(Index firstCol, Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V);
-  void computeSingVals(const ArrayRef& col0, const ArrayRef& diag, const IndicesRef& perm, VectorType& singVals,
-                       ArrayRef shifts, ArrayRef mus);
-  void perturbCol0(const ArrayRef& col0, const ArrayRef& diag, const IndicesRef& perm, const VectorType& singVals,
-                   const ArrayRef& shifts, const ArrayRef& mus, ArrayRef zhat);
-  void computeSingVecs(const ArrayRef& zhat, const ArrayRef& diag, const IndicesRef& perm, const VectorType& singVals,
-                       const ArrayRef& shifts, const ArrayRef& mus, MatrixXr& U, MatrixXr& V);
-  void deflation43(Index firstCol, Index shift, Index i, Index size);
-  void deflation44(Index firstColu, Index firstColm, Index firstRowW, Index firstColW, Index i, Index j, Index size);
-  void deflation(Index firstCol, Index lastCol, Index k, Index firstRowW, Index firstColW, Index shift);
   template <typename HouseholderU, typename HouseholderV, typename NaiveU, typename NaiveV>
   void copyUV(const HouseholderU& householderU, const HouseholderV& householderV, const NaiveU& naiveU,
               const NaiveV& naivev);
-  void structured_update(Block<MatrixXr, Dynamic, Dynamic> A, const MatrixXr& B, Index n1);
-  static RealScalar secularEq(RealScalar x, const ArrayRef& col0, const ArrayRef& diag, const IndicesRef& perm,
-                              const ArrayRef& diagShifted, RealScalar shift);
-  template <typename SVDType>
-  void computeBaseCase(SVDType& svd, Index n, Index firstCol, Index firstRowW, Index firstColW, Index shift);
 
  protected:
   void allocate(Index rows, Index cols, unsigned int computationOptions);
-  MatrixXr m_naiveU, m_naiveV;
-  MatrixXr m_computed;
-  Index m_nRec;
-  ArrayXr m_workspace;
-  ArrayXi m_workspaceI;
-  int m_algoswap;
-  bool m_isTranspose, m_compU, m_compV, m_useQrDecomp;
-  JacobiSVD<MatrixType, ComputationOptions> smallSvd;
+  internal::bdcsvd_impl<RealScalar> m_impl;
+  bool m_isTranspose, m_useQrDecomp;
+  JacobiSVD<MatrixX> smallSvd;
   HouseholderQR<MatrixX> qrDecomp;
   internal::UpperBidiagonalization<MatrixX> bid;
   MatrixX copyWorkspace;
@@ -280,14 +256,16 @@
 void BDCSVD<MatrixType, Options>::allocate(Index rows, Index cols, unsigned int computationOptions) {
   if (Base::allocate(rows, cols, computationOptions)) return;
 
-  if (cols < m_algoswap)
+  if (cols < m_impl.algoSwap())
     smallSvd.allocate(rows, cols, Options == 0 ? computationOptions : internal::get_computation_options(Options));
 
-  m_computed = MatrixXr::Zero(diagSize() + 1, diagSize());
-  m_compU = computeV();
-  m_compV = computeU();
   m_isTranspose = (cols > rows);
-  if (m_isTranspose) std::swap(m_compU, m_compV);
+
+  bool compU = computeV();
+  bool compV = computeU();
+  if (m_isTranspose) std::swap(compU, compV);
+
+  m_impl.allocate(diagSize(), compU, compV);
 
   // kMinAspectRatio is the crossover point that determines if we perform R-Bidiagonalization
   // or bidiagonalize the input matrix directly.
@@ -304,22 +282,12 @@
   copyWorkspace = MatrixX(m_isTranspose ? cols : rows, m_isTranspose ? rows : cols);
   bid = internal::UpperBidiagonalization<MatrixX>(m_useQrDecomp ? diagSize() : copyWorkspace.rows(),
                                                   m_useQrDecomp ? diagSize() : copyWorkspace.cols());
-
-  if (m_compU)
-    m_naiveU = MatrixXr::Zero(diagSize() + 1, diagSize() + 1);
-  else
-    m_naiveU = MatrixXr::Zero(2, diagSize() + 1);
-
-  if (m_compV) m_naiveV = MatrixXr::Zero(diagSize(), diagSize());
-
-  m_workspace.resize((diagSize() + 1) * (diagSize() + 1) * 3);
-  m_workspaceI.resize(3 * diagSize());
 }  // end allocate
 
 template <typename MatrixType, int Options>
 template <typename Derived>
-BDCSVD<MatrixType, Options>& BDCSVD<MatrixType, Options>::compute_impl(const MatrixBase<Derived>& matrix,
-                                                                       unsigned int computationOptions) {
+EIGEN_DONT_INLINE BDCSVD<MatrixType, Options>& BDCSVD<MatrixType, Options>::compute_impl(
+    const MatrixBase<Derived>& matrix, unsigned int computationOptions) {
   EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Derived, MatrixType);
   EIGEN_STATIC_ASSERT((std::is_same<typename Derived::Scalar, typename MatrixType::Scalar>::value),
                       Input matrix must have the same Scalar type as the BDCSVD object.);
@@ -335,7 +303,7 @@
   const RealScalar considerZero = (std::numeric_limits<RealScalar>::min)();
 
   //**** step -1 - If the problem is too small, directly falls back to JacobiSVD and return
-  if (matrix.cols() < m_algoswap) {
+  if (matrix.cols() < m_impl.algoSwap()) {
     smallSvd.compute(matrix);
     m_isInitialized = true;
     m_info = smallSvd.info();
@@ -377,12 +345,14 @@
   }
 
   //**** step 2 - Divide & Conquer
-  m_naiveU.setZero();
-  m_naiveV.setZero();
-  // FIXME this line involves a temporary matrix
-  m_computed.topRows(diagSize()) = bid.bidiagonal().toDenseMatrix().transpose();
-  m_computed.template bottomRows<1>().setZero();
-  divide(0, diagSize() - 1, 0, 0, 0);
+  m_impl.naiveU().setZero();
+  m_impl.naiveV().setZero();
+  // FIXME: this line involves a temporary matrix.
+  m_impl.computed().topRows(diagSize()) = bid.bidiagonal().toDenseMatrix().transpose();
+  m_impl.computed().template bottomRows<1>().setZero();
+  m_impl.divide(0, diagSize() - 1, 0, 0, 0);
+  m_info = m_impl.info();
+  m_numIters = m_impl.numIters();
   if (m_info != Success && m_info != NoConvergence) {
     m_isInitialized = true;
     return *this;
@@ -390,7 +360,7 @@
 
   //**** step 3 - Copy singular values and vectors
   for (int i = 0; i < diagSize(); i++) {
-    RealScalar a = abs(m_computed.coeff(i, i));
+    RealScalar a = abs(m_impl.computed().coeff(i, i));
     m_singularValues.coeffRef(i) = a * scale;
     if (a < considerZero) {
       m_nonzeroSingularValues = i;
@@ -404,9 +374,9 @@
 
   //**** step 4 - Finalize unitaries U and V
   if (m_isTranspose)
-    copyUV(bid.householderV(), bid.householderU(), m_naiveV, m_naiveU);
+    copyUV(bid.householderV(), bid.householderU(), m_impl.naiveV(), m_impl.naiveU());
   else
-    copyUV(bid.householderU(), bid.householderV(), m_naiveU, m_naiveV);
+    copyUV(bid.householderU(), bid.householderV(), m_impl.naiveU(), m_impl.naiveV());
 
   if (m_useQrDecomp) {
     if (m_isTranspose && computeV())
@@ -421,15 +391,16 @@
 
 template <typename MatrixType, int Options>
 template <typename HouseholderU, typename HouseholderV, typename NaiveU, typename NaiveV>
-void BDCSVD<MatrixType, Options>::copyUV(const HouseholderU& householderU, const HouseholderV& householderV,
-                                         const NaiveU& naiveU, const NaiveV& naiveV) {
+EIGEN_DONT_INLINE void BDCSVD<MatrixType, Options>::copyUV(const HouseholderU& householderU,
+                                                           const HouseholderV& householderV, const NaiveU& naiveU,
+                                                           const NaiveV& naiveV) {
   // Note exchange of U and V: m_matrixU is set from m_naiveV and vice versa
   if (computeU()) {
     Index Ucols = m_computeThinU ? diagSize() : rows();
     m_matrixU = MatrixX::Identity(rows(), Ucols);
     m_matrixU.topLeftCorner(diagSize(), diagSize()) =
         naiveV.template cast<Scalar>().topLeftCorner(diagSize(), diagSize());
-    // FIXME the following conditionals involve temporary buffers
+    // FIXME: the following conditionals involve temporary buffers.
     if (m_useQrDecomp)
       m_matrixU.topLeftCorner(householderU.cols(), diagSize()).applyOnTheLeft(householderU);
     else
@@ -440,7 +411,7 @@
     m_matrixV = MatrixX::Identity(cols(), Vcols);
     m_matrixV.topLeftCorner(diagSize(), diagSize()) =
         naiveU.template cast<Scalar>().topLeftCorner(diagSize(), diagSize());
-    // FIXME the following conditionals involve temporary buffers
+    // FIXME: the following conditionals involve temporary buffers.
     if (m_useQrDecomp)
       m_matrixV.topLeftCorner(householderV.cols(), diagSize()).applyOnTheLeft(householderV);
     else
@@ -448,998 +419,6 @@
   }
 }
 
-/** \internal
- * Performs A = A * B exploiting the special structure of the matrix A. Splitting A as:
- *  A = [A1]
- *      [A2]
- * such that A1.rows()==n1, then we assume that at least half of the columns of A1 and A2 are zeros.
- * We can thus pack them prior to the the matrix product. However, this is only worth the effort if the matrix is large
- * enough.
- */
-template <typename MatrixType, int Options>
-void BDCSVD<MatrixType, Options>::structured_update(Block<MatrixXr, Dynamic, Dynamic> A, const MatrixXr& B, Index n1) {
-  Index n = A.rows();
-  if (n > 100) {
-    // If the matrices are large enough, let's exploit the sparse structure of A by
-    // splitting it in half (wrt n1), and packing the non-zero columns.
-    Index n2 = n - n1;
-    Map<MatrixXr> A1(m_workspace.data(), n1, n);
-    Map<MatrixXr> A2(m_workspace.data() + n1 * n, n2, n);
-    Map<MatrixXr> B1(m_workspace.data() + n * n, n, n);
-    Map<MatrixXr> B2(m_workspace.data() + 2 * n * n, n, n);
-    Index k1 = 0, k2 = 0;
-    for (Index j = 0; j < n; ++j) {
-      if ((A.col(j).head(n1).array() != Literal(0)).any()) {
-        A1.col(k1) = A.col(j).head(n1);
-        B1.row(k1) = B.row(j);
-        ++k1;
-      }
-      if ((A.col(j).tail(n2).array() != Literal(0)).any()) {
-        A2.col(k2) = A.col(j).tail(n2);
-        B2.row(k2) = B.row(j);
-        ++k2;
-      }
-    }
-
-    A.topRows(n1).noalias() = A1.leftCols(k1) * B1.topRows(k1);
-    A.bottomRows(n2).noalias() = A2.leftCols(k2) * B2.topRows(k2);
-  } else {
-    Map<MatrixXr, Aligned> tmp(m_workspace.data(), n, n);
-    tmp.noalias() = A * B;
-    A = tmp;
-  }
-}
-
-template <typename MatrixType, int Options>
-template <typename SVDType>
-void BDCSVD<MatrixType, Options>::computeBaseCase(SVDType& svd, Index n, Index firstCol, Index firstRowW,
-                                                  Index firstColW, Index shift) {
-  svd.compute(m_computed.block(firstCol, firstCol, n + 1, n));
-  m_info = svd.info();
-  if (m_info != Success && m_info != NoConvergence) return;
-  if (m_compU)
-    m_naiveU.block(firstCol, firstCol, n + 1, n + 1).real() = svd.matrixU();
-  else {
-    m_naiveU.row(0).segment(firstCol, n + 1).real() = svd.matrixU().row(0);
-    m_naiveU.row(1).segment(firstCol, n + 1).real() = svd.matrixU().row(n);
-  }
-  if (m_compV) m_naiveV.block(firstRowW, firstColW, n, n).real() = svd.matrixV();
-  m_computed.block(firstCol + shift, firstCol + shift, n + 1, n).setZero();
-  m_computed.diagonal().segment(firstCol + shift, n) = svd.singularValues().head(n);
-}
-
-// The divide algorithm is done "in place", we are always working on subsets of the same matrix. The divide methods
-// takes as argument the place of the submatrix we are currently working on.
-
-//@param firstCol : The Index of the first column of the submatrix of m_computed and for m_naiveU;
-//@param lastCol : The Index of the last column of the submatrix of m_computed and for m_naiveU;
-// lastCol + 1 - firstCol is the size of the submatrix.
-//@param firstRowW : The Index of the first row of the matrix W that we are to change. (see the reference paper section
-// 1 for more information on W)
-//@param firstColW : Same as firstRowW with the column.
-//@param shift : Each time one takes the left submatrix, one must add 1 to the shift. Why? Because! We actually want the
-// last column of the U submatrix
-// to become the first column (*coeff) and to shift all the other columns to the right. There are more details on the
-// reference paper.
-template <typename MatrixType, int Options>
-void BDCSVD<MatrixType, Options>::divide(Index firstCol, Index lastCol, Index firstRowW, Index firstColW, Index shift) {
-  // requires rows = cols + 1;
-  using std::abs;
-  using std::pow;
-  using std::sqrt;
-  const Index n = lastCol - firstCol + 1;
-  const Index k = n / 2;
-  const RealScalar considerZero = (std::numeric_limits<RealScalar>::min)();
-  RealScalar alphaK;
-  RealScalar betaK;
-  RealScalar r0;
-  RealScalar lambda, phi, c0, s0;
-  VectorType l, f;
-  // We use the other algorithm which is more efficient for small
-  // matrices.
-  if (n < m_algoswap) {
-    // FIXME this block involves temporaries
-    if (m_compV) {
-      JacobiSVD<MatrixXr, ComputeFullU | ComputeFullV> baseSvd;
-      computeBaseCase(baseSvd, n, firstCol, firstRowW, firstColW, shift);
-    } else {
-      JacobiSVD<MatrixXr, ComputeFullU> baseSvd;
-      computeBaseCase(baseSvd, n, firstCol, firstRowW, firstColW, shift);
-    }
-    return;
-  }
-  // We use the divide and conquer algorithm
-  alphaK = m_computed(firstCol + k, firstCol + k);
-  betaK = m_computed(firstCol + k + 1, firstCol + k);
-  // The divide must be done in that order in order to have good results. Divide change the data inside the submatrices
-  // and the divide of the right submatrice reads one column of the left submatrice. That's why we need to treat the
-  // right submatrix before the left one.
-  divide(k + 1 + firstCol, lastCol, k + 1 + firstRowW, k + 1 + firstColW, shift);
-  if (m_info != Success && m_info != NoConvergence) return;
-  divide(firstCol, k - 1 + firstCol, firstRowW, firstColW + 1, shift + 1);
-  if (m_info != Success && m_info != NoConvergence) return;
-
-  if (m_compU) {
-    lambda = m_naiveU(firstCol + k, firstCol + k);
-    phi = m_naiveU(firstCol + k + 1, lastCol + 1);
-  } else {
-    lambda = m_naiveU(1, firstCol + k);
-    phi = m_naiveU(0, lastCol + 1);
-  }
-  r0 = sqrt((abs(alphaK * lambda) * abs(alphaK * lambda)) + abs(betaK * phi) * abs(betaK * phi));
-  if (m_compU) {
-    l = m_naiveU.row(firstCol + k).segment(firstCol, k);
-    f = m_naiveU.row(firstCol + k + 1).segment(firstCol + k + 1, n - k - 1);
-  } else {
-    l = m_naiveU.row(1).segment(firstCol, k);
-    f = m_naiveU.row(0).segment(firstCol + k + 1, n - k - 1);
-  }
-  if (m_compV) m_naiveV(firstRowW + k, firstColW) = Literal(1);
-  if (r0 < considerZero) {
-    c0 = Literal(1);
-    s0 = Literal(0);
-  } else {
-    c0 = alphaK * lambda / r0;
-    s0 = betaK * phi / r0;
-  }
-
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-  eigen_internal_assert(m_naiveU.allFinite());
-  eigen_internal_assert(m_naiveV.allFinite());
-  eigen_internal_assert(m_computed.allFinite());
-#endif
-
-  if (m_compU) {
-    MatrixXr q1(m_naiveU.col(firstCol + k).segment(firstCol, k + 1));
-    // we shiftW Q1 to the right
-    for (Index i = firstCol + k - 1; i >= firstCol; i--)
-      m_naiveU.col(i + 1).segment(firstCol, k + 1) = m_naiveU.col(i).segment(firstCol, k + 1);
-    // we shift q1 at the left with a factor c0
-    m_naiveU.col(firstCol).segment(firstCol, k + 1) = (q1 * c0);
-    // last column = q1 * - s0
-    m_naiveU.col(lastCol + 1).segment(firstCol, k + 1) = (q1 * (-s0));
-    // first column = q2 * s0
-    m_naiveU.col(firstCol).segment(firstCol + k + 1, n - k) =
-        m_naiveU.col(lastCol + 1).segment(firstCol + k + 1, n - k) * s0;
-    // q2 *= c0
-    m_naiveU.col(lastCol + 1).segment(firstCol + k + 1, n - k) *= c0;
-  } else {
-    RealScalar q1 = m_naiveU(0, firstCol + k);
-    // we shift Q1 to the right
-    for (Index i = firstCol + k - 1; i >= firstCol; i--) m_naiveU(0, i + 1) = m_naiveU(0, i);
-    // we shift q1 at the left with a factor c0
-    m_naiveU(0, firstCol) = (q1 * c0);
-    // last column = q1 * - s0
-    m_naiveU(0, lastCol + 1) = (q1 * (-s0));
-    // first column = q2 * s0
-    m_naiveU(1, firstCol) = m_naiveU(1, lastCol + 1) * s0;
-    // q2 *= c0
-    m_naiveU(1, lastCol + 1) *= c0;
-    m_naiveU.row(1).segment(firstCol + 1, k).setZero();
-    m_naiveU.row(0).segment(firstCol + k + 1, n - k - 1).setZero();
-  }
-
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-  eigen_internal_assert(m_naiveU.allFinite());
-  eigen_internal_assert(m_naiveV.allFinite());
-  eigen_internal_assert(m_computed.allFinite());
-#endif
-
-  m_computed(firstCol + shift, firstCol + shift) = r0;
-  m_computed.col(firstCol + shift).segment(firstCol + shift + 1, k) = alphaK * l.transpose().real();
-  m_computed.col(firstCol + shift).segment(firstCol + shift + k + 1, n - k - 1) = betaK * f.transpose().real();
-
-#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-  ArrayXr tmp1 = (m_computed.block(firstCol + shift, firstCol + shift, n, n)).jacobiSvd().singularValues();
-#endif
-  // Second part: try to deflate singular values in combined matrix
-  deflation(firstCol, lastCol, k, firstRowW, firstColW, shift);
-#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-  ArrayXr tmp2 = (m_computed.block(firstCol + shift, firstCol + shift, n, n)).jacobiSvd().singularValues();
-  std::cout << "\n\nj1 = " << tmp1.transpose().format(bdcsvdfmt) << "\n";
-  std::cout << "j2 = " << tmp2.transpose().format(bdcsvdfmt) << "\n\n";
-  std::cout << "err:      " << ((tmp1 - tmp2).abs() > 1e-12 * tmp2.abs()).transpose() << "\n";
-  static int count = 0;
-  std::cout << "# " << ++count << "\n\n";
-  eigen_internal_assert((tmp1 - tmp2).matrix().norm() < 1e-14 * tmp2.matrix().norm());
-//   eigen_internal_assert(count<681);
-//   eigen_internal_assert(((tmp1-tmp2).abs()<1e-13*tmp2.abs()).all());
-#endif
-
-  // Third part: compute SVD of combined matrix
-  MatrixXr UofSVD, VofSVD;
-  VectorType singVals;
-  computeSVDofM(firstCol + shift, n, UofSVD, singVals, VofSVD);
-
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-  eigen_internal_assert(UofSVD.allFinite());
-  eigen_internal_assert(VofSVD.allFinite());
-#endif
-
-  if (m_compU)
-    structured_update(m_naiveU.block(firstCol, firstCol, n + 1, n + 1), UofSVD, (n + 2) / 2);
-  else {
-    Map<Matrix<RealScalar, 2, Dynamic>, Aligned> tmp(m_workspace.data(), 2, n + 1);
-    tmp.noalias() = m_naiveU.middleCols(firstCol, n + 1) * UofSVD;
-    m_naiveU.middleCols(firstCol, n + 1) = tmp;
-  }
-
-  if (m_compV) structured_update(m_naiveV.block(firstRowW, firstColW, n, n), VofSVD, (n + 1) / 2);
-
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-  eigen_internal_assert(m_naiveU.allFinite());
-  eigen_internal_assert(m_naiveV.allFinite());
-  eigen_internal_assert(m_computed.allFinite());
-#endif
-
-  m_computed.block(firstCol + shift, firstCol + shift, n, n).setZero();
-  m_computed.block(firstCol + shift, firstCol + shift, n, n).diagonal() = singVals;
-}  // end divide
-
-// Compute SVD of m_computed.block(firstCol, firstCol, n + 1, n); this block only has non-zeros in
-// the first column and on the diagonal and has undergone deflation, so diagonal is in increasing
-// order except for possibly the (0,0) entry. The computed SVD is stored U, singVals and V, except
-// that if m_compV is false, then V is not computed. Singular values are sorted in decreasing order.
-//
-// TODO Opportunities for optimization: better root finding algo, better stopping criterion, better
-// handling of round-off errors, be consistent in ordering
-// For instance, to solve the secular equation using FMM, see
-// http://www.stat.uchicago.edu/~lekheng/courses/302/classics/greengard-rokhlin.pdf
-template <typename MatrixType, int Options>
-void BDCSVD<MatrixType, Options>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, VectorType& singVals,
-                                                MatrixXr& V) {
-  const RealScalar considerZero = (std::numeric_limits<RealScalar>::min)();
-  using std::abs;
-  ArrayRef col0 = m_computed.col(firstCol).segment(firstCol, n);
-  m_workspace.head(n) = m_computed.block(firstCol, firstCol, n, n).diagonal();
-  ArrayRef diag = m_workspace.head(n);
-  diag(0) = Literal(0);
-
-  // Allocate space for singular values and vectors
-  singVals.resize(n);
-  U.resize(n + 1, n + 1);
-  if (m_compV) V.resize(n, n);
-
-#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-  if (col0.hasNaN() || diag.hasNaN()) std::cout << "\n\nHAS NAN\n\n";
-#endif
-
-  // Many singular values might have been deflated, the zero ones have been moved to the end,
-  // but others are interleaved and we must ignore them at this stage.
-  // To this end, let's compute a permutation skipping them:
-  Index actual_n = n;
-  while (actual_n > 1 && numext::is_exactly_zero(diag(actual_n - 1))) {
-    --actual_n;
-    eigen_internal_assert(numext::is_exactly_zero(col0(actual_n)));
-  }
-  Index m = 0;  // size of the deflated problem
-  for (Index k = 0; k < actual_n; ++k)
-    if (abs(col0(k)) > considerZero) m_workspaceI(m++) = k;
-  Map<ArrayXi> perm(m_workspaceI.data(), m);
-
-  Map<ArrayXr> shifts(m_workspace.data() + 1 * n, n);
-  Map<ArrayXr> mus(m_workspace.data() + 2 * n, n);
-  Map<ArrayXr> zhat(m_workspace.data() + 3 * n, n);
-
-#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-  std::cout << "computeSVDofM using:\n";
-  std::cout << "  z: " << col0.transpose() << "\n";
-  std::cout << "  d: " << diag.transpose() << "\n";
-#endif
-
-  // Compute singVals, shifts, and mus
-  computeSingVals(col0, diag, perm, singVals, shifts, mus);
-
-#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-  std::cout << "  j:        "
-            << (m_computed.block(firstCol, firstCol, n, n)).jacobiSvd().singularValues().transpose().reverse()
-            << "\n\n";
-  std::cout << "  sing-val: " << singVals.transpose() << "\n";
-  std::cout << "  mu:       " << mus.transpose() << "\n";
-  std::cout << "  shift:    " << shifts.transpose() << "\n";
-
-  {
-    std::cout << "\n\n    mus:    " << mus.head(actual_n).transpose() << "\n\n";
-    std::cout << "    check1 (expect0) : "
-              << ((singVals.array() - (shifts + mus)) / singVals.array()).head(actual_n).transpose() << "\n\n";
-    eigen_internal_assert((((singVals.array() - (shifts + mus)) / singVals.array()).head(actual_n) >= 0).all());
-    std::cout << "    check2 (>0)      : " << ((singVals.array() - diag) / singVals.array()).head(actual_n).transpose()
-              << "\n\n";
-    eigen_internal_assert((((singVals.array() - diag) / singVals.array()).head(actual_n) >= 0).all());
-  }
-#endif
-
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-  eigen_internal_assert(singVals.allFinite());
-  eigen_internal_assert(mus.allFinite());
-  eigen_internal_assert(shifts.allFinite());
-#endif
-
-  // Compute zhat
-  perturbCol0(col0, diag, perm, singVals, shifts, mus, zhat);
-#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-  std::cout << "  zhat: " << zhat.transpose() << "\n";
-#endif
-
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-  eigen_internal_assert(zhat.allFinite());
-#endif
-
-  computeSingVecs(zhat, diag, perm, singVals, shifts, mus, U, V);
-
-#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-  std::cout << "U^T U: " << (U.transpose() * U - MatrixXr(MatrixXr::Identity(U.cols(), U.cols()))).norm() << "\n";
-  std::cout << "V^T V: " << (V.transpose() * V - MatrixXr(MatrixXr::Identity(V.cols(), V.cols()))).norm() << "\n";
-#endif
-
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-  eigen_internal_assert(m_naiveU.allFinite());
-  eigen_internal_assert(m_naiveV.allFinite());
-  eigen_internal_assert(m_computed.allFinite());
-  eigen_internal_assert(U.allFinite());
-  eigen_internal_assert(V.allFinite());
-//   eigen_internal_assert((U.transpose() * U - MatrixXr(MatrixXr::Identity(U.cols(),U.cols()))).norm() <
-//   100*NumTraits<RealScalar>::epsilon() * n); eigen_internal_assert((V.transpose() * V -
-//   MatrixXr(MatrixXr::Identity(V.cols(),V.cols()))).norm() < 100*NumTraits<RealScalar>::epsilon() * n);
-#endif
-
-  // Because of deflation, the singular values might not be completely sorted.
-  // Fortunately, reordering them is a O(n) problem
-  for (Index i = 0; i < actual_n - 1; ++i) {
-    if (singVals(i) > singVals(i + 1)) {
-      using std::swap;
-      swap(singVals(i), singVals(i + 1));
-      U.col(i).swap(U.col(i + 1));
-      if (m_compV) V.col(i).swap(V.col(i + 1));
-    }
-  }
-
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-  {
-    bool singular_values_sorted =
-        (((singVals.segment(1, actual_n - 1) - singVals.head(actual_n - 1))).array() >= 0).all();
-    if (!singular_values_sorted)
-      std::cout << "Singular values are not sorted: " << singVals.segment(1, actual_n).transpose() << "\n";
-    eigen_internal_assert(singular_values_sorted);
-  }
-#endif
-
-  // Reverse order so that singular values in increased order
-  // Because of deflation, the zeros singular-values are already at the end
-  singVals.head(actual_n).reverseInPlace();
-  U.leftCols(actual_n).rowwise().reverseInPlace();
-  if (m_compV) V.leftCols(actual_n).rowwise().reverseInPlace();
-
-#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-  JacobiSVD<MatrixXr> jsvd(m_computed.block(firstCol, firstCol, n, n));
-  std::cout << "  * j:        " << jsvd.singularValues().transpose() << "\n\n";
-  std::cout << "  * sing-val: " << singVals.transpose() << "\n";
-//   std::cout << "  * err:      " << ((jsvd.singularValues()-singVals)>1e-13*singVals.norm()).transpose() << "\n";
-#endif
-}
-
-template <typename MatrixType, int Options>
-typename BDCSVD<MatrixType, Options>::RealScalar BDCSVD<MatrixType, Options>::secularEq(
-    RealScalar mu, const ArrayRef& col0, const ArrayRef& diag, const IndicesRef& perm, const ArrayRef& diagShifted,
-    RealScalar shift) {
-  Index m = perm.size();
-  RealScalar res = Literal(1);
-  for (Index i = 0; i < m; ++i) {
-    Index j = perm(i);
-    // The following expression could be rewritten to involve only a single division,
-    // but this would make the expression more sensitive to overflow.
-    res += (col0(j) / (diagShifted(j) - mu)) * (col0(j) / (diag(j) + shift + mu));
-  }
-  return res;
-}
-
-template <typename MatrixType, int Options>
-void BDCSVD<MatrixType, Options>::computeSingVals(const ArrayRef& col0, const ArrayRef& diag, const IndicesRef& perm,
-                                                  VectorType& singVals, ArrayRef shifts, ArrayRef mus) {
-  using std::abs;
-  using std::sqrt;
-  using std::swap;
-
-  Index n = col0.size();
-  Index actual_n = n;
-  // Note that here actual_n is computed based on col0(i)==0 instead of diag(i)==0 as above
-  // because 1) we have diag(i)==0 => col0(i)==0 and 2) if col0(i)==0, then diag(i) is already a singular value.
-  while (actual_n > 1 && numext::is_exactly_zero(col0(actual_n - 1))) --actual_n;
-
-  for (Index k = 0; k < n; ++k) {
-    if (numext::is_exactly_zero(col0(k)) || actual_n == 1) {
-      // if col0(k) == 0, then entry is deflated, so singular value is on diagonal
-      // if actual_n==1, then the deflated problem is already diagonalized
-      singVals(k) = k == 0 ? col0(0) : diag(k);
-      mus(k) = Literal(0);
-      shifts(k) = k == 0 ? col0(0) : diag(k);
-      continue;
-    }
-
-    // otherwise, use secular equation to find singular value
-    RealScalar left = diag(k);
-    RealScalar right;  // was: = (k != actual_n-1) ? diag(k+1) : (diag(actual_n-1) + col0.matrix().norm());
-    if (k == actual_n - 1)
-      right = (diag(actual_n - 1) + col0.matrix().norm());
-    else {
-      // Skip deflated singular values,
-      // recall that at this stage we assume that z[j]!=0 and all entries for which z[j]==0 have been put aside.
-      // This should be equivalent to using perm[]
-      Index l = k + 1;
-      while (numext::is_exactly_zero(col0(l))) {
-        ++l;
-        eigen_internal_assert(l < actual_n);
-      }
-      right = diag(l);
-    }
-
-    // first decide whether it's closer to the left end or the right end
-    RealScalar mid = left + (right - left) / Literal(2);
-    RealScalar fMid = secularEq(mid, col0, diag, perm, diag, Literal(0));
-#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-    std::cout << "right-left = " << right - left << "\n";
-    //     std::cout << "fMid = " << fMid << " " << secularEq(mid-left, col0, diag, perm, ArrayXr(diag-left), left)
-    //                            << " " << secularEq(mid-right, col0, diag, perm, ArrayXr(diag-right), right)   <<
-    //                            "\n";
-    std::cout << "     = " << secularEq(left + RealScalar(0.000001) * (right - left), col0, diag, perm, diag, 0) << " "
-              << secularEq(left + RealScalar(0.1) * (right - left), col0, diag, perm, diag, 0) << " "
-              << secularEq(left + RealScalar(0.2) * (right - left), col0, diag, perm, diag, 0) << " "
-              << secularEq(left + RealScalar(0.3) * (right - left), col0, diag, perm, diag, 0) << " "
-              << secularEq(left + RealScalar(0.4) * (right - left), col0, diag, perm, diag, 0) << " "
-              << secularEq(left + RealScalar(0.49) * (right - left), col0, diag, perm, diag, 0) << " "
-              << secularEq(left + RealScalar(0.5) * (right - left), col0, diag, perm, diag, 0) << " "
-              << secularEq(left + RealScalar(0.51) * (right - left), col0, diag, perm, diag, 0) << " "
-              << secularEq(left + RealScalar(0.6) * (right - left), col0, diag, perm, diag, 0) << " "
-              << secularEq(left + RealScalar(0.7) * (right - left), col0, diag, perm, diag, 0) << " "
-              << secularEq(left + RealScalar(0.8) * (right - left), col0, diag, perm, diag, 0) << " "
-              << secularEq(left + RealScalar(0.9) * (right - left), col0, diag, perm, diag, 0) << " "
-              << secularEq(left + RealScalar(0.999999) * (right - left), col0, diag, perm, diag, 0) << "\n";
-#endif
-    RealScalar shift = (k == actual_n - 1 || fMid > Literal(0)) ? left : right;
-
-    // measure everything relative to shift
-    Map<ArrayXr> diagShifted(m_workspace.data() + 4 * n, n);
-    diagShifted = diag - shift;
-
-    if (k != actual_n - 1) {
-      // check that after the shift, f(mid) is still negative:
-      RealScalar midShifted = (right - left) / RealScalar(2);
-      // we can test exact equality here, because shift comes from `... ? left : right`
-      if (numext::equal_strict(shift, right)) midShifted = -midShifted;
-      RealScalar fMidShifted = secularEq(midShifted, col0, diag, perm, diagShifted, shift);
-      if (fMidShifted > 0) {
-        // fMid was erroneous, fix it:
-        shift = fMidShifted > Literal(0) ? left : right;
-        diagShifted = diag - shift;
-      }
-    }
-
-    // initial guess
-    RealScalar muPrev, muCur;
-    // we can test exact equality here, because shift comes from `... ? left : right`
-    if (numext::equal_strict(shift, left)) {
-      muPrev = (right - left) * RealScalar(0.1);
-      if (k == actual_n - 1)
-        muCur = right - left;
-      else
-        muCur = (right - left) * RealScalar(0.5);
-    } else {
-      muPrev = -(right - left) * RealScalar(0.1);
-      muCur = -(right - left) * RealScalar(0.5);
-    }
-
-    RealScalar fPrev = secularEq(muPrev, col0, diag, perm, diagShifted, shift);
-    RealScalar fCur = secularEq(muCur, col0, diag, perm, diagShifted, shift);
-    if (abs(fPrev) < abs(fCur)) {
-      swap(fPrev, fCur);
-      swap(muPrev, muCur);
-    }
-
-    // rational interpolation: fit a function of the form a / mu + b through the two previous
-    // iterates and use its zero to compute the next iterate
-    bool useBisection = fPrev * fCur > Literal(0);
-    while (!numext::is_exactly_zero(fCur) &&
-           abs(muCur - muPrev) >
-               Literal(8) * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(abs(muCur), abs(muPrev)) &&
-           abs(fCur - fPrev) > NumTraits<RealScalar>::epsilon() && !useBisection) {
-      ++m_numIters;
-
-      // Find a and b such that the function f(mu) = a / mu + b matches the current and previous samples.
-      RealScalar a = (fCur - fPrev) / (Literal(1) / muCur - Literal(1) / muPrev);
-      RealScalar b = fCur - a / muCur;
-      // And find mu such that f(mu)==0:
-      RealScalar muZero = -a / b;
-      RealScalar fZero = secularEq(muZero, col0, diag, perm, diagShifted, shift);
-
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-      eigen_internal_assert((numext::isfinite)(fZero));
-#endif
-
-      muPrev = muCur;
-      fPrev = fCur;
-      muCur = muZero;
-      fCur = fZero;
-
-      // we can test exact equality here, because shift comes from `... ? left : right`
-      if (numext::equal_strict(shift, left) && (muCur < Literal(0) || muCur > right - left)) useBisection = true;
-      if (numext::equal_strict(shift, right) && (muCur < -(right - left) || muCur > Literal(0))) useBisection = true;
-      if (abs(fCur) > abs(fPrev)) useBisection = true;
-    }
-
-    // fall back on bisection method if rational interpolation did not work
-    if (useBisection) {
-#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-      std::cout << "useBisection for k = " << k << ", actual_n = " << actual_n << "\n";
-#endif
-      RealScalar leftShifted, rightShifted;
-      // we can test exact equality here, because shift comes from `... ? left : right`
-      if (numext::equal_strict(shift, left)) {
-        // to avoid overflow, we must have mu > max(real_min, |z(k)|/sqrt(real_max)),
-        // the factor 2 is to be more conservative
-        leftShifted =
-            numext::maxi<RealScalar>((std::numeric_limits<RealScalar>::min)(),
-                                     Literal(2) * abs(col0(k)) / sqrt((std::numeric_limits<RealScalar>::max)()));
-
-        // check that we did it right:
-        eigen_internal_assert(
-            (numext::isfinite)((col0(k) / leftShifted) * (col0(k) / (diag(k) + shift + leftShifted))));
-        // I don't understand why the case k==0 would be special there:
-        // if (k == 0) rightShifted = right - left; else
-        rightShifted = (k == actual_n - 1)
-                           ? right
-                           : ((right - left) * RealScalar(0.51));  // theoretically we can take 0.5, but let's be safe
-      } else {
-        leftShifted = -(right - left) * RealScalar(0.51);
-        if (k + 1 < n)
-          rightShifted = -numext::maxi<RealScalar>((std::numeric_limits<RealScalar>::min)(),
-                                                   abs(col0(k + 1)) / sqrt((std::numeric_limits<RealScalar>::max)()));
-        else
-          rightShifted = -(std::numeric_limits<RealScalar>::min)();
-      }
-
-      RealScalar fLeft = secularEq(leftShifted, col0, diag, perm, diagShifted, shift);
-      eigen_internal_assert(fLeft < Literal(0));
-
-#if defined EIGEN_BDCSVD_DEBUG_VERBOSE || defined EIGEN_BDCSVD_SANITY_CHECKS || defined EIGEN_INTERNAL_DEBUGGING
-      RealScalar fRight = secularEq(rightShifted, col0, diag, perm, diagShifted, shift);
-#endif
-
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-      if (!(numext::isfinite)(fLeft))
-        std::cout << "f(" << leftShifted << ") =" << fLeft << " ; " << left << " " << shift << " " << right << "\n";
-      eigen_internal_assert((numext::isfinite)(fLeft));
-
-      if (!(numext::isfinite)(fRight))
-        std::cout << "f(" << rightShifted << ") =" << fRight << " ; " << left << " " << shift << " " << right << "\n";
-        // eigen_internal_assert((numext::isfinite)(fRight));
-#endif
-
-#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-      if (!(fLeft * fRight < 0)) {
-        std::cout << "f(leftShifted) using  leftShifted=" << leftShifted
-                  << " ;  diagShifted(1:10):" << diagShifted.head(10).transpose() << "\n ; "
-                  << "left==shift=" << bool(left == shift) << " ; left-shift = " << (left - shift) << "\n";
-        std::cout << "k=" << k << ", " << fLeft << " * " << fRight << " == " << fLeft * fRight << "  ;  "
-                  << "[" << left << " .. " << right << "] -> [" << leftShifted << " " << rightShifted
-                  << "], shift=" << shift << " ,  f(right)=" << secularEq(0, col0, diag, perm, diagShifted, shift)
-                  << " == " << secularEq(right, col0, diag, perm, diag, 0) << " == " << fRight << "\n";
-      }
-#endif
-      eigen_internal_assert(fLeft * fRight < Literal(0));
-
-      if (fLeft < Literal(0)) {
-        while (rightShifted - leftShifted > Literal(2) * NumTraits<RealScalar>::epsilon() *
-                                                numext::maxi<RealScalar>(abs(leftShifted), abs(rightShifted))) {
-          RealScalar midShifted = (leftShifted + rightShifted) / Literal(2);
-          fMid = secularEq(midShifted, col0, diag, perm, diagShifted, shift);
-          eigen_internal_assert((numext::isfinite)(fMid));
-
-          if (fLeft * fMid < Literal(0)) {
-            rightShifted = midShifted;
-          } else {
-            leftShifted = midShifted;
-            fLeft = fMid;
-          }
-        }
-        muCur = (leftShifted + rightShifted) / Literal(2);
-      } else {
-        // We have a problem as shifting on the left or right give either a positive or negative value
-        // at the middle of [left,right]...
-        // Instead of abbording or entering an infinite loop,
-        // let's just use the middle as the estimated zero-crossing:
-        muCur = (right - left) * RealScalar(0.5);
-        // we can test exact equality here, because shift comes from `... ? left : right`
-        if (numext::equal_strict(shift, right)) muCur = -muCur;
-      }
-    }
-
-    singVals[k] = shift + muCur;
-    shifts[k] = shift;
-    mus[k] = muCur;
-
-#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-    if (k + 1 < n)
-      std::cout << "found " << singVals[k] << " == " << shift << " + " << muCur << " from " << diag(k) << " .. "
-                << diag(k + 1) << "\n";
-#endif
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-    eigen_internal_assert(k == 0 || singVals[k] >= singVals[k - 1]);
-    eigen_internal_assert(singVals[k] >= diag(k));
-#endif
-
-    // perturb singular value slightly if it equals diagonal entry to avoid division by zero later
-    // (deflation is supposed to avoid this from happening)
-    // - this does no seem to be necessary anymore -
-    // if (singVals[k] == left) singVals[k] *= 1 + NumTraits<RealScalar>::epsilon();
-    // if (singVals[k] == right) singVals[k] *= 1 - NumTraits<RealScalar>::epsilon();
-  }
-}
-
-// zhat is perturbation of col0 for which singular vectors can be computed stably (see Section 3.1)
-template <typename MatrixType, int Options>
-void BDCSVD<MatrixType, Options>::perturbCol0(const ArrayRef& col0, const ArrayRef& diag, const IndicesRef& perm,
-                                              const VectorType& singVals, const ArrayRef& shifts, const ArrayRef& mus,
-                                              ArrayRef zhat) {
-  using std::sqrt;
-  Index n = col0.size();
-  Index m = perm.size();
-  if (m == 0) {
-    zhat.setZero();
-    return;
-  }
-  Index lastIdx = perm(m - 1);
-  // The offset permits to skip deflated entries while computing zhat
-  for (Index k = 0; k < n; ++k) {
-    if (numext::is_exactly_zero(col0(k)))  // deflated
-      zhat(k) = Literal(0);
-    else {
-      // see equation (3.6)
-      RealScalar dk = diag(k);
-      RealScalar prod = (singVals(lastIdx) + dk) * (mus(lastIdx) + (shifts(lastIdx) - dk));
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-      if (prod < 0) {
-        std::cout << "k = " << k << " ;  z(k)=" << col0(k) << ", diag(k)=" << dk << "\n";
-        std::cout << "prod = "
-                  << "(" << singVals(lastIdx) << " + " << dk << ") * (" << mus(lastIdx) << " + (" << shifts(lastIdx)
-                  << " - " << dk << "))"
-                  << "\n";
-        std::cout << "     = " << singVals(lastIdx) + dk << " * " << mus(lastIdx) + (shifts(lastIdx) - dk) << "\n";
-      }
-      eigen_internal_assert(prod >= 0);
-#endif
-
-      for (Index l = 0; l < m; ++l) {
-        Index i = perm(l);
-        if (i != k) {
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-          if (i >= k && (l == 0 || l - 1 >= m)) {
-            std::cout << "Error in perturbCol0\n";
-            std::cout << "  " << k << "/" << n << " " << l << "/" << m << " " << i << "/" << n << " ; " << col0(k)
-                      << " " << diag(k) << " "
-                      << "\n";
-            std::cout << "  " << diag(i) << "\n";
-            Index j = (i < k /*|| l==0*/) ? i : perm(l - 1);
-            std::cout << "  "
-                      << "j=" << j << "\n";
-          }
-#endif
-          Index j = i < k ? i : l > 0 ? perm(l - 1) : i;
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-          if (!(dk != Literal(0) || diag(i) != Literal(0))) {
-            std::cout << "k=" << k << ", i=" << i << ", l=" << l << ", perm.size()=" << perm.size() << "\n";
-          }
-          eigen_internal_assert(dk != Literal(0) || diag(i) != Literal(0));
-#endif
-          prod *= ((singVals(j) + dk) / ((diag(i) + dk))) * ((mus(j) + (shifts(j) - dk)) / ((diag(i) - dk)));
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-          eigen_internal_assert(prod >= 0);
-#endif
-#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-          if (i != k &&
-              numext::abs(((singVals(j) + dk) * (mus(j) + (shifts(j) - dk))) / ((diag(i) + dk) * (diag(i) - dk)) - 1) >
-                  0.9)
-            std::cout << "     "
-                      << ((singVals(j) + dk) * (mus(j) + (shifts(j) - dk))) / ((diag(i) + dk) * (diag(i) - dk))
-                      << " == (" << (singVals(j) + dk) << " * " << (mus(j) + (shifts(j) - dk)) << ") / ("
-                      << (diag(i) + dk) << " * " << (diag(i) - dk) << ")\n";
-#endif
-        }
-      }
-#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-      std::cout << "zhat(" << k << ") =  sqrt( " << prod << ")  ;  " << (singVals(lastIdx) + dk) << " * "
-                << mus(lastIdx) + shifts(lastIdx) << " - " << dk << "\n";
-#endif
-      RealScalar tmp = sqrt(prod);
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-      eigen_internal_assert((numext::isfinite)(tmp));
-#endif
-      zhat(k) = col0(k) > Literal(0) ? RealScalar(tmp) : RealScalar(-tmp);
-    }
-  }
-}
-
-// compute singular vectors
-template <typename MatrixType, int Options>
-void BDCSVD<MatrixType, Options>::computeSingVecs(const ArrayRef& zhat, const ArrayRef& diag, const IndicesRef& perm,
-                                                  const VectorType& singVals, const ArrayRef& shifts,
-                                                  const ArrayRef& mus, MatrixXr& U, MatrixXr& V) {
-  Index n = zhat.size();
-  Index m = perm.size();
-
-  for (Index k = 0; k < n; ++k) {
-    if (numext::is_exactly_zero(zhat(k))) {
-      U.col(k) = VectorType::Unit(n + 1, k);
-      if (m_compV) V.col(k) = VectorType::Unit(n, k);
-    } else {
-      U.col(k).setZero();
-      for (Index l = 0; l < m; ++l) {
-        Index i = perm(l);
-        U(i, k) = zhat(i) / (((diag(i) - shifts(k)) - mus(k))) / ((diag(i) + singVals[k]));
-      }
-      U(n, k) = Literal(0);
-      U.col(k).normalize();
-
-      if (m_compV) {
-        V.col(k).setZero();
-        for (Index l = 1; l < m; ++l) {
-          Index i = perm(l);
-          V(i, k) = diag(i) * zhat(i) / (((diag(i) - shifts(k)) - mus(k))) / ((diag(i) + singVals[k]));
-        }
-        V(0, k) = Literal(-1);
-        V.col(k).normalize();
-      }
-    }
-  }
-  U.col(n) = VectorType::Unit(n + 1, n);
-}
-
-// page 12_13
-// i >= 1, di almost null and zi non null.
-// We use a rotation to zero out zi applied to the left of M, and set di = 0.
-template <typename MatrixType, int Options>
-void BDCSVD<MatrixType, Options>::deflation43(Index firstCol, Index shift, Index i, Index size) {
-  using std::abs;
-  using std::pow;
-  using std::sqrt;
-  Index start = firstCol + shift;
-  RealScalar c = m_computed(start, start);
-  RealScalar s = m_computed(start + i, start);
-  RealScalar r = numext::hypot(c, s);
-  if (numext::is_exactly_zero(r)) {
-    m_computed(start + i, start + i) = Literal(0);
-    return;
-  }
-  m_computed(start, start) = r;
-  m_computed(start + i, start) = Literal(0);
-  m_computed(start + i, start + i) = Literal(0);
-
-  JacobiRotation<RealScalar> J(c / r, -s / r);
-  if (m_compU)
-    m_naiveU.middleRows(firstCol, size + 1).applyOnTheRight(firstCol, firstCol + i, J);
-  else
-    m_naiveU.applyOnTheRight(firstCol, firstCol + i, J);
-}  // end deflation 43
-
-// page 13
-// i,j >= 1, i > j, and |di - dj| < epsilon * norm2(M)
-// We apply two rotations to have zi = 0, and dj = di.
-template <typename MatrixType, int Options>
-void BDCSVD<MatrixType, Options>::deflation44(Index firstColu, Index firstColm, Index firstRowW, Index firstColW,
-                                              Index i, Index j, Index size) {
-  using std::abs;
-  using std::conj;
-  using std::pow;
-  using std::sqrt;
-
-  RealScalar s = m_computed(firstColm + i, firstColm);
-  RealScalar c = m_computed(firstColm + j, firstColm);
-  RealScalar r = numext::hypot(c, s);
-#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-  std::cout << "deflation 4.4: " << i << "," << j << " -> " << c << " " << s << " " << r << " ; "
-            << m_computed(firstColm + i - 1, firstColm) << " " << m_computed(firstColm + i, firstColm) << " "
-            << m_computed(firstColm + i + 1, firstColm) << " " << m_computed(firstColm + i + 2, firstColm) << "\n";
-  std::cout << m_computed(firstColm + i - 1, firstColm + i - 1) << " " << m_computed(firstColm + i, firstColm + i)
-            << " " << m_computed(firstColm + i + 1, firstColm + i + 1) << " "
-            << m_computed(firstColm + i + 2, firstColm + i + 2) << "\n";
-#endif
-  if (numext::is_exactly_zero(r)) {
-    m_computed(firstColm + j, firstColm + j) = m_computed(firstColm + i, firstColm + i);
-    return;
-  }
-  c /= r;
-  s /= r;
-  m_computed(firstColm + j, firstColm) = r;
-  m_computed(firstColm + j, firstColm + j) = m_computed(firstColm + i, firstColm + i);
-  m_computed(firstColm + i, firstColm) = Literal(0);
-
-  JacobiRotation<RealScalar> J(c, -s);
-  if (m_compU)
-    m_naiveU.middleRows(firstColu, size + 1).applyOnTheRight(firstColu + j, firstColu + i, J);
-  else
-    m_naiveU.applyOnTheRight(firstColu + j, firstColu + i, J);
-  if (m_compV) m_naiveV.middleRows(firstRowW, size).applyOnTheRight(firstColW + j, firstColW + i, J);
-}  // end deflation 44
-
-// acts on block from (firstCol+shift, firstCol+shift) to (lastCol+shift, lastCol+shift) [inclusive]
-template <typename MatrixType, int Options>
-void BDCSVD<MatrixType, Options>::deflation(Index firstCol, Index lastCol, Index k, Index firstRowW, Index firstColW,
-                                            Index shift) {
-  using std::abs;
-  using std::sqrt;
-  const Index length = lastCol + 1 - firstCol;
-
-  Block<MatrixXr, Dynamic, 1> col0(m_computed, firstCol + shift, firstCol + shift, length, 1);
-  Diagonal<MatrixXr> fulldiag(m_computed);
-  VectorBlock<Diagonal<MatrixXr>, Dynamic> diag(fulldiag, firstCol + shift, length);
-
-  const RealScalar considerZero = (std::numeric_limits<RealScalar>::min)();
-  RealScalar maxDiag = diag.tail((std::max)(Index(1), length - 1)).cwiseAbs().maxCoeff();
-  RealScalar epsilon_strict = numext::maxi<RealScalar>(considerZero, NumTraits<RealScalar>::epsilon() * maxDiag);
-  RealScalar epsilon_coarse =
-      Literal(8) * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(col0.cwiseAbs().maxCoeff(), maxDiag);
-
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-  eigen_internal_assert(m_naiveU.allFinite());
-  eigen_internal_assert(m_naiveV.allFinite());
-  eigen_internal_assert(m_computed.allFinite());
-#endif
-
-#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-  std::cout << "\ndeflate:" << diag.head(k + 1).transpose() << "  |  "
-            << diag.segment(k + 1, length - k - 1).transpose() << "\n";
-#endif
-
-  // condition 4.1
-  if (diag(0) < epsilon_coarse) {
-#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-    std::cout << "deflation 4.1, because " << diag(0) << " < " << epsilon_coarse << "\n";
-#endif
-    diag(0) = epsilon_coarse;
-  }
-
-  // condition 4.2
-  for (Index i = 1; i < length; ++i)
-    if (abs(col0(i)) < epsilon_strict) {
-#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-      std::cout << "deflation 4.2, set z(" << i << ") to zero because " << abs(col0(i)) << " < " << epsilon_strict
-                << "  (diag(" << i << ")=" << diag(i) << ")\n";
-#endif
-      col0(i) = Literal(0);
-    }
-
-  // condition 4.3
-  for (Index i = 1; i < length; i++)
-    if (diag(i) < epsilon_coarse) {
-#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-      std::cout << "deflation 4.3, cancel z(" << i << ")=" << col0(i) << " because diag(" << i << ")=" << diag(i)
-                << " < " << epsilon_coarse << "\n";
-#endif
-      deflation43(firstCol, shift, i, length);
-    }
-
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-  eigen_internal_assert(m_naiveU.allFinite());
-  eigen_internal_assert(m_naiveV.allFinite());
-  eigen_internal_assert(m_computed.allFinite());
-#endif
-#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-  std::cout << "to be sorted: " << diag.transpose() << "\n\n";
-  std::cout << "            : " << col0.transpose() << "\n\n";
-#endif
-  {
-    // Check for total deflation:
-    // If we have a total deflation, then we have to consider col0(0)==diag(0) as a singular value during sorting.
-    const bool total_deflation = (col0.tail(length - 1).array().abs() < considerZero).all();
-
-    // Sort the diagonal entries, since diag(1:k-1) and diag(k:length) are already sorted, let's do a sorted merge.
-    // First, compute the respective permutation.
-    Index* permutation = m_workspaceI.data();
-    {
-      permutation[0] = 0;
-      Index p = 1;
-
-      // Move deflated diagonal entries at the end.
-      for (Index i = 1; i < length; ++i)
-        if (diag(i) < considerZero) permutation[p++] = i;
-
-      Index i = 1, j = k + 1;
-      for (; p < length; ++p) {
-        if (i > k)
-          permutation[p] = j++;
-        else if (j >= length)
-          permutation[p] = i++;
-        else if (diag(i) < diag(j))
-          permutation[p] = j++;
-        else
-          permutation[p] = i++;
-      }
-    }
-
-    // If we have a total deflation, then we have to insert diag(0) at the right place
-    if (total_deflation) {
-      for (Index i = 1; i < length; ++i) {
-        Index pi = permutation[i];
-        if (diag(pi) < considerZero || diag(0) < diag(pi))
-          permutation[i - 1] = permutation[i];
-        else {
-          permutation[i - 1] = 0;
-          break;
-        }
-      }
-    }
-
-    // Current index of each col, and current column of each index
-    Index* realInd = m_workspaceI.data() + length;
-    Index* realCol = m_workspaceI.data() + 2 * length;
-
-    for (int pos = 0; pos < length; pos++) {
-      realCol[pos] = pos;
-      realInd[pos] = pos;
-    }
-
-    for (Index i = total_deflation ? 0 : 1; i < length; i++) {
-      const Index pi = permutation[length - (total_deflation ? i + 1 : i)];
-      const Index J = realCol[pi];
-
-      using std::swap;
-      // swap diagonal and first column entries:
-      swap(diag(i), diag(J));
-      if (i != 0 && J != 0) swap(col0(i), col0(J));
-
-      // change columns
-      if (m_compU)
-        m_naiveU.col(firstCol + i)
-            .segment(firstCol, length + 1)
-            .swap(m_naiveU.col(firstCol + J).segment(firstCol, length + 1));
-      else
-        m_naiveU.col(firstCol + i).segment(0, 2).swap(m_naiveU.col(firstCol + J).segment(0, 2));
-      if (m_compV)
-        m_naiveV.col(firstColW + i)
-            .segment(firstRowW, length)
-            .swap(m_naiveV.col(firstColW + J).segment(firstRowW, length));
-
-      // update real pos
-      const Index realI = realInd[i];
-      realCol[realI] = J;
-      realCol[pi] = i;
-      realInd[J] = realI;
-      realInd[i] = pi;
-    }
-  }
-#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-  std::cout << "sorted: " << diag.transpose().format(bdcsvdfmt) << "\n";
-  std::cout << "      : " << col0.transpose() << "\n\n";
-#endif
-
-  // condition 4.4
-  {
-    Index i = length - 1;
-    // Find last non-deflated entry.
-    while (i > 0 && (diag(i) < considerZero || abs(col0(i)) < considerZero)) --i;
-
-    for (; i > 1; --i)
-      if ((diag(i) - diag(i - 1)) < epsilon_strict) {
-#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-        std::cout << "deflation 4.4 with i = " << i << " because " << diag(i) << " - " << diag(i - 1)
-                  << " == " << (diag(i) - diag(i - 1)) << " < " << epsilon_strict << "\n";
-#endif
-        eigen_internal_assert(abs(diag(i) - diag(i - 1)) < epsilon_coarse &&
-                              " diagonal entries are not properly sorted");
-        deflation44(firstCol, firstCol + shift, firstRowW, firstColW, i, i - 1, length);
-      }
-  }
-
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-  for (Index j = 2; j < length; ++j) eigen_internal_assert(diag(j - 1) <= diag(j) || abs(diag(j)) < considerZero);
-#endif
-
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-  eigen_internal_assert(m_naiveU.allFinite());
-  eigen_internal_assert(m_naiveV.allFinite());
-  eigen_internal_assert(m_computed.allFinite());
-#endif
-}  // end deflation
-
 /** \svd_module
  *
  * \return the singular value decomposition of \c *this computed by Divide & Conquer algorithm

diff --git a/Eigen/src/SVD/BDCSVDImpl.h b/Eigen/src/SVD/BDCSVDImpl.h
new file mode 100644
index 0000000..4d0e70f
--- /dev/null
+++ b/Eigen/src/SVD/BDCSVDImpl.h

@@ -0,0 +1,1091 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// We used the "A Divide-And-Conquer Algorithm for the Bidiagonal SVD"
+// research report written by Ming Gu and Stanley C.Eisenstat
+// The code variable names correspond to the names they used in their
+// report
+//
+// Copyright (C) 2013 Gauthier Brun <brun.gauthier@gmail.com>
+// Copyright (C) 2013 Nicolas Carre <nicolas.carre@ensimag.fr>
+// Copyright (C) 2013 Jean Ceccato <jean.ceccato@ensimag.fr>
+// Copyright (C) 2013 Pierre Zoppitelli <pierre.zoppitelli@ensimag.fr>
+// Copyright (C) 2013 Jitse Niesen <jitse@maths.leeds.ac.uk>
+// Copyright (C) 2014-2017 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BDCSVD_IMPL_H
+#define EIGEN_BDCSVD_IMPL_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+IOFormat bdcsvdfmt(8, 0, ", ", "\n", "  [", "]");
+#endif
+
+namespace internal {
+
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+#define BDCSVD_SANITY_CHECK(expr) eigen_internal_assert(expr)
+#else
+#define BDCSVD_SANITY_CHECK(expr)
+#endif
+
+/** \internal
+ * Implementation of the divide-and-conquer phase of BDCSVD.
+ *
+ * Templated only on RealScalar so that all BDCSVD instantiations sharing the same
+ * RealScalar (e.g. BDCSVD<MatrixXd, ComputeThinU|ComputeThinV> and
+ * BDCSVD<MatrixXd, ComputeFullU|ComputeFullV>, or BDCSVD<MatrixXcd> and
+ * BDCSVD<MatrixXd>) share a single copy of the ~950 lines of D&C code.
+ */
+template <typename RealScalar_>
+class bdcsvd_impl {
+ public:
+  typedef RealScalar_ RealScalar;
+  typedef typename NumTraits<RealScalar>::Literal Literal;
+  typedef Matrix<RealScalar, Dynamic, Dynamic, ColMajor> MatrixXr;
+  typedef Matrix<RealScalar, Dynamic, 1> VectorType;
+  typedef Array<RealScalar, Dynamic, 1> ArrayXr;
+  typedef Array<Index, 1, Dynamic> ArrayXi;
+  typedef Ref<ArrayXr> ArrayRef;
+  typedef Ref<ArrayXi> IndicesRef;
+
+  bdcsvd_impl() : m_algoswap(16), m_compU(false), m_compV(false), m_numIters(0), m_info(Success) {}
+
+  void allocate(Index diagSize, bool compU, bool compV);
+
+  /** Entry point for the divide-and-conquer phase. */
+  void divide(Index firstCol, Index lastCol, Index firstRowW, Index firstColW, Index shift);
+
+  MatrixXr& naiveU() { return m_naiveU; }
+  const MatrixXr& naiveU() const { return m_naiveU; }
+  MatrixXr& naiveV() { return m_naiveV; }
+  const MatrixXr& naiveV() const { return m_naiveV; }
+  MatrixXr& computed() { return m_computed; }
+  const MatrixXr& computed() const { return m_computed; }
+  ComputationInfo info() const { return m_info; }
+  int numIters() const { return m_numIters; }
+  int algoSwap() const { return m_algoswap; }
+  void setAlgoSwap(int s) { m_algoswap = s; }
+
+ private:
+  void computeSVDofM(Index firstCol, Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V);
+  void computeSingVals(const ArrayRef& col0, const ArrayRef& diag, const IndicesRef& perm, VectorType& singVals,
+                       ArrayRef shifts, ArrayRef mus);
+  void perturbCol0(const ArrayRef& col0, const ArrayRef& diag, const IndicesRef& perm, const VectorType& singVals,
+                   const ArrayRef& shifts, const ArrayRef& mus, ArrayRef zhat);
+  void computeSingVecs(const ArrayRef& zhat, const ArrayRef& diag, const IndicesRef& perm, const VectorType& singVals,
+                       const ArrayRef& shifts, const ArrayRef& mus, MatrixXr& U, MatrixXr& V);
+  void deflation43(Index firstCol, Index shift, Index i, Index size);
+  void deflation44(Index firstColu, Index firstColm, Index firstRowW, Index firstColW, Index i, Index j, Index size);
+  void deflation(Index firstCol, Index lastCol, Index k, Index firstRowW, Index firstColW, Index shift);
+  void structured_update(Block<MatrixXr, Dynamic, Dynamic> A, const MatrixXr& B, Index n1);
+  static RealScalar secularEq(RealScalar x, const ArrayRef& col0, const ArrayRef& diag, const IndicesRef& perm,
+                              const ArrayRef& diagShifted, RealScalar shift);
+  template <typename SVDType>
+  void computeBaseCase(SVDType& svd, Index n, Index firstCol, Index firstRowW, Index firstColW, Index shift);
+
+  MatrixXr m_naiveU, m_naiveV;
+  MatrixXr m_computed;
+  ArrayXr m_workspace;
+  ArrayXi m_workspaceI;
+  int m_algoswap;
+  bool m_compU, m_compV;
+  int m_numIters;
+  ComputationInfo m_info;
+};
+
+template <typename RealScalar_>
+void bdcsvd_impl<RealScalar_>::allocate(Index diagSize, bool compU, bool compV) {
+  m_compU = compU;
+  m_compV = compV;
+  m_numIters = 0;
+  m_info = Success;
+
+  m_computed = MatrixXr::Zero(diagSize + 1, diagSize);
+
+  if (m_compU)
+    m_naiveU = MatrixXr::Zero(diagSize + 1, diagSize + 1);
+  else
+    m_naiveU = MatrixXr::Zero(2, diagSize + 1);
+
+  if (m_compV) m_naiveV = MatrixXr::Zero(diagSize, diagSize);
+
+  m_workspace.resize((diagSize + 1) * (diagSize + 1) * 3);
+  m_workspaceI.resize(3 * diagSize);
+}
+
+/** \internal
+ * Performs A = A * B exploiting the special structure of the matrix A. Splitting A as:
+ *  A = [A1]
+ *      [A2]
+ * such that A1.rows()==n1, then we assume that at least half of the columns of A1 and A2 are zeros.
+ * We can thus pack them prior to the matrix product. However, this is only worth the effort if the matrix is large
+ * enough.
+ */
+template <typename RealScalar_>
+void bdcsvd_impl<RealScalar_>::structured_update(Block<MatrixXr, Dynamic, Dynamic> A, const MatrixXr& B, Index n1) {
+  Index n = A.rows();
+  if (n > 100) {
+    // If the matrices are large enough, let's exploit the sparse structure of A by
+    // splitting it in half (wrt n1), and packing the non-zero columns.
+    Index n2 = n - n1;
+    Map<MatrixXr> A1(m_workspace.data(), n1, n);
+    Map<MatrixXr> A2(m_workspace.data() + n1 * n, n2, n);
+    Map<MatrixXr> B1(m_workspace.data() + n * n, n, n);
+    Map<MatrixXr> B2(m_workspace.data() + 2 * n * n, n, n);
+    Index k1 = 0, k2 = 0;
+    for (Index j = 0; j < n; ++j) {
+      if ((A.col(j).head(n1).array() != Literal(0)).any()) {
+        A1.col(k1) = A.col(j).head(n1);
+        B1.row(k1) = B.row(j);
+        ++k1;
+      }
+      if ((A.col(j).tail(n2).array() != Literal(0)).any()) {
+        A2.col(k2) = A.col(j).tail(n2);
+        B2.row(k2) = B.row(j);
+        ++k2;
+      }
+    }
+
+    A.topRows(n1).noalias() = A1.leftCols(k1) * B1.topRows(k1);
+    A.bottomRows(n2).noalias() = A2.leftCols(k2) * B2.topRows(k2);
+  } else {
+    Map<MatrixXr, Aligned> tmp(m_workspace.data(), n, n);
+    tmp.noalias() = A * B;
+    A = tmp;
+  }
+}
+
+template <typename RealScalar_>
+template <typename SVDType>
+void bdcsvd_impl<RealScalar_>::computeBaseCase(SVDType& svd, Index n, Index firstCol, Index firstRowW, Index firstColW,
+                                               Index shift) {
+  svd.compute(m_computed.block(firstCol, firstCol, n + 1, n));
+  m_info = svd.info();
+  if (m_info != Success && m_info != NoConvergence) return;
+  if (m_compU)
+    m_naiveU.block(firstCol, firstCol, n + 1, n + 1) = svd.matrixU();
+  else {
+    m_naiveU.row(0).segment(firstCol, n + 1) = svd.matrixU().row(0);
+    m_naiveU.row(1).segment(firstCol, n + 1) = svd.matrixU().row(n);
+  }
+  if (m_compV) m_naiveV.block(firstRowW, firstColW, n, n) = svd.matrixV();
+  m_computed.block(firstCol + shift, firstCol + shift, n + 1, n).setZero();
+  m_computed.diagonal().segment(firstCol + shift, n) = svd.singularValues().head(n);
+}
+
+// The divide algorithm is done "in place", we are always working on subsets of the same matrix. The divide methods
+// takes as argument the place of the submatrix we are currently working on.
+
+//@param firstCol : The Index of the first column of the submatrix of m_computed and for m_naiveU;
+//@param lastCol : The Index of the last column of the submatrix of m_computed and for m_naiveU;
+// lastCol + 1 - firstCol is the size of the submatrix.
+//@param firstRowW : The Index of the first row of the matrix W that we are to change. (see the reference paper section
+// 1 for more information on W)
+//@param firstColW : Same as firstRowW with the column.
+//@param shift : Each time one takes the left submatrix, one must add 1 to the shift. Why? Because! We actually want the
+// last column of the U submatrix
+// to become the first column (*coeff) and to shift all the other columns to the right. There are more details on the
+// reference paper.
+template <typename RealScalar_>
+void bdcsvd_impl<RealScalar_>::divide(Index firstCol, Index lastCol, Index firstRowW, Index firstColW, Index shift) {
+  // requires rows = cols + 1;
+  using std::abs;
+  using std::sqrt;
+  const Index n = lastCol - firstCol + 1;
+  const Index k = n / 2;
+  const RealScalar considerZero = (std::numeric_limits<RealScalar>::min)();
+  RealScalar alphaK;
+  RealScalar betaK;
+  RealScalar r0;
+  RealScalar lambda, phi, c0, s0;
+  VectorType l, f;
+  // We use the other algorithm which is more efficient for small
+  // matrices.
+  if (n < m_algoswap) {
+    // FIXME: this block involves temporaries.
+    if (m_compV) {
+      JacobiSVD<MatrixXr, ComputeFullU | ComputeFullV> baseSvd;
+      computeBaseCase(baseSvd, n, firstCol, firstRowW, firstColW, shift);
+    } else {
+      JacobiSVD<MatrixXr, ComputeFullU> baseSvd;
+      computeBaseCase(baseSvd, n, firstCol, firstRowW, firstColW, shift);
+    }
+    return;
+  }
+  // We use the divide and conquer algorithm
+  alphaK = m_computed(firstCol + k, firstCol + k);
+  betaK = m_computed(firstCol + k + 1, firstCol + k);
+  // The divide must be done in that order in order to have good results. Divide change the data inside the submatrices
+  // and the divide of the right submatrice reads one column of the left submatrice. That's why we need to treat the
+  // right submatrix before the left one.
+  divide(k + 1 + firstCol, lastCol, k + 1 + firstRowW, k + 1 + firstColW, shift);
+  if (m_info != Success && m_info != NoConvergence) return;
+  divide(firstCol, k - 1 + firstCol, firstRowW, firstColW + 1, shift + 1);
+  if (m_info != Success && m_info != NoConvergence) return;
+
+  if (m_compU) {
+    lambda = m_naiveU(firstCol + k, firstCol + k);
+    phi = m_naiveU(firstCol + k + 1, lastCol + 1);
+  } else {
+    lambda = m_naiveU(1, firstCol + k);
+    phi = m_naiveU(0, lastCol + 1);
+  }
+  r0 = sqrt((abs(alphaK * lambda) * abs(alphaK * lambda)) + abs(betaK * phi) * abs(betaK * phi));
+  if (m_compU) {
+    l = m_naiveU.row(firstCol + k).segment(firstCol, k);
+    f = m_naiveU.row(firstCol + k + 1).segment(firstCol + k + 1, n - k - 1);
+  } else {
+    l = m_naiveU.row(1).segment(firstCol, k);
+    f = m_naiveU.row(0).segment(firstCol + k + 1, n - k - 1);
+  }
+  if (m_compV) m_naiveV(firstRowW + k, firstColW) = Literal(1);
+  if (r0 < considerZero) {
+    c0 = Literal(1);
+    s0 = Literal(0);
+  } else {
+    c0 = alphaK * lambda / r0;
+    s0 = betaK * phi / r0;
+  }
+
+  BDCSVD_SANITY_CHECK(m_naiveU.allFinite());
+  BDCSVD_SANITY_CHECK(m_naiveV.allFinite());
+  BDCSVD_SANITY_CHECK(m_computed.allFinite());
+
+  if (m_compU) {
+    MatrixXr q1(m_naiveU.col(firstCol + k).segment(firstCol, k + 1));
+    // we shiftW Q1 to the right
+    for (Index i = firstCol + k - 1; i >= firstCol; i--)
+      m_naiveU.col(i + 1).segment(firstCol, k + 1) = m_naiveU.col(i).segment(firstCol, k + 1);
+    // we shift q1 at the left with a factor c0
+    m_naiveU.col(firstCol).segment(firstCol, k + 1) = (q1 * c0);
+    // last column = q1 * - s0
+    m_naiveU.col(lastCol + 1).segment(firstCol, k + 1) = (q1 * (-s0));
+    // first column = q2 * s0
+    m_naiveU.col(firstCol).segment(firstCol + k + 1, n - k) =
+        m_naiveU.col(lastCol + 1).segment(firstCol + k + 1, n - k) * s0;
+    // q2 *= c0
+    m_naiveU.col(lastCol + 1).segment(firstCol + k + 1, n - k) *= c0;
+  } else {
+    RealScalar q1 = m_naiveU(0, firstCol + k);
+    // we shift Q1 to the right
+    for (Index i = firstCol + k - 1; i >= firstCol; i--) m_naiveU(0, i + 1) = m_naiveU(0, i);
+    // we shift q1 at the left with a factor c0
+    m_naiveU(0, firstCol) = (q1 * c0);
+    // last column = q1 * - s0
+    m_naiveU(0, lastCol + 1) = (q1 * (-s0));
+    // first column = q2 * s0
+    m_naiveU(1, firstCol) = m_naiveU(1, lastCol + 1) * s0;
+    // q2 *= c0
+    m_naiveU(1, lastCol + 1) *= c0;
+    m_naiveU.row(1).segment(firstCol + 1, k).setZero();
+    m_naiveU.row(0).segment(firstCol + k + 1, n - k - 1).setZero();
+  }
+
+  BDCSVD_SANITY_CHECK(m_naiveU.allFinite());
+  BDCSVD_SANITY_CHECK(m_naiveV.allFinite());
+  BDCSVD_SANITY_CHECK(m_computed.allFinite());
+
+  m_computed(firstCol + shift, firstCol + shift) = r0;
+  m_computed.col(firstCol + shift).segment(firstCol + shift + 1, k) = alphaK * l.transpose();
+  m_computed.col(firstCol + shift).segment(firstCol + shift + k + 1, n - k - 1) = betaK * f.transpose();
+
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+  ArrayXr tmp1 = (m_computed.block(firstCol + shift, firstCol + shift, n, n)).jacobiSvd().singularValues();
+#endif
+  // Second part: try to deflate singular values in combined matrix
+  deflation(firstCol, lastCol, k, firstRowW, firstColW, shift);
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+  ArrayXr tmp2 = (m_computed.block(firstCol + shift, firstCol + shift, n, n)).jacobiSvd().singularValues();
+  std::cout << "\n\nj1 = " << tmp1.transpose().format(bdcsvdfmt) << "\n";
+  std::cout << "j2 = " << tmp2.transpose().format(bdcsvdfmt) << "\n\n";
+  std::cout << "err:      " << ((tmp1 - tmp2).abs() > 1e-12 * tmp2.abs()).transpose() << "\n";
+  static int count = 0;
+  std::cout << "# " << ++count << "\n\n";
+  eigen_internal_assert((tmp1 - tmp2).matrix().norm() < 1e-14 * tmp2.matrix().norm());
+//   eigen_internal_assert(count<681);
+//   eigen_internal_assert(((tmp1-tmp2).abs()<1e-13*tmp2.abs()).all());
+#endif
+
+  // Third part: compute SVD of combined matrix
+  MatrixXr UofSVD, VofSVD;
+  VectorType singVals;
+  computeSVDofM(firstCol + shift, n, UofSVD, singVals, VofSVD);
+
+  BDCSVD_SANITY_CHECK(UofSVD.allFinite());
+  BDCSVD_SANITY_CHECK(VofSVD.allFinite());
+
+  if (m_compU)
+    structured_update(m_naiveU.block(firstCol, firstCol, n + 1, n + 1), UofSVD, (n + 2) / 2);
+  else {
+    Map<Matrix<RealScalar, 2, Dynamic>, Aligned> tmp(m_workspace.data(), 2, n + 1);
+    tmp.noalias() = m_naiveU.middleCols(firstCol, n + 1) * UofSVD;
+    m_naiveU.middleCols(firstCol, n + 1) = tmp;
+  }
+
+  if (m_compV) structured_update(m_naiveV.block(firstRowW, firstColW, n, n), VofSVD, (n + 1) / 2);
+
+  BDCSVD_SANITY_CHECK(m_naiveU.allFinite());
+  BDCSVD_SANITY_CHECK(m_naiveV.allFinite());
+  BDCSVD_SANITY_CHECK(m_computed.allFinite());
+
+  m_computed.block(firstCol + shift, firstCol + shift, n, n).setZero();
+  m_computed.block(firstCol + shift, firstCol + shift, n, n).diagonal() = singVals;
+}  // end divide
+
+// Compute SVD of m_computed.block(firstCol, firstCol, n + 1, n); this block only has non-zeros in
+// the first column and on the diagonal and has undergone deflation, so diagonal is in increasing
+// order except for possibly the (0,0) entry. The computed SVD is stored U, singVals and V, except
+// that if m_compV is false, then V is not computed. Singular values are sorted in decreasing order.
+//
+// TODO: opportunities for optimization: better root-finding algorithm, better stopping criterion,
+// better handling of round-off errors, and consistent ordering.
+// For instance, to solve the secular equation using FMM, see
+// http://www.stat.uchicago.edu/~lekheng/courses/302/classics/greengard-rokhlin.pdf
+template <typename RealScalar_>
+void bdcsvd_impl<RealScalar_>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V) {
+  const RealScalar considerZero = (std::numeric_limits<RealScalar>::min)();
+  using std::abs;
+  ArrayRef col0 = m_computed.col(firstCol).segment(firstCol, n);
+  m_workspace.head(n) = m_computed.block(firstCol, firstCol, n, n).diagonal();
+  ArrayRef diag = m_workspace.head(n);
+  diag(0) = Literal(0);
+
+  // Allocate space for singular values and vectors
+  singVals.resize(n);
+  U.resize(n + 1, n + 1);
+  if (m_compV) V.resize(n, n);
+
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+  if (col0.hasNaN() || diag.hasNaN()) std::cout << "\n\nHAS NAN\n\n";
+#endif
+
+  // Many singular values might have been deflated, the zero ones have been moved to the end,
+  // but others are interleaved and we must ignore them at this stage.
+  // To this end, let's compute a permutation skipping them:
+  Index actual_n = n;
+  while (actual_n > 1 && numext::is_exactly_zero(diag(actual_n - 1))) {
+    --actual_n;
+    eigen_internal_assert(numext::is_exactly_zero(col0(actual_n)));
+  }
+  Index m = 0;  // size of the deflated problem
+  for (Index k = 0; k < actual_n; ++k)
+    if (abs(col0(k)) > considerZero) m_workspaceI(m++) = k;
+  Map<ArrayXi> perm(m_workspaceI.data(), m);
+
+  Map<ArrayXr> shifts(m_workspace.data() + 1 * n, n);
+  Map<ArrayXr> mus(m_workspace.data() + 2 * n, n);
+  Map<ArrayXr> zhat(m_workspace.data() + 3 * n, n);
+
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+  std::cout << "computeSVDofM using:\n";
+  std::cout << "  z: " << col0.transpose() << "\n";
+  std::cout << "  d: " << diag.transpose() << "\n";
+#endif
+
+  // Compute singVals, shifts, and mus
+  computeSingVals(col0, diag, perm, singVals, shifts, mus);
+
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+  std::cout << "  j:        "
+            << (m_computed.block(firstCol, firstCol, n, n)).jacobiSvd().singularValues().transpose().reverse()
+            << "\n\n";
+  std::cout << "  sing-val: " << singVals.transpose() << "\n";
+  std::cout << "  mu:       " << mus.transpose() << "\n";
+  std::cout << "  shift:    " << shifts.transpose() << "\n";
+
+  {
+    std::cout << "\n\n    mus:    " << mus.head(actual_n).transpose() << "\n\n";
+    std::cout << "    check1 (expect0) : "
+              << ((singVals.array() - (shifts + mus)) / singVals.array()).head(actual_n).transpose() << "\n\n";
+    eigen_internal_assert((((singVals.array() - (shifts + mus)) / singVals.array()).head(actual_n) >= 0).all());
+    std::cout << "    check2 (>0)      : " << ((singVals.array() - diag) / singVals.array()).head(actual_n).transpose()
+              << "\n\n";
+    eigen_internal_assert((((singVals.array() - diag) / singVals.array()).head(actual_n) >= 0).all());
+  }
+#endif
+
+  BDCSVD_SANITY_CHECK(singVals.allFinite());
+  BDCSVD_SANITY_CHECK(mus.allFinite());
+  BDCSVD_SANITY_CHECK(shifts.allFinite());
+
+  // Compute zhat
+  perturbCol0(col0, diag, perm, singVals, shifts, mus, zhat);
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+  std::cout << "  zhat: " << zhat.transpose() << "\n";
+#endif
+
+  BDCSVD_SANITY_CHECK(zhat.allFinite());
+
+  computeSingVecs(zhat, diag, perm, singVals, shifts, mus, U, V);
+
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+  std::cout << "U^T U: " << (U.transpose() * U - MatrixXr(MatrixXr::Identity(U.cols(), U.cols()))).norm() << "\n";
+  std::cout << "V^T V: " << (V.transpose() * V - MatrixXr(MatrixXr::Identity(V.cols(), V.cols()))).norm() << "\n";
+#endif
+
+  BDCSVD_SANITY_CHECK(m_naiveU.allFinite());
+  BDCSVD_SANITY_CHECK(m_naiveV.allFinite());
+  BDCSVD_SANITY_CHECK(m_computed.allFinite());
+  BDCSVD_SANITY_CHECK(U.allFinite());
+  BDCSVD_SANITY_CHECK(V.allFinite());
+  //   BDCSVD_SANITY_CHECK((U.transpose() * U - MatrixXr(MatrixXr::Identity(U.cols(),U.cols()))).norm() <
+  //   100*NumTraits<RealScalar>::epsilon() * n);
+  //   BDCSVD_SANITY_CHECK((V.transpose() * V -
+  //   MatrixXr(MatrixXr::Identity(V.cols(),V.cols()))).norm() < 100*NumTraits<RealScalar>::epsilon() * n);
+
+  // Because of deflation, the singular values might not be completely sorted.
+  // Fortunately, reordering them is a O(n) problem
+  for (Index i = 0; i < actual_n - 1; ++i) {
+    if (singVals(i) > singVals(i + 1)) {
+      using std::swap;
+      swap(singVals(i), singVals(i + 1));
+      U.col(i).swap(U.col(i + 1));
+      if (m_compV) V.col(i).swap(V.col(i + 1));
+    }
+  }
+
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+  {
+    bool singular_values_sorted =
+        (((singVals.segment(1, actual_n - 1) - singVals.head(actual_n - 1))).array() >= 0).all();
+    if (!singular_values_sorted)
+      std::cout << "Singular values are not sorted: " << singVals.segment(1, actual_n).transpose() << "\n";
+    eigen_internal_assert(singular_values_sorted);
+  }
+#endif
+
+  // Reverse order so that singular values in increased order
+  // Because of deflation, the zeros singular-values are already at the end
+  singVals.head(actual_n).reverseInPlace();
+  U.leftCols(actual_n).rowwise().reverseInPlace();
+  if (m_compV) V.leftCols(actual_n).rowwise().reverseInPlace();
+
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+  JacobiSVD<MatrixXr> jsvd(m_computed.block(firstCol, firstCol, n, n));
+  std::cout << "  * j:        " << jsvd.singularValues().transpose() << "\n\n";
+  std::cout << "  * sing-val: " << singVals.transpose() << "\n";
+//   std::cout << "  * err:      " << ((jsvd.singularValues()-singVals)>1e-13*singVals.norm()).transpose() << "\n";
+#endif
+}
+
+template <typename RealScalar_>
+typename bdcsvd_impl<RealScalar_>::RealScalar bdcsvd_impl<RealScalar_>::secularEq(RealScalar mu, const ArrayRef& col0,
+                                                                                  const ArrayRef& diag,
+                                                                                  const IndicesRef& perm,
+                                                                                  const ArrayRef& diagShifted,
+                                                                                  RealScalar shift) {
+  Index m = perm.size();
+  RealScalar res = Literal(1);
+  for (Index i = 0; i < m; ++i) {
+    Index j = perm(i);
+    // The following expression could be rewritten to involve only a single division,
+    // but this would make the expression more sensitive to overflow.
+    res += (col0(j) / (diagShifted(j) - mu)) * (col0(j) / (diag(j) + shift + mu));
+  }
+  return res;
+}
+
+template <typename RealScalar_>
+void bdcsvd_impl<RealScalar_>::computeSingVals(const ArrayRef& col0, const ArrayRef& diag, const IndicesRef& perm,
+                                               VectorType& singVals, ArrayRef shifts, ArrayRef mus) {
+  using std::abs;
+  using std::sqrt;
+  using std::swap;
+
+  Index n = col0.size();
+  Index actual_n = n;
+  // Note that here actual_n is computed based on col0(i)==0 instead of diag(i)==0 as above
+  // because 1) we have diag(i)==0 => col0(i)==0 and 2) if col0(i)==0, then diag(i) is already a singular value.
+  while (actual_n > 1 && numext::is_exactly_zero(col0(actual_n - 1))) --actual_n;
+
+  for (Index k = 0; k < n; ++k) {
+    if (numext::is_exactly_zero(col0(k)) || actual_n == 1) {
+      // if col0(k) == 0, then entry is deflated, so singular value is on diagonal
+      // if actual_n==1, then the deflated problem is already diagonalized
+      singVals(k) = k == 0 ? col0(0) : diag(k);
+      mus(k) = Literal(0);
+      shifts(k) = k == 0 ? col0(0) : diag(k);
+      continue;
+    }
+
+    // otherwise, use secular equation to find singular value
+    RealScalar left = diag(k);
+    RealScalar right;  // was: = (k != actual_n-1) ? diag(k+1) : (diag(actual_n-1) + col0.matrix().norm());
+    if (k == actual_n - 1)
+      right = (diag(actual_n - 1) + col0.matrix().norm());
+    else {
+      // Skip deflated singular values,
+      // recall that at this stage we assume that z[j]!=0 and all entries for which z[j]==0 have been put aside.
+      // This should be equivalent to using perm[]
+      Index l = k + 1;
+      while (numext::is_exactly_zero(col0(l))) {
+        ++l;
+        eigen_internal_assert(l < actual_n);
+      }
+      right = diag(l);
+    }
+
+    // first decide whether it's closer to the left end or the right end
+    RealScalar mid = left + (right - left) / Literal(2);
+    RealScalar fMid = secularEq(mid, col0, diag, perm, diag, Literal(0));
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+    std::cout << "right-left = " << right - left << "\n";
+    //     std::cout << "fMid = " << fMid << " " << secularEq(mid-left, col0, diag, perm, ArrayXr(diag-left), left)
+    //                            << " " << secularEq(mid-right, col0, diag, perm, ArrayXr(diag-right), right)   <<
+    //                            "\n";
+    std::cout << "     = " << secularEq(left + RealScalar(0.000001) * (right - left), col0, diag, perm, diag, 0) << " "
+              << secularEq(left + RealScalar(0.1) * (right - left), col0, diag, perm, diag, 0) << " "
+              << secularEq(left + RealScalar(0.2) * (right - left), col0, diag, perm, diag, 0) << " "
+              << secularEq(left + RealScalar(0.3) * (right - left), col0, diag, perm, diag, 0) << " "
+              << secularEq(left + RealScalar(0.4) * (right - left), col0, diag, perm, diag, 0) << " "
+              << secularEq(left + RealScalar(0.49) * (right - left), col0, diag, perm, diag, 0) << " "
+              << secularEq(left + RealScalar(0.5) * (right - left), col0, diag, perm, diag, 0) << " "
+              << secularEq(left + RealScalar(0.51) * (right - left), col0, diag, perm, diag, 0) << " "
+              << secularEq(left + RealScalar(0.6) * (right - left), col0, diag, perm, diag, 0) << " "
+              << secularEq(left + RealScalar(0.7) * (right - left), col0, diag, perm, diag, 0) << " "
+              << secularEq(left + RealScalar(0.8) * (right - left), col0, diag, perm, diag, 0) << " "
+              << secularEq(left + RealScalar(0.9) * (right - left), col0, diag, perm, diag, 0) << " "
+              << secularEq(left + RealScalar(0.999999) * (right - left), col0, diag, perm, diag, 0) << "\n";
+#endif
+    RealScalar shift = (k == actual_n - 1 || fMid > Literal(0)) ? left : right;
+
+    // measure everything relative to shift
+    Map<ArrayXr> diagShifted(m_workspace.data() + 4 * n, n);
+    diagShifted = diag - shift;
+
+    if (k != actual_n - 1) {
+      // check that after the shift, f(mid) is still negative:
+      RealScalar midShifted = (right - left) / RealScalar(2);
+      // we can test exact equality here, because shift comes from `... ? left : right`
+      if (numext::equal_strict(shift, right)) midShifted = -midShifted;
+      RealScalar fMidShifted = secularEq(midShifted, col0, diag, perm, diagShifted, shift);
+      if (fMidShifted > 0) {
+        // fMid was erroneous, fix it:
+        shift = fMidShifted > Literal(0) ? left : right;
+        diagShifted = diag - shift;
+      }
+    }
+
+    // initial guess
+    RealScalar muPrev, muCur;
+    // we can test exact equality here, because shift comes from `... ? left : right`
+    if (numext::equal_strict(shift, left)) {
+      muPrev = (right - left) * RealScalar(0.1);
+      if (k == actual_n - 1)
+        muCur = right - left;
+      else
+        muCur = (right - left) * RealScalar(0.5);
+    } else {
+      muPrev = -(right - left) * RealScalar(0.1);
+      muCur = -(right - left) * RealScalar(0.5);
+    }
+
+    RealScalar fPrev = secularEq(muPrev, col0, diag, perm, diagShifted, shift);
+    RealScalar fCur = secularEq(muCur, col0, diag, perm, diagShifted, shift);
+    if (abs(fPrev) < abs(fCur)) {
+      swap(fPrev, fCur);
+      swap(muPrev, muCur);
+    }
+
+    // rational interpolation: fit a function of the form a / mu + b through the two previous
+    // iterates and use its zero to compute the next iterate
+    bool useBisection = fPrev * fCur > Literal(0);
+    while (!numext::is_exactly_zero(fCur) &&
+           abs(muCur - muPrev) >
+               Literal(8) * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(abs(muCur), abs(muPrev)) &&
+           abs(fCur - fPrev) > NumTraits<RealScalar>::epsilon() && !useBisection) {
+      ++m_numIters;
+
+      // Find a and b such that the function f(mu) = a / mu + b matches the current and previous samples.
+      RealScalar a = (fCur - fPrev) / (Literal(1) / muCur - Literal(1) / muPrev);
+      RealScalar b = fCur - a / muCur;
+      // And find mu such that f(mu)==0:
+      RealScalar muZero = -a / b;
+      RealScalar fZero = secularEq(muZero, col0, diag, perm, diagShifted, shift);
+
+      BDCSVD_SANITY_CHECK((numext::isfinite)(fZero));
+
+      muPrev = muCur;
+      fPrev = fCur;
+      muCur = muZero;
+      fCur = fZero;
+
+      // we can test exact equality here, because shift comes from `... ? left : right`
+      if (numext::equal_strict(shift, left) && (muCur < Literal(0) || muCur > right - left)) useBisection = true;
+      if (numext::equal_strict(shift, right) && (muCur < -(right - left) || muCur > Literal(0))) useBisection = true;
+      if (abs(fCur) > abs(fPrev)) useBisection = true;
+    }
+
+    // fall back on bisection method if rational interpolation did not work
+    if (useBisection) {
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+      std::cout << "useBisection for k = " << k << ", actual_n = " << actual_n << "\n";
+#endif
+      RealScalar leftShifted, rightShifted;
+      // we can test exact equality here, because shift comes from `... ? left : right`
+      if (numext::equal_strict(shift, left)) {
+        // to avoid overflow, we must have mu > max(real_min, |z(k)|/sqrt(real_max)),
+        // the factor 2 is to be more conservative
+        leftShifted =
+            numext::maxi<RealScalar>((std::numeric_limits<RealScalar>::min)(),
+                                     Literal(2) * abs(col0(k)) / sqrt((std::numeric_limits<RealScalar>::max)()));
+
+        // check that we did it right:
+        eigen_internal_assert(
+            (numext::isfinite)((col0(k) / leftShifted) * (col0(k) / (diag(k) + shift + leftShifted))));
+        // It is unclear why k==0 would need special handling here:
+        // if (k == 0) rightShifted = right - left; else
+        rightShifted = (k == actual_n - 1)
+                           ? right
+                           : ((right - left) * RealScalar(0.51));  // theoretically we can take 0.5, but let's be safe
+      } else {
+        leftShifted = -(right - left) * RealScalar(0.51);
+        if (k + 1 < n)
+          rightShifted = -numext::maxi<RealScalar>((std::numeric_limits<RealScalar>::min)(),
+                                                   abs(col0(k + 1)) / sqrt((std::numeric_limits<RealScalar>::max)()));
+        else
+          rightShifted = -(std::numeric_limits<RealScalar>::min)();
+      }
+      RealScalar fLeft = secularEq(leftShifted, col0, diag, perm, diagShifted, shift);
+      eigen_internal_assert(fLeft < Literal(0));
+
+#if defined EIGEN_BDCSVD_DEBUG_VERBOSE || defined EIGEN_BDCSVD_SANITY_CHECKS || defined EIGEN_INTERNAL_DEBUGGING
+      RealScalar fRight = secularEq(rightShifted, col0, diag, perm, diagShifted, shift);
+#endif
+
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+      if (!(numext::isfinite)(fLeft))
+        std::cout << "f(" << leftShifted << ") =" << fLeft << " ; " << left << " " << shift << " " << right << "\n";
+      eigen_internal_assert((numext::isfinite)(fLeft));
+
+      if (!(numext::isfinite)(fRight))
+        std::cout << "f(" << rightShifted << ") =" << fRight << " ; " << left << " " << shift << " " << right << "\n";
+        // eigen_internal_assert((numext::isfinite)(fRight));
+#endif
+
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+      if (!(fLeft * fRight < 0)) {
+        std::cout << "f(leftShifted) using  leftShifted=" << leftShifted
+                  << " ;  diagShifted(1:10):" << diagShifted.head(10).transpose() << "\n ; "
+                  << "left==shift=" << bool(left == shift) << " ; left-shift = " << (left - shift) << "\n";
+        std::cout << "k=" << k << ", " << fLeft << " * " << fRight << " == " << fLeft * fRight << "  ;  "
+                  << "[" << left << " .. " << right << "] -> [" << leftShifted << " " << rightShifted
+                  << "], shift=" << shift << " ,  f(right)=" << secularEq(0, col0, diag, perm, diagShifted, shift)
+                  << " == " << secularEq(right, col0, diag, perm, diag, 0) << " == " << fRight << "\n";
+      }
+#endif
+      eigen_internal_assert(fLeft * fRight < Literal(0));
+
+      if (fLeft < Literal(0)) {
+        while (rightShifted - leftShifted > Literal(2) * NumTraits<RealScalar>::epsilon() *
+                                                numext::maxi<RealScalar>(abs(leftShifted), abs(rightShifted))) {
+          RealScalar midShifted = (leftShifted + rightShifted) / Literal(2);
+          fMid = secularEq(midShifted, col0, diag, perm, diagShifted, shift);
+          eigen_internal_assert((numext::isfinite)(fMid));
+
+          if (fLeft * fMid < Literal(0)) {
+            rightShifted = midShifted;
+          } else {
+            leftShifted = midShifted;
+            fLeft = fMid;
+          }
+        }
+        muCur = (leftShifted + rightShifted) / Literal(2);
+      } else {
+        // We have a problem as shifting on the left or right give either a positive or negative value
+        // at the middle of [left,right]...
+        // Instead of abbording or entering an infinite loop,
+        // let's just use the middle as the estimated zero-crossing:
+        muCur = (right - left) * RealScalar(0.5);
+        // we can test exact equality here, because shift comes from `... ? left : right`
+        if (numext::equal_strict(shift, right)) muCur = -muCur;
+      }
+    }
+
+    singVals[k] = shift + muCur;
+    shifts[k] = shift;
+    mus[k] = muCur;
+
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+    if (k + 1 < n)
+      std::cout << "found " << singVals[k] << " == " << shift << " + " << muCur << " from " << diag(k) << " .. "
+                << diag(k + 1) << "\n";
+#endif
+    BDCSVD_SANITY_CHECK(k == 0 || singVals[k] >= singVals[k - 1]);
+    BDCSVD_SANITY_CHECK(singVals[k] >= diag(k));
+
+    // perturb singular value slightly if it equals diagonal entry to avoid division by zero later
+    // (deflation is supposed to avoid this from happening)
+    // - this does no seem to be necessary anymore -
+    // if (singVals[k] == left) singVals[k] *= 1 + NumTraits<RealScalar>::epsilon();
+    // if (singVals[k] == right) singVals[k] *= 1 - NumTraits<RealScalar>::epsilon();
+  }
+}
+
+// zhat is perturbation of col0 for which singular vectors can be computed stably (see Section 3.1)
+template <typename RealScalar_>
+void bdcsvd_impl<RealScalar_>::perturbCol0(const ArrayRef& col0, const ArrayRef& diag, const IndicesRef& perm,
+                                           const VectorType& singVals, const ArrayRef& shifts, const ArrayRef& mus,
+                                           ArrayRef zhat) {
+  using std::sqrt;
+  Index n = col0.size();
+  Index m = perm.size();
+  if (m == 0) {
+    zhat.setZero();
+    return;
+  }
+  Index lastIdx = perm(m - 1);
+  // The offset permits to skip deflated entries while computing zhat
+  for (Index k = 0; k < n; ++k) {
+    if (numext::is_exactly_zero(col0(k)))  // deflated
+      zhat(k) = Literal(0);
+    else {
+      // see equation (3.6)
+      RealScalar dk = diag(k);
+      RealScalar prod = (singVals(lastIdx) + dk) * (mus(lastIdx) + (shifts(lastIdx) - dk));
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+      if (prod < 0) {
+        std::cout << "k = " << k << " ;  z(k)=" << col0(k) << ", diag(k)=" << dk << "\n";
+        std::cout << "prod = "
+                  << "(" << singVals(lastIdx) << " + " << dk << ") * (" << mus(lastIdx) << " + (" << shifts(lastIdx)
+                  << " - " << dk << "))"
+                  << "\n";
+        std::cout << "     = " << singVals(lastIdx) + dk << " * " << mus(lastIdx) + (shifts(lastIdx) - dk) << "\n";
+      }
+      eigen_internal_assert(prod >= 0);
+#endif
+
+      for (Index l = 0; l < m; ++l) {
+        Index i = perm(l);
+        if (i != k) {
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+          if (i >= k && (l == 0 || l - 1 >= m)) {
+            std::cout << "Error in perturbCol0\n";
+            std::cout << "  " << k << "/" << n << " " << l << "/" << m << " " << i << "/" << n << " ; " << col0(k)
+                      << " " << diag(k) << " "
+                      << "\n";
+            std::cout << "  " << diag(i) << "\n";
+            Index j = (i < k /*|| l==0*/) ? i : perm(l - 1);
+            std::cout << "  "
+                      << "j=" << j << "\n";
+          }
+#endif
+          Index j = i < k ? i : l > 0 ? perm(l - 1) : i;
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+          if (!(dk != Literal(0) || diag(i) != Literal(0))) {
+            std::cout << "k=" << k << ", i=" << i << ", l=" << l << ", perm.size()=" << perm.size() << "\n";
+          }
+          eigen_internal_assert(dk != Literal(0) || diag(i) != Literal(0));
+#endif
+          prod *= ((singVals(j) + dk) / ((diag(i) + dk))) * ((mus(j) + (shifts(j) - dk)) / ((diag(i) - dk)));
+          BDCSVD_SANITY_CHECK(prod >= 0);
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+          if (i != k &&
+              numext::abs(((singVals(j) + dk) * (mus(j) + (shifts(j) - dk))) / ((diag(i) + dk) * (diag(i) - dk)) - 1) >
+                  0.9)
+            std::cout << "     "
+                      << ((singVals(j) + dk) * (mus(j) + (shifts(j) - dk))) / ((diag(i) + dk) * (diag(i) - dk))
+                      << " == (" << (singVals(j) + dk) << " * " << (mus(j) + (shifts(j) - dk)) << ") / ("
+                      << (diag(i) + dk) << " * " << (diag(i) - dk) << ")\n";
+#endif
+        }
+      }
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+      std::cout << "zhat(" << k << ") =  sqrt( " << prod << ")  ;  " << (singVals(lastIdx) + dk) << " * "
+                << mus(lastIdx) + shifts(lastIdx) << " - " << dk << "\n";
+#endif
+      RealScalar tmp = sqrt(prod);
+      BDCSVD_SANITY_CHECK((numext::isfinite)(tmp));
+      zhat(k) = col0(k) > Literal(0) ? RealScalar(tmp) : RealScalar(-tmp);
+    }
+  }
+}
+
+// compute singular vectors
+template <typename RealScalar_>
+void bdcsvd_impl<RealScalar_>::computeSingVecs(const ArrayRef& zhat, const ArrayRef& diag, const IndicesRef& perm,
+                                               const VectorType& singVals, const ArrayRef& shifts, const ArrayRef& mus,
+                                               MatrixXr& U, MatrixXr& V) {
+  Index n = zhat.size();
+  Index m = perm.size();
+
+  for (Index k = 0; k < n; ++k) {
+    if (numext::is_exactly_zero(zhat(k))) {
+      U.col(k) = VectorType::Unit(n + 1, k);
+      if (m_compV) V.col(k) = VectorType::Unit(n, k);
+    } else {
+      U.col(k).setZero();
+      for (Index l = 0; l < m; ++l) {
+        Index i = perm(l);
+        U(i, k) = zhat(i) / (((diag(i) - shifts(k)) - mus(k))) / ((diag(i) + singVals[k]));
+      }
+      U(n, k) = Literal(0);
+      U.col(k).normalize();
+
+      if (m_compV) {
+        V.col(k).setZero();
+        for (Index l = 1; l < m; ++l) {
+          Index i = perm(l);
+          V(i, k) = diag(i) * zhat(i) / (((diag(i) - shifts(k)) - mus(k))) / ((diag(i) + singVals[k]));
+        }
+        V(0, k) = Literal(-1);
+        V.col(k).normalize();
+      }
+    }
+  }
+  U.col(n) = VectorType::Unit(n + 1, n);
+}
+
+// page 12_13
+// i >= 1, di almost null and zi non null.
+// We use a rotation to zero out zi applied to the left of M, and set di = 0.
+template <typename RealScalar_>
+void bdcsvd_impl<RealScalar_>::deflation43(Index firstCol, Index shift, Index i, Index size) {
+  using std::abs;
+  using std::sqrt;
+  Index start = firstCol + shift;
+  RealScalar c = m_computed(start, start);
+  RealScalar s = m_computed(start + i, start);
+  RealScalar r = numext::hypot(c, s);
+  if (numext::is_exactly_zero(r)) {
+    m_computed(start + i, start + i) = Literal(0);
+    return;
+  }
+  m_computed(start, start) = r;
+  m_computed(start + i, start) = Literal(0);
+  m_computed(start + i, start + i) = Literal(0);
+
+  JacobiRotation<RealScalar> J(c / r, -s / r);
+  if (m_compU)
+    m_naiveU.middleRows(firstCol, size + 1).applyOnTheRight(firstCol, firstCol + i, J);
+  else
+    m_naiveU.applyOnTheRight(firstCol, firstCol + i, J);
+}  // end deflation 43
+
+// page 13
+// i,j >= 1, i > j, and |di - dj| < epsilon * norm2(M)
+// We apply two rotations to have zi = 0, and dj = di.
+template <typename RealScalar_>
+void bdcsvd_impl<RealScalar_>::deflation44(Index firstColu, Index firstColm, Index firstRowW, Index firstColW, Index i,
+                                           Index j, Index size) {
+  using std::abs;
+  using std::sqrt;
+
+  RealScalar s = m_computed(firstColm + i, firstColm);
+  RealScalar c = m_computed(firstColm + j, firstColm);
+  RealScalar r = numext::hypot(c, s);
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+  std::cout << "deflation 4.4: " << i << "," << j << " -> " << c << " " << s << " " << r << " ; "
+            << m_computed(firstColm + i - 1, firstColm) << " " << m_computed(firstColm + i, firstColm) << " "
+            << m_computed(firstColm + i + 1, firstColm) << " " << m_computed(firstColm + i + 2, firstColm) << "\n";
+  std::cout << m_computed(firstColm + i - 1, firstColm + i - 1) << " " << m_computed(firstColm + i, firstColm + i)
+            << " " << m_computed(firstColm + i + 1, firstColm + i + 1) << " "
+            << m_computed(firstColm + i + 2, firstColm + i + 2) << "\n";
+#endif
+  if (numext::is_exactly_zero(r)) {
+    m_computed(firstColm + j, firstColm + j) = m_computed(firstColm + i, firstColm + i);
+    return;
+  }
+  c /= r;
+  s /= r;
+  m_computed(firstColm + j, firstColm) = r;
+  m_computed(firstColm + j, firstColm + j) = m_computed(firstColm + i, firstColm + i);
+  m_computed(firstColm + i, firstColm) = Literal(0);
+
+  JacobiRotation<RealScalar> J(c, -s);
+  if (m_compU)
+    m_naiveU.middleRows(firstColu, size + 1).applyOnTheRight(firstColu + j, firstColu + i, J);
+  else
+    m_naiveU.applyOnTheRight(firstColu + j, firstColu + i, J);
+  if (m_compV) m_naiveV.middleRows(firstRowW, size).applyOnTheRight(firstColW + j, firstColW + i, J);
+}  // end deflation 44
+
+// acts on block from (firstCol+shift, firstCol+shift) to (lastCol+shift, lastCol+shift) [inclusive]
+template <typename RealScalar_>
+void bdcsvd_impl<RealScalar_>::deflation(Index firstCol, Index lastCol, Index k, Index firstRowW, Index firstColW,
+                                         Index shift) {
+  using std::abs;
+  using std::sqrt;
+  const Index length = lastCol + 1 - firstCol;
+
+  Block<MatrixXr, Dynamic, 1> col0(m_computed, firstCol + shift, firstCol + shift, length, 1);
+  Diagonal<MatrixXr> fulldiag(m_computed);
+  VectorBlock<Diagonal<MatrixXr>, Dynamic> diag(fulldiag, firstCol + shift, length);
+
+  const RealScalar considerZero = (std::numeric_limits<RealScalar>::min)();
+  RealScalar maxDiag = diag.tail((std::max)(Index(1), length - 1)).cwiseAbs().maxCoeff();
+  RealScalar epsilon_strict = numext::maxi<RealScalar>(considerZero, NumTraits<RealScalar>::epsilon() * maxDiag);
+  RealScalar epsilon_coarse =
+      Literal(8) * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(col0.cwiseAbs().maxCoeff(), maxDiag);
+
+  BDCSVD_SANITY_CHECK(m_naiveU.allFinite());
+  BDCSVD_SANITY_CHECK(m_naiveV.allFinite());
+  BDCSVD_SANITY_CHECK(m_computed.allFinite());
+
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+  std::cout << "\ndeflate:" << diag.head(k + 1).transpose() << "  |  "
+            << diag.segment(k + 1, length - k - 1).transpose() << "\n";
+#endif
+
+  // condition 4.1
+  if (diag(0) < epsilon_coarse) {
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+    std::cout << "deflation 4.1, because " << diag(0) << " < " << epsilon_coarse << "\n";
+#endif
+    diag(0) = epsilon_coarse;
+  }
+
+  // condition 4.2
+  for (Index i = 1; i < length; ++i)
+    if (abs(col0(i)) < epsilon_strict) {
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+      std::cout << "deflation 4.2, set z(" << i << ") to zero because " << abs(col0(i)) << " < " << epsilon_strict
+                << "  (diag(" << i << ")=" << diag(i) << ")\n";
+#endif
+      col0(i) = Literal(0);
+    }
+
+  // condition 4.3
+  for (Index i = 1; i < length; i++)
+    if (diag(i) < epsilon_coarse) {
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+      std::cout << "deflation 4.3, cancel z(" << i << ")=" << col0(i) << " because diag(" << i << ")=" << diag(i)
+                << " < " << epsilon_coarse << "\n";
+#endif
+      deflation43(firstCol, shift, i, length);
+    }
+
+  BDCSVD_SANITY_CHECK(m_naiveU.allFinite());
+  BDCSVD_SANITY_CHECK(m_naiveV.allFinite());
+  BDCSVD_SANITY_CHECK(m_computed.allFinite());
+
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+  std::cout << "to be sorted: " << diag.transpose() << "\n\n";
+  std::cout << "            : " << col0.transpose() << "\n\n";
+#endif
+  {
+    // Check for total deflation:
+    // If we have a total deflation, then we have to consider col0(0)==diag(0) as a singular value during sorting.
+    const bool total_deflation = (col0.tail(length - 1).array().abs() < considerZero).all();
+
+    // Sort the diagonal entries, since diag(1:k-1) and diag(k:length) are already sorted, let's do a sorted merge.
+    // First, compute the respective permutation.
+    Index* permutation = m_workspaceI.data();
+    {
+      permutation[0] = 0;
+      Index p = 1;
+
+      // Move deflated diagonal entries at the end.
+      for (Index i = 1; i < length; ++i)
+        if (diag(i) < considerZero) permutation[p++] = i;
+
+      Index i = 1, j = k + 1;
+      for (; p < length; ++p) {
+        if (i > k)
+          permutation[p] = j++;
+        else if (j >= length)
+          permutation[p] = i++;
+        else if (diag(i) < diag(j))
+          permutation[p] = j++;
+        else
+          permutation[p] = i++;
+      }
+    }
+
+    // If we have a total deflation, then we have to insert diag(0) at the right place
+    if (total_deflation) {
+      for (Index i = 1; i < length; ++i) {
+        Index pi = permutation[i];
+        if (diag(pi) < considerZero || diag(0) < diag(pi))
+          permutation[i - 1] = permutation[i];
+        else {
+          permutation[i - 1] = 0;
+          break;
+        }
+      }
+    }
+
+    // Current index of each col, and current column of each index
+    Index* realInd = m_workspaceI.data() + length;
+    Index* realCol = m_workspaceI.data() + 2 * length;
+
+    for (int pos = 0; pos < length; pos++) {
+      realCol[pos] = pos;
+      realInd[pos] = pos;
+    }
+
+    for (Index i = total_deflation ? 0 : 1; i < length; i++) {
+      const Index pi = permutation[length - (total_deflation ? i + 1 : i)];
+      const Index J = realCol[pi];
+
+      using std::swap;
+      // swap diagonal and first column entries:
+      swap(diag(i), diag(J));
+      if (i != 0 && J != 0) swap(col0(i), col0(J));
+
+      // change columns
+      if (m_compU)
+        m_naiveU.col(firstCol + i)
+            .segment(firstCol, length + 1)
+            .swap(m_naiveU.col(firstCol + J).segment(firstCol, length + 1));
+      else
+        m_naiveU.col(firstCol + i).segment(0, 2).swap(m_naiveU.col(firstCol + J).segment(0, 2));
+      if (m_compV)
+        m_naiveV.col(firstColW + i)
+            .segment(firstRowW, length)
+            .swap(m_naiveV.col(firstColW + J).segment(firstRowW, length));
+
+      // update real pos
+      const Index realI = realInd[i];
+      realCol[realI] = J;
+      realCol[pi] = i;
+      realInd[J] = realI;
+      realInd[i] = pi;
+    }
+  }
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+  std::cout << "sorted: " << diag.transpose().format(bdcsvdfmt) << "\n";
+  std::cout << "      : " << col0.transpose() << "\n\n";
+#endif
+
+  // condition 4.4
+  {
+    Index i = length - 1;
+    // Find last non-deflated entry.
+    while (i > 0 && (diag(i) < considerZero || abs(col0(i)) < considerZero)) --i;
+
+    for (; i > 1; --i)
+      if ((diag(i) - diag(i - 1)) < epsilon_strict) {
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+        std::cout << "deflation 4.4 with i = " << i << " because " << diag(i) << " - " << diag(i - 1)
+                  << " == " << (diag(i) - diag(i - 1)) << " < " << epsilon_strict << "\n";
+#endif
+        eigen_internal_assert(abs(diag(i) - diag(i - 1)) < epsilon_coarse &&
+                              " diagonal entries are not properly sorted");
+        deflation44(firstCol, firstCol + shift, firstRowW, firstColW, i, i - 1, length);
+      }
+  }
+
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+  for (Index j = 2; j < length; ++j) eigen_internal_assert(diag(j - 1) <= diag(j) || abs(diag(j)) < considerZero);
+#endif
+
+  BDCSVD_SANITY_CHECK(m_naiveU.allFinite());
+  BDCSVD_SANITY_CHECK(m_naiveV.allFinite());
+  BDCSVD_SANITY_CHECK(m_computed.allFinite());
+}  // end deflation
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_BDCSVD_IMPL_H

diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h
index 7c52a35..fd0275c 100644
--- a/Eigen/src/SVD/JacobiSVD.h
+++ b/Eigen/src/SVD/JacobiSVD.h

@@ -372,8 +372,8 @@
   typedef typename MatrixType::Scalar Scalar;
   typedef typename MatrixType::RealScalar RealScalar;
   static bool run(typename SVD::WorkMatrixType& work_matrix, SVD& svd, Index p, Index q, RealScalar& maxDiagEntry) {
-    using std::abs;
-    using std::sqrt;
+    using numext::abs;
+    using numext::sqrt;
     Scalar z;
     JacobiRotation<Scalar> rot;
     RealScalar n = sqrt(numext::abs2(work_matrix.coeff(p, p)) + numext::abs2(work_matrix.coeff(q, p)));
@@ -425,7 +425,7 @@
 };
 
 template <typename MatrixType_, int Options>
-struct traits<JacobiSVD<MatrixType_, Options> > : svd_traits<MatrixType_, Options> {
+struct traits<JacobiSVD<MatrixType_, Options>> : svd_traits<MatrixType_, Options> {
   typedef MatrixType_ MatrixType;
 };
 
@@ -497,7 +497,7 @@
  * \sa MatrixBase::jacobiSvd()
  */
 template <typename MatrixType_, int Options_>
-class JacobiSVD : public SVDBase<JacobiSVD<MatrixType_, Options_> > {
+class JacobiSVD : public SVDBase<JacobiSVD<MatrixType_, Options_>> {
   typedef SVDBase<JacobiSVD> Base;
 
  public:
@@ -653,6 +653,12 @@
   template <typename Derived>
   JacobiSVD& compute_impl(const MatrixBase<Derived>& matrix, unsigned int computationOptions);
 
+  // Blocked sweep for the Jacobi SVD (works for both real and complex scalars).
+  // Extracted into a separate EIGEN_DONT_INLINE method to prevent the blocking
+  // code from interfering with the compiler's optimization of the non-blocking
+  // scalar sweep.
+  EIGEN_DONT_INLINE bool blocked_sweep(RealScalar considerAsZero, RealScalar precision, RealScalar& maxDiagEntry);
+
  protected:
   using Base::m_computationOptions;
   using Base::m_computeFullU;
@@ -686,6 +692,16 @@
   internal::qr_preconditioner_impl<MatrixType, Options, QRPreconditioner, internal::PreconditionIfMoreRowsThanCols>
       m_qr_precond_morerows;
   WorkMatrixType m_workMatrix;
+
+  // Blocking parameters for the Jacobi SVD sweep.
+#ifdef EIGEN_JACOBI_SVD_BLOCK_SIZE
+  static constexpr Index kDefaultBlockSize = EIGEN_JACOBI_SVD_BLOCK_SIZE;
+#else
+  static constexpr Index kDefaultBlockSize = 32;
+#endif
+
+  // Use the lower of the default block size and static maximum matrix dimensions.
+  static constexpr Index kBlockSize = internal::min_size_prefer_fixed(kDefaultBlockSize, MaxDiagSizeAtCompileTime);
 };
 
 template <typename MatrixType, int Options>
@@ -703,7 +719,7 @@
   EIGEN_STATIC_ASSERT((std::is_same<typename Derived::Scalar, typename MatrixType::Scalar>::value),
                       Input matrix must have the same Scalar type as the BDCSVD object.);
 
-  using std::abs;
+  using numext::abs;
 
   allocate(matrix.rows(), matrix.cols(), computationOptions);
 
@@ -746,33 +762,72 @@
   while (!finished) {
     finished = true;
 
-    // do a sweep: for all index pairs (p,q), perform SVD of the corresponding 2x2 sub-matrix
+    {
+      // Sweep with optional blocking for large matrices.
+      // Use blocking when the matrix is large enough that individual left rotations
+      // (strided row operations on column-major data) cause significant cache misses.
+      // The threshold is derived from the L2 cache size: blocking becomes worthwhile
+      // when n exceeds sqrt(L2 / 4). We divide by sizeof(float) rather than sizeof(RealScalar)
+      // because the cache miss pattern depends on the number of columns accessed (one cache
+      // line per column), not the scalar size. This also makes the threshold appropriately
+      // more conservative for larger types where GEMM overhead is higher.
+      const Index n = diagSize();
+#ifdef EIGEN_JACOBI_SVD_BLOCKING_THRESHOLD
+      const Index blockingThreshold = EIGEN_JACOBI_SVD_BLOCKING_THRESHOLD;
+#else
+      const Index blockingThreshold = static_cast<Index>(std::sqrt(static_cast<double>(l2CacheSize() / sizeof(float))));
+#endif
 
-    for (Index p = 1; p < diagSize(); ++p) {
-      for (Index q = 0; q < p; ++q) {
-        // if this 2x2 sub-matrix is not diagonal already...
-        // notice that this comparison will evaluate to false if any NaN is involved, ensuring that NaN's don't
-        // keep us iterating forever. Similarly, small denormal numbers are considered zero.
+      if (n >= blockingThreshold) {
+        // The blocked sweep is in a separate EIGEN_DONT_INLINE method to prevent
+        // the blocking code from interfering with the compiler's optimization of
+        // the non-blocking scalar sweep below.
+        finished = !blocked_sweep(considerAsZero, precision, maxDiagEntry);
+      }
+      // Non-blocking paths: apply rotations individually. The real and complex
+      // paths are kept separate to avoid any codegen impact from the complex
+      // preconditioner on GCC's optimization of the real inner loop.
+      else
+        EIGEN_IF_CONSTEXPR(NumTraits<Scalar>::IsComplex) {
+          // Complex non-blocking sweep: condition each 2x2 block to be real before diagonalizing.
+          for (Index p = 1; p < n; ++p) {
+            for (Index q = 0; q < p; ++q) {
+              RealScalar threshold = numext::maxi<RealScalar>(considerAsZero, precision * maxDiagEntry);
+              if (abs(m_workMatrix.coeff(p, q)) > threshold || abs(m_workMatrix.coeff(q, p)) > threshold) {
+                finished = false;
+                if (internal::svd_precondition_2x2_block_to_be_real<MatrixType, Options>::run(m_workMatrix, *this, p, q,
+                                                                                              maxDiagEntry)) {
+                  JacobiRotation<RealScalar> j_left, j_right;
+                  internal::real_2x2_jacobi_svd(m_workMatrix, p, q, &j_left, &j_right);
+                  m_workMatrix.applyOnTheLeft(p, q, j_left);
+                  if (computeU()) m_matrixU.applyOnTheRight(p, q, j_left.transpose());
+                  m_workMatrix.applyOnTheRight(p, q, j_right);
+                  if (computeV()) m_matrixV.applyOnTheRight(p, q, j_right);
+                  maxDiagEntry = numext::maxi<RealScalar>(
+                      maxDiagEntry,
+                      numext::maxi<RealScalar>(abs(m_workMatrix.coeff(p, p)), abs(m_workMatrix.coeff(q, q))));
+                }
+              }
+            }
+          }
+        }
+      else {
+        // Real non-blocking sweep: diagonalize each 2x2 block directly.
         RealScalar threshold = numext::maxi<RealScalar>(considerAsZero, precision * maxDiagEntry);
-        if (abs(m_workMatrix.coeff(p, q)) > threshold || abs(m_workMatrix.coeff(q, p)) > threshold) {
-          finished = false;
-          // perform SVD decomposition of 2x2 sub-matrix corresponding to indices p,q to make it diagonal
-          // the complex to real operation returns true if the updated 2x2 block is not already diagonal
-          if (internal::svd_precondition_2x2_block_to_be_real<MatrixType, Options>::run(m_workMatrix, *this, p, q,
-                                                                                        maxDiagEntry)) {
-            JacobiRotation<RealScalar> j_left, j_right;
-            internal::real_2x2_jacobi_svd(m_workMatrix, p, q, &j_left, &j_right);
-
-            // accumulate resulting Jacobi rotations
-            m_workMatrix.applyOnTheLeft(p, q, j_left);
-            if (computeU()) m_matrixU.applyOnTheRight(p, q, j_left.transpose());
-
-            m_workMatrix.applyOnTheRight(p, q, j_right);
-            if (computeV()) m_matrixV.applyOnTheRight(p, q, j_right);
-
-            // keep track of the largest diagonal coefficient
-            maxDiagEntry = numext::maxi<RealScalar>(
-                maxDiagEntry, numext::maxi<RealScalar>(abs(m_workMatrix.coeff(p, p)), abs(m_workMatrix.coeff(q, q))));
+        for (Index p = 1; p < n; ++p) {
+          for (Index q = 0; q < p; ++q) {
+            if (abs(m_workMatrix.coeff(p, q)) > threshold || abs(m_workMatrix.coeff(q, p)) > threshold) {
+              finished = false;
+              JacobiRotation<RealScalar> j_left, j_right;
+              internal::real_2x2_jacobi_svd(m_workMatrix, p, q, &j_left, &j_right);
+              m_workMatrix.applyOnTheLeft(p, q, j_left);
+              if (computeU()) m_matrixU.applyOnTheRight(p, q, j_left.transpose());
+              m_workMatrix.applyOnTheRight(p, q, j_right);
+              if (computeV()) m_matrixV.applyOnTheRight(p, q, j_right);
+              maxDiagEntry = numext::maxi<RealScalar>(
+                  maxDiagEntry, numext::maxi<RealScalar>(abs(m_workMatrix.coeff(p, p)), abs(m_workMatrix.coeff(q, q))));
+              threshold = numext::maxi<RealScalar>(considerAsZero, precision * maxDiagEntry);
+            }
           }
         }
       }
@@ -822,6 +877,216 @@
   return *this;
 }
 
+// Blocked Jacobi SVD sweep for both real and complex scalar types. For large n,
+// applying left rotations (row operations on column-major data) causes cache
+// misses due to strided access. To mitigate this, we accumulate kBlockSize left
+// rotations into a small dense matrix and apply them via a single GEMM to the
+// contiguous row block q..q+kBlockSize-1 and the (possibly distant) row p.
+// Right rotations and column scalings act on columns (contiguous in column-major)
+// and are applied individually.
+//
+// For complex types, the 2x2 preconditioning (making the block real) involves
+// complex left rotations and row scalings, which are also accumulated into the
+// block matrix. Column scalings from preconditioning are applied directly.
+//
+// The accumulated rotation matrix has lower-triangular structure in its top-left
+// kBlockSize x kBlockSize corner, which we exploit with triangularView.
+//
+// Returns true if any off-diagonal element exceeded the threshold (i.e. sweep
+// is not yet converged).
+template <typename MatrixType, int Options>
+EIGEN_DONT_INLINE bool JacobiSVD<MatrixType, Options>::blocked_sweep(RealScalar considerAsZero, RealScalar precision,
+                                                                     RealScalar& maxDiagEntry) {
+  using numext::abs;
+  using numext::sqrt;
+  const Index n = diagSize();
+  RealScalar threshold = numext::maxi<RealScalar>(considerAsZero, precision * maxDiagEntry);
+  bool notFinished = false;
+  static constexpr Index kBlockBufferSize = (kBlockSize + 1) * (kBlockSize + 1);
+  ei_declare_aligned_stack_constructed_variable(Scalar, blockBufferPtr, kBlockBufferSize, 0);
+  Map<Matrix<Scalar, kBlockSize + 1, kBlockSize + 1, MatrixOptions>, AlignedMax> blockBuffer(
+      blockBufferPtr, kBlockSize + 1, kBlockSize + 1);
+
+  ei_declare_aligned_stack_constructed_variable(Scalar, accumPtr, kBlockBufferSize, 0);
+  Map<Matrix<Scalar, kBlockSize + 1, kBlockSize + 1, MatrixOptions>, AlignedMax> accum(accumPtr, kBlockSize + 1,
+                                                                                       kBlockSize + 1);
+
+  for (Index p = 1; p < n; ++p) {
+    Index q = 0;
+
+    // Blocked loop: process kBlockSize pairs (p,q+qq) for qq=0..kBlockSize-1.
+    // We extract the relevant (kBlockSize+1) x (kBlockSize+1) submatrix of W
+    // into a small buffer, compute all rotations on the buffer, accumulate the
+    // left transformations into `accum`, and apply them in one GEMM at the end.
+    for (; q + kBlockSize <= p; q += kBlockSize) {
+      // Buffer = [ W(q:q+k, q:q+k)  W(q:q+k, p) ]
+      //          [ W(p, q:q+k)       W(p, p)      ]
+      blockBuffer.template topLeftCorner<kBlockSize, kBlockSize>() =
+          m_workMatrix.template block<kBlockSize, kBlockSize>(q, q);
+      blockBuffer.col(kBlockSize).template head<kBlockSize>() = m_workMatrix.col(p).template segment<kBlockSize>(q);
+      blockBuffer.row(kBlockSize).template head<kBlockSize>() = m_workMatrix.row(p).template segment<kBlockSize>(q);
+      blockBuffer(kBlockSize, kBlockSize) = m_workMatrix(p, p);
+
+      // Accumulator for left transformations: W <- accum * W.
+      // After processing qq pairs, accum's top-left kBlockSize x kBlockSize
+      // block is lower-triangular (each rotation only mixes row qq with row
+      // kBlockSize, so rows 0..qq-1 are unchanged).
+      accum.setIdentity(kBlockSize + 1, kBlockSize + 1);
+      bool blockDirty = false;
+
+      for (Index qq = 0; qq < kBlockSize; ++qq) {
+        if (abs(blockBuffer.coeff(kBlockSize, qq)) > threshold || abs(blockBuffer.coeff(qq, kBlockSize)) > threshold) {
+          notFinished = true;
+          blockDirty = true;
+
+          // Complex preconditioning: transform the 2x2 block
+          //   [w_pp  w_pq] = [buffer(kBlockSize, kBlockSize)  buffer(kBlockSize, qq)]
+          //   [w_qp  w_qq]   [buffer(qq, kBlockSize)          buffer(qq, qq)        ]
+          // to have real entries via unitary row/column operations, so
+          // real_2x2_jacobi_svd can be applied.
+          //
+          // Left operations (complex rotation, row scaling by e^{i*theta}) are
+          // accumulated into `accum` for deferred GEMM application.
+          // Right operations (column scaling) are applied directly since column
+          // ops are contiguous in column-major layout.
+          bool doRealSvd = true;
+          EIGEN_IF_CONSTEXPR(NumTraits<Scalar>::IsComplex) {
+            Scalar z;
+            // nn = ||(w_pp, w_qp)||_2, the norm of the first column of the 2x2 block.
+            RealScalar nn = sqrt(numext::abs2(blockBuffer.coeff(kBlockSize, kBlockSize)) +
+                                 numext::abs2(blockBuffer.coeff(qq, kBlockSize)));
+
+            if (numext::is_exactly_zero(nn)) {
+              // First column is zero => block is already upper triangular.
+              blockBuffer.coeffRef(kBlockSize, kBlockSize) = Scalar(0);
+              blockBuffer.coeffRef(qq, kBlockSize) = Scalar(0);
+
+              // Scale rows by z = e^{-i*arg(w)} to make remaining entries real.
+              if (abs(numext::imag(blockBuffer.coeff(kBlockSize, qq))) > considerAsZero) {
+                z = abs(blockBuffer.coeff(kBlockSize, qq)) / blockBuffer.coeff(kBlockSize, qq);
+                blockBuffer.row(kBlockSize) *= z;
+                accum.row(kBlockSize) *= z;  // accumulate left op
+                if (computeU()) m_matrixU.col(p) *= numext::conj(z);
+              }
+              if (abs(numext::imag(blockBuffer.coeff(qq, qq))) > considerAsZero) {
+                z = abs(blockBuffer.coeff(qq, qq)) / blockBuffer.coeff(qq, qq);
+                blockBuffer.row(qq) *= z;
+                accum.row(qq) *= z;  // accumulate left op
+                if (computeU()) m_matrixU.col(q + qq) *= numext::conj(z);
+              }
+            } else {
+              // Apply complex Givens rotation to zero out w_qp:
+              //   [c  s] [w_pp]   [nn]      conj(w_pp)         w_qp
+              //   [-s c] [w_qp] = [0 ]  c = ----------,  s = ------
+              //                                 nn              nn
+              JacobiRotation<Scalar> rot;
+              rot.c() = numext::conj(blockBuffer.coeff(kBlockSize, kBlockSize)) / nn;
+              rot.s() = blockBuffer.coeff(qq, kBlockSize) / nn;
+              blockBuffer.applyOnTheLeft(kBlockSize, qq, rot);
+              accum.applyOnTheLeft(kBlockSize, qq, rot);  // accumulate left op
+              if (computeU()) m_matrixU.applyOnTheRight(p, q + qq, rot.adjoint());
+
+              // Scale column qq by z = e^{-i*arg(w_pq)} to make w_pq real.
+              if (abs(numext::imag(blockBuffer.coeff(kBlockSize, qq))) > considerAsZero) {
+                z = abs(blockBuffer.coeff(kBlockSize, qq)) / blockBuffer.coeff(kBlockSize, qq);
+                blockBuffer.col(qq) *= z;
+                m_workMatrix.col(q + qq) *= z;  // right op: apply directly
+                if (computeV()) m_matrixV.col(q + qq) *= z;
+              }
+              // Scale row qq by z = e^{-i*arg(w_qq)} to make w_qq real.
+              if (abs(numext::imag(blockBuffer.coeff(qq, qq))) > considerAsZero) {
+                z = abs(blockBuffer.coeff(qq, qq)) / blockBuffer.coeff(qq, qq);
+                blockBuffer.row(qq) *= z;
+                accum.row(qq) *= z;  // accumulate left op
+                if (computeU()) m_matrixU.col(q + qq) *= numext::conj(z);
+              }
+            }
+            // Update maxDiagEntry from preconditioning.
+            maxDiagEntry = numext::maxi<RealScalar>(
+                maxDiagEntry, numext::maxi<RealScalar>(abs(blockBuffer.coeff(kBlockSize, kBlockSize)),
+                                                       abs(blockBuffer.coeff(qq, qq))));
+            // Check if 2x2 block still needs diagonalizing.
+            RealScalar precondThreshold = numext::maxi<RealScalar>(considerAsZero, precision * maxDiagEntry);
+            doRealSvd = abs(blockBuffer.coeff(kBlockSize, qq)) > precondThreshold ||
+                        abs(blockBuffer.coeff(qq, kBlockSize)) > precondThreshold;
+          }
+
+          if (doRealSvd) {
+            // Compute real 2x2 SVD: buffer_2x2 = j_left * diag * j_right^T.
+            JacobiRotation<RealScalar> j_left, j_right;
+            internal::real_2x2_jacobi_svd(blockBuffer, kBlockSize, qq, &j_left, &j_right);
+            blockBuffer.applyOnTheLeft(kBlockSize, qq, j_left);
+            blockBuffer.applyOnTheRight(kBlockSize, qq, j_right);
+
+            // Accumulate left rotation for deferred GEMM.
+            accum.applyOnTheLeft(kBlockSize, qq, j_left);
+
+            // Right rotation is a column op (contiguous): apply directly.
+            m_workMatrix.applyOnTheRight(p, q + qq, j_right);
+            if (computeU()) m_matrixU.applyOnTheRight(p, q + qq, j_left.transpose());
+            if (computeV()) m_matrixV.applyOnTheRight(p, q + qq, j_right);
+
+            maxDiagEntry = numext::maxi<RealScalar>(
+                maxDiagEntry, numext::maxi<RealScalar>(abs(blockBuffer.coeff(kBlockSize, kBlockSize)),
+                                                       abs(blockBuffer.coeff(qq, qq))));
+          }
+          threshold = numext::maxi<RealScalar>(considerAsZero, precision * maxDiagEntry);
+        }
+      }
+
+      // Apply accumulated left rotations: W <- accum * W, via GEMM.
+      // When p == q + kBlockSize, all kBlockSize+1 rows are contiguous.
+      // Otherwise, rows q..q+k-1 and row p are non-adjacent; we split:
+      //   [Mq]      [L11  l12] [Mq]
+      //   [Mp] <-   [l21  l22] [Mp]
+      // L11 is lower-triangular (exploited via triangularView).
+      if (blockDirty) {
+        if (p == q + kBlockSize) {
+          m_workMatrix.template middleRows<kBlockSize + 1>(q) =
+              accum * m_workMatrix.template middleRows<kBlockSize + 1>(q);
+        } else {
+          const auto L11 = accum.template topLeftCorner<kBlockSize, kBlockSize>();
+          const auto l12 = accum.col(kBlockSize).template head<kBlockSize>();
+          const auto l21 = accum.row(kBlockSize).template head<kBlockSize>();
+          const Scalar l22 = accum(kBlockSize, kBlockSize);
+          auto Mq = m_workMatrix.template middleRows<kBlockSize>(q);
+          auto Mp = m_workMatrix.row(p);
+          Matrix<Scalar, 1, Dynamic> Mp_save = Mp;
+          Mp.noalias() = l21 * Mq + l22 * Mp_save;
+          Mq = L11.template triangularView<Lower>() * Mq + l12 * Mp_save;
+        }
+      }
+    }
+
+    // Scalar loop for remaining pairs after blocked processing.
+    for (; q < p; ++q) {
+      if (abs(m_workMatrix.coeff(p, q)) > threshold || abs(m_workMatrix.coeff(q, p)) > threshold) {
+        notFinished = true;
+
+        bool doRealSvd = true;
+        EIGEN_IF_CONSTEXPR(NumTraits<Scalar>::IsComplex) {
+          doRealSvd = internal::svd_precondition_2x2_block_to_be_real<MatrixType, Options>::run(m_workMatrix, *this, p,
+                                                                                                q, maxDiagEntry);
+        }
+
+        if (doRealSvd) {
+          JacobiRotation<RealScalar> j_left, j_right;
+          internal::real_2x2_jacobi_svd(m_workMatrix, p, q, &j_left, &j_right);
+          m_workMatrix.applyOnTheLeft(p, q, j_left);
+          if (computeU()) m_matrixU.applyOnTheRight(p, q, j_left.transpose());
+          m_workMatrix.applyOnTheRight(p, q, j_right);
+          if (computeV()) m_matrixV.applyOnTheRight(p, q, j_right);
+          maxDiagEntry = numext::maxi<RealScalar>(
+              maxDiagEntry, numext::maxi<RealScalar>(abs(m_workMatrix.coeff(p, p)), abs(m_workMatrix.coeff(q, q))));
+        }
+        threshold = numext::maxi<RealScalar>(considerAsZero, precision * maxDiagEntry);
+      }
+    }
+  }
+
+  return notFinished;
+}
+
 /** \svd_module
  *
  * \return the singular value decomposition of \c *this computed by two-sided

diff --git a/Eigen/src/SVD/SVDBase.h b/Eigen/src/SVD/SVDBase.h
index dcb4dba..c34bf0d 100644
--- a/Eigen/src/SVD/SVDBase.h
+++ b/Eigen/src/SVD/SVDBase.h

@@ -290,7 +290,7 @@
    * A x - b \Vert \f$.
    */
   template <typename Rhs>
-  inline const Solve<Derived, Rhs> solve(const MatrixBase<Rhs>& b) const;
+  inline Solve<Derived, Rhs> solve(const MatrixBase<Rhs>& b) const;
 #endif
 
   /** \brief Reports whether previous computation was successful.

diff --git a/Eigen/src/SVD/UpperBidiagonalization.h b/Eigen/src/SVD/UpperBidiagonalization.h
index 6df6318..2cf020e 100644
--- a/Eigen/src/SVD/UpperBidiagonalization.h
+++ b/Eigen/src/SVD/UpperBidiagonalization.h

@@ -17,8 +17,8 @@
 namespace Eigen {
 
 namespace internal {
-// UpperBidiagonalization will probably be replaced by a Bidiagonalization class, don't want to make it stable API.
-// At the same time, it's useful to keep for now as it's about the only thing that is testing the BandMatrix class.
+// UpperBidiagonalization may be replaced by a Bidiagonalization class; not part of stable API.
+// Kept for now as it is one of the few tests exercising the BandMatrix class.
 
 template <typename MatrixType_>
 class UpperBidiagonalization {
@@ -330,7 +330,7 @@
   Index cols = matrix.cols();
   EIGEN_ONLY_USED_FOR_DEBUG(cols);
 
-  eigen_assert(rows >= cols && "UpperBidiagonalization is only for Arices satisfying rows>=cols.");
+  eigen_assert(rows >= cols && "UpperBidiagonalization is only for matrices satisfying rows>=cols.");
 
   m_householder = matrix;
 
@@ -350,7 +350,7 @@
   EIGEN_ONLY_USED_FOR_DEBUG(rows);
   EIGEN_ONLY_USED_FOR_DEBUG(cols);
 
-  eigen_assert(rows >= cols && "UpperBidiagonalization is only for Arices satisfying rows>=cols.");
+  eigen_assert(rows >= cols && "UpperBidiagonalization is only for matrices satisfying rows>=cols.");
 
   m_householder = matrix;
   upperbidiagonalization_inplace_blocked(m_householder, m_bidiagonal);
@@ -359,19 +359,6 @@
   return *this;
 }
 
-#if 0
-/** \return the Householder QR decomposition of \c *this.
-  *
-  * \sa class Bidiagonalization
-  */
-template<typename Derived>
-const UpperBidiagonalization<typename MatrixBase<Derived>::PlainObject>
-MatrixBase<Derived>::bidiagonalization() const
-{
-  return UpperBidiagonalization<PlainObject>(eval());
-}
-#endif
-
 }  // end namespace internal
 
 }  // end namespace Eigen

diff --git a/Eigen/src/SparseCholesky/SimplicialCholesky.h b/Eigen/src/SparseCholesky/SimplicialCholesky.h
index 1414794..997bc22 100644
--- a/Eigen/src/SparseCholesky/SimplicialCholesky.h
+++ b/Eigen/src/SparseCholesky/SimplicialCholesky.h

@@ -416,7 +416,7 @@
 
   /** Performs a numeric decomposition of \a matrix
    *
-   * The given matrix must have the same sparsity than the matrix on which the symbolic decomposition has been
+   * The given matrix must have the same sparsity as the matrix on which the symbolic decomposition has been
    * performed.
    *
    * \sa analyzePattern()
@@ -505,7 +505,7 @@
 
   /** Performs a numeric decomposition of \a matrix
    *
-   * The given matrix must have the same sparsity than the matrix on which the symbolic decomposition has been
+   * The given matrix must have the same sparsity as the matrix on which the symbolic decomposition has been
    * performed.
    *
    * \sa analyzePattern()
@@ -587,7 +587,7 @@
 
   /** Performs a numeric decomposition of \a matrix
    *
-   * The given matrix must have the same sparsity than the matrix on which the symbolic decomposition has been
+   * The given matrix must have the same sparsity as the matrix on which the symbolic decomposition has been
    * performed.
    *
    * \sa analyzePattern()
@@ -677,7 +677,7 @@
 
   /** Performs a numeric decomposition of \a matrix
    *
-   * The given matrix must have the same sparsity than the matrix on which the symbolic decomposition has been
+   * The given matrix must have the same sparsity as the matrix on which the symbolic decomposition has been
    * performed.
    *
    * \sa analyzePattern()
@@ -761,7 +761,7 @@
 
   /** Performs a numeric decomposition of \a matrix
    *
-   * The given matrix must have the same sparsity than the matrix on which the symbolic decomposition has been
+   * The given matrix must have the same sparsity as the matrix on which the symbolic decomposition has been
    * performed.
    *
    * \sa analyzePattern()

diff --git a/Eigen/src/SparseCore/AmbiVector.h b/Eigen/src/SparseCore/AmbiVector.h
index 9f265f0..4af6f88 100644
--- a/Eigen/src/SparseCore/AmbiVector.h
+++ b/Eigen/src/SparseCore/AmbiVector.h

@@ -173,7 +173,7 @@
     return m_buffer[i];
   else {
     ListEl* EIGEN_RESTRICT llElements = reinterpret_cast<ListEl*>(m_buffer);
-    // TODO factorize the following code to reduce code generation
+    // TODO: factor out the following code to reduce code generation
     eigen_assert(m_mode == IsSparse);
     if (m_llSize == 0) {
       // this is the first element

diff --git a/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h b/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
index 3c6e797..2beadab 100644
--- a/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
+++ b/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h

@@ -79,8 +79,8 @@
       const Index t200 = rows / 11;  // 11 == (log2(200)*1.39)
       const Index t = (rows * 100) / 139;
 
-      // FIXME reserve nnz non zeros
-      // FIXME implement faster sorting algorithms for very small nnz
+      // FIXME: reserve space for the expected number of non-zeros.
+      // FIXME: implement faster sorting for very small nnz counts.
       // if the result is sparse enough => use a quick sort
       // otherwise => loop through the entire vector
       // In order to avoid to perform an expensive log2 when the
@@ -131,7 +131,7 @@
 
     // If the result is tall and thin (in the extreme case a column vector)
     // then it is faster to sort the coefficients inplace instead of transposing twice.
-    // FIXME, the following heuristic is probably not very good.
+    // FIXME: this heuristic has known limitations and should be improved.
     if (lhs.rows() > rhs.cols()) {
       using ColMajorMatrix = typename sparse_eval<ColMajorMatrixAux, ResultType::RowsAtCompileTime,
                                                   ResultType::ColsAtCompileTime, ColMajorMatrixAux::Flags>::type;

diff --git a/Eigen/src/SparseCore/SparseAssign.h b/Eigen/src/SparseCore/SparseAssign.h
index f2da519..b794263 100644
--- a/Eigen/src/SparseCore/SparseAssign.h
+++ b/Eigen/src/SparseCore/SparseAssign.h

@@ -25,7 +25,7 @@
 template <typename Derived>
 template <typename OtherDerived>
 Derived &SparseMatrixBase<Derived>::operator=(const ReturnByValue<OtherDerived> &other) {
-  // TODO use the evaluator mechanism
+  // TODO: use the evaluator mechanism
   other.evalTo(derived());
   return derived();
 }

diff --git a/Eigen/src/SparseCore/SparseBlock.h b/Eigen/src/SparseCore/SparseBlock.h
index 1342f4e..3affff6 100644
--- a/Eigen/src/SparseCore/SparseBlock.h
+++ b/Eigen/src/SparseCore/SparseBlock.h

@@ -68,6 +68,32 @@
   Index blockRows() const { return IsRowMajor ? m_outerSize.value() : m_matrix.rows(); }
   Index blockCols() const { return IsRowMajor ? m_matrix.cols() : m_outerSize.value(); }
 
+  inline const Scalar* valuePtr() const { return m_matrix.valuePtr(); }
+  inline Scalar* valuePtr() { return m_matrix.valuePtr(); }
+
+  inline const StorageIndex* innerIndexPtr() const { return m_matrix.innerIndexPtr(); }
+  inline StorageIndex* innerIndexPtr() { return m_matrix.innerIndexPtr(); }
+
+  inline const StorageIndex* outerIndexPtr() const {
+    const StorageIndex* p = m_matrix.outerIndexPtr();
+    return p ? p + m_outerStart : 0;
+  }
+  inline StorageIndex* outerIndexPtr() {
+    StorageIndex* p = m_matrix.outerIndexPtr();
+    return p ? p + m_outerStart : 0;
+  }
+
+  inline const StorageIndex* innerNonZeroPtr() const {
+    const StorageIndex* p = m_matrix.innerNonZeroPtr();
+    return p ? p + m_outerStart : 0;
+  }
+  inline StorageIndex* innerNonZeroPtr() {
+    StorageIndex* p = m_matrix.innerNonZeroPtr();
+    return p ? p + m_outerStart : 0;
+  }
+
+  bool isCompressed() const { return m_matrix.innerNonZeroPtr() == 0; }
+
  protected:
   typename internal::ref_selector<XprType>::non_const_type m_matrix;
   Index m_outerStart;

diff --git a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
index 7fcf2c2..e3dcd57 100644
--- a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
+++ b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h

@@ -32,7 +32,7 @@
 //  4 - dense op dense     product      dense
 //                         generic      dense
 //
-// TODO to ease compiler job, we could specialize product/quotient with a scalar
+// TODO: to ease compiler job, we could specialize product/quotient with a scalar
 //      and fallback to cwise-unary evaluator using bind1st_op and bind2nd_op.
 
 template <typename BinaryOp, typename Lhs, typename Rhs>

diff --git a/Eigen/src/SparseCore/SparseDenseProduct.h b/Eigen/src/SparseCore/SparseDenseProduct.h
index 17ce596..7512793 100644
--- a/Eigen/src/SparseCore/SparseDenseProduct.h
+++ b/Eigen/src/SparseCore/SparseDenseProduct.h

@@ -26,11 +26,18 @@
   typedef Sparse ret;
 };
 
+// Type trait to detect if a sparse type supports direct compressed storage access
+// (i.e., has valuePtr(), innerIndexPtr(), outerIndexPtr(), isCompressed()).
+// All types deriving from SparseCompressedBase provide these methods.
+template <typename T>
+struct has_compressed_storage : std::is_base_of<SparseCompressedBase<T>, T> {};
+
 template <typename SparseLhsType, typename DenseRhsType, typename DenseResType, typename AlphaType,
           int LhsStorageOrder = ((SparseLhsType::Flags & RowMajorBit) == RowMajorBit) ? RowMajor : ColMajor,
           bool ColPerCol = ((DenseRhsType::Flags & RowMajorBit) == 0) || DenseRhsType::ColsAtCompileTime == 1>
 struct sparse_time_dense_product_impl;
 
+// RowMajor, single column (ColPerCol=true): CSR SpMV
 template <typename SparseLhsType, typename DenseRhsType, typename DenseResType>
 struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType, typename DenseResType::Scalar,
                                       RowMajor, true> {
@@ -39,36 +46,123 @@
   typedef internal::remove_all_t<DenseResType> Res;
   typedef typename evaluator<Lhs>::InnerIterator LhsInnerIterator;
   typedef evaluator<Lhs> LhsEval;
+  typedef typename Res::Scalar ResScalar;
+
   static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res,
                   const typename Res::Scalar& alpha) {
     LhsEval lhsEval(lhs);
-
     Index n = lhs.outerSize();
-#ifdef EIGEN_HAS_OPENMP
-    Index threads = Eigen::nbThreads();
-#endif
 
     for (Index c = 0; c < rhs.cols(); ++c) {
-#ifdef EIGEN_HAS_OPENMP
-      // This 20000 threshold has been found experimentally on 2D and 3D Poisson problems.
-      // It basically represents the minimal amount of work to be done to be worth it.
-      if (threads > 1 && lhsEval.nonZerosEstimate() > 20000) {
-#pragma omp parallel for schedule(dynamic, (n + threads * 4 - 1) / (threads * 4)) num_threads(threads)
-        for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i, c);
-      } else
-#endif
-      {
-        for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i, c);
-      }
+      runCol(lhsEval, lhs, rhs, res, alpha, n, c, std::integral_constant<bool, has_compressed_storage<Lhs>::value>());
     }
   }
 
-  static void processRow(const LhsEval& lhsEval, const DenseRhsType& rhs, DenseResType& res,
-                         const typename Res::Scalar& alpha, Index i, Index col) {
-    // Two accumulators, which breaks the dependency chain on the accumulator
-    // and allows more instruction-level parallelism in the following loop
-    typename Res::Scalar tmp_a(0);
-    typename Res::Scalar tmp_b(0);
+  // Direct pointer path: works for both compressed and non-compressed storage.
+  static void runCol(const LhsEval& /*lhsEval*/, const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res,
+                     const ResScalar& alpha, Index n, Index c, std::true_type /* has_compressed_storage */) {
+    runColImpl(lhs, rhs, res, alpha, n, c, std::integral_constant<bool, bool(DenseRhsType::Flags & DirectAccessBit)>());
+  }
+
+  template <typename RhsT>
+  static void runColImpl(const SparseLhsType& lhs, const RhsT& rhs, DenseResType& res, const ResScalar& alpha, Index n,
+                         Index c, std::true_type) {
+    const Lhs& mat = lhs;
+    const auto* vals = mat.valuePtr();
+    const auto* inds = mat.innerIndexPtr();
+    // Sparse vectors don't store outer indices.
+    const auto* outer = mat.outerIndexPtr();
+    const auto* innerNnz = mat.innerNonZeroPtr();
+    // The fast rhs pointer path requires unit inner stride (common case: VectorXd, contiguous matrix column).
+    if (rhs.innerStride() == 1) {
+      const auto* x = rhs.data() + c * rhs.outerStride();
+#ifdef EIGEN_HAS_OPENMP
+      Index threads = Eigen::nbThreads();
+      if (threads > 1 && mat.nonZeros() > 20000) {
+#pragma omp parallel for schedule(dynamic, (n + threads * 4 - 1) / (threads * 4)) num_threads(threads)
+        for (Index i = 0; i < n; ++i) {
+          Index k = outer ? outer[i] : 0;
+          const Index end = innerNnz ? (outer ? outer[i] : 0) + innerNnz[i]
+                                     : (outer ? outer[i + 1] : mat.nonZeros());
+          ResScalar sum0(0), sum1(0);
+          for (; k < end; ++k) {
+            sum0 += vals[k] * x[inds[k]];
+            ++k;
+            if (k < end) {
+              sum1 += vals[k] * x[inds[k]];
+            }
+          }
+          res.coeffRef(i, c) += alpha * (sum0 + sum1);
+        }
+      } else
+#endif
+      {
+        for (Index i = 0; i < n; ++i) {
+          Index k = outer ? outer[i] : 0;
+          const Index end = innerNnz ? (outer ? outer[i] : 0) + innerNnz[i]
+                                     : (outer ? outer[i + 1] : mat.nonZeros());
+          // Two independent accumulators to break the dependency chain
+          ResScalar sum0(0), sum1(0);
+          for (; k < end; ++k) {
+            sum0 += vals[k] * x[inds[k]];
+            ++k;
+            if (k < end) {
+              sum1 += vals[k] * x[inds[k]];
+            }
+          }
+          res.coeffRef(i, c) += alpha * (sum0 + sum1);
+        }
+      }
+    } else {
+      runColImpl(lhs, rhs, res, alpha, n, c, std::false_type());
+    }
+  }
+
+  // Use fall-back path without direct access to rhs.
+  template <typename RhsT>
+  static void runColImpl(const SparseLhsType& lhs, const RhsT& rhs, DenseResType& res, const ResScalar& alpha, Index n,
+                         Index c, std::false_type) {
+    const Lhs& mat = lhs;
+    const auto* vals = mat.valuePtr();
+    const auto* inds = mat.innerIndexPtr();
+    const auto* outer = mat.outerIndexPtr();
+    const auto* innerNnz = mat.innerNonZeroPtr();
+    // Non-unit rhs stride (or no direct access): use direct pointers for sparse side, coeff() for rhs
+    for (Index i = 0; i < n; ++i) {
+      Index k = outer ? outer[i] : 0;
+      const Index end = innerNnz ? (outer ? outer[i] : 0) + innerNnz[i]
+                                 : (outer ? outer[i + 1] : mat.nonZeros());
+      ResScalar sum0(0), sum1(0);
+      for (; k < end; ++k) {
+        sum0 += vals[k] * rhs.coeff(inds[k], c);
+        ++k;
+        if (k < end) {
+          sum1 += vals[k] * rhs.coeff(inds[k], c);
+        }
+      }
+      res.coeffRef(i, c) += alpha * (sum0 + sum1);
+    }
+  }
+
+  // Iterator fallback path
+  static void runCol(const LhsEval& lhsEval, const SparseLhsType& /*lhs*/, const DenseRhsType& rhs, DenseResType& res,
+                     const ResScalar& alpha, Index n, Index c, std::false_type /* has_compressed_storage */) {
+#ifdef EIGEN_HAS_OPENMP
+    Index threads = Eigen::nbThreads();
+    if (threads > 1 && lhsEval.nonZerosEstimate() > 20000) {
+#pragma omp parallel for schedule(dynamic, (n + threads * 4 - 1) / (threads * 4)) num_threads(threads)
+      for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i, c);
+    } else
+#endif
+    {
+      for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i, c);
+    }
+  }
+
+  static void processRow(const LhsEval& lhsEval, const DenseRhsType& rhs, DenseResType& res, const ResScalar& alpha,
+                         Index i, Index col) {
+    ResScalar tmp_a(0);
+    ResScalar tmp_b(0);
     for (LhsInnerIterator it(lhsEval, i); it; ++it) {
       tmp_a += it.value() * rhs.coeff(it.index(), col);
       ++it;
@@ -80,17 +174,7 @@
   }
 };
 
-// FIXME: what is the purpose of the following specialization? Is it for the BlockedSparse format?
-// -> let's disable it for now as it is conflicting with generic scalar*matrix and matrix*scalar operators
-// template<typename T1, typename T2/*, int Options_, typename StrideType_*/>
-// struct ScalarBinaryOpTraits<T1, Ref<T2/*, Options_, StrideType_*/> >
-// {
-//   enum {
-//     Defined = 1
-//   };
-//   typedef typename CwiseUnaryOp<scalar_multiple2_op<T1, typename T2::Scalar>, T2>::PlainObject ReturnType;
-// };
-
+// ColMajor, single column (ColPerCol=true): CSC SpMV
 template <typename SparseLhsType, typename DenseRhsType, typename DenseResType, typename AlphaType>
 struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType, AlphaType, ColMajor, true> {
   typedef internal::remove_all_t<SparseLhsType> Lhs;
@@ -98,11 +182,61 @@
   typedef internal::remove_all_t<DenseResType> Res;
   typedef evaluator<Lhs> LhsEval;
   typedef typename LhsEval::InnerIterator LhsInnerIterator;
+
   static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const AlphaType& alpha) {
+    runImpl(lhs, rhs, res, alpha, std::integral_constant<bool, has_compressed_storage<Lhs>::value>());
+  }
+
+  // Direct pointer path: works for both compressed and non-compressed storage.
+  static void runImpl(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const AlphaType& alpha,
+                      std::true_type /* has_compressed_storage */) {
+    typedef typename Lhs::Scalar LhsScalar;
+    typedef typename Lhs::StorageIndex StorageIndex;
+    const Lhs& mat = lhs;
+    const LhsScalar* vals = mat.valuePtr();
+    const StorageIndex* inds = mat.innerIndexPtr();
+    // Sparse vectors don't store outer indices.
+    const auto* outer = mat.outerIndexPtr();
+    const auto* innerNnz = mat.innerNonZeroPtr();
+    // The fast result pointer path requires contiguous ColMajor result layout.
+    // Transpose<ColMajor> reports innerStride()==1 but is actually RowMajor, so check both.
+    if (!(Res::Flags & RowMajorBit) && res.innerStride() == 1) {
+      for (Index c = 0; c < rhs.cols(); ++c) {
+        typename Res::Scalar* y = res.data() + c * res.outerStride();
+        for (Index j = 0; j < lhs.outerSize(); ++j) {
+          typename ScalarBinaryOpTraits<AlphaType, typename Rhs::Scalar>::ReturnType rhs_j(alpha * rhs.coeff(j, c));
+          const Index start = outer ? outer[j] : 0;
+          const Index end = innerNnz ? start + innerNnz[j] : (outer ? outer[j + 1] : mat.nonZeros());
+          Index k = start;
+          // 4-way unrolled scatter-add (no SIMD: writes are scattered)
+          for (; k + 3 < end; k += 4) {
+            y[inds[k]] += vals[k] * rhs_j;
+            y[inds[k + 1]] += vals[k + 1] * rhs_j;
+            y[inds[k + 2]] += vals[k + 2] * rhs_j;
+            y[inds[k + 3]] += vals[k + 3] * rhs_j;
+          }
+          for (; k < end; ++k) y[inds[k]] += vals[k] * rhs_j;
+        }
+      }
+    } else {
+      // Non-unit result stride: use coeffRef() for result access
+      for (Index c = 0; c < rhs.cols(); ++c) {
+        for (Index j = 0; j < lhs.outerSize(); ++j) {
+          typename ScalarBinaryOpTraits<AlphaType, typename Rhs::Scalar>::ReturnType rhs_j(alpha * rhs.coeff(j, c));
+          const Index start = outer ? outer[j] : 0;
+          const Index end = innerNnz ? start + innerNnz[j] : (outer ? outer[j + 1] : mat.nonZeros());
+          for (Index k = start; k < end; ++k) res.coeffRef(inds[k], c) += vals[k] * rhs_j;
+        }
+      }
+    }
+  }
+
+  // Iterator-based fallback
+  static void runImpl(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const AlphaType& alpha,
+                      std::false_type /* has_compressed_storage */) {
     LhsEval lhsEval(lhs);
     for (Index c = 0; c < rhs.cols(); ++c) {
       for (Index j = 0; j < lhs.outerSize(); ++j) {
-        //        typename Res::Scalar rhs_j = alpha * rhs.coeff(j,c);
         typename ScalarBinaryOpTraits<AlphaType, typename Rhs::Scalar>::ReturnType rhs_j(alpha * rhs.coeff(j, c));
         for (LhsInnerIterator it(lhsEval, j); it; ++it) res.coeffRef(it.index(), c) += it.value() * rhs_j;
       }
@@ -110,6 +244,7 @@
   }
 };
 
+// RowMajor, multiple columns (ColPerCol=false): sparse * dense_matrix
 template <typename SparseLhsType, typename DenseRhsType, typename DenseResType>
 struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType, typename DenseResType::Scalar,
                                       RowMajor, false> {
@@ -118,6 +253,9 @@
   typedef internal::remove_all_t<DenseResType> Res;
   typedef evaluator<Lhs> LhsEval;
   typedef typename LhsEval::InnerIterator LhsInnerIterator;
+
+  static constexpr bool IsCompressedLhs = has_compressed_storage<Lhs>::value;
+
   static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res,
                   const typename Res::Scalar& alpha) {
     Index n = lhs.rows();
@@ -129,21 +267,43 @@
     // It basically represents the minimal amount of work to be done to be worth it.
     if (threads > 1 && lhsEval.nonZerosEstimate() * rhs.cols() > 20000) {
 #pragma omp parallel for schedule(dynamic, (n + threads * 4 - 1) / (threads * 4)) num_threads(threads)
-      for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i);
+      for (Index i = 0; i < n; ++i)
+        processRow(lhsEval, lhs, rhs, res, alpha, i, std::integral_constant<bool, IsCompressedLhs>());
     } else
 #endif
     {
-      for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i);
+      for (Index i = 0; i < n; ++i)
+        processRow(lhsEval, lhs, rhs, res, alpha, i, std::integral_constant<bool, IsCompressedLhs>());
     }
   }
 
-  static void processRow(const LhsEval& lhsEval, const DenseRhsType& rhs, Res& res, const typename Res::Scalar& alpha,
-                         Index i) {
+  // Direct pointer path: works for both compressed and non-compressed storage.
+  static void processRow(const LhsEval& /*lhsEval*/, const SparseLhsType& lhs, const DenseRhsType& rhs, Res& res,
+                         const typename Res::Scalar& alpha, Index i, std::true_type /* has_compressed_storage */) {
+    typedef typename Lhs::Scalar LhsScalar;
+    typedef typename Lhs::StorageIndex StorageIndex;
+    const Lhs& mat = lhs;
+    const LhsScalar* vals = mat.valuePtr();
+    const StorageIndex* inds = mat.innerIndexPtr();
+    // Sparse vectors don't store outer indices.
+    const Index start = mat.outerIndexPtr() ? mat.outerIndexPtr()[i] : 0;
+    const auto* innerNnz = mat.innerNonZeroPtr();
+    const Index end = innerNnz
+                          ? start + innerNnz[i]
+                          : (mat.outerIndexPtr() ? mat.outerIndexPtr()[i + 1]
+                                                 : mat.nonZeros());
+    typename Res::RowXpr res_i(res.row(i));
+    for (Index k = start; k < end; ++k) res_i += (alpha * vals[k]) * rhs.row(inds[k]);
+  }
+
+  static void processRow(const LhsEval& lhsEval, const SparseLhsType& /*lhs*/, const DenseRhsType& rhs, Res& res,
+                         const typename Res::Scalar& alpha, Index i, std::false_type /* has_compressed_storage */) {
     typename Res::RowXpr res_i(res.row(i));
     for (LhsInnerIterator it(lhsEval, i); it; ++it) res_i += (alpha * it.value()) * rhs.row(it.index());
   }
 };
 
+// ColMajor, multiple columns (ColPerCol=false): sparse * dense_matrix
 template <typename SparseLhsType, typename DenseRhsType, typename DenseResType>
 struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType, typename DenseResType::Scalar,
                                       ColMajor, false> {
@@ -151,8 +311,33 @@
   typedef internal::remove_all_t<DenseRhsType> Rhs;
   typedef internal::remove_all_t<DenseResType> Res;
   typedef typename evaluator<Lhs>::InnerIterator LhsInnerIterator;
+
   static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res,
                   const typename Res::Scalar& alpha) {
+    runImpl(lhs, rhs, res, alpha, std::integral_constant<bool, has_compressed_storage<Lhs>::value>());
+  }
+
+  // Direct pointer path: works for both compressed and non-compressed storage.
+  static void runImpl(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res,
+                      const typename Res::Scalar& alpha, std::true_type /* has_compressed_storage */) {
+    typedef typename Lhs::Scalar LhsScalar;
+    typedef typename Lhs::StorageIndex StorageIndex;
+    const Lhs& mat = lhs;
+    const LhsScalar* vals = mat.valuePtr();
+    const StorageIndex* inds = mat.innerIndexPtr();
+    // Sparse vectors don't store outer indices.
+    const auto* outer = mat.outerIndexPtr();
+    const auto* innerNnz = mat.innerNonZeroPtr();
+    for (Index j = 0; j < lhs.outerSize(); ++j) {
+      typename Rhs::ConstRowXpr rhs_j(rhs.row(j));
+      const Index start = outer ? outer[j] : 0;
+      const Index end = innerNnz ? start + innerNnz[j] : (outer ? outer[j + 1] : mat.nonZeros());
+      for (Index k = start; k < end; ++k) res.row(inds[k]) += (alpha * vals[k]) * rhs_j;
+    }
+  }
+
+  static void runImpl(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res,
+                      const typename Res::Scalar& alpha, std::false_type /* has_compressed_storage */) {
     evaluator<Lhs> lhsEval(lhs);
     for (Index j = 0; j < lhs.outerSize(); ++j) {
       typename Rhs::ConstRowXpr rhs_j(rhs.row(j));

diff --git a/Eigen/src/SparseCore/SparseDiagonalProduct.h b/Eigen/src/SparseCore/SparseDiagonalProduct.h
index 1f72a6b..e02fdaa 100644
--- a/Eigen/src/SparseCore/SparseDiagonalProduct.h
+++ b/Eigen/src/SparseCore/SparseDiagonalProduct.h

@@ -39,7 +39,11 @@
     : public sparse_diagonal_product_evaluator<Rhs, typename Lhs::DiagonalVectorType,
                                                Rhs::Flags & RowMajorBit ? SDP_AsScalarProduct : SDP_AsCwiseProduct> {
   typedef Product<Lhs, Rhs, DefaultProduct> XprType;
-  enum { CoeffReadCost = HugeCost, Flags = Rhs::Flags & RowMajorBit, Alignment = 0 };  // FIXME CoeffReadCost & Flags
+  enum {
+    CoeffReadCost = HugeCost,
+    Flags = Rhs::Flags & RowMajorBit,
+    Alignment = 0
+  };  // FIXME: compute proper CoeffReadCost and propagate Flags.
 
   typedef sparse_diagonal_product_evaluator<Rhs, typename Lhs::DiagonalVectorType,
                                             Rhs::Flags & RowMajorBit ? SDP_AsScalarProduct : SDP_AsCwiseProduct>
@@ -52,7 +56,11 @@
     : public sparse_diagonal_product_evaluator<Lhs, Transpose<const typename Rhs::DiagonalVectorType>,
                                                Lhs::Flags & RowMajorBit ? SDP_AsCwiseProduct : SDP_AsScalarProduct> {
   typedef Product<Lhs, Rhs, DefaultProduct> XprType;
-  enum { CoeffReadCost = HugeCost, Flags = Lhs::Flags & RowMajorBit, Alignment = 0 };  // FIXME CoeffReadCost & Flags
+  enum {
+    CoeffReadCost = HugeCost,
+    Flags = Lhs::Flags & RowMajorBit,
+    Alignment = 0
+  };  // FIXME: compute proper CoeffReadCost and propagate Flags.
 
   typedef sparse_diagonal_product_evaluator<Lhs, Transpose<const typename Rhs::DiagonalVectorType>,
                                             Lhs::Flags & RowMajorBit ? SDP_AsCwiseProduct : SDP_AsScalarProduct>

diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h
index 8fcdfdf..36646c8 100644
--- a/Eigen/src/SparseCore/SparseMatrix.h
+++ b/Eigen/src/SparseCore/SparseMatrix.h

@@ -1008,7 +1008,7 @@
 
     const bool overwrite = internal::is_same<Func, internal::assign_op<Scalar, Scalar>>::value;
     if (overwrite) {
-      if ((m_outerSize != n) || (m_innerSize != n)) resize(n, n);
+      if ((m_outerSize != n) || (m_innerSize != n) || (n == 0)) resize(n, n);
     }
 
     if (m_data.size() == 0 || overwrite) {
@@ -1548,7 +1548,7 @@
     Eigen::Map<IndexVector>(dest.m_outerIndex, dest.outerSize()).setZero();
 
     // pass 1
-    // FIXME the above copy could be merged with that pass
+    // FIXME: merge the above copy into this pass to avoid iterating twice.
     for (Index j = 0; j < otherCopy.outerSize(); ++j)
       for (typename OtherCopyEval::InnerIterator it(otherCopyEval, j); it; ++it) ++dest.m_outerIndex[it.index()];
 

diff --git a/Eigen/src/SparseCore/SparseMatrixBase.h b/Eigen/src/SparseCore/SparseMatrixBase.h
index ccbbe98..acdaf93 100644
--- a/Eigen/src/SparseCore/SparseMatrixBase.h
+++ b/Eigen/src/SparseCore/SparseMatrixBase.h

@@ -115,7 +115,7 @@
   typedef Transpose<Derived> TransposeReturnType;
   typedef Transpose<const Derived> ConstTransposeReturnType;
 
-  // FIXME storage order do not match evaluator storage order
+  // FIXME: storage order may not match evaluator storage order.
   typedef SparseMatrix<Scalar, Flags & RowMajorBit ? RowMajor : ColMajor, StorageIndex> PlainObject;
 
   /** This is the "real scalar" type; if the \a Scalar type is already real numbers
@@ -203,7 +203,7 @@
     return derived();
   }
 
-  SparseMatrixBase() : m_isRValue(false) { /* TODO check flags */
+  SparseMatrixBase() : m_isRValue(false) { /* TODO: validate traits flags. */
   }
 
   template <typename OtherDerived>

diff --git a/Eigen/src/SparseCore/SparseRef.h b/Eigen/src/SparseCore/SparseRef.h
index c205e6d..aa8ae34 100644
--- a/Eigen/src/SparseCore/SparseRef.h
+++ b/Eigen/src/SparseCore/SparseRef.h

@@ -117,7 +117,8 @@
 #else
 template <typename SparseMatrixType, int Options>
 class Ref<SparseMatrixType, Options>
-    : public SparseMapBase<Derived, WriteAccessors>  // yes, that's weird to use Derived here, but that works!
+    : public SparseMapBase<Derived, WriteAccessors>  // Note: 'Derived' is used here intentionally; it resolves
+                                                     // correctly via CRTP.
 #endif
 {
   typedef SparseMatrix<MatScalar, MatOptions, MatIndex> PlainObjectType;
@@ -322,8 +323,8 @@
 
 namespace internal {
 
-// FIXME shall we introduce a general evaluatior_ref that we can specialize for any sparse object once, and thus remove
-// this copy-pasta thing...
+// FIXME: consider introducing a general evaluator_ref that we can specialize for any sparse object once, and thus
+// remove this copy-pasta thing...
 
 template <typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
 struct evaluator<Ref<SparseMatrix<MatScalar, MatOptions, MatIndex>, Options, StrideType>>

diff --git a/Eigen/src/SparseCore/SparseSelfAdjointView.h b/Eigen/src/SparseCore/SparseSelfAdjointView.h
index 05b3de5..210462f 100644
--- a/Eigen/src/SparseCore/SparseSelfAdjointView.h
+++ b/Eigen/src/SparseCore/SparseSelfAdjointView.h

@@ -126,7 +126,7 @@
   SparseSelfAdjointView& rankUpdate(const SparseMatrixBase<DerivedU>& u, const Scalar& alpha = Scalar(1));
 
   /** \returns an expression of P H P^-1 */
-  // TODO implement twists in a more evaluator friendly fashion
+  // TODO: implement twists in a more evaluator friendly fashion
   SparseSymmetricPermutationProduct<MatrixTypeNested_, Mode> twistedBy(
       const PermutationMatrix<Dynamic, Dynamic, StorageIndex>& perm) const {
     return SparseSymmetricPermutationProduct<MatrixTypeNested_, Mode>(m_matrix, perm);
@@ -205,7 +205,7 @@
 
 namespace internal {
 
-// TODO currently a selfadjoint expression has the form SelfAdjointView<.,.>
+// TODO: currently a selfadjoint expression has the form SelfAdjointView<.,.>
 //      in the future selfadjoint-ness should be defined by the expression traits
 //      such that Transpose<SelfAdjointView<.,.> > is valid. (currently TriangularBase::transpose() is overloaded to
 //      make it work)

diff --git a/Eigen/src/SparseCore/SparseSolverBase.h b/Eigen/src/SparseCore/SparseSolverBase.h
index d67a677..acf1564 100644
--- a/Eigen/src/SparseCore/SparseSolverBase.h
+++ b/Eigen/src/SparseCore/SparseSolverBase.h

@@ -81,7 +81,7 @@
    * \sa compute()
    */
   template <typename Rhs>
-  inline const Solve<Derived, Rhs> solve(const MatrixBase<Rhs>& b) const {
+  inline Solve<Derived, Rhs> solve(const MatrixBase<Rhs>& b) const {
     eigen_assert(m_isInitialized && "Solver is not initialized.");
     eigen_assert(derived().rows() == b.rows() && "solve(): invalid number of rows of the right hand side matrix b");
     return Solve<Derived, Rhs>(derived(), b.derived());
@@ -92,7 +92,7 @@
    * \sa compute()
    */
   template <typename Rhs>
-  inline const Solve<Derived, Rhs> solve(const SparseMatrixBase<Rhs>& b) const {
+  inline Solve<Derived, Rhs> solve(const SparseMatrixBase<Rhs>& b) const {
     eigen_assert(m_isInitialized && "Solver is not initialized.");
     eigen_assert(derived().rows() == b.rows() && "solve(): invalid number of rows of the right hand side matrix b");
     return Solve<Derived, Rhs>(derived(), b.derived());

diff --git a/Eigen/src/SparseCore/SparseSparseProductWithPruning.h b/Eigen/src/SparseCore/SparseSparseProductWithPruning.h
index 6e1c9cf..84816ae 100644
--- a/Eigen/src/SparseCore/SparseSparseProductWithPruning.h
+++ b/Eigen/src/SparseCore/SparseSparseProductWithPruning.h

@@ -56,14 +56,11 @@
   res.reserve(estimated_nnz_prod);
   double ratioColRes = double(estimated_nnz_prod) / (double(lhs.rows()) * double(rhs.cols()));
   for (Index j = 0; j < cols; ++j) {
-    // FIXME:
-    // double ratioColRes = (double(rhs.innerVector(j).nonZeros()) +
-    // double(lhs.nonZeros())/double(lhs.cols()))/double(lhs.rows());
-    // let's do a more accurate determination of the nnz ratio for the current column j of res
+    // FIXME: compute a more accurate per-column nnz ratio for res.
     tempVector.init(ratioColRes);
     tempVector.setZero();
     for (typename evaluator<Rhs>::InnerIterator rhsIt(rhsEval, j); rhsIt; ++rhsIt) {
-      // FIXME should be written like this: tmp += rhsIt.value() * lhs.col(rhsIt.index())
+      // FIXME: rewrite as tmp += rhsIt.value() * lhs.col(rhsIt.index()).
       tempVector.restart();
       RhsScalar x = rhsIt.value();
       for (typename evaluator<Lhs>::InnerIterator lhsIt(lhsEval, rhsIt.index()); lhsIt; ++lhsIt) {

diff --git a/Eigen/src/SparseCore/SparseUtil.h b/Eigen/src/SparseCore/SparseUtil.h
index 33cedaf..8eedd54 100644
--- a/Eigen/src/SparseCore/SparseUtil.h
+++ b/Eigen/src/SparseCore/SparseUtil.h

@@ -105,7 +105,7 @@
   typedef SparseVector<Scalar_, ColMajor, StorageIndex_> type;
 };
 
-// TODO this seems almost identical to plain_matrix_type<T, Sparse>
+// TODO: consider unifying with plain_matrix_type<T, Sparse>.
 template <typename T, int Rows, int Cols, int Flags>
 struct sparse_eval {
   typedef typename traits<T>::Scalar Scalar_;

diff --git a/Eigen/src/SparseCore/SparseVector.h b/Eigen/src/SparseCore/SparseVector.h
index c8d34e3..66fbb07 100644
--- a/Eigen/src/SparseCore/SparseVector.h
+++ b/Eigen/src/SparseCore/SparseVector.h

@@ -168,7 +168,7 @@
 
     Index startId = 0;
     Index p = Index(m_data.size()) - 1;
-    // TODO smart realloc
+    // TODO: implement smart reallocation.
     m_data.resize(p + 2, 1);
 
     while ((p >= startId) && (m_data.index(p) > i)) {

diff --git a/Eigen/src/SparseCore/SparseView.h b/Eigen/src/SparseCore/SparseView.h
index 7220bee..d566898 100644
--- a/Eigen/src/SparseCore/SparseView.h
+++ b/Eigen/src/SparseCore/SparseView.h

@@ -75,7 +75,7 @@
 
 namespace internal {
 
-// TODO find a way to unify the two following variants
+// TODO: find a way to unify the two following variants
 // This is tricky because implementing an inner iterator on top of an IndexBased evaluator is
 // not easy because the evaluators do not expose the sizes of the underlying expression.
 

diff --git a/Eigen/src/SparseCore/TriangularSolver.h b/Eigen/src/SparseCore/TriangularSolver.h
index 684de48..b005487 100644
--- a/Eigen/src/SparseCore/TriangularSolver.h
+++ b/Eigen/src/SparseCore/TriangularSolver.h

@@ -130,7 +130,8 @@
         if (!numext::is_exactly_zero(tmp))  // optimization when other is actually sparse
         {
           if (!(Mode & UnitDiag)) {
-            // TODO replace this by a binary search. make sure the binary search is safe for partially sorted elements
+            // TODO: replace this with a binary search. make sure the binary search is safe for partially sorted
+            // elements
             LhsIterator it(lhsEval, i);
             while (it && it.index() != i) ++it;
             eigen_assert(it && it.index() == i);
@@ -195,7 +196,7 @@
     res.reserve(other.nonZeros());
 
     for (Index col = 0; col < other.cols(); ++col) {
-      // FIXME estimate number of non zeros
+      // FIXME: estimate the number of non-zeros per column for better allocation.
       tempVector.init(.99 /*float(other.col(col).nonZeros())/float(other.rows())*/);
       tempVector.setZero();
       tempVector.restart();
@@ -230,16 +231,11 @@
         }
       }
 
-      //       Index count = 0;
-      // FIXME compute a reference value to filter zeros
+      // FIXME: compute a reference value to filter zeros.
       for (typename AmbiVector<Scalar, StorageIndex>::Iterator it(tempVector /*,1e-12*/); it; ++it) {
-        //         ++ count;
-        //         std::cerr << "fill " << it.index() << ", " << col << "\n";
-        //         std::cout << it.value() << "  ";
-        // FIXME use insertBack
+        // FIXME: use insertBack for better performance.
         res.insert(it.index(), col) = it.value();
       }
-      //       std::cout << "tempVector.nonZeros() == " << int(count) << " / " << (other.rows()) << "\n";
     }
     res.finalize();
     other = res.markAsRValue();

diff --git a/Eigen/src/SparseLU/SparseLU.h b/Eigen/src/SparseLU/SparseLU.h
index cc69a42..9b12f49 100644
--- a/Eigen/src/SparseLU/SparseLU.h
+++ b/Eigen/src/SparseLU/SparseLU.h

@@ -51,7 +51,7 @@
     eigen_assert(m_sparseLU->info() == Success && "The matrix should be factorized first");
     EIGEN_STATIC_ASSERT((Dest::Flags & RowMajorBit) == 0, THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES);
 
-    // this ugly const_cast_derived() helps to detect aliasing when applying the permutations
+    // const_cast_derived() is needed to enable aliasing detection when applying the permutations.
     for (Index j = 0; j < B.cols(); ++j) {
       X.col(j) = m_sparseLU->colsPermutation() * B.const_cast_derived().col(j);
     }
@@ -310,7 +310,7 @@
    * \sa compute()
    */
   template <typename Rhs>
-  inline const Solve<SparseLU, Rhs> solve(const MatrixBase<Rhs>& B) const;
+  inline Solve<SparseLU, Rhs> solve(const MatrixBase<Rhs>& B) const;
 #endif  // EIGEN_PARSED_BY_DOXYGEN
 
   /** \brief Reports whether previous computation was successful.
@@ -344,7 +344,7 @@
     // on return, X is overwritten by the computed solution
     X.resize(B.rows(), B.cols());
 
-    // this ugly const_cast_derived() helps to detect aliasing when applying the permutations
+    // const_cast_derived() is needed to enable aliasing detection when applying the permutations.
     for (Index j = 0; j < B.cols(); ++j) X.col(j) = rowsPermutation() * B.const_cast_derived().col(j);
 
     // Forward substitution with L
@@ -360,7 +360,7 @@
   /** \brief Give the absolute value of the determinant.
    *
    * \returns the absolute value of the determinant of the matrix of which
-   * *this is the QR decomposition.
+   * *this is the LU factorization.
    *
    * \warning a determinant can be very big or small, so for matrices
    * of large enough dimension, there is a risk of overflow/underflow.
@@ -368,7 +368,7 @@
    *
    * \sa logAbsDeterminant(), signDeterminant()
    */
-  Scalar absDeterminant() {
+  Scalar absDeterminant() const {
     using std::abs;
     eigen_assert(m_factorizationIsOk && "The matrix should be factorized first.");
     // Initialize with the determinant of the row matrix
@@ -389,7 +389,7 @@
   /** \brief Give the natural log of the absolute determinant.
    *
    * \returns the natural log of the absolute value of the determinant of the matrix
-   * of which **this is the QR decomposition
+   * of which *this is the LU factorization
    *
    * \note This method is useful to work around the risk of overflow/underflow that's
    * inherent to the determinant computation.
@@ -420,7 +420,7 @@
    *
    * \sa absDeterminant(), logAbsDeterminant()
    */
-  Scalar signDeterminant() {
+  Scalar signDeterminant() const {
     eigen_assert(m_factorizationIsOk && "The matrix should be factorized first.");
     // Initialize with the determinant of the row matrix
     Index det = 1;
@@ -446,7 +446,7 @@
    *
    * \sa absDeterminant(), logAbsDeterminant()
    */
-  Scalar determinant() {
+  Scalar determinant() const {
     eigen_assert(m_factorizationIsOk && "The matrix should be factorized first.");
     // Initialize with the determinant of the row matrix
     Scalar det = Scalar(1.);
@@ -507,7 +507,7 @@
   SparseLU(const SparseLU&);
 };  // End class SparseLU
 
-// Functions needed by the anaysis phase
+// Functions needed by the analysis phase
 /** \brief Compute the column permutation.
  *
  * Compute the column permutation to minimize the fill-in
@@ -603,7 +603,7 @@
  * > A->ncol: number of bytes allocated when memory allocation failure occurred, plus A->ncol.
  * If lwork = -1, it is the estimated amount of space needed, plus A->ncol.
  *
- * It seems that A was the name of the matrix in the past.
+ * Note: 'A' in the above description refers to the factored matrix (historical naming from SuperLU).
  *
  * \sa analyzePattern(), compute(), SparseLU(), info(), lastErrorMessage()
  */
@@ -616,7 +616,6 @@
   m_isInitialized = true;
 
   // Apply the column permutation computed in analyzepattern()
-  //   m_mat = matrix * m_perm_c.inverse();
   m_mat = matrix;
   if (m_perm_c.size()) {
     m_mat.uncompress();  // NOTE: The effect of this command is only to create the InnerNonzeros pointers.
@@ -779,7 +778,7 @@
       }
 
       // Update the determinant of the row permutation matrix
-      // FIXME: the following test is not correct, we should probably take iperm_c into account and pivrow is not
+      // FIXME: the following test is not correct; it should account for iperm_c, and pivrow is not
       // directly the row pivot.
       if (pivrow != jj) m_detPermR = -m_detPermR;
 

diff --git a/Eigen/src/SparseLU/SparseLU_Memory.h b/Eigen/src/SparseLU/SparseLU_Memory.h
index 22affd2..7acfa5c 100644
--- a/Eigen/src/SparseLU/SparseLU_Memory.h
+++ b/Eigen/src/SparseLU/SparseLU_Memory.h

@@ -128,7 +128,7 @@
  * \param n number of columns
  * \param annz number of initial nonzeros in the matrix
  * \param lwork  if lwork=-1, this routine returns an estimated size of the required memory
- * \param glu persistent data to facilitate multiple factors : will be deleted later ??
+ * \param glu persistent data to facilitate multiple factors (may be deleted later).
  * \param fillratio estimated ratio of fill in the factors
  * \param panel_size Size of a panel
  * \return an estimated size of the required memory if lwork = -1; otherwise, return the size of actually allocated

diff --git a/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h b/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h
index eb15909..98b7348 100644
--- a/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h
+++ b/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h

@@ -27,11 +27,7 @@
  * NOTE : This class corresponds to the SCformat structure in SuperLU
  *
  */
-/* TODO
- * InnerIterator as for sparsematrix
- * SuperInnerIterator to iterate through all supernodes
- * Function for triangular solve
- */
+// TODO: add InnerIterator, SuperInnerIterator, and triangular solve support.
 template <typename Scalar_, typename StorageIndex_>
 class MappedSuperNodalMatrix {
  public:

diff --git a/Eigen/src/SparseLU/SparseLU_column_bmod.h b/Eigen/src/SparseLU/SparseLU_column_bmod.h
index 8435b56..014ffc2 100644
--- a/Eigen/src/SparseLU/SparseLU_column_bmod.h
+++ b/Eigen/src/SparseLU/SparseLU_column_bmod.h

@@ -45,7 +45,7 @@
  * \param dense Store the full representation of the column
  * \param tempv working array
  * \param segrep segment representative ...
- * \param repfnz ??? First nonzero column in each row ???  ...
+ * \param repfnz first nonzero column in each row  ...
  * \param fpanelc First column in the current panel
  * \param glu Global LU data.
  * \return 0 - successful return

diff --git a/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h b/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h
index 8df830b..86080e4 100644
--- a/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h
+++ b/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h

@@ -52,7 +52,7 @@
   IndexVector post;
   internal::treePostorder(StorageIndex(n), et, post);  // Post order etree
   IndexVector inv_post(n + 1);
-  for (StorageIndex i = 0; i < n + 1; ++i) inv_post(post(i)) = i;  // inv_post = post.inverse()???
+  for (StorageIndex i = 0; i < n + 1; ++i) inv_post(post(i)) = i;  // Compute the inverse postorder permutation.
 
   // Renumber etree in postorder
   IndexVector iwork(n);

diff --git a/Eigen/src/SparseLU/SparseLU_panel_dfs.h b/Eigen/src/SparseLU/SparseLU_panel_dfs.h
index df31548..4412396 100644
--- a/Eigen/src/SparseLU/SparseLU_panel_dfs.h
+++ b/Eigen/src/SparseLU/SparseLU_panel_dfs.h

@@ -136,12 +136,9 @@
         //    segment is seen for the first time. (Note that
         //    "repfnz(krep)" may change later.)
         //    Baktrack dfs to its parent
-        if (traits.update_segrep(krep, jj))
-        // if (marker1(krep) < jcol )
-        {
+        if (traits.update_segrep(krep, jj)) {
           segrep(nseg) = krep;
           ++nseg;
-          // marker1(krep) = jj;
         }
 
         kpar = parent(krep);            // Pop recursion, mimic recursion

diff --git a/Eigen/src/SparseQR/SparseQR.h b/Eigen/src/SparseQR/SparseQR.h
index 4dc7aa9..d11e664 100644
--- a/Eigen/src/SparseQR/SparseQR.h
+++ b/Eigen/src/SparseQR/SparseQR.h

@@ -241,14 +241,14 @@
    * \sa compute()
    */
   template <typename Rhs>
-  inline const Solve<SparseQR, Rhs> solve(const MatrixBase<Rhs>& B) const {
+  inline Solve<SparseQR, Rhs> solve(const MatrixBase<Rhs>& B) const {
     eigen_assert(m_isInitialized && "The factorization should be called first, use compute()");
     eigen_assert(this->rows() == B.rows() &&
                  "SparseQR::solve() : invalid number of rows in the right hand side matrix");
     return Solve<SparseQR, Rhs>(*this, B.derived());
   }
   template <typename Rhs>
-  inline const Solve<SparseQR, Rhs> solve(const SparseMatrixBase<Rhs>& B) const {
+  inline Solve<SparseQR, Rhs> solve(const SparseMatrixBase<Rhs>& B) const {
     eigen_assert(m_isInitialized && "The factorization should be called first, use compute()");
     eigen_assert(this->rows() == B.rows() &&
                  "SparseQR::solve() : invalid number of rows in the right hand side matrix");
@@ -338,8 +338,7 @@
   m_Q.resize(m, diagSize);
 
   // Allocate space for nonzero elements: rough estimation
-  m_R.reserve(2 * mat.nonZeros());  // FIXME Get a more accurate estimation through symbolic factorization with the
-                                    // etree
+  m_R.reserve(2 * mat.nonZeros());  // FIXME: get a tighter bound via symbolic factorization using the etree.
   m_Q.reserve(2 * mat.nonZeros());
   m_hcoeffs.resize(diagSize);
   m_analysisIsok = true;
@@ -502,7 +501,7 @@
 
     if (nonzeroCol < diagSize) {
       // Compute the Householder reflection that eliminate the current column
-      // FIXME this step should call the Householder module.
+      // FIXME: refactor to use the Householder module's reflector computation.
       Scalar c0 = nzcolQ ? tval(Qidx(0)) : Scalar(0);
 
       // First, the squared norm of Q((col+1):m, col)
@@ -627,7 +626,7 @@
 
   const SparseQRType& m_qr;
   const Derived& m_other;
-  bool m_transpose;  // TODO this actually means adjoint
+  bool m_transpose;  // TODO: rename to m_adjoint; this flag controls adjoint application.
 };
 
 template <typename SparseQRType>
@@ -646,14 +645,14 @@
   }
   inline Index rows() const { return m_qr.rows(); }
   inline Index cols() const { return m_qr.rows(); }
-  // To use for operations with the transpose of Q FIXME this is the same as adjoint at the moment
+  // To use for operations with the transpose of Q. FIXME: currently identical to adjoint(); specialize for complex.
   SparseQRMatrixQTransposeReturnType<SparseQRType> transpose() const {
     return SparseQRMatrixQTransposeReturnType<SparseQRType>(m_qr);
   }
   const SparseQRType& m_qr;
 };
 
-// TODO this actually represents the adjoint of Q
+// TODO: rename to SparseQRMatrixQAdjointReturnType; this represents the adjoint of Q.
 template <typename SparseQRType>
 struct SparseQRMatrixQTransposeReturnType {
   explicit SparseQRMatrixQTransposeReturnType(const SparseQRType& qr) : m_qr(qr) {}

diff --git a/Eigen/src/SuperLUSupport/SuperLUSupport.h b/Eigen/src/SuperLUSupport/SuperLUSupport.h
index 4db9249..bed414b 100644
--- a/Eigen/src/SuperLUSupport/SuperLUSupport.h
+++ b/Eigen/src/SuperLUSupport/SuperLUSupport.h

@@ -208,7 +208,7 @@
 
     res.setScalarType<typename MatrixType::Scalar>();
 
-    // FIXME the following is not very accurate
+    // FIXME: the following type mapping is approximate.
     if (int(MatrixType::Flags) & int(Upper)) res.Mtype = SLU_TRU;
     if (int(MatrixType::Flags) & int(Lower)) res.Mtype = SLU_TRL;
 
@@ -259,7 +259,7 @@
 
     res.setScalarType<typename MatrixType::Scalar>();
 
-    // FIXME the following is not very accurate
+    // FIXME: the following type mapping is approximate.
     if (MatrixType::Flags & Upper) res.Mtype = SLU_TRU;
     if (MatrixType::Flags & Lower) res.Mtype = SLU_TRL;
 
@@ -487,7 +487,7 @@
 
   /** Performs a numeric decomposition of \a matrix
    *
-   * The given matrix must have the same sparsity than the matrix on which the symbolic decomposition has been
+   * The given matrix must have the same sparsity as the matrix on which the symbolic decomposition has been
    * performed.
    *
    * \sa analyzePattern()
@@ -583,7 +583,7 @@
 
   m_extractedDataAreDirty = true;
 
-  // FIXME how to better check for errors ???
+  // FIXME: implement more detailed error checking based on SuperLU info codes.
   m_info = info == 0 ? Success : NumericalIssue;
   m_factorizationIsOk = true;
 }
@@ -792,7 +792,7 @@
 
   /** Performs a numeric decomposition of \a matrix
    *
-   * The given matrix must have the same sparsity than the matrix on which the symbolic decomposition has been
+   * The given matrix must have the same sparsity as the matrix on which the symbolic decomposition has been
    * performed.
    *
    * \sa analyzePattern()
@@ -872,7 +872,7 @@
                 &info, Scalar());
   StatFree(&m_sluStat);
 
-  // FIXME how to better check for errors ???
+  // FIXME: implement more detailed error checking based on SuperLU info codes.
   m_info = info == 0 ? Success : NumericalIssue;
   m_factorizationIsOk = true;
 }

diff --git a/Eigen/src/ThreadPool/Barrier.h b/Eigen/src/ThreadPool/Barrier.h
index 8b2f8da..be530d0 100644
--- a/Eigen/src/ThreadPool/Barrier.h
+++ b/Eigen/src/ThreadPool/Barrier.h

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2018 Rasmus Munk Larsen <rmlarsen@google.com>
+// Copyright (C) 2018 Rasmus Munk Larsen <rmlarsen@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed

diff --git a/Eigen/src/ThreadPool/InternalHeaderCheck.h b/Eigen/src/ThreadPool/InternalHeaderCheck.h
index 5b27ef4..4384cc6 100644
--- a/Eigen/src/ThreadPool/InternalHeaderCheck.h
+++ b/Eigen/src/ThreadPool/InternalHeaderCheck.h

@@ -1,4 +1,3 @@
 #ifndef EIGEN_THREADPOOL_MODULE_H
-#error \
-    "Please include unsupported/Eigen/CXX11/ThreadPool instead of including headers inside the src directory directly."
+#error "Please include Eigen/ThreadPool instead of including headers inside the src directory directly."
 #endif

diff --git a/Eigen/src/ThreadPool/NonBlockingThreadPool.h b/Eigen/src/ThreadPool/NonBlockingThreadPool.h
index 44d4b24..a9f0beb 100644
--- a/Eigen/src/ThreadPool/NonBlockingThreadPool.h
+++ b/Eigen/src/ThreadPool/NonBlockingThreadPool.h

@@ -433,7 +433,7 @@
     // If we are shutting down and all worker threads blocked without work,
     // that's we are done.
     blocked_++;
-    // TODO is blocked_ required to be unsigned?
+    // TODO: is blocked_ required to be unsigned?
     if (done_ && blocked_ == static_cast<unsigned>(num_threads_)) {
       ec_.CancelWait();
       // Almost done, but need to re-check queues.

diff --git a/Eigen/src/ThreadPool/RunQueue.h b/Eigen/src/ThreadPool/RunQueue.h
index 9046b18..e419ae7 100644
--- a/Eigen/src/ThreadPool/RunQueue.h
+++ b/Eigen/src/ThreadPool/RunQueue.h

@@ -57,7 +57,7 @@
     Elem* e = &array_[front & kMask];
     uint8_t s = e->state.load(std::memory_order_relaxed);
     if (s != kEmpty || !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire)) return w;
-    front_.store(front + 1 + (kSize << 1), std::memory_order_relaxed);
+    front_.store(front + 1 + (kSize << 1), std::memory_order_release);
     e->w = std::move(w);
     e->state.store(kReady, std::memory_order_release);
     return Work();
@@ -73,7 +73,7 @@
     Work w = std::move(e->w);
     e->state.store(kEmpty, std::memory_order_release);
     front = ((front - 1) & kMask2) | (front & ~kMask2);
-    front_.store(front, std::memory_order_relaxed);
+    front_.store(front, std::memory_order_release);
     return w;
   }
 
@@ -86,7 +86,7 @@
     uint8_t s = e->state.load(std::memory_order_relaxed);
     if (s != kEmpty || !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire)) return w;
     back = ((back - 1) & kMask2) | (back & ~kMask2);
-    back_.store(back, std::memory_order_relaxed);
+    back_.store(back, std::memory_order_release);
     e->w = std::move(w);
     e->state.store(kReady, std::memory_order_release);
     return Work();
@@ -102,7 +102,7 @@
     if (s != kReady || !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire)) return Work();
     Work w = std::move(e->w);
     e->state.store(kEmpty, std::memory_order_release);
-    back_.store(back + 1 + (kSize << 1), std::memory_order_relaxed);
+    back_.store(back + 1 + (kSize << 1), std::memory_order_release);
     return w;
   }
 
@@ -132,7 +132,7 @@
       e->state.store(kEmpty, std::memory_order_release);
       n++;
     }
-    if (n != 0) back_.store(start + 1 + (kSize << 1), std::memory_order_relaxed);
+    if (n != 0) back_.store(start + 1 + (kSize << 1), std::memory_order_release);
     return n;
   }
 

diff --git a/Eigen/src/ThreadPool/ThreadLocal.h b/Eigen/src/ThreadPool/ThreadLocal.h
index aa0bd10..f3ad5cc 100644
--- a/Eigen/src/ThreadPool/ThreadLocal.h
+++ b/Eigen/src/ThreadPool/ThreadLocal.h

@@ -90,7 +90,7 @@
 //
 //   Eigen::ThreadLocal<Counter> counter(10);
 //
-//   // Each thread will have access to it's own counter object.
+//   // Each thread will have access to its own counter object.
 //   Counter& cnt = counter.local();
 //   cnt++;
 //

diff --git a/Eigen/src/UmfPackSupport/UmfPackSupport.h b/Eigen/src/UmfPackSupport/UmfPackSupport.h
index 22c701b..51e4db1 100644
--- a/Eigen/src/UmfPackSupport/UmfPackSupport.h
+++ b/Eigen/src/UmfPackSupport/UmfPackSupport.h

@@ -11,7 +11,7 @@
 #define EIGEN_UMFPACKSUPPORT_H
 
 // for compatibility with super old version of umfpack,
-// not sure this is really needed, but this is harmless.
+// This may not be strictly needed, but it is harmless.
 #ifndef SuiteSparse_long
 #ifdef UF_long
 #define SuiteSparse_long UF_long
@@ -425,7 +425,7 @@
 
   /** Performs a numeric decomposition of \a matrix
    *
-   * The given matrix must have the same sparsity than the matrix on which the pattern anylysis has been performed.
+   * The given matrix must have the same sparsity as the matrix on which the pattern analysis has been performed.
    *
    * \sa analyzePattern(), compute()
    */

diff --git a/Eigen/src/plugins/ArrayCwiseBinaryOps.inc b/Eigen/src/plugins/ArrayCwiseBinaryOps.inc
index c8c2434..ae240f1 100644
--- a/Eigen/src/plugins/ArrayCwiseBinaryOps.inc
+++ b/Eigen/src/plugins/ArrayCwiseBinaryOps.inc

@@ -4,8 +4,8 @@
  * \sa MatrixBase::cwiseProduct
  */
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const EIGEN_CWISE_BINARY_RETURN_TYPE(Derived, OtherDerived, product) operator*(
-    const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const EIGEN_CWISE_BINARY_RETURN_TYPE(Derived, OtherDerived, product)
+operator*(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const {
   return EIGEN_CWISE_BINARY_RETURN_TYPE(Derived, OtherDerived, product)(derived(), other.derived());
 }
 
@@ -14,7 +14,7 @@
  * \sa MatrixBase::cwiseQuotient
  */
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseBinaryOp<
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const CwiseBinaryOp<
     internal::scalar_quotient_op<Scalar, typename OtherDerived::Scalar>, const Derived, const OtherDerived>
 operator/(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const {
   return CwiseBinaryOp<internal::scalar_quotient_op<Scalar, typename OtherDerived::Scalar>, const Derived,
@@ -29,7 +29,7 @@
  * \sa max()
  */
 template <int NaNPropagation = PropagateFast, typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const
     CwiseBinaryOp<internal::scalar_min_op<Scalar, Scalar, NaNPropagation>, const Derived, const OtherDerived>
 #ifdef EIGEN_PARSED_BY_DOXYGEN
     min
@@ -46,7 +46,7 @@
  * \sa max()
  */
 template <int NaNPropagation = PropagateFast>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const
     CwiseBinaryOp<internal::scalar_min_op<Scalar, Scalar, NaNPropagation>, const Derived,
                   const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject> >
 #ifdef EIGEN_PARSED_BY_DOXYGEN
@@ -66,7 +66,7 @@
  * \sa min()
  */
 template <int NaNPropagation = PropagateFast, typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const
     CwiseBinaryOp<internal::scalar_max_op<Scalar, Scalar, NaNPropagation>, const Derived, const OtherDerived>
 #ifdef EIGEN_PARSED_BY_DOXYGEN
     max
@@ -83,7 +83,7 @@
  * \sa min()
  */
 template <int NaNPropagation = PropagateFast>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const
     CwiseBinaryOp<internal::scalar_max_op<Scalar, Scalar, NaNPropagation>, const Derived,
                   const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject> >
 #ifdef EIGEN_PARSED_BY_DOXYGEN
@@ -105,7 +105,7 @@
  *
  * \sa absolute_difference()
  */
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const
     CwiseBinaryOp<internal::scalar_absolute_difference_op<Scalar, Scalar>, const Derived,
                   const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject> >
 #ifdef EIGEN_PARSED_BY_DOXYGEN
@@ -133,10 +133,10 @@
  */
 EIGEN_MAKE_CWISE_BINARY_OP(atan2, atan2)
 
-// TODO code generating macros could be moved to Macros.h and could include generation of documentation
+// TODO: code generating macros could be moved to Macros.h and could include generation of documentation
 #define EIGEN_MAKE_CWISE_COMP_OP(OP, COMPARATOR)                                                                     \
   template <typename OtherDerived>                                                                                   \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const                                                                        \
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const                                                              \
       CwiseBinaryOp<internal::scalar_cmp_op<Scalar, typename OtherDerived::Scalar, internal::cmp_##COMPARATOR>,      \
                     const Derived, const OtherDerived>                                                               \
       OP(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const {                                        \
@@ -149,24 +149,24 @@
   typedef CwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_##COMPARATOR>,                         \
                         const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject>, const Derived>      \
       RCmp##COMPARATOR##ReturnType;                                                                                  \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Cmp##COMPARATOR##ReturnType OP(const Scalar &s) const {                \
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const Cmp##COMPARATOR##ReturnType OP(const Scalar &s) const {      \
     return this->OP(Derived::PlainObject::Constant(rows(), cols(), s));                                              \
   }                                                                                                                  \
-  EIGEN_DEVICE_FUNC friend EIGEN_STRONG_INLINE const RCmp##COMPARATOR##ReturnType OP(                                \
+  EIGEN_DEVICE_FUNC constexpr friend EIGEN_STRONG_INLINE const RCmp##COMPARATOR##ReturnType OP(                      \
       const Scalar &s, const EIGEN_CURRENT_STORAGE_BASE_CLASS<Derived> &d) {                                         \
     return Derived::PlainObject::Constant(d.rows(), d.cols(), s).OP(d);                                              \
   }
 
 #define EIGEN_MAKE_CWISE_COMP_R_OP(OP, R_OP, RCOMPARATOR)                                                             \
   template <typename OtherDerived>                                                                                    \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const                                                                         \
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const                                                               \
       CwiseBinaryOp<internal::scalar_cmp_op<typename OtherDerived::Scalar, Scalar, internal::cmp_##RCOMPARATOR>,      \
                     const OtherDerived, const Derived>                                                                \
       OP(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const {                                         \
     return CwiseBinaryOp<internal::scalar_cmp_op<typename OtherDerived::Scalar, Scalar, internal::cmp_##RCOMPARATOR>, \
                          const OtherDerived, const Derived>(other.derived(), derived());                              \
   }                                                                                                                   \
-  EIGEN_DEVICE_FUNC inline const RCmp##RCOMPARATOR##ReturnType OP(const Scalar &s) const {                            \
+  EIGEN_DEVICE_FUNC constexpr inline const RCmp##RCOMPARATOR##ReturnType OP(const Scalar &s) const {                  \
     return Derived::PlainObject::Constant(rows(), cols(), s).R_OP(*this);                                             \
   }                                                                                                                   \
   friend inline const Cmp##RCOMPARATOR##ReturnType OP(const Scalar &s, const Derived &d) {                            \
@@ -301,26 +301,6 @@
     const T &s, const StorageBaseType &a);
 #endif
 
-// NOTE disabled until we agree on argument order
-#if 0
-/** \cpp11 \returns an expression of the coefficient-wise polygamma function.
-  *
-  * \specialfunctions_module
-  *
-  * It returns the \a n -th derivative of the digamma(psi) evaluated at \c *this.
-  *
-  * \warning Be careful with the order of the parameters: x.polygamma(n) is equivalent to polygamma(n,x)
-  *
-  * \sa Eigen::polygamma()
-  */
-template<typename DerivedN>
-inline const CwiseBinaryOp<internal::scalar_polygamma_op<Scalar>, const DerivedN, const Derived>
-polygamma(const EIGEN_CURRENT_STORAGE_BASE_CLASS<DerivedN> &n) const
-{
-  return CwiseBinaryOp<internal::scalar_polygamma_op<Scalar>, const DerivedN, const Derived>(n.derived(), this->derived());
-}
-#endif
-
 /** \returns an expression of the coefficient-wise zeta function.
  *
  * \specialfunctions_module

diff --git a/Eigen/src/plugins/ArrayCwiseUnaryOps.inc b/Eigen/src/plugins/ArrayCwiseUnaryOps.inc
index 753aeb4..c06dc41 100644
--- a/Eigen/src/plugins/ArrayCwiseUnaryOps.inc
+++ b/Eigen/src/plugins/ArrayCwiseUnaryOps.inc

@@ -49,7 +49,7 @@
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_abs">Math functions</a>, abs2()
  */
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const AbsReturnType abs() const { return AbsReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const AbsReturnType abs() const { return AbsReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise phase angle of \c *this
  *
@@ -58,9 +58,9 @@
  *
  * \sa abs()
  */
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const ArgReturnType arg() const { return ArgReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const ArgReturnType arg() const { return ArgReturnType(derived()); }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CArgReturnType carg() const { return CArgReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const CArgReturnType carg() const { return CArgReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise squared absolute value of \c *this
  *
@@ -69,7 +69,7 @@
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_abs2">Math functions</a>, abs(), square()
  */
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Abs2ReturnType abs2() const { return Abs2ReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const Abs2ReturnType abs2() const { return Abs2ReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise exponential of *this.
  *
@@ -82,7 +82,7 @@
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_exp">Math functions</a>, exp2(), pow(), log(), sin(),
  * cos()
  */
-EIGEN_DEVICE_FUNC inline const ExpReturnType exp() const { return ExpReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const ExpReturnType exp() const { return ExpReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise exponential of *this.
  *
@@ -91,7 +91,7 @@
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_exp">Math functions</a>, exp(), pow(), log(), sin(),
  * cos()
  */
-EIGEN_DEVICE_FUNC inline const Exp2ReturnType exp2() const { return Exp2ReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const Exp2ReturnType exp2() const { return Exp2ReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise exponential of *this minus 1.
  *
@@ -100,7 +100,7 @@
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_expm1">Math functions</a>, exp()
  */
-EIGEN_DEVICE_FUNC inline const Expm1ReturnType expm1() const { return Expm1ReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const Expm1ReturnType expm1() const { return Expm1ReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise logarithm of *this.
  *
@@ -112,7 +112,7 @@
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_log">Math functions</a>, log()
  */
-EIGEN_DEVICE_FUNC inline const LogReturnType log() const { return LogReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const LogReturnType log() const { return LogReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise logarithm of 1 plus \c *this.
  *
@@ -121,7 +121,7 @@
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_log1p">Math functions</a>, log()
  */
-EIGEN_DEVICE_FUNC inline const Log1pReturnType log1p() const { return Log1pReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const Log1pReturnType log1p() const { return Log1pReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise base-10 logarithm of *this.
  *
@@ -132,14 +132,14 @@
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_log10">Math functions</a>, log()
  */
-EIGEN_DEVICE_FUNC inline const Log10ReturnType log10() const { return Log10ReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const Log10ReturnType log10() const { return Log10ReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise base-2 logarithm of *this.
  *
  * This function computes the coefficient-wise base-2 logarithm.
  *
  */
-EIGEN_DEVICE_FUNC inline const Log2ReturnType log2() const { return Log2ReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const Log2ReturnType log2() const { return Log2ReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise square root of *this.
  *
@@ -151,7 +151,7 @@
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_sqrt">Math functions</a>, pow(), square(), cbrt()
  */
-EIGEN_DEVICE_FUNC inline const SqrtReturnType sqrt() const { return SqrtReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const SqrtReturnType sqrt() const { return SqrtReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise cube root of *this.
  *
@@ -162,7 +162,7 @@
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_cbrt">Math functions</a>, sqrt(), pow(), square()
  */
-EIGEN_DEVICE_FUNC inline const CbrtReturnType cbrt() const { return CbrtReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const CbrtReturnType cbrt() const { return CbrtReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise inverse square root of *this.
  *
@@ -173,7 +173,7 @@
  *
  * \sa pow(), square()
  */
-EIGEN_DEVICE_FUNC inline const RsqrtReturnType rsqrt() const { return RsqrtReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const RsqrtReturnType rsqrt() const { return RsqrtReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise signum of *this.
  *
@@ -184,7 +184,7 @@
  *
  * \sa pow(), square()
  */
-EIGEN_DEVICE_FUNC inline const SignReturnType sign() const { return SignReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const SignReturnType sign() const { return SignReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise cosine of *this.
  *
@@ -196,7 +196,7 @@
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_cos">Math functions</a>, sin(), acos()
  */
-EIGEN_DEVICE_FUNC inline const CosReturnType cos() const { return CosReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const CosReturnType cos() const { return CosReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise sine of *this.
  *
@@ -208,7 +208,7 @@
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_sin">Math functions</a>, cos(), asin()
  */
-EIGEN_DEVICE_FUNC inline const SinReturnType sin() const { return SinReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const SinReturnType sin() const { return SinReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise tan of *this.
  *
@@ -217,7 +217,7 @@
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_tan">Math functions</a>, cos(), sin()
  */
-EIGEN_DEVICE_FUNC inline const TanReturnType tan() const { return TanReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const TanReturnType tan() const { return TanReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise arc tan of *this.
  *
@@ -226,7 +226,7 @@
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_atan">Math functions</a>, tan(), asin(), acos()
  */
-EIGEN_DEVICE_FUNC inline const AtanReturnType atan() const { return AtanReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const AtanReturnType atan() const { return AtanReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise arc cosine of *this.
  *
@@ -235,7 +235,7 @@
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_acos">Math functions</a>, cos(), asin()
  */
-EIGEN_DEVICE_FUNC inline const AcosReturnType acos() const { return AcosReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const AcosReturnType acos() const { return AcosReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise arc sine of *this.
  *
@@ -244,7 +244,7 @@
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_asin">Math functions</a>, sin(), acos()
  */
-EIGEN_DEVICE_FUNC inline const AsinReturnType asin() const { return AsinReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const AsinReturnType asin() const { return AsinReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise hyperbolic tan of *this.
  *
@@ -253,7 +253,7 @@
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_tanh">Math functions</a>, tan(), sinh(), cosh()
  */
-EIGEN_DEVICE_FUNC inline const TanhReturnType tanh() const { return TanhReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const TanhReturnType tanh() const { return TanhReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise hyperbolic sin of *this.
  *
@@ -262,7 +262,7 @@
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_sinh">Math functions</a>, sin(), tanh(), cosh()
  */
-EIGEN_DEVICE_FUNC inline const SinhReturnType sinh() const { return SinhReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const SinhReturnType sinh() const { return SinhReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise hyperbolic cos of *this.
  *
@@ -271,29 +271,29 @@
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_cosh">Math functions</a>, tanh(), sinh(), cosh()
  */
-EIGEN_DEVICE_FUNC inline const CoshReturnType cosh() const { return CoshReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const CoshReturnType cosh() const { return CoshReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise inverse hyperbolic tan of *this.
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_atanh">Math functions</a>, atanh(), asinh(), acosh()
  */
-EIGEN_DEVICE_FUNC inline const AtanhReturnType atanh() const { return AtanhReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const AtanhReturnType atanh() const { return AtanhReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise inverse hyperbolic sin of *this.
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_asinh">Math functions</a>, atanh(), asinh(), acosh()
  */
-EIGEN_DEVICE_FUNC inline const AsinhReturnType asinh() const { return AsinhReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const AsinhReturnType asinh() const { return AsinhReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise inverse hyperbolic cos of *this.
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_acosh">Math functions</a>, atanh(), asinh(), acosh()
  */
-EIGEN_DEVICE_FUNC inline const AcoshReturnType acosh() const { return AcoshReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const AcoshReturnType acosh() const { return AcoshReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise logistic of *this.
  */
-EIGEN_DEVICE_FUNC inline const LogisticReturnType logistic() const { return LogisticReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const LogisticReturnType logistic() const { return LogisticReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise inverse of *this.
  *
@@ -302,7 +302,7 @@
  *
  * \sa operator/(), operator*()
  */
-EIGEN_DEVICE_FUNC inline const InverseReturnType inverse() const { return InverseReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const InverseReturnType inverse() const { return InverseReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise square of *this.
  *
@@ -311,7 +311,7 @@
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_squareE">Math functions</a>, abs2(), cube(), pow()
  */
-EIGEN_DEVICE_FUNC inline const SquareReturnType square() const { return SquareReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const SquareReturnType square() const { return SquareReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise cube of *this.
  *
@@ -320,7 +320,7 @@
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_cube">Math functions</a>, square(), pow()
  */
-EIGEN_DEVICE_FUNC inline const CubeReturnType cube() const { return CubeReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const CubeReturnType cube() const { return CubeReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise rint of *this.
  *
@@ -329,7 +329,7 @@
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_rint">Math functions</a>, ceil(), floor()
  */
-EIGEN_DEVICE_FUNC inline const RintReturnType rint() const { return RintReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const RintReturnType rint() const { return RintReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise round of *this.
  *
@@ -338,7 +338,7 @@
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_round">Math functions</a>, ceil(), floor()
  */
-EIGEN_DEVICE_FUNC inline const RoundReturnType round() const { return RoundReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const RoundReturnType round() const { return RoundReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise floor of *this.
  *
@@ -347,7 +347,7 @@
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_floor">Math functions</a>, ceil(), round()
  */
-EIGEN_DEVICE_FUNC inline const FloorReturnType floor() const { return FloorReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const FloorReturnType floor() const { return FloorReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise ceil of *this.
  *
@@ -356,7 +356,7 @@
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_ceil">Math functions</a>, floor(), round()
  */
-EIGEN_DEVICE_FUNC inline const CeilReturnType ceil() const { return CeilReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const CeilReturnType ceil() const { return CeilReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise truncation of *this.
  *
@@ -365,7 +365,7 @@
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_trunc">Math functions</a>, floor(), round()
  */
-EIGEN_DEVICE_FUNC inline const TruncReturnType trunc() const { return TruncReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const TruncReturnType trunc() const { return TruncReturnType(derived()); }
 
 template <int N>
 struct ShiftRightXpr {
@@ -380,7 +380,7 @@
  * \sa shiftLeft()
  */
 template <int N>
-EIGEN_DEVICE_FUNC typename ShiftRightXpr<N>::Type shiftRight() const {
+EIGEN_DEVICE_FUNC constexpr typename ShiftRightXpr<N>::Type shiftRight() const {
   return typename ShiftRightXpr<N>::Type(derived());
 }
 
@@ -397,7 +397,7 @@
  * \sa shiftRight()
  */
 template <int N>
-EIGEN_DEVICE_FUNC typename ShiftLeftXpr<N>::Type shiftLeft() const {
+EIGEN_DEVICE_FUNC constexpr typename ShiftLeftXpr<N>::Type shiftLeft() const {
   return typename ShiftLeftXpr<N>::Type(derived());
 }
 
@@ -408,7 +408,7 @@
  *
  * \sa isfinite(), isinf()
  */
-EIGEN_DEVICE_FUNC inline const IsNaNReturnType isNaN() const { return IsNaNReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const IsNaNReturnType isNaN() const { return IsNaNReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise isinf of *this.
  *
@@ -417,7 +417,7 @@
  *
  * \sa isnan(), isfinite()
  */
-EIGEN_DEVICE_FUNC inline const IsInfReturnType isInf() const { return IsInfReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const IsInfReturnType isInf() const { return IsInfReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise isfinite of *this.
  *
@@ -426,8 +426,8 @@
  *
  * \sa isnan(), isinf()
  */
-EIGEN_DEVICE_FUNC inline const IsFiniteReturnType isFinite() const { return IsFiniteReturnType(derived()); }
-EIGEN_DEVICE_FUNC inline const IsFiniteTypedReturnType isFiniteTyped() const {
+EIGEN_DEVICE_FUNC constexpr inline const IsFiniteReturnType isFinite() const { return IsFiniteReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const IsFiniteTypedReturnType isFiniteTyped() const {
   return IsFiniteTypedReturnType(derived());
 }
 
@@ -438,11 +438,15 @@
  *
  * \sa operator!=()
  */
-EIGEN_DEVICE_FUNC inline const BooleanNotReturnType operator!() const { return BooleanNotReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const BooleanNotReturnType operator!() const {
+  return BooleanNotReturnType(derived());
+}
 
 /** \returns an expression of the bitwise ~ operator of *this
  */
-EIGEN_DEVICE_FUNC inline const BitwiseNotReturnType operator~() const { return BitwiseNotReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const BitwiseNotReturnType operator~() const {
+  return BitwiseNotReturnType(derived());
+}
 
 // --- SpecialFunctions module ---
 
@@ -462,7 +466,7 @@
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_lgamma">Math functions</a>, digamma()
  */
-EIGEN_DEVICE_FUNC inline const LgammaReturnType lgamma() const { return LgammaReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const LgammaReturnType lgamma() const { return LgammaReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise digamma (psi, derivative of lgamma).
  *
@@ -475,7 +479,7 @@
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_digamma">Math functions</a>, Eigen::digamma(),
  * Eigen::polygamma(), lgamma()
  */
-EIGEN_DEVICE_FUNC inline const DigammaReturnType digamma() const { return DigammaReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const DigammaReturnType digamma() const { return DigammaReturnType(derived()); }
 
 /** \cpp11 \returns an expression of the coefficient-wise Gauss error
  * function of *this.
@@ -488,7 +492,7 @@
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_erf">Math functions</a>, erfc()
  */
-EIGEN_DEVICE_FUNC inline const ErfReturnType erf() const { return ErfReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const ErfReturnType erf() const { return ErfReturnType(derived()); }
 
 /** \cpp11 \returns an expression of the coefficient-wise Complementary error
  * function of *this.
@@ -501,7 +505,7 @@
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_erfc">Math functions</a>, erf()
  */
-EIGEN_DEVICE_FUNC inline const ErfcReturnType erfc() const { return ErfcReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const ErfcReturnType erfc() const { return ErfcReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise inverse of the CDF of the Normal distribution function
  * function of *this.
@@ -516,7 +520,7 @@
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_ndtri">Math functions</a>
  */
-EIGEN_DEVICE_FUNC inline const NdtriReturnType ndtri() const { return NdtriReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const NdtriReturnType ndtri() const { return NdtriReturnType(derived()); }
 
 template <typename ScalarExponent>
 using UnaryPowReturnType =
@@ -538,7 +542,7 @@
  * \sa ArrayBase::pow(ArrayBase), square(), cube(), exp(), log()
  */
 template <typename ScalarExponent>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const UnaryPowReturnType<ScalarExponent> pow(
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const UnaryPowReturnType<ScalarExponent> pow(
     const ScalarExponent& exponent) const {
   return UnaryPowReturnType<ScalarExponent>(derived(), internal::scalar_unary_pow_op<Scalar, ScalarExponent>(exponent));
 }

diff --git a/Eigen/src/plugins/BlockMethods.inc b/Eigen/src/plugins/BlockMethods.inc
index 0782aa3..57e347e 100644
--- a/Eigen/src/plugins/BlockMethods.inc
+++ b/Eigen/src/plugins/BlockMethods.inc

@@ -111,7 +111,7 @@
 /// \sa class Block, fix, fix<N>(int)
 ///
 template <typename NRowsType, typename NColsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,
                            internal::get_fixed_value<NColsType>::value>::Type
@@ -128,7 +128,7 @@
 
 /// This is the const version of block(Index,Index,NRowsType,NColsType)
 template <typename NRowsType, typename NColsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     const typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,
                                       internal::get_fixed_value<NColsType>::value>::Type
@@ -163,7 +163,7 @@
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template <typename NRowsType, typename NColsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,
                            internal::get_fixed_value<NColsType>::value>::Type
@@ -181,7 +181,7 @@
 
 /// This is the const version of topRightCorner(NRowsType, NColsType).
 template <typename NRowsType, typename NColsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     const typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,
                                       internal::get_fixed_value<NColsType>::value>::Type
@@ -213,13 +213,14 @@
 /// \sa class Block, block<int,int>(Index,Index)
 ///
 template <int CRows, int CCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename FixedBlockXpr<CRows, CCols>::Type topRightCorner() {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename FixedBlockXpr<CRows, CCols>::Type topRightCorner() {
   return typename FixedBlockXpr<CRows, CCols>::Type(derived(), 0, cols() - CCols);
 }
 
 /// This is the const version of topRightCorner<int, int>().
 template <int CRows, int CCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename ConstFixedBlockXpr<CRows, CCols>::Type topRightCorner() const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const typename ConstFixedBlockXpr<CRows, CCols>::Type topRightCorner()
+    const {
   return typename ConstFixedBlockXpr<CRows, CCols>::Type(derived(), 0, cols() - CCols);
 }
 
@@ -243,14 +244,14 @@
 /// \sa class Block
 ///
 template <int CRows, int CCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename FixedBlockXpr<CRows, CCols>::Type topRightCorner(Index cRows,
-                                                                                                Index cCols) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename FixedBlockXpr<CRows, CCols>::Type topRightCorner(Index cRows,
+                                                                                                          Index cCols) {
   return typename FixedBlockXpr<CRows, CCols>::Type(derived(), 0, cols() - cCols, cRows, cCols);
 }
 
 /// This is the const version of topRightCorner<int, int>(Index, Index).
 template <int CRows, int CCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename ConstFixedBlockXpr<CRows, CCols>::Type topRightCorner(
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const typename ConstFixedBlockXpr<CRows, CCols>::Type topRightCorner(
     Index cRows, Index cCols) const {
   return typename ConstFixedBlockXpr<CRows, CCols>::Type(derived(), 0, cols() - cCols, cRows, cCols);
 }
@@ -274,7 +275,7 @@
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template <typename NRowsType, typename NColsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,
                            internal::get_fixed_value<NColsType>::value>::Type
@@ -290,7 +291,7 @@
 
 /// This is the const version of topLeftCorner(Index, Index).
 template <typename NRowsType, typename NColsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     const typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,
                                       internal::get_fixed_value<NColsType>::value>::Type
@@ -318,13 +319,14 @@
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template <int CRows, int CCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename FixedBlockXpr<CRows, CCols>::Type topLeftCorner() {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename FixedBlockXpr<CRows, CCols>::Type topLeftCorner() {
   return typename FixedBlockXpr<CRows, CCols>::Type(derived(), 0, 0);
 }
 
 /// This is the const version of topLeftCorner<int, int>().
 template <int CRows, int CCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename ConstFixedBlockXpr<CRows, CCols>::Type topLeftCorner() const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const typename ConstFixedBlockXpr<CRows, CCols>::Type topLeftCorner()
+    const {
   return typename ConstFixedBlockXpr<CRows, CCols>::Type(derived(), 0, 0);
 }
 
@@ -348,14 +350,14 @@
 /// \sa class Block
 ///
 template <int CRows, int CCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename FixedBlockXpr<CRows, CCols>::Type topLeftCorner(Index cRows,
-                                                                                               Index cCols) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename FixedBlockXpr<CRows, CCols>::Type topLeftCorner(Index cRows,
+                                                                                                         Index cCols) {
   return typename FixedBlockXpr<CRows, CCols>::Type(derived(), 0, 0, cRows, cCols);
 }
 
 /// This is the const version of topLeftCorner<int, int>(Index, Index).
 template <int CRows, int CCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename ConstFixedBlockXpr<CRows, CCols>::Type topLeftCorner(
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const typename ConstFixedBlockXpr<CRows, CCols>::Type topLeftCorner(
     Index cRows, Index cCols) const {
   return typename ConstFixedBlockXpr<CRows, CCols>::Type(derived(), 0, 0, cRows, cCols);
 }
@@ -379,7 +381,7 @@
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template <typename NRowsType, typename NColsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,
                            internal::get_fixed_value<NColsType>::value>::Type
@@ -395,7 +397,7 @@
 
 /// This is the const version of bottomRightCorner(NRowsType, NColsType).
 template <typename NRowsType, typename NColsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     const typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,
                                       internal::get_fixed_value<NColsType>::value>::Type
@@ -423,13 +425,14 @@
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template <int CRows, int CCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename FixedBlockXpr<CRows, CCols>::Type bottomRightCorner() {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename FixedBlockXpr<CRows, CCols>::Type bottomRightCorner() {
   return typename FixedBlockXpr<CRows, CCols>::Type(derived(), rows() - CRows, cols() - CCols);
 }
 
 /// This is the const version of bottomRightCorner<int, int>().
 template <int CRows, int CCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename ConstFixedBlockXpr<CRows, CCols>::Type bottomRightCorner() const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const typename ConstFixedBlockXpr<CRows, CCols>::Type
+bottomRightCorner() const {
   return typename ConstFixedBlockXpr<CRows, CCols>::Type(derived(), rows() - CRows, cols() - CCols);
 }
 
@@ -453,14 +456,14 @@
 /// \sa class Block
 ///
 template <int CRows, int CCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename FixedBlockXpr<CRows, CCols>::Type bottomRightCorner(Index cRows,
-                                                                                                   Index cCols) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename FixedBlockXpr<CRows, CCols>::Type bottomRightCorner(
+    Index cRows, Index cCols) {
   return typename FixedBlockXpr<CRows, CCols>::Type(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
 }
 
 /// This is the const version of bottomRightCorner<int, int>(Index, Index).
 template <int CRows, int CCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename ConstFixedBlockXpr<CRows, CCols>::Type bottomRightCorner(
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const typename ConstFixedBlockXpr<CRows, CCols>::Type bottomRightCorner(
     Index cRows, Index cCols) const {
   return typename ConstFixedBlockXpr<CRows, CCols>::Type(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
 }
@@ -484,7 +487,7 @@
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template <typename NRowsType, typename NColsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,
                            internal::get_fixed_value<NColsType>::value>::Type
@@ -503,7 +506,7 @@
 
 /// This is the const version of bottomLeftCorner(NRowsType, NColsType).
 template <typename NRowsType, typename NColsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,
                                 internal::get_fixed_value<NColsType>::value>::Type
@@ -530,13 +533,14 @@
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template <int CRows, int CCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename FixedBlockXpr<CRows, CCols>::Type bottomLeftCorner() {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename FixedBlockXpr<CRows, CCols>::Type bottomLeftCorner() {
   return typename FixedBlockXpr<CRows, CCols>::Type(derived(), rows() - CRows, 0);
 }
 
 /// This is the const version of bottomLeftCorner<int, int>().
 template <int CRows, int CCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename ConstFixedBlockXpr<CRows, CCols>::Type bottomLeftCorner() const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const typename ConstFixedBlockXpr<CRows, CCols>::Type bottomLeftCorner()
+    const {
   return typename ConstFixedBlockXpr<CRows, CCols>::Type(derived(), rows() - CRows, 0);
 }
 
@@ -588,7 +592,7 @@
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template <typename NRowsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     typename NRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
 #else
@@ -601,7 +605,7 @@
 
 /// This is the const version of topRows(NRowsType).
 template <typename NRowsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     const typename ConstNRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
 #else
@@ -628,13 +632,13 @@
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template <int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NRowsBlockXpr<N>::Type topRows(Index n = N) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename NRowsBlockXpr<N>::Type topRows(Index n = N) {
   return typename NRowsBlockXpr<N>::Type(derived(), 0, 0, n, cols());
 }
 
 /// This is the const version of topRows<int>().
 template <int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ConstNRowsBlockXpr<N>::Type topRows(Index n = N) const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename ConstNRowsBlockXpr<N>::Type topRows(Index n = N) const {
   return typename ConstNRowsBlockXpr<N>::Type(derived(), 0, 0, n, cols());
 }
 
@@ -655,7 +659,7 @@
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template <typename NRowsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     typename NRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
 #else
@@ -668,7 +672,7 @@
 
 /// This is the const version of bottomRows(NRowsType).
 template <typename NRowsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     const typename ConstNRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
 #else
@@ -695,13 +699,13 @@
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template <int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NRowsBlockXpr<N>::Type bottomRows(Index n = N) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename NRowsBlockXpr<N>::Type bottomRows(Index n = N) {
   return typename NRowsBlockXpr<N>::Type(derived(), rows() - n, 0, n, cols());
 }
 
 /// This is the const version of bottomRows<int>().
 template <int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ConstNRowsBlockXpr<N>::Type bottomRows(Index n = N) const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename ConstNRowsBlockXpr<N>::Type bottomRows(Index n = N) const {
   return typename ConstNRowsBlockXpr<N>::Type(derived(), rows() - n, 0, n, cols());
 }
 
@@ -723,7 +727,7 @@
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template <typename NRowsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     typename NRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
 #else
@@ -736,7 +740,7 @@
 
 /// This is the const version of middleRows(Index,NRowsType).
 template <typename NRowsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     const typename ConstNRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
 #else
@@ -764,14 +768,15 @@
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template <int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NRowsBlockXpr<N>::Type middleRows(Index startRow, Index n = N) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename NRowsBlockXpr<N>::Type middleRows(Index startRow,
+                                                                                           Index n = N) {
   return typename NRowsBlockXpr<N>::Type(derived(), startRow, 0, n, cols());
 }
 
 /// This is the const version of middleRows<int>().
 template <int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ConstNRowsBlockXpr<N>::Type middleRows(Index startRow,
-                                                                                      Index n = N) const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename ConstNRowsBlockXpr<N>::Type middleRows(Index startRow,
+                                                                                                Index n = N) const {
   return typename ConstNRowsBlockXpr<N>::Type(derived(), startRow, 0, n, cols());
 }
 
@@ -792,7 +797,7 @@
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template <typename NColsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     typename NColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
 #else
@@ -805,7 +810,7 @@
 
 /// This is the const version of leftCols(NColsType).
 template <typename NColsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     const typename ConstNColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
 #else
@@ -832,13 +837,13 @@
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template <int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NColsBlockXpr<N>::Type leftCols(Index n = N) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename NColsBlockXpr<N>::Type leftCols(Index n = N) {
   return typename NColsBlockXpr<N>::Type(derived(), 0, 0, rows(), n);
 }
 
 /// This is the const version of leftCols<int>().
 template <int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ConstNColsBlockXpr<N>::Type leftCols(Index n = N) const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename ConstNColsBlockXpr<N>::Type leftCols(Index n = N) const {
   return typename ConstNColsBlockXpr<N>::Type(derived(), 0, 0, rows(), n);
 }
 
@@ -859,7 +864,7 @@
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template <typename NColsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     typename NColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
 #else
@@ -872,7 +877,7 @@
 
 /// This is the const version of rightCols(NColsType).
 template <typename NColsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     const typename ConstNColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
 #else
@@ -899,13 +904,13 @@
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template <int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NColsBlockXpr<N>::Type rightCols(Index n = N) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename NColsBlockXpr<N>::Type rightCols(Index n = N) {
   return typename NColsBlockXpr<N>::Type(derived(), 0, cols() - n, rows(), n);
 }
 
 /// This is the const version of rightCols<int>().
 template <int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ConstNColsBlockXpr<N>::Type rightCols(Index n = N) const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename ConstNColsBlockXpr<N>::Type rightCols(Index n = N) const {
   return typename ConstNColsBlockXpr<N>::Type(derived(), 0, cols() - n, rows(), n);
 }
 
@@ -927,7 +932,7 @@
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template <typename NColsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     typename NColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
 #else
@@ -940,7 +945,7 @@
 
 /// This is the const version of middleCols(Index,NColsType).
 template <typename NColsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     const typename ConstNColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
 #else
@@ -968,14 +973,15 @@
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template <int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NColsBlockXpr<N>::Type middleCols(Index startCol, Index n = N) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename NColsBlockXpr<N>::Type middleCols(Index startCol,
+                                                                                           Index n = N) {
   return typename NColsBlockXpr<N>::Type(derived(), 0, startCol, rows(), n);
 }
 
 /// This is the const version of middleCols<int>().
 template <int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ConstNColsBlockXpr<N>::Type middleCols(Index startCol,
-                                                                                      Index n = N) const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename ConstNColsBlockXpr<N>::Type middleCols(Index startCol,
+                                                                                                Index n = N) const {
   return typename ConstNColsBlockXpr<N>::Type(derived(), 0, startCol, rows(), n);
 }
 
@@ -990,7 +996,7 @@
 /// Example: \include MatrixBase_block_int_int.cpp
 /// Output: \verbinclude MatrixBase_block_int_int.out
 ///
-/// \note The usage of of this overload is discouraged from %Eigen 3.4, better used the generic
+/// \note The usage of this overload is discouraged from %Eigen 3.4, better use the generic
 /// block(Index,Index,NRowsType,NColsType), here is the one-to-one equivalence:
 /// \code
 /// mat.template block<NRows,NCols>(i,j)  <-->  mat.block(i,j,fix<NRows>,fix<NCols>)
@@ -1004,13 +1010,14 @@
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template <int NRows, int NCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename FixedBlockXpr<NRows, NCols>::Type block(Index startRow, Index startCol) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename FixedBlockXpr<NRows, NCols>::Type block(Index startRow,
+                                                                                                 Index startCol) {
   return typename FixedBlockXpr<NRows, NCols>::Type(derived(), startRow, startCol);
 }
 
 /// This is the const version of block<>(Index, Index). */
 template <int NRows, int NCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename ConstFixedBlockXpr<NRows, NCols>::Type block(
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const typename ConstFixedBlockXpr<NRows, NCols>::Type block(
     Index startRow, Index startCol) const {
   return typename ConstFixedBlockXpr<NRows, NCols>::Type(derived(), startRow, startCol);
 }
@@ -1032,7 +1039,7 @@
 /// Example: \include MatrixBase_template_int_int_block_int_int_int_int.cpp
 /// Output: \verbinclude MatrixBase_template_int_int_block_int_int_int_int.out
 ///
-/// \note The usage of of this overload is discouraged from %Eigen 3.4, better used the generic
+/// \note The usage of this overload is discouraged from %Eigen 3.4, better use the generic
 /// block(Index,Index,NRowsType,NColsType), here is the one-to-one complete equivalence:
 /// \code
 /// mat.template block<NRows,NCols>(i,j,rows,cols)     <-->  mat.block(i,j,fix<NRows>(rows),fix<NCols>(cols))
@@ -1047,15 +1054,16 @@
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template <int NRows, int NCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename FixedBlockXpr<NRows, NCols>::Type block(Index startRow, Index startCol,
-                                                                                       Index blockRows,
-                                                                                       Index blockCols) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename FixedBlockXpr<NRows, NCols>::Type block(Index startRow,
+                                                                                                 Index startCol,
+                                                                                                 Index blockRows,
+                                                                                                 Index blockCols) {
   return typename FixedBlockXpr<NRows, NCols>::Type(derived(), startRow, startCol, blockRows, blockCols);
 }
 
 /// This is the const version of block<>(Index, Index, Index, Index).
 template <int NRows, int NCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename ConstFixedBlockXpr<NRows, NCols>::Type block(
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const typename ConstFixedBlockXpr<NRows, NCols>::Type block(
     Index startRow, Index startCol, Index blockRows, Index blockCols) const {
   return typename ConstFixedBlockXpr<NRows, NCols>::Type(derived(), startRow, startCol, blockRows, blockCols);
 }
@@ -1068,10 +1076,10 @@
 EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column - major)
 /**
  * \sa row(), class Block */
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ColXpr col(Index i) { return ColXpr(derived(), i); }
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE ColXpr col(Index i) { return ColXpr(derived(), i); }
 
 /// This is the const version of col().
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ConstColXpr col(Index i) const { return ConstColXpr(derived(), i); }
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE ConstColXpr col(Index i) const { return ConstColXpr(derived(), i); }
 
 /// \returns an expression of the \a i-th row of \c *this. Note that the numbering starts at 0.
 ///
@@ -1081,10 +1089,10 @@
 EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row - major)
 /**
  * \sa col(), class Block */
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE RowXpr row(Index i) { return RowXpr(derived(), i); }
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE RowXpr row(Index i) { return RowXpr(derived(), i); }
 
 /// This is the const version of row(). */
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ConstRowXpr row(Index i) const { return ConstRowXpr(derived(), i); }
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE ConstRowXpr row(Index i) const { return ConstRowXpr(derived(), i); }
 
 /// \returns an expression of a segment (i.e. a vector block) in \c *this with either dynamic or fixed sizes.
 ///
@@ -1108,7 +1116,7 @@
 /// \sa block(Index,Index,NRowsType,NColsType), fix<N>, fix<N>(int), class Block
 ///
 template <typename NType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     typename FixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
 #else
@@ -1122,7 +1130,7 @@
 
 /// This is the const version of segment(Index,NType).
 template <typename NType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     const typename ConstFixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
 #else
@@ -1155,7 +1163,7 @@
 /// \sa class Block, block(Index,Index)
 ///
 template <typename NType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     typename FixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
 #else
@@ -1169,7 +1177,7 @@
 
 /// This is the const version of head(NType).
 template <typename NType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     const typename ConstFixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
 #else
@@ -1202,7 +1210,7 @@
 /// \sa class Block, block(Index,Index)
 ///
 template <typename NType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     typename FixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
 #else
@@ -1216,7 +1224,7 @@
 
 /// This is the const version of tail(Index).
 template <typename NType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     const typename ConstFixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
 #else
@@ -1245,15 +1253,16 @@
 /// \sa segment(Index,NType), class Block
 ///
 template <int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename FixedSegmentReturnType<N>::Type segment(Index start, Index n = N) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename FixedSegmentReturnType<N>::Type segment(Index start,
+                                                                                                 Index n = N) {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return typename FixedSegmentReturnType<N>::Type(derived(), start, n);
 }
 
 /// This is the const version of segment<int>(Index).
 template <int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ConstFixedSegmentReturnType<N>::Type segment(Index start,
-                                                                                            Index n = N) const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename ConstFixedSegmentReturnType<N>::Type segment(
+    Index start, Index n = N) const {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return typename ConstFixedSegmentReturnType<N>::Type(derived(), start, n);
 }
@@ -1274,14 +1283,14 @@
 /// \sa head(NType), class Block
 ///
 template <int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename FixedSegmentReturnType<N>::Type head(Index n = N) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename FixedSegmentReturnType<N>::Type head(Index n = N) {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return typename FixedSegmentReturnType<N>::Type(derived(), 0, n);
 }
 
 /// This is the const version of head<int>().
 template <int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ConstFixedSegmentReturnType<N>::Type head(Index n = N) const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename ConstFixedSegmentReturnType<N>::Type head(Index n = N) const {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return typename ConstFixedSegmentReturnType<N>::Type(derived(), 0, n);
 }
@@ -1302,14 +1311,14 @@
 /// \sa tail(NType), class Block
 ///
 template <int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename FixedSegmentReturnType<N>::Type tail(Index n = N) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename FixedSegmentReturnType<N>::Type tail(Index n = N) {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return typename FixedSegmentReturnType<N>::Type(derived(), size() - n, n);
 }
 
 /// This is the const version of tail<int>.
 template <int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ConstFixedSegmentReturnType<N>::Type tail(Index n = N) const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename ConstFixedSegmentReturnType<N>::Type tail(Index n = N) const {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return typename ConstFixedSegmentReturnType<N>::Type(derived(), size() - n, n);
 }
@@ -1317,21 +1326,21 @@
 /// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
 /// is col-major (resp. row-major).
 ///
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE InnerVectorReturnType innerVector(Index outer) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE InnerVectorReturnType innerVector(Index outer) {
   return InnerVectorReturnType(derived(), outer);
 }
 
 /// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
 /// is col-major (resp. row-major). Read-only.
 ///
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const ConstInnerVectorReturnType innerVector(Index outer) const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const ConstInnerVectorReturnType innerVector(Index outer) const {
   return ConstInnerVectorReturnType(derived(), outer);
 }
 
 /// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
 /// is col-major (resp. row-major).
 ///
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE InnerVectorsReturnType innerVectors(Index outerStart, Index outerSize) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE InnerVectorsReturnType innerVectors(Index outerStart, Index outerSize) {
   return Block<Derived, Dynamic, Dynamic, true>(derived(), IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart,
                                                 IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize);
 }
@@ -1339,8 +1348,8 @@
 /// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
 /// is col-major (resp. row-major). Read-only.
 ///
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const ConstInnerVectorsReturnType innerVectors(Index outerStart,
-                                                                                     Index outerSize) const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const ConstInnerVectorsReturnType innerVectors(Index outerStart,
+                                                                                               Index outerSize) const {
   return Block<const Derived, Dynamic, Dynamic, true>(derived(), IsRowMajor ? outerStart : 0,
                                                       IsRowMajor ? 0 : outerStart, IsRowMajor ? outerSize : rows(),
                                                       IsRowMajor ? cols() : outerSize);
@@ -1350,14 +1359,15 @@
  * \sa subVectors()
  */
 template <DirectionType Direction>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::conditional_t<Direction == Vertical, ColXpr, RowXpr> subVector(Index i) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE std::conditional_t<Direction == Vertical, ColXpr, RowXpr> subVector(
+    Index i) {
   return std::conditional_t<Direction == Vertical, ColXpr, RowXpr>(derived(), i);
 }
 
 /** This is the const version of subVector(Index) */
 template <DirectionType Direction>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::conditional_t<Direction == Vertical, ConstColXpr, ConstRowXpr> subVector(
-    Index i) const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE std::conditional_t<Direction == Vertical, ConstColXpr, ConstRowXpr>
+subVector(Index i) const {
   return std::conditional_t<Direction == Vertical, ConstColXpr, ConstRowXpr>(derived(), i);
 }
 

diff --git a/Eigen/src/plugins/CommonCwiseBinaryOps.inc b/Eigen/src/plugins/CommonCwiseBinaryOps.inc
index f1ba301..d5127e3 100644
--- a/Eigen/src/plugins/CommonCwiseBinaryOps.inc
+++ b/Eigen/src/plugins/CommonCwiseBinaryOps.inc

@@ -38,8 +38,9 @@
  * \sa class CwiseBinaryOp, operator+(), operator-(), cwiseProduct()
  */
 template <typename CustomBinaryOp, typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseBinaryOp<CustomBinaryOp, const Derived, const OtherDerived> binaryExpr(
-    const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other, const CustomBinaryOp& func = CustomBinaryOp()) const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const CwiseBinaryOp<CustomBinaryOp, const Derived, const OtherDerived>
+binaryExpr(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other,
+           const CustomBinaryOp& func = CustomBinaryOp()) const {
   return CwiseBinaryOp<CustomBinaryOp, const Derived, const OtherDerived>(derived(), other.derived(), func);
 }
 
@@ -63,7 +64,8 @@
  * \sa operator||(), select()
  */
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC inline const CwiseBinaryOp<internal::scalar_boolean_and_op<Scalar>, const Derived, const OtherDerived>
+EIGEN_DEVICE_FUNC constexpr inline const CwiseBinaryOp<internal::scalar_boolean_and_op<Scalar>, const Derived,
+                                                       const OtherDerived>
 operator&&(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
   return CwiseBinaryOp<internal::scalar_boolean_and_op<Scalar>, const Derived, const OtherDerived>(derived(),
                                                                                                    other.derived());
@@ -77,7 +79,8 @@
  * \sa operator&&(), select()
  */
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC inline const CwiseBinaryOp<internal::scalar_boolean_or_op<Scalar>, const Derived, const OtherDerived>
+EIGEN_DEVICE_FUNC constexpr inline const CwiseBinaryOp<internal::scalar_boolean_or_op<Scalar>, const Derived,
+                                                       const OtherDerived>
 operator||(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
   return CwiseBinaryOp<internal::scalar_boolean_or_op<Scalar>, const Derived, const OtherDerived>(derived(),
                                                                                                   other.derived());
@@ -88,7 +91,8 @@
  * \sa operator|(), operator^()
  */
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC inline const CwiseBinaryOp<internal::scalar_bitwise_and_op<Scalar>, const Derived, const OtherDerived>
+EIGEN_DEVICE_FUNC constexpr inline const CwiseBinaryOp<internal::scalar_bitwise_and_op<Scalar>, const Derived,
+                                                       const OtherDerived>
 operator&(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
   return CwiseBinaryOp<internal::scalar_bitwise_and_op<Scalar>, const Derived, const OtherDerived>(derived(),
                                                                                                    other.derived());
@@ -99,7 +103,8 @@
  * \sa operator&(), operator^()
  */
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC inline const CwiseBinaryOp<internal::scalar_bitwise_or_op<Scalar>, const Derived, const OtherDerived>
+EIGEN_DEVICE_FUNC constexpr inline const CwiseBinaryOp<internal::scalar_bitwise_or_op<Scalar>, const Derived,
+                                                       const OtherDerived>
 operator|(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
   return CwiseBinaryOp<internal::scalar_bitwise_or_op<Scalar>, const Derived, const OtherDerived>(derived(),
                                                                                                   other.derived());
@@ -109,7 +114,8 @@
  * \sa operator&(), operator|()
  */
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC inline const CwiseBinaryOp<internal::scalar_bitwise_xor_op<Scalar>, const Derived, const OtherDerived>
+EIGEN_DEVICE_FUNC constexpr inline const CwiseBinaryOp<internal::scalar_bitwise_xor_op<Scalar>, const Derived,
+                                                       const OtherDerived>
 operator^(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
   return CwiseBinaryOp<internal::scalar_bitwise_xor_op<Scalar>, const Derived, const OtherDerived>(derived(),
                                                                                                    other.derived());

diff --git a/Eigen/src/plugins/CommonCwiseUnaryOps.inc b/Eigen/src/plugins/CommonCwiseUnaryOps.inc
index 64f3648..bf8a940 100644
--- a/Eigen/src/plugins/CommonCwiseUnaryOps.inc
+++ b/Eigen/src/plugins/CommonCwiseUnaryOps.inc

@@ -37,7 +37,7 @@
 ///
 EIGEN_DOC_UNARY_ADDONS(operator-, opposite)
 ///
-EIGEN_DEVICE_FUNC inline const NegativeReturnType operator-() const { return NegativeReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const NegativeReturnType operator-() const { return NegativeReturnType(derived()); }
 
 template <class NewType>
 struct CastXpr {
@@ -55,7 +55,7 @@
 /// \sa class CwiseUnaryOp
 ///
 template <typename NewType>
-EIGEN_DEVICE_FUNC typename CastXpr<NewType>::Type cast() const {
+EIGEN_DEVICE_FUNC constexpr typename CastXpr<NewType>::Type cast() const {
   return typename CastXpr<NewType>::Type(derived());
 }
 
@@ -64,7 +64,7 @@
 EIGEN_DOC_UNARY_ADDONS(conjugate, complex conjugate)
 ///
 /// \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_conj">Math functions</a>, MatrixBase::adjoint()
-EIGEN_DEVICE_FUNC inline ConjugateReturnType conjugate() const { return ConjugateReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline ConjugateReturnType conjugate() const { return ConjugateReturnType(derived()); }
 
 /// \returns an expression of the complex conjugate of \c *this if Cond==true, returns derived() otherwise.
 ///
@@ -72,7 +72,7 @@
 ///
 /// \sa conjugate()
 template <bool Cond>
-EIGEN_DEVICE_FUNC inline std::conditional_t<Cond, ConjugateReturnType, const Derived&> conjugateIf() const {
+EIGEN_DEVICE_FUNC constexpr inline std::conditional_t<Cond, ConjugateReturnType, const Derived&> conjugateIf() const {
   typedef std::conditional_t<Cond, ConjugateReturnType, const Derived&> ReturnType;
   return ReturnType(derived());
 }
@@ -82,14 +82,14 @@
 EIGEN_DOC_UNARY_ADDONS(real, real part function)
 ///
 /// \sa imag()
-EIGEN_DEVICE_FUNC inline RealReturnType real() const { return RealReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline RealReturnType real() const { return RealReturnType(derived()); }
 
 /// \returns an read-only expression of the imaginary part of \c *this.
 ///
 EIGEN_DOC_UNARY_ADDONS(imag, imaginary part function)
 ///
 /// \sa real()
-EIGEN_DEVICE_FUNC inline const ImagReturnType imag() const { return ImagReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const ImagReturnType imag() const { return ImagReturnType(derived()); }
 
 /// \brief Apply a unary operator coefficient-wise
 /// \param[in]  func  Functor implementing the unary operator
@@ -113,7 +113,7 @@
 /// \sa unaryViewExpr, binaryExpr, class CwiseUnaryOp
 ///
 template <typename CustomUnaryOp>
-EIGEN_DEVICE_FUNC inline const CwiseUnaryOp<CustomUnaryOp, const Derived> unaryExpr(
+EIGEN_DEVICE_FUNC constexpr inline const CwiseUnaryOp<CustomUnaryOp, const Derived> unaryExpr(
     const CustomUnaryOp& func = CustomUnaryOp()) const {
   return CwiseUnaryOp<CustomUnaryOp, const Derived>(derived(), func);
 }
@@ -132,7 +132,7 @@
 /// \sa unaryExpr, binaryExpr class CwiseUnaryOp
 ///
 template <typename CustomViewOp>
-EIGEN_DEVICE_FUNC inline const CwiseUnaryView<CustomViewOp, const Derived> unaryViewExpr(
+EIGEN_DEVICE_FUNC constexpr inline const CwiseUnaryView<CustomViewOp, const Derived> unaryViewExpr(
     const CustomViewOp& func = CustomViewOp()) const {
   return CwiseUnaryView<CustomViewOp, const Derived>(derived(), func);
 }
@@ -147,7 +147,7 @@
 /// \sa unaryExpr, binaryExpr class CwiseUnaryOp
 ///
 template <typename CustomViewOp>
-EIGEN_DEVICE_FUNC inline CwiseUnaryView<CustomViewOp, Derived> unaryViewExpr(
+EIGEN_DEVICE_FUNC constexpr inline CwiseUnaryView<CustomViewOp, Derived> unaryViewExpr(
     const CustomViewOp& func = CustomViewOp()) {
   return CwiseUnaryView<CustomViewOp, Derived>(derived(), func);
 }
@@ -157,11 +157,11 @@
 EIGEN_DOC_UNARY_ADDONS(real, real part function)
 ///
 /// \sa imag()
-EIGEN_DEVICE_FUNC inline NonConstRealReturnType real() { return NonConstRealReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline NonConstRealReturnType real() { return NonConstRealReturnType(derived()); }
 
 /// \returns a non const expression of the imaginary part of \c *this.
 ///
 EIGEN_DOC_UNARY_ADDONS(imag, imaginary part function)
 ///
 /// \sa real()
-EIGEN_DEVICE_FUNC inline NonConstImagReturnType imag() { return NonConstImagReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline NonConstImagReturnType imag() { return NonConstImagReturnType(derived()); }

diff --git a/Eigen/src/plugins/MatrixCwiseBinaryOps.inc b/Eigen/src/plugins/MatrixCwiseBinaryOps.inc
index fae92d8..8e0da06 100644
--- a/Eigen/src/plugins/MatrixCwiseBinaryOps.inc
+++ b/Eigen/src/plugins/MatrixCwiseBinaryOps.inc

@@ -18,7 +18,7 @@
  * \sa class CwiseBinaryOp, cwiseAbs2
  */
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const EIGEN_CWISE_BINARY_RETURN_TYPE(Derived, OtherDerived, product)
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const EIGEN_CWISE_BINARY_RETURN_TYPE(Derived, OtherDerived, product)
     cwiseProduct(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
   return EIGEN_CWISE_BINARY_RETURN_TYPE(Derived, OtherDerived, product)(derived(), other.derived());
 }
@@ -55,7 +55,7 @@
  * \sa cwiseNotEqual(), isApprox(), isMuchSmallerThan()
  */
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC inline const CwiseBinaryEqualReturnType<OtherDerived> cwiseEqual(
+EIGEN_DEVICE_FUNC constexpr inline const CwiseBinaryEqualReturnType<OtherDerived> cwiseEqual(
     const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
   return CwiseBinaryEqualReturnType<OtherDerived>(derived(), other.derived());
 }
@@ -73,35 +73,35 @@
  * \sa cwiseEqual(), isApprox(), isMuchSmallerThan()
  */
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC inline const CwiseBinaryNotEqualReturnType<OtherDerived> cwiseNotEqual(
+EIGEN_DEVICE_FUNC constexpr inline const CwiseBinaryNotEqualReturnType<OtherDerived> cwiseNotEqual(
     const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
   return CwiseBinaryNotEqualReturnType<OtherDerived>(derived(), other.derived());
 }
 
 /** \returns an expression of the coefficient-wise < operator of *this and \a other */
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC inline const CwiseBinaryLessReturnType<OtherDerived> cwiseLess(
+EIGEN_DEVICE_FUNC constexpr inline const CwiseBinaryLessReturnType<OtherDerived> cwiseLess(
     const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
   return CwiseBinaryLessReturnType<OtherDerived>(derived(), other.derived());
 }
 
 /** \returns an expression of the coefficient-wise > operator of *this and \a other */
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC inline const CwiseBinaryGreaterReturnType<OtherDerived> cwiseGreater(
+EIGEN_DEVICE_FUNC constexpr inline const CwiseBinaryGreaterReturnType<OtherDerived> cwiseGreater(
     const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
   return CwiseBinaryGreaterReturnType<OtherDerived>(derived(), other.derived());
 }
 
 /** \returns an expression of the coefficient-wise <= operator of *this and \a other */
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC inline const CwiseBinaryLessOrEqualReturnType<OtherDerived> cwiseLessOrEqual(
+EIGEN_DEVICE_FUNC constexpr inline const CwiseBinaryLessOrEqualReturnType<OtherDerived> cwiseLessOrEqual(
     const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
   return CwiseBinaryLessOrEqualReturnType<OtherDerived>(derived(), other.derived());
 }
 
 /** \returns an expression of the coefficient-wise >= operator of *this and \a other */
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC inline const CwiseBinaryGreaterOrEqualReturnType<OtherDerived> cwiseGreaterOrEqual(
+EIGEN_DEVICE_FUNC constexpr inline const CwiseBinaryGreaterOrEqualReturnType<OtherDerived> cwiseGreaterOrEqual(
     const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
   return CwiseBinaryGreaterOrEqualReturnType<OtherDerived>(derived(), other.derived());
 }
@@ -114,7 +114,7 @@
  * \sa class CwiseBinaryOp, max()
  */
 template <int NaNPropagation = PropagateFast, typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const
     CwiseBinaryOp<internal::scalar_min_op<Scalar, Scalar, NaNPropagation>, const Derived, const OtherDerived>
     cwiseMin(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
   return CwiseBinaryOp<internal::scalar_min_op<Scalar, Scalar, NaNPropagation>, const Derived, const OtherDerived>(
@@ -126,7 +126,7 @@
  * \sa class CwiseBinaryOp, min()
  */
 template <int NaNPropagation = PropagateFast>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const
     CwiseBinaryOp<internal::scalar_min_op<Scalar, Scalar, NaNPropagation>, const Derived, const ConstantReturnType>
     cwiseMin(const Scalar& other) const {
   return cwiseMin<NaNPropagation>(Derived::Constant(rows(), cols(), other));
@@ -140,7 +140,7 @@
  * \sa class CwiseBinaryOp, min()
  */
 template <int NaNPropagation = PropagateFast, typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const
     CwiseBinaryOp<internal::scalar_max_op<Scalar, Scalar, NaNPropagation>, const Derived, const OtherDerived>
     cwiseMax(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
   return CwiseBinaryOp<internal::scalar_max_op<Scalar, Scalar, NaNPropagation>, const Derived, const OtherDerived>(
@@ -152,7 +152,7 @@
  * \sa class CwiseBinaryOp, min()
  */
 template <int NaNPropagation = PropagateFast>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const
     CwiseBinaryOp<internal::scalar_max_op<Scalar, Scalar, NaNPropagation>, const Derived, const ConstantReturnType>
     cwiseMax(const Scalar& other) const {
   return cwiseMax<NaNPropagation>(Derived::Constant(rows(), cols(), other));
@@ -166,7 +166,7 @@
  * \sa class CwiseBinaryOp, cwiseProduct(), cwiseInverse()
  */
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const
     CwiseBinaryOp<internal::scalar_quotient_op<Scalar>, const Derived, const OtherDerived>
     cwiseQuotient(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
   return CwiseBinaryOp<internal::scalar_quotient_op<Scalar>, const Derived, const OtherDerived>(derived(),
@@ -195,7 +195,7 @@
  *
  * \sa cwiseEqual(const MatrixBase<OtherDerived> &) const
  */
-EIGEN_DEVICE_FUNC inline const CwiseScalarEqualReturnType cwiseEqual(const Scalar& s) const {
+EIGEN_DEVICE_FUNC constexpr inline const CwiseScalarEqualReturnType cwiseEqual(const Scalar& s) const {
   return CwiseScalarEqualReturnType(derived(), Derived::Constant(rows(), cols(), s));
 }
 
@@ -208,27 +208,28 @@
  *
  * \sa cwiseEqual(const MatrixBase<OtherDerived> &) const
  */
-EIGEN_DEVICE_FUNC inline const CwiseScalarNotEqualReturnType cwiseNotEqual(const Scalar& s) const {
+EIGEN_DEVICE_FUNC constexpr inline const CwiseScalarNotEqualReturnType cwiseNotEqual(const Scalar& s) const {
   return CwiseScalarNotEqualReturnType(derived(), Derived::Constant(rows(), cols(), s));
 }
 
 /** \returns an expression of the coefficient-wise < operator of \c *this and a scalar \a s */
-EIGEN_DEVICE_FUNC inline const CwiseScalarLessReturnType cwiseLess(const Scalar& s) const {
+EIGEN_DEVICE_FUNC constexpr inline const CwiseScalarLessReturnType cwiseLess(const Scalar& s) const {
   return CwiseScalarLessReturnType(derived(), Derived::Constant(rows(), cols(), s));
 }
 
 /** \returns an expression of the coefficient-wise > operator of \c *this and a scalar \a s */
-EIGEN_DEVICE_FUNC inline const CwiseScalarGreaterReturnType cwiseGreater(const Scalar& s) const {
+EIGEN_DEVICE_FUNC constexpr inline const CwiseScalarGreaterReturnType cwiseGreater(const Scalar& s) const {
   return CwiseScalarGreaterReturnType(derived(), Derived::Constant(rows(), cols(), s));
 }
 
 /** \returns an expression of the coefficient-wise <= operator of \c *this and a scalar \a s */
-EIGEN_DEVICE_FUNC inline const CwiseScalarLessOrEqualReturnType cwiseLessOrEqual(const Scalar& s) const {
+EIGEN_DEVICE_FUNC constexpr inline const CwiseScalarLessOrEqualReturnType cwiseLessOrEqual(const Scalar& s) const {
   return CwiseScalarLessOrEqualReturnType(derived(), Derived::Constant(rows(), cols(), s));
 }
 
 /** \returns an expression of the coefficient-wise >= operator of \c *this and a scalar \a s */
-EIGEN_DEVICE_FUNC inline const CwiseScalarGreaterOrEqualReturnType cwiseGreaterOrEqual(const Scalar& s) const {
+EIGEN_DEVICE_FUNC constexpr inline const CwiseScalarGreaterOrEqualReturnType cwiseGreaterOrEqual(
+    const Scalar& s) const {
   return CwiseScalarGreaterOrEqualReturnType(derived(), Derived::Constant(rows(), cols(), s));
 }
 
@@ -252,37 +253,37 @@
     CwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE, true>, const Derived, const OtherDerived>;
 
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseBinaryTypedEqualReturnType<OtherDerived> cwiseTypedEqual(
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const CwiseBinaryTypedEqualReturnType<OtherDerived> cwiseTypedEqual(
     const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
   return CwiseBinaryTypedEqualReturnType<OtherDerived>(derived(), other.derived());
 }
 
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseBinaryTypedNotEqualReturnType<OtherDerived> cwiseTypedNotEqual(
-    const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const CwiseBinaryTypedNotEqualReturnType<OtherDerived>
+cwiseTypedNotEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
   return CwiseBinaryTypedNotEqualReturnType<OtherDerived>(derived(), other.derived());
 }
 
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseBinaryTypedLessReturnType<OtherDerived> cwiseTypedLess(
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const CwiseBinaryTypedLessReturnType<OtherDerived> cwiseTypedLess(
     const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
   return CwiseBinaryTypedLessReturnType<OtherDerived>(derived(), other.derived());
 }
 
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseBinaryTypedGreaterReturnType<OtherDerived> cwiseTypedGreater(
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const CwiseBinaryTypedGreaterReturnType<OtherDerived> cwiseTypedGreater(
     const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
   return CwiseBinaryTypedGreaterReturnType<OtherDerived>(derived(), other.derived());
 }
 
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseBinaryTypedLessOrEqualReturnType<OtherDerived> cwiseTypedLessOrEqual(
-    const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const CwiseBinaryTypedLessOrEqualReturnType<OtherDerived>
+cwiseTypedLessOrEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
   return CwiseBinaryTypedLessOrEqualReturnType<OtherDerived>(derived(), other.derived());
 }
 
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseBinaryTypedGreaterOrEqualReturnType<OtherDerived>
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const CwiseBinaryTypedGreaterOrEqualReturnType<OtherDerived>
 cwiseTypedGreaterOrEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
   return CwiseBinaryTypedGreaterOrEqualReturnType<OtherDerived>(derived(), other.derived());
 }
@@ -303,29 +304,32 @@
     CwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE, true>, const Derived,
                   const ConstantReturnType>;
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseScalarTypedEqualReturnType cwiseTypedEqual(const Scalar& s) const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const CwiseScalarTypedEqualReturnType
+cwiseTypedEqual(const Scalar& s) const {
   return CwiseScalarTypedEqualReturnType(derived(), ConstantReturnType(rows(), cols(), s));
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseScalarTypedNotEqualReturnType
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const CwiseScalarTypedNotEqualReturnType
 cwiseTypedNotEqual(const Scalar& s) const {
   return CwiseScalarTypedNotEqualReturnType(derived(), ConstantReturnType(rows(), cols(), s));
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseScalarTypedLessReturnType cwiseTypedLess(const Scalar& s) const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const CwiseScalarTypedLessReturnType
+cwiseTypedLess(const Scalar& s) const {
   return CwiseScalarTypedLessReturnType(derived(), ConstantReturnType(rows(), cols(), s));
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseScalarTypedGreaterReturnType cwiseTypedGreater(const Scalar& s) const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const CwiseScalarTypedGreaterReturnType
+cwiseTypedGreater(const Scalar& s) const {
   return CwiseScalarTypedGreaterReturnType(derived(), ConstantReturnType(rows(), cols(), s));
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseScalarTypedLessOrEqualReturnType
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const CwiseScalarTypedLessOrEqualReturnType
 cwiseTypedLessOrEqual(const Scalar& s) const {
   return CwiseScalarTypedLessOrEqualReturnType(derived(), ConstantReturnType(rows(), cols(), s));
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseScalarTypedGreaterOrEqualReturnType
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const CwiseScalarTypedGreaterOrEqualReturnType
 cwiseTypedGreaterOrEqual(const Scalar& s) const {
   return CwiseScalarTypedGreaterOrEqualReturnType(derived(), ConstantReturnType(rows(), cols(), s));
 }

diff --git a/Eigen/src/plugins/MatrixCwiseUnaryOps.inc b/Eigen/src/plugins/MatrixCwiseUnaryOps.inc
index ffaf5aa..a57548c 100644
--- a/Eigen/src/plugins/MatrixCwiseUnaryOps.inc
+++ b/Eigen/src/plugins/MatrixCwiseUnaryOps.inc

@@ -30,7 +30,7 @@
 ///
 /// \sa cwiseAbs2()
 ///
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseAbsReturnType cwiseAbs() const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const CwiseAbsReturnType cwiseAbs() const {
   return CwiseAbsReturnType(derived());
 }
 
@@ -43,7 +43,7 @@
 ///
 /// \sa cwiseAbs()
 ///
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseAbs2ReturnType cwiseAbs2() const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const CwiseAbs2ReturnType cwiseAbs2() const {
   return CwiseAbs2ReturnType(derived());
 }
 
@@ -56,7 +56,9 @@
 ///
 /// \sa cwisePow(), cwiseSquare(), cwiseCbrt()
 ///
-EIGEN_DEVICE_FUNC inline const CwiseSqrtReturnType cwiseSqrt() const { return CwiseSqrtReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const CwiseSqrtReturnType cwiseSqrt() const {
+  return CwiseSqrtReturnType(derived());
+}
 
 /// \returns an expression of the coefficient-wise cube root of *this.
 ///
@@ -64,7 +66,9 @@
 ///
 /// \sa cwiseSqrt(), cwiseSquare(), cwisePow()
 ///
-EIGEN_DEVICE_FUNC inline const CwiseCbrtReturnType cwiseCbrt() const { return CwiseCbrtReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const CwiseCbrtReturnType cwiseCbrt() const {
+  return CwiseCbrtReturnType(derived());
+}
 
 /// \returns an expression of the coefficient-wise square of *this.
 ///
@@ -72,7 +76,9 @@
 ///
 /// \sa cwisePow(), cwiseSqrt(), cwiseCbrt()
 ///
-EIGEN_DEVICE_FUNC inline const CwiseSquareReturnType cwiseSquare() const { return CwiseSquareReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const CwiseSquareReturnType cwiseSquare() const {
+  return CwiseSquareReturnType(derived());
+}
 
 /// \returns an expression of the coefficient-wise signum of *this.
 ///
@@ -81,7 +87,9 @@
 ///
 EIGEN_DOC_UNARY_ADDONS(cwiseSign, sign function)
 ///
-EIGEN_DEVICE_FUNC inline const CwiseSignReturnType cwiseSign() const { return CwiseSignReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const CwiseSignReturnType cwiseSign() const {
+  return CwiseSignReturnType(derived());
+}
 
 /// \returns an expression of the coefficient-wise inverse of *this.
 ///
@@ -92,7 +100,9 @@
 ///
 /// \sa cwiseProduct()
 ///
-EIGEN_DEVICE_FUNC inline const CwiseInverseReturnType cwiseInverse() const { return CwiseInverseReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const CwiseInverseReturnType cwiseInverse() const {
+  return CwiseInverseReturnType(derived());
+}
 
 /// \returns an expression of the coefficient-wise phase angle of \c *this
 ///
@@ -101,9 +111,9 @@
 ///
 EIGEN_DOC_UNARY_ADDONS(cwiseArg, arg)
 
-EIGEN_DEVICE_FUNC inline const CwiseArgReturnType cwiseArg() const { return CwiseArgReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const CwiseArgReturnType cwiseArg() const { return CwiseArgReturnType(derived()); }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseCArgReturnType cwiseCArg() const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const CwiseCArgReturnType cwiseCArg() const {
   return CwiseCArgReturnType(derived());
 }
 
@@ -113,6 +123,7 @@
                      CwiseUnaryOp<internal::scalar_unary_pow_op<Scalar, ScalarExponent>, const Derived>>;
 
 template <typename ScalarExponent>
-EIGEN_DEVICE_FUNC inline const CwisePowReturnType<ScalarExponent> cwisePow(const ScalarExponent& exponent) const {
+EIGEN_DEVICE_FUNC constexpr inline const CwisePowReturnType<ScalarExponent> cwisePow(
+    const ScalarExponent& exponent) const {
   return CwisePowReturnType<ScalarExponent>(derived(), internal::scalar_unary_pow_op<Scalar, ScalarExponent>(exponent));
 }

diff --git a/Eigen/src/plugins/ReshapedMethods.inc b/Eigen/src/plugins/ReshapedMethods.inc
index c1f90e7..1e3a196 100644
--- a/Eigen/src/plugins/ReshapedMethods.inc
+++ b/Eigen/src/plugins/ReshapedMethods.inc

@@ -27,11 +27,11 @@
 /// \sa class Reshaped, fix, fix<N>(int)
 ///
 template <int Order = ColMajor, typename NRowsType, typename NColsType>
-EIGEN_DEVICE_FUNC inline Reshaped<Derived, ...> reshaped(NRowsType nRows, NColsType nCols);
+EIGEN_DEVICE_FUNC constexpr inline Reshaped<Derived, ...> reshaped(NRowsType nRows, NColsType nCols);
 
 /// This is the const version of reshaped(NRowsType,NColsType).
 template <int Order = ColMajor, typename NRowsType, typename NColsType>
-EIGEN_DEVICE_FUNC inline const Reshaped<const Derived, ...> reshaped(NRowsType nRows, NColsType nCols) const;
+EIGEN_DEVICE_FUNC constexpr inline const Reshaped<const Derived, ...> reshaped(NRowsType nRows, NColsType nCols) const;
 
 /// \returns an expression of \c *this with columns (or rows) stacked to a linear column vector
 ///
@@ -56,11 +56,11 @@
 /// \sa reshaped(NRowsType,NColsType), class Reshaped
 ///
 template <int Order = ColMajor>
-EIGEN_DEVICE_FUNC inline Reshaped<Derived, ...> reshaped();
+EIGEN_DEVICE_FUNC constexpr inline Reshaped<Derived, ...> reshaped();
 
 /// This is the const version of reshaped().
 template <int Order = ColMajor>
-EIGEN_DEVICE_FUNC inline const Reshaped<const Derived, ...> reshaped() const;
+EIGEN_DEVICE_FUNC constexpr inline const Reshaped<const Derived, ...> reshaped() const;
 
 #else
 
@@ -79,7 +79,7 @@
 #endif
 
 template <typename NRowsType, typename NColsType>
-EIGEN_DEVICE_FUNC inline Reshaped<
+EIGEN_DEVICE_FUNC constexpr inline Reshaped<
     EIGEN_RESHAPED_METHOD_CONST Derived,
     internal::get_compiletime_reshape_size<NRowsType, NColsType, SizeAtCompileTime>::value,
     internal::get_compiletime_reshape_size<NColsType, NRowsType, SizeAtCompileTime>::value>
@@ -92,7 +92,7 @@
 }
 
 template <int Order, typename NRowsType, typename NColsType>
-EIGEN_DEVICE_FUNC inline Reshaped<
+EIGEN_DEVICE_FUNC constexpr inline Reshaped<
     EIGEN_RESHAPED_METHOD_CONST Derived,
     internal::get_compiletime_reshape_size<NRowsType, NColsType, SizeAtCompileTime>::value,
     internal::get_compiletime_reshape_size<NColsType, NRowsType, SizeAtCompileTime>::value,
@@ -108,14 +108,14 @@
 
 // Views as linear vectors
 
-EIGEN_DEVICE_FUNC inline Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived, SizeAtCompileTime, 1> reshaped()
+EIGEN_DEVICE_FUNC constexpr inline Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived, SizeAtCompileTime, 1> reshaped()
     EIGEN_RESHAPED_METHOD_CONST {
   return Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived, SizeAtCompileTime, 1>(derived(), size(), 1);
 }
 
 template <int Order>
-EIGEN_DEVICE_FUNC inline Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived, SizeAtCompileTime, 1,
-                                  internal::get_compiletime_reshape_order(Flags, Order)>
+EIGEN_DEVICE_FUNC constexpr inline Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived, SizeAtCompileTime, 1,
+                                            internal::get_compiletime_reshape_order(Flags, Order)>
 reshaped() EIGEN_RESHAPED_METHOD_CONST {
   EIGEN_STATIC_ASSERT(Order == RowMajor || Order == ColMajor || Order == AutoOrder, INVALID_TEMPLATE_PARAMETER);
   return Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived, SizeAtCompileTime, 1,

diff --git a/blas/BandTriangularSolver.h b/blas/BandTriangularSolver.h
index f9bfdc1..fbc639f 100644
--- a/blas/BandTriangularSolver.h
+++ b/blas/BandTriangularSolver.h

@@ -15,7 +15,7 @@
 
 /* \internal
  * Solve Ax=b with A a band triangular matrix
- * TODO: extend it to matrices for x abd b */
+ * TODO: extend it to matrices for x and b */
 template <typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, int StorageOrder>
 struct band_solve_triangular_selector;
 

diff --git a/blas/level3_impl.h b/blas/level3_impl.h
index 4ea1b46..03d634c 100644
--- a/blas/level3_impl.h
+++ b/blas/level3_impl.h

@@ -486,7 +486,7 @@
 
   if (*m == 0 || *n == 0) return;
 
-  // FIXME find a way to avoid this copy
+  // FIXME: find a way to avoid this copy
   Eigen::Matrix<Scalar, Dynamic, Dynamic, ColMajor> tmp = matrix(b, *m, *n, *ldb);
   matrix(b, *m, *n, *ldb).setZero();
 

diff --git a/blas/testing/CMakeLists.txt b/blas/testing/CMakeLists.txt
index f0ee6a4..8b8ff44 100644
--- a/blas/testing/CMakeLists.txt
+++ b/blas/testing/CMakeLists.txt

@@ -48,5 +48,13 @@
 # add_custom_target(level1)
 # add_dependencies(level1 sblat1)
 
+# Workaround for gfortran -O3 bug.  The optimization
+# introduces a bug in the calculation of the norm of the error
+# in the complex case.
+if(CMAKE_Fortran_COMPILER_ID MATCHES "GNU")
+  target_compile_options(cblat3 PRIVATE "-O0")
+  target_compile_options(zblat3 PRIVATE "-O0")
+endif()
+
 endif()
 

diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake
index 2a44f05..e5aefa3 100644
--- a/cmake/EigenTesting.cmake
+++ b/cmake/EigenTesting.cmake

@@ -46,6 +46,21 @@
       set(CUDA_CLANG_LINK_LIBRARIES ${CUDA_CLANG_LINK_LIBRARIES} "rt")
       endif()
       target_link_libraries(${targetname} ${CUDA_CLANG_LINK_LIBRARIES})
+    elseif(EIGEN_TEST_CUDA_NVC)
+      set_source_files_properties(${filename} PROPERTIES LANGUAGE CXX)
+
+      if(CUDA_64_BIT_DEVICE_CODE AND (EXISTS "${CUDA_TOOLKIT_ROOT_DIR}/lib64"))
+        link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib64")
+      else()
+        link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib")
+      endif()
+
+      add_executable(${targetname} ${filename})
+      set(CUDA_NVC_LINK_LIBRARIES "cudart_static" "cuda" "dl" "pthread")
+      if (CMAKE_SYSTEM_NAME STREQUAL "Linux")
+        list(APPEND CUDA_NVC_LINK_LIBRARIES "rt")
+      endif()
+      target_link_libraries(${targetname} ${CUDA_NVC_LINK_LIBRARIES})
     else()
       cuda_add_executable(${targetname} ${filename})
     endif()
@@ -458,7 +473,7 @@
     endif()
     if(MSVC)
       set(${VAR} "${CMAKE_CXX_COMPILER_VERSION}")
-    elseif(${CMAKE_CXX_COMPILER_ID} MATCHES "PGI")
+    elseif(${CMAKE_CXX_COMPILER_ID} MATCHES "PGI" OR ${CMAKE_CXX_COMPILER_ID} MATCHES "NVHPC")
       set(${VAR} "${CMAKE_CXX_COMPILER_ID}-${CMAKE_CXX_COMPILER_VERSION}")
     else()
     # on all other system we rely on ${CMAKE_CXX_COMPILER}

diff --git a/demos/opengl/camera.cpp b/demos/opengl/camera.cpp
index e50e3d7..0e5cc58 100644
--- a/demos/opengl/camera.cpp
+++ b/demos/opengl/camera.cpp

@@ -213,7 +213,7 @@
   a.x() *= depth / mProjectionMatrix(0, 0);
   a.y() *= depth / mProjectionMatrix(1, 1);
   a.z() = -depth;
-  // FIXME /\/|
+  // FIXME: verify this transformation.
   Vector4f b = invModelview * Vector4f(a.x(), a.y(), a.z(), 1.);
   return Vector3f(b.x(), b.y(), b.z());
 }

diff --git a/doc/CoeffwiseMathFunctionsTable.dox b/doc/CoeffwiseMathFunctionsTable.dox
index 3b9ba7b..cda19d9 100644
--- a/doc/CoeffwiseMathFunctionsTable.dox
+++ b/doc/CoeffwiseMathFunctionsTable.dox

@@ -572,7 +572,7 @@
   \anchor cwisetable_betainc
   betainc(a,b,x);
   </td>
-  <td><a href="https://en.wikipedia.org/wiki/Beta_function#Incomplete_beta_function">Incomplete beta function</a></td>
+  <td><a href="https://en.wikipedia.org/wiki/Beta_function#Incomplete_beta_function">regularized incomplete beta function</a></td>
   <td>
   built-in for float and double,\n but requires \cpp11
   </td>
@@ -608,6 +608,133 @@
 
 \n
 
+\section CoeffwiseMathFunctionsAccuracy Accuracy of vectorized math functions
+
+The following tables summarize the accuracy of %Eigen's vectorized implementations measured
+in units of ULP (Unit in the Last Place) on an x86-64 system (Intel Xeon, GCC) with SSE2 SIMD
+target. The reference values were computed using
+<a href="https://www.mpfr.org/">MPFR</a> at 128-bit precision.
+Float results are exhaustive over all ~4.28 billion finite representable values.
+Double results sample ~2.88 billion values using a geometric stepping factor of 10<sup>-6</sup>.
+
+These numbers may differ for other SIMD targets (AVX, AVX512, NEON, SVE, etc.)
+since each has its own packet math implementations. Functions marked "delegates to std"
+do not have a custom vectorized implementation for the tested SIMD target &mdash; they call
+the standard library function element-by-element.
+
+The full histograms for each function can be generated with the \c ulp_accuracy tool
+in <tt>test/ulp_accuracy/</tt>.
+
+\subsection CoeffwiseMathFunctionsAccuracy_float Float precision
+
+<table class="manual-hl">
+<tr><th>Function</th><th>Max |ULP|</th><th>Mean |ULP|</th><th>% Exact</th><th>Notes</th></tr>
+<tr><th colspan="5">Trigonometric</th></tr>
+<tr><td>sin</td>  <td>2</td>  <td>0.087</td> <td>91.3%</td> <td>\ref CoeffwiseMathFunctionsAccuracy_note1 "1"</td></tr>
+<tr><td>cos</td>  <td>2</td>  <td>0.088</td> <td>91.2%</td> <td>\ref CoeffwiseMathFunctionsAccuracy_note1 "1"</td></tr>
+<tr><td>tan</td>  <td>5</td>  <td>0.238</td> <td>77.3%</td> <td>\ref CoeffwiseMathFunctionsAccuracy_note1 "1"</td></tr>
+<tr><td>asin</td> <td>4</td>  <td>0.726</td> <td>51.3%</td> <td></td></tr>
+<tr><td>acos</td> <td>4</td>  <td>0.057</td> <td>95.0%</td> <td></td></tr>
+<tr><td>atan</td> <td>4</td>  <td>0.061</td> <td>94.0%</td> <td></td></tr>
+<tr><th colspan="5">Hyperbolic</th></tr>
+<tr><td>sinh</td>  <td>2</td>  <td>0.017</td> <td>98.3%</td> <td></td></tr>
+<tr><td>cosh</td>  <td>2</td>  <td>0.004</td> <td>99.6%</td> <td></td></tr>
+<tr><td>tanh</td>  <td>6</td>  <td>0.030</td> <td>97.2%</td> <td></td></tr>
+<tr><td>asinh</td> <td>2</td>  <td>0.145</td> <td>85.5%</td> <td></td></tr>
+<tr><td>acosh</td> <td>2</td>  <td>0.057</td> <td>94.3%</td> <td></td></tr>
+<tr><td>atanh</td> <td>2</td>  <td>0.004</td> <td>99.6%</td> <td></td></tr>
+<tr><th colspan="5">Exponential / Logarithmic</th></tr>
+<tr><td>exp</td>   <td>1</td>  <td>0.018</td> <td>98.2%</td> <td></td></tr>
+<tr><td>exp2</td>  <td>6</td>  <td>0.034</td> <td>97.3%</td> <td></td></tr>
+<tr><td>expm1</td> <td>5</td>  <td>0.060</td> <td>94.6%</td> <td></td></tr>
+<tr><td>log</td>   <td>3</td>  <td>0.120</td> <td>88.0%</td> <td></td></tr>
+<tr><td>log1p</td> <td>5</td>  <td>0.134</td> <td>87.5%</td> <td></td></tr>
+<tr><td>log10</td> <td>2</td>  <td>0.007</td> <td>99.3%</td> <td></td></tr>
+<tr><td>log2</td>  <td>5</td>  <td>0.005</td> <td>99.5%</td> <td></td></tr>
+<tr><th colspan="5">Error / Special</th></tr>
+<tr><td>erf</td>      <td>7</td>     <td>0.332</td> <td>67.5%</td> <td></td></tr>
+<tr><td>erfc</td>     <td>8</td>     <td>0.010</td> <td>99.2%</td> <td></td></tr>
+<tr><td>lgamma</td>   <td colspan="3"><em>delegates to std</em></td> <td>\ref CoeffwiseMathFunctionsAccuracy_note3 "3"</td></tr>
+<tr><th colspan="5">Other</th></tr>
+<tr><td>logistic</td> <td>7</td>     <td>0.040</td> <td>97.0%</td> <td></td></tr>
+<tr><td>sqrt</td>     <td>0</td>     <td>0.000</td> <td>100%</td>  <td>Uses hardware sqrt</td></tr>
+<tr><td>cbrt</td>     <td>2</td>     <td>0.552</td> <td>49.1%</td> <td></td></tr>
+<tr><td>rsqrt</td>    <td>&infin;</td> <td>0.114</td> <td>88.2%</td> <td>\ref CoeffwiseMathFunctionsAccuracy_note4 "4"</td></tr>
+</table>
+
+\subsection CoeffwiseMathFunctionsAccuracy_double Double precision
+
+<table class="manual-hl">
+<tr><th>Function</th><th>Max |ULP|</th><th>Mean |ULP|</th><th>% Exact</th><th>Notes</th></tr>
+<tr><th colspan="5">Trigonometric</th></tr>
+<tr><td>sin</td>  <td>13,879,755</td> <td>0.093</td> <td>93.2%</td> <td>\ref CoeffwiseMathFunctionsAccuracy_note1 "1"</td></tr>
+<tr><td>cos</td>  <td>2,024,130</td>  <td>0.043</td> <td>98.2%</td> <td>\ref CoeffwiseMathFunctionsAccuracy_note1 "1"</td></tr>
+<tr><td>tan</td>  <td>13,879,755</td> <td>0.128</td> <td>92.7%</td> <td>\ref CoeffwiseMathFunctionsAccuracy_note1 "1"</td></tr>
+<tr><td>asin</td> <td>1</td>          <td>&lt;0.001</td> <td>&gt;99.9%</td> <td></td></tr>
+<tr><td>acos</td> <td>1</td>          <td>&lt;0.001</td> <td>100%</td> <td></td></tr>
+<tr><td>atan</td> <td>5</td>          <td>0.013</td> <td>98.8%</td> <td></td></tr>
+<tr><th colspan="5">Hyperbolic</th></tr>
+<tr><td>sinh</td>  <td>2</td>  <td>0.004</td> <td>99.6%</td> <td></td></tr>
+<tr><td>cosh</td>  <td>2</td>  <td>0.001</td> <td>99.9%</td> <td></td></tr>
+<tr><td>tanh</td>  <td>8</td>  <td>0.008</td> <td>99.3%</td> <td></td></tr>
+<tr><td>asinh</td> <td>2</td>  <td>0.098</td> <td>90.2%</td> <td></td></tr>
+<tr><td>acosh</td> <td>2</td>  <td>0.047</td> <td>95.3%</td> <td></td></tr>
+<tr><td>atanh</td> <td>2</td>  <td>&lt;0.001</td> <td>&gt;99.9%</td> <td></td></tr>
+<tr><th colspan="5">Exponential / Logarithmic</th></tr>
+<tr><td>exp</td>   <td>2</td>   <td>0.001</td> <td>99.9%</td> <td></td></tr>
+<tr><td>exp2</td>  <td>214</td> <td>0.107</td> <td>99.6%</td> <td>\ref CoeffwiseMathFunctionsAccuracy_note2 "2"</td></tr>
+<tr><td>expm1</td> <td>3</td>   <td>0.010</td> <td>99.1%</td> <td></td></tr>
+<tr><td>log</td>   <td>2</td>   <td>0.147</td> <td>85.3%</td> <td></td></tr>
+<tr><td>log1p</td> <td>3</td>   <td>0.097</td> <td>90.6%</td> <td></td></tr>
+<tr><td>log10</td> <td>2</td>   <td>0.001</td> <td>99.9%</td> <td></td></tr>
+<tr><td>log2</td>  <td>2</td>   <td>&lt;0.001</td> <td>99.9%</td> <td></td></tr>
+<tr><th colspan="5">Error / Special</th></tr>
+<tr><td>erf</td>      <td>&infin;</td> <td>0.050</td> <td>70.5%</td> <td>\ref CoeffwiseMathFunctionsAccuracy_note5 "5"</td></tr>
+<tr><td>erfc</td>     <td>11</td>      <td>0.002</td> <td>99.9%</td> <td></td></tr>
+<tr><td>lgamma</td>   <td colspan="3"><em>delegates to std</em></td> <td>\ref CoeffwiseMathFunctionsAccuracy_note3 "3"</td></tr>
+<tr><th colspan="5">Other</th></tr>
+<tr><td>logistic</td> <td>3</td>       <td>0.008</td> <td>99.2%</td> <td></td></tr>
+<tr><td>sqrt</td>     <td>0</td>       <td>0.000</td> <td>100%</td>  <td>Uses hardware sqrt</td></tr>
+<tr><td>cbrt</td>     <td>2</td>       <td>0.119</td> <td>88.1%</td> <td></td></tr>
+<tr><td>rsqrt</td>    <td>&infin;</td> <td>0.135</td> <td>86.5%</td> <td>\ref CoeffwiseMathFunctionsAccuracy_note4 "4"</td></tr>
+</table>
+
+\subsection CoeffwiseMathFunctionsAccuracy_notes Notes
+
+\anchor CoeffwiseMathFunctionsAccuracy_note1
+<b>1. sin/cos/tan argument reduction:</b>
+%Eigen's vectorized sin, cos, and tan use a Cody-Waite argument reduction scheme that
+subtracts multiples of &pi;/2 from the input. For very large arguments (|x| &gt; ~10<sup>4</sup>
+in float, |x| &gt; ~10 in double), this reduction loses precision, producing
+occasional large ULP errors. The mean error remains low because most representable
+values are small. Applications that need high accuracy for large arguments should
+perform argument reduction in user code before calling these functions.
+
+\anchor CoeffwiseMathFunctionsAccuracy_note2
+<b>2. exp2 double precision:</b>
+The exp2 implementation for double shows a max error of 214 ULP near the overflow
+boundary (x &asymp; 1022). The mean error is still low (0.107 ULP, 99.6% exact), so
+the large max error affects only inputs very close to overflow.
+
+\anchor CoeffwiseMathFunctionsAccuracy_note3
+<b>3. lgamma:</b>
+The vectorized lgamma delegates to the standard library function \c std::lgamma for
+this SIMD target (SSE2) and therefore has the same accuracy as the platform's C math library.
+
+\anchor CoeffwiseMathFunctionsAccuracy_note4
+<b>4. rsqrt max=&infin;:</b>
+The infinite max ULP is due to a sign disagreement at a single subnormal input:
+rsqrt(-0) returns -&infin; in %Eigen but the MPFR reference produces NaN (rsqrt of a negative
+value). Ignoring this edge case, the implementation is accurate (&lt; 2 ULP) everywhere else.
+For float, 16.8 million subnormal negative inputs (0.4%) also produce &plusmn;&infin; vs NaN.
+The mean error excluding these outliers is well below 1 ULP.
+
+\anchor CoeffwiseMathFunctionsAccuracy_note5
+<b>5. erf double &infin;:</b>
+The vectorized erf for double returns NaN for &plusmn;&infin; instead of the correct &plusmn;1.
+This produces an infinite max ULP error at a single input value. Excluding &plusmn;&infin;,
+the max error is 3 ULP and the mean is 0.050 ULP.
+
 */
 
 }

diff --git a/doc/DenseDecompositionBenchmark.dox b/doc/DenseDecompositionBenchmark.dox
index 8f9570b..f5bfc9c 100644
--- a/doc/DenseDecompositionBenchmark.dox
+++ b/doc/DenseDecompositionBenchmark.dox

@@ -35,7 +35,7 @@
  + For large problem sizes, only the decomposition implementing a cache-friendly blocking strategy scale well. Those include LLT, PartialPivLU, HouseholderQR, and BDCSVD. This explain why for a 4k x 4k matrix, HouseholderQR is faster than LDLT. In the future, LDLT and ColPivHouseholderQR will also implement blocking strategies.
  + CompleteOrthogonalDecomposition is based on ColPivHouseholderQR and they thus achieve the same level of performance.
 
-The above table has been generated by the <a href="https://gitlab.com/libeigen/eigen/raw/master/bench/dense_solvers.cpp">bench/dense_solvers.cpp</a> file, feel-free to hack it to generate a table matching your hardware, compiler, and favorite problem sizes.
+The above table was originally generated by a benchmark tool. Feel free to write your own benchmark to generate a table matching your hardware, compiler, and favorite problem sizes.
 
 */
 

diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in
index 25a7e2e..445fc0a 100644
--- a/doc/Doxyfile.in
+++ b/doc/Doxyfile.in

@@ -32,7 +32,7 @@
                          "cpp14=<span class='cpp14'>[c++14]</span>" \
                          "cpp17=<span class='cpp17'>[c++17]</span>" \
                          "newin{1}=<span class='newin3x'>New in %Eigen \1.</span>" \
-                         eigenAutoToc= \
+                         "eigenAutoToc=<!-- TOC placeholder -->" \
                          eigenManualPage=\defgroup
 EXTENSION_MAPPING      = .h=C++ \
                          no_extension=C++

diff --git a/doc/HiPerformance.dox b/doc/HiPerformance.dox
index 9cee335..662fc9a 100644
--- a/doc/HiPerformance.dox
+++ b/doc/HiPerformance.dox

@@ -81,7 +81,7 @@
 m1.noalias() += m3.adjoint()
 *              * m2.adjoint(); \endcode</td>
 <td>This is because the product expression has the EvalBeforeNesting bit which
-    enforces the evaluation of the product by the Tranpose expression.</td>
+    enforces the evaluation of the product by the Transpose expression.</td>
 </tr>
 <tr class="alt">
 <td>\code

diff --git a/doc/Manual.dox b/doc/Manual.dox
index 65ae778..c229ef5 100644
--- a/doc/Manual.dox
+++ b/doc/Manual.dox

@@ -1,5 +1,5 @@
 
-// This file strutures pages and modules into a convenient hierarchical structure.
+// This file structures pages and modules into a convenient hierarchical structure.
 
 namespace Eigen {
 
@@ -35,7 +35,6 @@
 */
 
 /** \page UnclassifiedPages Unclassified pages
-  - \subpage TopicResizing
   - \subpage TopicVectorization
   - \subpage TopicEigenExpressionTemplates
   - \subpage TopicScalarTypes
@@ -70,6 +69,8 @@
     \ingroup DenseMatrixManipulation_chapter */
 /** \addtogroup TutorialReshape
     \ingroup DenseMatrixManipulation_chapter */
+/** \addtogroup TopicResizing
+    \ingroup DenseMatrixManipulation_chapter */
 /** \addtogroup TutorialSTL
     \ingroup DenseMatrixManipulation_chapter */
 /** \addtogroup TutorialMapClass

diff --git a/doc/PassingByValue.dox b/doc/PassingByValue.dox
index 9254fe6..4f11e0c 100644
--- a/doc/PassingByValue.dox
+++ b/doc/PassingByValue.dox

@@ -4,7 +4,11 @@
 
 Passing objects by value is almost always a very bad idea in C++, as this means useless copies, and one should pass them by reference instead.
 
-With %Eigen, this is even more important: passing \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen objects" by value is not only inefficient, it can be illegal or make your program crash! And the reason is that these %Eigen objects have alignment modifiers that aren't respected when they are passed by value.
+\note If you are compiling in C++17 mode with a sufficiently recent compiler (GCC >= 7, Clang >= 5, MSVC >= 19.12),
+the alignment issues described below are handled automatically by the compiler, and passing %Eigen objects by value
+is safe (though still less efficient than passing by const reference).
+
+With pre-C++17 compilers, passing \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen objects" by value is not only inefficient, it can be illegal or make your program crash! And the reason is that these %Eigen objects have alignment modifiers that aren't respected when they are passed by value.
 
 For example, a function like this, where \c v is passed by value:
 

diff --git a/doc/Pitfalls.dox b/doc/Pitfalls.dox
index 85282bd..7e627cf 100644
--- a/doc/Pitfalls.dox
+++ b/doc/Pitfalls.dox

@@ -112,17 +112,19 @@
 
 If you don't know why passing-by-value is wrong with %Eigen, read this \link TopicPassingByValue page \endlink first.
 
-While you may be extremely careful and use care to make sure that all of your code that explicitly uses %Eigen types is pass-by-reference you have to watch out for templates which define the argument types at compile time.
+\note If you are compiling in C++17 mode with a sufficiently recent compiler (GCC >= 7, Clang >= 5, MSVC >= 19.12),
+the alignment issues described below are handled automatically by the compiler via over-aligned operator new,
+and pass-by-value is safe.
 
-If a template has a function that takes arguments pass-by-value, and the relevant template parameter ends up being an %Eigen type, then you will of course have the same alignment problems that you would in an explicitly defined function passing %Eigen types by reference.
+For pre-C++17 code, you have to watch out for templates which define argument types at compile time.
+If a template has a function that takes arguments pass-by-value, and the relevant template parameter ends up being an %Eigen type, then you will have the same alignment problems that you would in an explicitly defined function passing %Eigen types by value.
 
 Using %Eigen types with other third party libraries or even the STL can present the same problem.
-<code>boost::bind</code> for example uses pass-by-value to store arguments in the returned functor.
-This will of course be a problem.
+\c std::bind for example uses pass-by-value to store arguments in the returned functor.
 
 There are at least two ways around this:
-  - If the value you are passing is guaranteed to be around for the life of the functor, you can use boost::ref() to wrap the value as you pass it to boost::bind. Generally this is not a solution for values on the stack as if the functor ever gets passed to a lower or independent scope, the object may be gone by the time it's attempted to be used.
-  - The other option is to make your functions take a reference counted pointer like boost::shared_ptr as the argument. This avoids needing to worry about managing the lifetime of the object being passed.
+  - If the value you are passing is guaranteed to be around for the life of the functor, you can use \c std::ref() to wrap the value as you pass it to \c std::bind. Generally this is not a solution for values on the stack as if the functor ever gets passed to a lower or independent scope, the object may be gone by the time it's attempted to be used.
+  - The other option is to make your functions take a reference counted pointer like \c std::shared_ptr as the argument. This avoids needing to worry about managing the lifetime of the object being passed.
 
 
 \section TopicPitfalls_matrix_bool Matrices with boolean coefficients

diff --git a/doc/PreprocessorDirectives.dox b/doc/PreprocessorDirectives.dox
index f4af907..c8da063 100644
--- a/doc/PreprocessorDirectives.dox
+++ b/doc/PreprocessorDirectives.dox

@@ -60,7 +60,8 @@
 functions by defining EIGEN_HAS_C99_MATH=1.
 
  - \b EIGEN_HAS_C99_MATH - controls the usage of C99 math functions such as erf, erfc, lgamma, etc.
- - \b EIGEN_HAS_STD_RESULT_OF - defines whether std::result_of is supported
+ - \b EIGEN_HAS_STD_INVOKE_RESULT - controls the usage of \c std::invoke_result (C++17). When disabled,
+   \c std::result_of is used as a fallback.
  - \b EIGEN_NO_IO - Disables any usage and support for `<iostreams>`.
 
 \section TopicPreprocessorDirectivesAssertions Assertions
@@ -103,8 +104,9 @@
    this threshold raises a compile time assertion. Use 0 to set no limit. Default is 128 KB.
  - \b \c EIGEN_NO_CUDA - disables CUDA support when defined. Might be useful in .cu files for which Eigen is used on the host only,
    and never called from device code.
+ - \b \c EIGEN_NO_HIP - disables HIP support when defined. Analogous to \c EIGEN_NO_CUDA for AMD's HIP compiler.
  - \b \c EIGEN_STRONG_INLINE - This macro is used to qualify critical functions and methods that we expect the compiler to inline.
-   By default it is defined to \c __forceinline for MSVC and ICC, and to \c inline for other compilers. A tipical usage is to
+   By default it is defined to \c __forceinline for MSVC and ICC, and to \c inline for other compilers. A typical usage is to
    define it to \c inline for MSVC users wanting faster compilation times, at the risk of performance degradations in some rare
    cases for which MSVC inliner fails to do a good job.
  - \b \c EIGEN_DEFAULT_L1_CACHE_SIZE - Sets the default L1 cache size that is used in Eigen's GEBP kernel when the correct cache size cannot be determined at runtime.
@@ -126,9 +128,7 @@
 
  - \b EIGEN_ARRAY_PLUGIN - filename of plugin for extending the Array class.
  - \b EIGEN_ARRAYBASE_PLUGIN - filename of plugin for extending the ArrayBase class.
- - \b EIGEN_CWISE_PLUGIN - filename of plugin for extending the Cwise class.
  - \b EIGEN_DENSEBASE_PLUGIN - filename of plugin for extending the DenseBase class.
- - \b EIGEN_DYNAMICSPARSEMATRIX_PLUGIN - filename of plugin for extending the DynamicSparseMatrix class.
  - \b EIGEN_FUNCTORS_PLUGIN - filename of plugin for adding new functors and specializations of functor_traits.
  - \b EIGEN_MAPBASE_PLUGIN - filename of plugin for extending the MapBase class.
  - \b EIGEN_MATRIX_PLUGIN - filename of plugin for extending the Matrix class.

diff --git a/doc/QuickReference.dox b/doc/QuickReference.dox
index 1fb000f..4aece90 100644
--- a/doc/QuickReference.dox
+++ b/doc/QuickReference.dox

@@ -551,7 +551,6 @@
 <tr>
 <th>Default versions</th>
 <th>Optimized versions when the size \n is known at compile time</th></tr>
-<th></th>
 
 <tr><td>\code vec1.head(n)\endcode</td><td>\code vec1.head<n>()\endcode</td><td>the first \c n coeffs </td></tr>
 <tr><td>\code vec1.tail(n)\endcode</td><td>\code vec1.tail<n>()\endcode</td><td>the last \c n coeffs </td></tr>

diff --git a/doc/SparseLinearSystems.dox b/doc/SparseLinearSystems.dox
index a82ddfe..0c7bfc1 100644
--- a/doc/SparseLinearSystems.dox
+++ b/doc/SparseLinearSystems.dox

@@ -176,7 +176,7 @@
 For direct methods, the solution are computed at the machine precision. Sometimes, the solution need not be too accurate. In this case, the iterative methods are more suitable and the desired accuracy can be set before the solve step using \b setTolerance(). For all the available functions, please, refer to the documentation of the \link IterativeLinearSolvers_Module Iterative solvers module \endlink. 
 
 \section BenchmarkRoutine
-Most of the time, all you need is to know how much time it will take to solve your system, and hopefully, what is the most suitable solver. In Eigen, we provide a benchmark routine that can be used for this purpose. It is very easy to use. In the build directory, navigate to `bench/spbench` and compile the routine by typing `make spbenchsolver`. Run it with `--help` option to get the list of all available options. Basically, the matrices to test should be in <a href="http://math.nist.gov/MatrixMarket/formats.html">MatrixMarket Coordinate format</a>, and the routine returns the statistics from all available solvers in Eigen.
+Most of the time, all you need is to know how much time it will take to solve your system, and hopefully, what is the most suitable solver. You can benchmark solvers by timing them on matrices in <a href="http://math.nist.gov/MatrixMarket/formats.html">MatrixMarket Coordinate format</a>. See the `benchmarks/Sparse/` directory for example benchmarks.
 
 To export your matrices and right-hand-side vectors in the matrix-market format, you can use the unsupported SparseExtra module:
 \code

diff --git a/doc/TopicMultithreading.dox b/doc/TopicMultithreading.dox
index 7a8ff30..e8e10c5 100644
--- a/doc/TopicMultithreading.dox
+++ b/doc/TopicMultithreading.dox

@@ -4,11 +4,10 @@
 
 \section TopicMultiThreading_MakingEigenMT Make Eigen run in parallel
 
-Some %Eigen's algorithms can exploit the multiple cores present in your hardware.
-To this end, it is enough to enable OpenMP on your compiler, for instance:
- - GCC: \c -fopenmp
- - ICC: \c -openmp
- - MSVC: check the respective option in the build properties.
+Some of %Eigen's algorithms can exploit the multiple cores present in your hardware.
+The primary mechanism is OpenMP. To enable it, pass the appropriate flag to your compiler:
+ - GCC/Clang: \c -fopenmp
+ - MSVC: \c /openmp (or check the respective option in the build properties)
 
 You can control the number of threads that will be used using either the OpenMP API or %Eigen's API using the following priority:
 \code
@@ -24,6 +23,14 @@
 \endcode
 You can disable %Eigen's multi threading at compile time by defining the \link TopicPreprocessorDirectivesPerformance EIGEN_DONT_PARALLELIZE \endlink preprocessor token.
 
+\subsection TopicMultiThreading_ThreadPool Alternative: ThreadPool backend
+
+As an alternative to OpenMP, %Eigen supports a custom thread pool backend for GEMM operations.
+Define \c EIGEN_GEMM_THREADPOOL and use \c Eigen::setGemmThreadPool(Eigen::ThreadPool*) to
+provide a thread pool. OpenMP and \c EIGEN_GEMM_THREADPOOL are mutually exclusive.
+
+\subsection TopicMultiThreading_ParallelOps Parallelized operations
+
 Currently, the following algorithms can make use of multi-threading:
  - general dense matrix - matrix products
  - PartialPivLU
@@ -36,28 +43,14 @@
 
 Indeed, the principle of hyper-threading is to run multiple threads (in most cases 2) on a single core in an interleaved manner.
 However, %Eigen's matrix-matrix product kernel is fully optimized and already exploits nearly 100% of the CPU capacity.
-Consequently, there is no room for running multiple such threads on a single core, and the performance would drops significantly because of cache pollution and other sources of overheads.
+Consequently, there is no room for running multiple such threads on a single core, and the performance would drop significantly because of cache pollution and other sources of overhead.
 At this stage of reading you're probably wondering why %Eigen does not limit itself to the number of physical cores?
 This is simply because OpenMP does not allow to know the number of physical cores, and thus %Eigen will launch as many threads as <i>cores</i> reported by OpenMP.
 
 \section TopicMultiThreading_UsingEigenWithMT Using Eigen in a multi-threaded application
 
-In the case your own application is multithreaded, and multiple threads make calls to %Eigen, then you have to initialize %Eigen by calling the following routine \b before creating the threads:
-\code
-#include <Eigen/Core>
-
-int main(int argc, char** argv)
-{
-  Eigen::initParallel();
-  
-  ...
-}
-\endcode
-
-\note With %Eigen 3.3, and a fully C++11 compliant compiler (i.e., <a href="http://en.cppreference.com/w/cpp/language/storage_duration#Static_local_variables">thread-safe static local variable initialization</a>), then calling \c initParallel() is optional.
-
-\warning Note that all functions generating random matrices are \b not re-entrant nor thread-safe. Those include DenseBase::Random(), and DenseBase::setRandom() despite a call to `Eigen::initParallel()`. This is because these functions are based on `std::rand` which is not re-entrant.
-For thread-safe random generator, we recommend the use of c++11 random generators (\link DenseBase::NullaryExpr(Index, const CustomNullaryOp&) example \endlink) or `boost::random`.
+\warning Note that all functions generating random matrices are \b not re-entrant nor thread-safe. Those include DenseBase::Random(), and DenseBase::setRandom(). This is because these functions are based on \c std::rand which is not re-entrant.
+For thread-safe random generation, we recommend the use of C++11 random generators (\link DenseBase::NullaryExpr(Index, const CustomNullaryOp&) example \endlink).
 
 In the case your application is parallelized with OpenMP, you might want to disable %Eigen's own parallelization as detailed in the previous section.
 

diff --git a/doc/TopicResizing.dox b/doc/TopicResizing.dox
index c323e17..b7f28bf 100644
--- a/doc/TopicResizing.dox
+++ b/doc/TopicResizing.dox

@@ -1,11 +1,179 @@
 namespace Eigen {
 
-/** \page TopicResizing Resizing
+/** \eigenManualPage TopicResizing Resizing
+
+\eigenAutoToc
+
+\section TopicResizing_Resize Resizing with \link PlainObjectBase::resize(Index,Index) resize() \endlink
+
+The most basic method to change the size of matrices or vectors is \link PlainObjectBase::resize(Index,Index) resize(rows, cols) \endlink. 
+It takes the new number of rows and columns as arguments.
+
+\code
+MatrixXd m(2,2);
+m << 1, 2, 3, 4;
+m.resize(3,3);
+// m is now 3x3. 
+// OLD values are lost. NEW values are uninitialized.
+\endcode
+
+The \c resize() method is **destructive** if the total number of coefficients (rows  x columns) differs from the previous one. 
+Meaning that all previous values are lost and the newly allocated coefficients are **uninitialized**. 
+You should fill them before use.
+
+\subsection TopicResizing_ResizeNoOp The special case of "No-Op" resizing
+If you resize a matrix while keeping the total number of coefficients unchanged, the existing values are preserved in memory. 
+(Meaning when old_rows x old_cols = new_rows x new_cols)
+
+However, because Eigen stores matrices in **column-major** order by default, the logical position of these values may change.
+
+\code
+MatrixXd m(2,2);
+m << 1, 2, 3, 4; 
+// m is now:    1 2
+//              3 4    
+// Memory storage: [1, 3, 2, 4]
+
+// Resizing to 1x4 (total size 4 is unchanged)
+m.resize(1,4); 
+// m is now:   1 3 2 4
+// The memory [1, 3, 2, 4] was not touched, but is now interpreted as a 1x4 matrix. 
+\endcode
+
+\subsection TopicResizing_ResizeNoChange Resizing only one dimension
+
+To resize only one dimension while leaving the other unchanged, 
+you can pass \c Eigen::NoChange as the parameter for the dimension you wish to keep.
+
+\code
+MatrixXd m(2,2);
+// Resize rows to 5, keep columns at 2
+m.resize(5, Eigen::NoChange); 
+
+// Resize columns to 3, keep rows at 5
+m.resize(Eigen::NoChange, 3);
+\endcode
+
+\subsection TopicResizing_ResizeVectors Resizing vectors
+
+Resizing for vectors behaves the same way as for matrices. 
+You provide the new size as an argument to \c resize().
+
+\code
+VectorXd v(3);
+v << 1, 2, 3;
+v.resize(5);   
+// v is now of size 5 and the values are uninitialized.
+\endcode
+
+\subsection TopicResizing_ResizeArray Resizing arrays
+
+Resizing for arrays behaves the same way as for matrices.
+You provide the new number of rows and columns as arguments to \c resize().
+
+\code
+ArrayXXf a(2,2);
+a << 1, 2, 3, 4;
+a.resize(3,3);
+// a is now 3x3 and the values are uninitialized.
+\endcode
+
+\section TopicResizing_ResizeLike Resizing to match another object with \link PlainObjectBase::resizeLike() resizeLike() \endlink
+
+You can resize a matrix or vector to match the dimensions of another object using \link PlainObjectBase::resizeLike() resizeLike(eigenBase) \endlink.
+This method is also **destructive** (data is lost).
+
+\code
+MatrixXd m(2,2);
+MatrixXd n(4,4);
+m.resizeLike(n); 
+// m is now 4x4.
+\endcode
 
 
-TODO: write this dox page!
+<b>Note on Vectors:</b> When applied to vectors, \c resizeLike() matches the **size** (number of coefficients) of the other object, 
+but maintains the row/column orientation of the vector being resized.
 
-Is linked from the tutorial on the Matrix class.
+\code
+RowVectorXd r(2);
+VectorXd c(5);
+
+// r is resized to be a row-vector of size 5 (1x5), matching c's size.
+// It does NOT become a column-vector.
+r.resizeLike(c); 
+\endcode
+
+\section TopicResizing_Conservative Resizing with \link PlainObjectBase::conservativeResize(Index,Index) conservativeResize() \endlink
+
+If you need to resize a matrix while keeping its current values, 
+use \link PlainObjectBase::conservativeResize(Index,Index) conservativeResize(rows, cols) \endlink.
+
+\code
+MatrixXd m(2,2);
+m << 1, 2, 3, 4;
+m.conservativeResize(3,3);
+// m is now:
+// 1 2 ?
+// 3 4 ?
+// ? ? ?
+// The '?' are uninitialized values.
+\endcode
+
+
+When using \c conservativeResize():
+- **Preservation:** The existing values are preserved.
+- **Alignment:** The matrix is resized relative to the **top-left** corner.
+- **New Data:** Any newly allocated coefficients (if the matrix grows) are **uninitialized**. You should fill them before use.
+
+Just like \c resize(), you can resize **vectors** and **arrays** and keep previous values, using \c conservativeResize().
+
+And just like \c resize(), you can use \c Eigen::NoChange to resize only one dimension conservatively:
+\code
+MatrixXd m(2,2);
+m << 1, 2, 3, 4;
+
+// Add a new row (now 3x2), keeping existing values
+m.conservativeResize(3, Eigen::NoChange); 
+\endcode
+
+\section TopicResizing_Assignment Automatic resizing on assignment
+
+By default, when you assign one matrix to another, Eigen automatically resizes the left-hand side to match the size of the right-hand side.
+
+\code
+MatrixXf a(2,2);
+MatrixXf b(4,4);
+a = b; // a is now 4x4
+\endcode
+
+\subsection TopicResizing_DisableAuto Disabling automatic resizing
+
+In some applications, you may want to prevent automatic resizing to avoid unexpected memory allocations. 
+You can disable this behavior by defining the \c EIGEN_NO_AUTOMATIC_RESIZING preprocessor macro.
+
+If this macro is defined, the assignment `a = b` will trigger an assertion failure at runtime if the dimensions of `a` and `b` do not match.
+
+\code
+#define EIGEN_NO_AUTOMATIC_RESIZING
+#include <Eigen/Dense>
+
+void function() {
+  MatrixXf a(2,2);
+  MatrixXf b(4,4);
+  a = b; // ERROR: Runtime assertion failure
+}
+\endcode
+
+\section TopicResizing_Fixed Fixed-size matrices
+
+Resizing methods are technically available on fixed-size matrices for API uniformity, but they will trigger an assertion failure if you try to actually change the dimensions. 
+Because the dimensions of a fixed-size matrix (like \c Matrix4f) are determined at compile-time, they cannot be changed at runtime.
+
+\code
+Matrix4f m;
+m.resize(4,4); // Legal, no-op
+m.resize(5,5); // ERROR: Runtime assertion failure
+\endcode
 
 */
 }

diff --git a/doc/TutorialGeometry.dox b/doc/TutorialGeometry.dox
index 30bc25a..2938df4 100644
--- a/doc/TutorialGeometry.dox
+++ b/doc/TutorialGeometry.dox

@@ -112,7 +112,7 @@
 
 <a href="#" class="top">top</a>\section TutorialGeoTransform Affine transformations
 Generic affine transformations are represented by the Transform class which internally
-is a (Dim+1)^2 matrix. In Eigen we have chosen to not distinghish between points and
+is a (Dim+1)^2 matrix. In Eigen we have chosen to not distinguish between points and
 vectors such that all points are actually represented by displacement vectors from the
 origin ( \f$ \mathbf{p} \equiv \mathbf{p}-0 \f$ ). With that in mind, real points and
 vector distinguish when the transformation is applied.

diff --git a/doc/TutorialMapClass.dox b/doc/TutorialMapClass.dox
index caa2539..64f0a0e 100644
--- a/doc/TutorialMapClass.dox
+++ b/doc/TutorialMapClass.dox

@@ -9,7 +9,7 @@
 
 \section TutorialMapIntroduction Introduction
 
-Occasionally you may have a pre-defined array of numbers that you want to use within %Eigen as a vector or matrix. While one option is to make a copy of the data, most commonly you probably want to re-use this memory as an %Eigen type. Fortunately, this is very easy with the Map class.
+Occasionally you may have a pre-defined array of numbers that you want to use within %Eigen as a vector or matrix. While one option is to make a copy of the data, most commonly you probably want to reuse this memory as an %Eigen type. Fortunately, this is very easy with the Map class.
 
 \section TutorialMapTypes Map types and declaring Map variables
 

diff --git a/doc/TutorialSlicingIndexing.dox b/doc/TutorialSlicingIndexing.dox
index 6ebaa2d..a1b00df 100644
--- a/doc/TutorialSlicingIndexing.dox
+++ b/doc/TutorialSlicingIndexing.dox

@@ -114,8 +114,7 @@
 
 As seen in the last example, referencing the <i> last n </i> elements (or rows/columns) is a bit cumbersome to write.
 This becomes even more tricky and error prone with a non-default increment.
-Here comes \link Eigen::placeholders::lastN(SizeType) Eigen::placeholders::lastN(size) \endlink, and
-\link Eigen::placeholders::lastN(SizeType,IncrType) Eigen::placeholders::lastN(size,incr) \endlink:
+Here comes \ref Eigen_placeholders_lastN :
 
 <table class="manual">
 <tr>

diff --git a/doc/UnalignedArrayAssert.dox b/doc/UnalignedArrayAssert.dox
index ca674a2..0ed6f88 100644
--- a/doc/UnalignedArrayAssert.dox
+++ b/doc/UnalignedArrayAssert.dox

@@ -7,7 +7,7 @@
 my_program: path/to/eigen/Eigen/src/Core/DenseStorage.h:44:
 Eigen::internal::matrix_array<T, Size, MatrixOptions, Align>::internal::matrix_array()
 [with T = double, int Size = 2, int MatrixOptions = 2, bool Align = true]:
-Assertion `(reinterpret_cast<size_t>(array) & (sizemask)) == 0 && "this assertion
+Assertion (reinterpret_cast<size_t>(array) & (sizemask)) == 0 && "this assertion
 is explained here: http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html
 **** READ THIS WEB PAGE !!! ****"' failed.
 </pre>

diff --git a/doc/UsingAOCL.dox b/doc/UsingAOCL.dox
index 24ce698..311b556 100644
--- a/doc/UsingAOCL.dox
+++ b/doc/UsingAOCL.dox

@@ -196,14 +196,10 @@
 \endcode
 
 
-To build Eigen with AOCL integration and benchmarking capabilities, use the following CMake configuration:
+To build Eigen with AOCL integration, use the following CMake configuration:
 
 \code
-cmake .. -DEIGEN_BUILD_AOCL_BENCH=ON \
-         -DEIGEN_AOCL_BENCH_FLAGS="-O3 -mavx512f -fveclib=AMDLIBM" \
-         -DEIGEN_AOCL_BENCH_USE_MT=OFF \
-         -DEIGEN_AOCL_BENCH_ARCH=znver5 \
-         -DCMAKE_BUILD_TYPE=Debug \
+cmake .. -DCMAKE_BUILD_TYPE=Release \
          -DCMAKE_C_COMPILER=clang \
          -DCMAKE_CXX_COMPILER=clang++ \
          -DCMAKE_INSTALL_PREFIX=$PWD/install \
@@ -215,10 +211,6 @@
 
 <table class="manual">
 <tr><th>Parameter</th><th>Expected Values</th><th>Description</th></tr>
-<tr><td>\c EIGEN_BUILD_AOCL_BENCH</td><td>\c ON, \c OFF</td><td>Enable/disable AOCL benchmark compilation</td></tr>
-<tr class="alt"><td>\c EIGEN_AOCL_BENCH_FLAGS</td><td>Compiler flags string</td><td>Additional compiler optimizations: \c "-O3 -mavx512f -fveclib=AMDLIBM"</td></tr>
-<tr><td>\c EIGEN_AOCL_BENCH_USE_MT</td><td>\c ON, \c OFF</td><td>Use multi-threaded AOCL libraries (\c ON recommended for performance)</td></tr>
-<tr class="alt"><td>\c EIGEN_AOCL_BENCH_ARCH</td><td>\c znver3, \c znver4, \c znver5, \c native, \c generic</td><td>Target AMD architecture (match your CPU generation)</td></tr>
 <tr><td>\c CMAKE_BUILD_TYPE</td><td>\c Release, \c Debug, \c RelWithDebInfo</td><td>Build configuration (\c Release recommended for benchmarks)</td></tr>
 <tr class="alt"><td>\c CMAKE_C_COMPILER</td><td>\c clang, \c gcc</td><td>C compiler (clang recommended for AOCL)</td></tr>
 <tr><td>\c CMAKE_CXX_COMPILER</td><td>\c clang++, \c g++</td><td>C++ compiler (clang++ recommended for AOCL)</td></tr>

diff --git a/doc/UsingNVCC.dox b/doc/UsingNVCC.dox
index 36beb2d..ce5e07f 100644
--- a/doc/UsingNVCC.dox
+++ b/doc/UsingNVCC.dox

@@ -3,27 +3,36 @@
 
 /** \page TopicCUDA Using Eigen in CUDA kernels
 
-Staring from CUDA 5.5 and Eigen 3.3, it is possible to use Eigen's matrices, vectors, and arrays for fixed size within CUDA kernels. This is especially useful when working on numerous but small problems. By default, when Eigen's headers are included within a .cu file compiled by nvcc most Eigen's functions and methods are prefixed by the \c __device__ \c __host__ keywords making them callable from both host and device code.
-This support can be disabled by defining \c EIGEN_NO_CUDA before including any Eigen's header.
-This might be useful to disable some warnings when a .cu file makes use of Eigen on the host side only.
-However, in both cases, host's SIMD vectorization has to be disabled in .cu files.
-It is thus \b strongly \b recommended to properly move all costly host computation from your .cu files to regular .cpp files.
+\section TopicCUDA_Overview Overview
 
-Known issues:
+%Eigen's fixed-size matrices, vectors, and arrays can be used inside CUDA and HIP kernels.
+This is especially useful when working on numerous but small problems.
+By default, when %Eigen's headers are included within a \c .cu file compiled by \c nvcc,
+most of %Eigen's functions and methods are prefixed by the \c __device__ \c __host__ keywords
+making them callable from both host and device code.
 
- - \c nvcc with MS Visual Studio does not work (patch welcome)
- 
- - \c nvcc 5.5 with gcc-4.7 (or greater) has issues with the standard \c \<limits\> header file. To workaround this, you can add the following before including any other files:
-   \code
-    // workaround issue between gcc >= 4.7 and cuda 5.5
-    #if (defined __GNUC__) && (__GNUC__>4 || __GNUC_MINOR__>=7)
-      #undef _GLIBCXX_ATOMIC_BUILTINS
-      #undef _GLIBCXX_USE_INT128
-    #endif
-   \endcode
-   
- - On 64bits system Eigen uses \c long \c int as the default type for indexes and sizes. On CUDA device, it would make sense to default to 32 bits \c int.
-   However, to keep host and CUDA code compatible, this cannot be done automatically by %Eigen, and the user is thus required to define \c EIGEN_DEFAULT_DENSE_INDEX_TYPE to \c int throughout his code (or only for CUDA code if there is no interaction between host and CUDA code through %Eigen's object).
+This support can be disabled by defining \c EIGEN_NO_CUDA (or \c EIGEN_NO_HIP for HIP)
+before including any %Eigen header.
+This might be useful to disable some warnings when a \c .cu file makes use of %Eigen on the host side only.
+
+\warning Host SIMD vectorization must be disabled in \c .cu files. It is \b strongly
+\b recommended to move all costly host-side computation from \c .cu files to regular \c .cpp files.
+
+\section TopicCUDA_HIP HIP support
+
+%Eigen also supports AMD's HIP compiler (\c hipcc). The same \c EIGEN_DEVICE_FUNC mechanism applies:
+functions are annotated with \c __device__ \c __host__ when compiling with HIP.
+Define \c EIGEN_NO_HIP to disable this. Internally, both CUDA and HIP are unified under the
+\c EIGEN_GPUCC macro.
+
+\section TopicCUDA_KnownIssues Known issues
+
+ - On 64-bit systems %Eigen uses \c long \c int as the default type for indexes and sizes.
+   On CUDA/HIP devices, it would make sense to default to 32-bit \c int.
+   However, to keep host and device code compatible, this cannot be done automatically by %Eigen,
+   and the user is thus required to define \c EIGEN_DEFAULT_DENSE_INDEX_TYPE to \c int throughout
+   their code (or only for device code if there is no interaction between host and device code
+   through %Eigen's objects).
 
 */
 

diff --git a/doc/snippets/EigenSolver_compute.cpp b/doc/snippets/EigenSolver_compute.cpp
index 1652463..6f794ee 100644
--- a/doc/snippets/EigenSolver_compute.cpp
+++ b/doc/snippets/EigenSolver_compute.cpp

@@ -2,5 +2,5 @@
 MatrixXf A = MatrixXf::Random(4, 4);
 es.compute(A, /* computeEigenvectors = */ false);
 cout << "The eigenvalues of A are: " << es.eigenvalues().transpose() << endl;
-es.compute(A + MatrixXf::Identity(4, 4), false);  // re-use es to compute eigenvalues of A+I
+es.compute(A + MatrixXf::Identity(4, 4), false);  // reuse es to compute eigenvalues of A+I
 cout << "The eigenvalues of A+I are: " << es.eigenvalues().transpose() << endl;

diff --git a/doc/snippets/HessenbergDecomposition_compute.cpp b/doc/snippets/HessenbergDecomposition_compute.cpp
index 298b40b..91c015a 100644
--- a/doc/snippets/HessenbergDecomposition_compute.cpp
+++ b/doc/snippets/HessenbergDecomposition_compute.cpp

@@ -2,5 +2,5 @@
 HessenbergDecomposition<MatrixXcf> hd(4);
 hd.compute(A);
 cout << "The matrix H in the decomposition of A is:" << endl << hd.matrixH() << endl;
-hd.compute(2 * A);  // re-use hd to compute and store decomposition of 2A
+hd.compute(2 * A);  // reuse hd to compute and store decomposition of 2A
 cout << "The matrix H in the decomposition of 2A is:" << endl << hd.matrixH() << endl;

diff --git a/doc/snippets/MatrixBase_diagonalView.cpp b/doc/snippets/MatrixBase_diagonalView.cpp
new file mode 100644
index 0000000..4fbf719
--- /dev/null
+++ b/doc/snippets/MatrixBase_diagonalView.cpp

@@ -0,0 +1,5 @@
+Matrix3d m;
+m << 1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9;
+cout << "Here's the matrix m:" << endl << m << endl;
+cout << "m.diagonal().asDiagonal() returns: " << endl << m.diagonal().asDiagonal() << endl;
+cout << "m.diagonalView() returns: " << endl << m.diagonalView() << endl;

diff --git a/doc/snippets/SelfAdjointEigenSolver_SelfAdjointEigenSolver.cpp b/doc/snippets/SelfAdjointEigenSolver_SelfAdjointEigenSolver.cpp
index 16f4b23..0a26e9c 100644
--- a/doc/snippets/SelfAdjointEigenSolver_SelfAdjointEigenSolver.cpp
+++ b/doc/snippets/SelfAdjointEigenSolver_SelfAdjointEigenSolver.cpp

@@ -3,5 +3,5 @@
 Matrix4f A = X + X.transpose();
 es.compute(A);
 cout << "The eigenvalues of A are: " << es.eigenvalues().transpose() << endl;
-es.compute(A + Matrix4f::Identity(4, 4));  // re-use es to compute eigenvalues of A+I
+es.compute(A + Matrix4f::Identity(4, 4));  // reuse es to compute eigenvalues of A+I
 cout << "The eigenvalues of A+I are: " << es.eigenvalues().transpose() << endl;

diff --git a/doc/snippets/SelfAdjointEigenSolver_compute_MatrixType.cpp b/doc/snippets/SelfAdjointEigenSolver_compute_MatrixType.cpp
index 9075db7..1548562 100644
--- a/doc/snippets/SelfAdjointEigenSolver_compute_MatrixType.cpp
+++ b/doc/snippets/SelfAdjointEigenSolver_compute_MatrixType.cpp

@@ -3,5 +3,5 @@
 MatrixXf A = X + X.transpose();
 es.compute(A);
 cout << "The eigenvalues of A are: " << es.eigenvalues().transpose() << endl;
-es.compute(A + MatrixXf::Identity(4, 4));  // re-use es to compute eigenvalues of A+I
+es.compute(A + MatrixXf::Identity(4, 4));  // reuse es to compute eigenvalues of A+I
 cout << "The eigenvalues of A+I are: " << es.eigenvalues().transpose() << endl;

diff --git a/doc/snippets/Tridiagonalization_compute.cpp b/doc/snippets/Tridiagonalization_compute.cpp
index ecaee15..1bb9d28 100644
--- a/doc/snippets/Tridiagonalization_compute.cpp
+++ b/doc/snippets/Tridiagonalization_compute.cpp

@@ -4,6 +4,6 @@
 tri.compute(A);
 cout << "The matrix T in the tridiagonal decomposition of A is: " << endl;
 cout << tri.matrixT() << endl;
-tri.compute(2 * A);  // re-use tri to compute eigenvalues of 2A
+tri.compute(2 * A);  // reuse tri to compute eigenvalues of 2A
 cout << "The matrix T in the tridiagonal decomposition of 2A is: " << endl;
 cout << tri.matrixT() << endl;

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index f704c4a..0555bfe 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt

@@ -186,6 +186,17 @@
 ei_add_test(float_conversion)
 ei_add_test(io)
 ei_add_test(packetmath "-DEIGEN_FAST_MATH=1")
+# Generic clang vector backend tests for different vector sizes.
+include(CheckCXXSourceCompiles)
+check_cxx_source_compiles("
+  typedef float v4sf __attribute__((ext_vector_type(4)));
+  int main() { return __builtin_vectorelements(v4sf{}); }
+" COMPILER_SUPPORTS_VECTOR_EXTENSIONS)
+if(COMPILER_SUPPORTS_VECTOR_EXTENSIONS)
+  ei_add_test(packetmath_generic_16 "-DEIGEN_FAST_MATH=1")
+  ei_add_test(packetmath_generic_32 "-DEIGEN_FAST_MATH=1")
+  ei_add_test(packetmath_generic_64 "-DEIGEN_FAST_MATH=1")
+endif()
 ei_add_test(packet_segment)
 ei_add_test(vectorization_logic)
 ei_add_test(basicstuff)
@@ -206,11 +217,13 @@
 ei_add_test(reshape)
 ei_add_test(swap)
 ei_add_test(resize)
+ei_add_test(no_automatic_resizing)
 ei_add_test(conservative_resize)
 ei_add_test(product_small)
 ei_add_test(product_large)
 ei_add_test(product_extra)
 ei_add_test(diagonalmatrices)
+ei_add_test(diagonalview)
 ei_add_test(skew_symmetric_matrix3)
 ei_add_test(adjoint)
 ei_add_test(diagonal)
@@ -243,6 +256,7 @@
 ei_add_test(permutationmatrices)
 ei_add_test(bandmatrix)
 ei_add_test(cholesky)
+ei_add_test(condition_estimator)
 ei_add_test(lu)
 ei_add_test(determinant)
 ei_add_test(inverse)
@@ -335,6 +349,12 @@
     set(EIGEN_FASTMATH_FLAGS "/fp:fast")
   endif()
 endif()
+# The fastmath test intentionally uses NaN/infinity under -ffast-math.
+# Suppress the clang warning about this being technically undefined.
+check_cxx_compiler_flag("-Wno-nan-infinity-disabled" COMPILER_SUPPORT_WNO_NAN_INF)
+if(COMPILER_SUPPORT_WNO_NAN_INF)
+  set(EIGEN_FASTMATH_FLAGS "${EIGEN_FASTMATH_FLAGS} -Wno-nan-infinity-disabled")
+endif()
 
 ei_add_test(fastmath "${EIGEN_FASTMATH_FLAGS}")
 
@@ -400,11 +420,16 @@
 # CUDA unit tests
 option(EIGEN_TEST_CUDA "Enable CUDA support in unit tests" OFF)
 option(EIGEN_TEST_CUDA_CLANG "Use clang instead of nvcc to compile the CUDA tests" OFF)
+option(EIGEN_TEST_CUDA_NVC "Use nvc++ (NVHPC) instead of nvcc to compile the CUDA tests" OFF)
 
 if(EIGEN_TEST_CUDA_CLANG AND NOT CMAKE_CXX_COMPILER MATCHES "clang")
   message(WARNING "EIGEN_TEST_CUDA_CLANG is set, but CMAKE_CXX_COMPILER does not appear to be clang.")
 endif()
 
+if(EIGEN_TEST_CUDA_NVC AND NOT CMAKE_CXX_COMPILER_ID MATCHES "NVHPC")
+  message(WARNING "EIGEN_TEST_CUDA_NVC is set, but CMAKE_CXX_COMPILER does not appear to be nvc++.")
+endif()
+
 find_package(CUDA 9.0)
 if(CUDA_FOUND AND EIGEN_TEST_CUDA)
   # Make sure to compile without the -pedantic, -Wundef, -Wnon-virtual-dtor
@@ -421,6 +446,12 @@
       string(APPEND CMAKE_CXX_FLAGS " --cuda-gpu-arch=sm_${GPU}")
     endforeach()
     string(APPEND CMAKE_CXX_FLAGS " ${EIGEN_CUDA_CXX_FLAGS}")
+  elseif(EIGEN_TEST_CUDA_NVC)
+    string(APPEND CMAKE_CXX_FLAGS " -cuda")
+    foreach(GPU IN LISTS EIGEN_CUDA_COMPUTE_ARCH)
+      string(APPEND CMAKE_CXX_FLAGS " -gpu=cc${GPU}")
+    endforeach()
+    string(APPEND CMAKE_CXX_FLAGS " ${EIGEN_CUDA_CXX_FLAGS}")
   else()
     set(CUDA_PROPAGATE_HOST_FLAGS OFF)
     set(NVCC_ARCH_FLAGS)
@@ -501,6 +532,18 @@
   add_dependencies(buildtests doc)
 endif()
 
+# ULP accuracy measurement tool (see test/ulp_accuracy/README.md)
+find_package(MPFR)
+find_package(GMP)
+add_executable(ulp_accuracy ulp_accuracy/ulp_accuracy.cpp)
+target_compile_options(ulp_accuracy PRIVATE -pthread)
+target_link_libraries(ulp_accuracy Eigen3::Eigen ${CMAKE_THREAD_LIBS_INIT} pthread)
+if(MPFR_FOUND AND GMP_FOUND)
+  target_include_directories(ulp_accuracy PRIVATE ${MPFR_INCLUDES} ${GMP_INCLUDES})
+  target_link_libraries(ulp_accuracy ${MPFR_LIBRARIES} ${GMP_LIBRARIES})
+  target_compile_definitions(ulp_accuracy PRIVATE EIGEN_HAS_MPFR)
+endif()
+
 # Register all smoke tests
 include("EigenSmokeTestList")
 ei_add_smoke_tests("${ei_smoke_test_list}")

diff --git a/test/accelerate_support.cpp b/test/accelerate_support.cpp
index 4620d41..ec0609a 100644
--- a/test/accelerate_support.cpp
+++ b/test/accelerate_support.cpp

@@ -1,11 +1,19 @@
-#define EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS
-#include "sparse_solver.h"
-
 #if defined(DEBUG)
 #undef DEBUG
 #endif
 
+#define EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS
+
+#pragma clang diagnostic push
+// The following "warning" causes a build failure on macOS with the latest
+// version of clang:
+// error: non-defining declaration of enumeration with a fixed underlying
+//        type is only permitted as a standalone declaration
+#pragma clang diagnostic ignored "-Welaborated-enum-base"
 #include <Eigen/AccelerateSupport>
+#pragma clang diagnostic pop
+
+#include "sparse_solver.h"
 
 template <typename MatrixType, typename DenseMat>
 int generate_sparse_rectangular_problem(MatrixType& A, DenseMat& dA, int maxRows = 300, int maxCols = 300) {

diff --git a/test/adjoint.cpp b/test/adjoint.cpp
index 6366476..881d639 100644
--- a/test/adjoint.cpp
+++ b/test/adjoint.cpp

@@ -207,6 +207,87 @@
   a = a.transpose();
 }
 
+template <typename Scalar>
+void inner_product_boundary_sizes() {
+  const Index PS = internal::packet_traits<Scalar>::size;
+  // Sizes that exercise every branch in the 4-way unrolled vectorized inner product:
+  // scalar fallback (< PS), 1-3 packets, quad loop entry/exit, remainder packets, scalar cleanup
+  const Index sizes[] = {0,
+                         1,
+                         PS - 1,
+                         PS,
+                         PS + 1,
+                         2 * PS - 1,
+                         2 * PS,
+                         2 * PS + 1,
+                         3 * PS - 1,
+                         3 * PS,
+                         3 * PS + 1,
+                         4 * PS - 1,
+                         4 * PS,
+                         4 * PS + 1,
+                         8 * PS,
+                         8 * PS + 1,
+                         8 * PS + PS,
+                         8 * PS + 2 * PS,
+                         8 * PS + 3 * PS,
+                         8 * PS + 3 * PS + 1};
+  for (int si = 0; si < 20; ++si) {
+    const Index n = sizes[si];
+    if (n <= 0) continue;
+    typedef Matrix<Scalar, Dynamic, 1> Vec;
+    Vec v1 = Vec::Random(n);
+    Vec v2 = Vec::Random(n);
+    // Reference: scalar loop
+    Scalar expected(0);
+    for (Index k = 0; k < n; ++k) expected += numext::conj(v1(k)) * v2(k);
+    VERIFY_IS_APPROX(v1.dot(v2), expected);
+    // Also test squaredNorm
+    Scalar sq_expected(0);
+    for (Index k = 0; k < n; ++k) sq_expected += numext::conj(v1(k)) * v1(k);
+    VERIFY_IS_APPROX(v1.squaredNorm(), numext::real(sq_expected));
+  }
+}
+
+// Test transposeInPlace at vectorization boundary sizes.
+// BlockedInPlaceTranspose uses PacketSize-blocked loops with a scalar remainder (line 273),
+// exercising off-by-one-prone transitions.
+template <typename Scalar>
+void transposeInPlace_boundary() {
+  const Index PS = internal::packet_traits<Scalar>::size;
+  // Sizes around packet boundaries where the blocked path's remainder handling is exercised.
+  const Index sizes[] = {1,      2,          3,      PS - 1,     PS,     PS + 1,    2 * PS - 1,
+                         2 * PS, 2 * PS + 1, 3 * PS, 3 * PS + 1, 4 * PS, 4 * PS + 1};
+  for (int si = 0; si < 13; ++si) {
+    Index n = sizes[si];
+    if (n <= 0) continue;
+    typedef Matrix<Scalar, Dynamic, Dynamic> Mat;
+
+    // Square transposeInPlace
+    Mat m1 = Mat::Random(n, n);
+    Mat m2 = m1;
+    m2.transposeInPlace();
+    VERIFY_IS_APPROX(m2, m1.transpose());
+    // Double transpose should return to original
+    m2.transposeInPlace();
+    VERIFY_IS_APPROX(m2, m1);
+  }
+
+  // Non-square transposeInPlace (resizable dynamic matrices)
+  const Index rect_sizes[][2] = {{2, 5}, {PS, 2 * PS + 1}, {3, 1}, {1, 7}, {2 * PS, PS + 1}};
+  for (int si = 0; si < 5; ++si) {
+    Index r = rect_sizes[si][0], c = rect_sizes[si][1];
+    if (r <= 0 || c <= 0) continue;
+    typedef Matrix<Scalar, Dynamic, Dynamic> Mat;
+    Mat m1 = Mat::Random(r, c);
+    Mat expected = m1.transpose();
+    Mat m2 = m1;
+    m2.transposeInPlace();
+    VERIFY_IS_APPROX(m2, expected);
+    VERIFY(m2.rows() == c && m2.cols() == r);
+  }
+}
+
 EIGEN_DECLARE_TEST(adjoint) {
   for (int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1(adjoint(Matrix<float, 1, 1>()));
@@ -233,4 +314,15 @@
   CALL_SUBTEST_7(adjoint(Matrix<float, 100, 100>()));
 
   CALL_SUBTEST_13(adjoint_extra<0>());
+
+  // Inner product vectorization boundary tests (deterministic, outside g_repeat)
+  CALL_SUBTEST_14(inner_product_boundary_sizes<float>());
+  CALL_SUBTEST_15(inner_product_boundary_sizes<double>());
+  CALL_SUBTEST_16(inner_product_boundary_sizes<std::complex<float>>());
+  CALL_SUBTEST_17(inner_product_boundary_sizes<std::complex<double>>());
+
+  // transposeInPlace at vectorization boundaries (deterministic, outside g_repeat).
+  CALL_SUBTEST_18(transposeInPlace_boundary<float>());
+  CALL_SUBTEST_18(transposeInPlace_boundary<double>());
+  CALL_SUBTEST_18(transposeInPlace_boundary<std::complex<float>>());
 }

diff --git a/test/array_cwise.cpp b/test/array_cwise.cpp
index f361cce..f73a3bf 100644
--- a/test/array_cwise.cpp
+++ b/test/array_cwise.cpp

@@ -773,6 +773,7 @@
 
   // test bug2966: select did not support some scalar types that forbade implicit conversions from bool
   ArrayX<scalar_wrapper> m5(10);
+  m5.setConstant(scalar_wrapper(0));
   m5 = (m5 == scalar_wrapper(0)).select(m5, m5);
 }
 
@@ -940,8 +941,11 @@
 
   ArrayType m1 = ArrayType::Random(rows, cols), m2(rows, cols), m4 = m1;
 
-  m4.real() = (m4.real().abs() == RealScalar(0)).select(RealScalar(1), m4.real());
-  m4.imag() = (m4.imag().abs() == RealScalar(0)).select(RealScalar(1), m4.imag());
+  // Clamp m4 so that |m4| >= min_normal, avoiding overflow in inverse(m4).
+  // For complex z = a+bi, 1/z = (a-bi)/(a²+b²); if a²+b² underflows to zero
+  // (both |a| and |b| below sqrt(min_normal)), the inverse overflows to inf/nan.
+  const RealScalar min = (std::numeric_limits<RealScalar>::min)();
+  m4 = (m4.abs() < min).select(Scalar(1), m4);
 
   Array<RealScalar, -1, -1> m3(rows, cols);
 
@@ -1002,21 +1006,10 @@
 
   std::complex<RealScalar> zero(0.0, 0.0);
   VERIFY((Eigen::isnan)(m1 * zero / zero).all());
-#if EIGEN_COMP_MSVC
-  // msvc complex division is not robust
-  VERIFY((Eigen::isinf)(m4 / RealScalar(0)).all());
-#else
-#if EIGEN_COMP_CLANG
-  // clang's complex division is notoriously broken too
-  if ((numext::isinf)(m4(0, 0) / RealScalar(0))) {
-#endif
-    VERIFY((Eigen::isinf)(m4 / zero).all());
-#if EIGEN_COMP_CLANG
-  } else {
-    VERIFY((Eigen::isinf)(m4.real() / zero.real()).all());
-  }
-#endif
-#endif  // MSVC
+  // Complex division by zero may produce inf or NaN depending on the std::complex
+  // implementation (algebraic formula gives 0/0=NaN for some components). Only require
+  // the result to be non-finite.
+  VERIFY((!(Eigen::isfinite)(m4 / zero)).all());
 
   VERIFY(((Eigen::isfinite)(m1) && (!(Eigen::isfinite)(m1 * zero / zero)) && (!(Eigen::isfinite)(m1 / zero))).all());
 

diff --git a/test/array_for_matrix.cpp b/test/array_for_matrix.cpp
index e8d2c7d..f072434 100644
--- a/test/array_for_matrix.cpp
+++ b/test/array_for_matrix.cpp

@@ -73,12 +73,12 @@
   VERIFY_IS_EQUAL(m1.block(0, 0, rows, 0).rowwise().prod(), ColVectorType::Ones(rows));
 
   // verify the const accessors exist
-  const Scalar& ref_m1 = m.matrix().array().coeffRef(0);
-  const Scalar& ref_m2 = m.matrix().array().coeffRef(0, 0);
-  const Scalar& ref_a1 = m.array().matrix().coeffRef(0);
-  const Scalar& ref_a2 = m.array().matrix().coeffRef(0, 0);
-  VERIFY(&ref_a1 == &ref_m1);
-  VERIFY(&ref_a2 == &ref_m2);
+  const Scalar* ptr_m1 = &m.matrix().array().coeffRef(0);
+  const Scalar* ptr_m2 = &m.matrix().array().coeffRef(0, 0);
+  const Scalar* ptr_a1 = &m.array().matrix().coeffRef(0);
+  const Scalar* ptr_a2 = &m.array().matrix().coeffRef(0, 0);
+  VERIFY(ptr_a1 == ptr_m1);
+  VERIFY(ptr_a2 == ptr_m2);
 
   // Check write accessors:
   m1.array().coeffRef(0, 0) = 1;
@@ -273,6 +273,45 @@
   VERIFY(a1.size() == cols);
 }
 
+// Test lpNorm for matrices (not just vectors).
+// lpNorm treats the matrix as a flat coefficient vector,
+// so the reference is computed element-by-element.
+template <typename MatrixType>
+void lpNorm_matrix(const MatrixType& m) {
+  using std::abs;
+  using std::pow;
+  using std::sqrt;
+  typedef typename MatrixType::RealScalar RealScalar;
+
+  Index rows = m.rows();
+  Index cols = m.cols();
+  MatrixType u = MatrixType::Random(rows, cols);
+
+  // L1 norm
+  RealScalar ref_l1(0);
+  for (Index j = 0; j < cols; ++j)
+    for (Index i = 0; i < rows; ++i) ref_l1 += abs(u(i, j));
+  VERIFY_IS_APPROX(u.template lpNorm<1>(), ref_l1);
+
+  // L2 norm
+  RealScalar ref_l2_sq(0);
+  for (Index j = 0; j < cols; ++j)
+    for (Index i = 0; i < rows; ++i) ref_l2_sq += numext::abs2(u(i, j));
+  VERIFY_IS_APPROX(u.template lpNorm<2>(), sqrt(ref_l2_sq));
+
+  // L-Infinity norm
+  RealScalar ref_linf(0);
+  for (Index j = 0; j < cols; ++j)
+    for (Index i = 0; i < rows; ++i) ref_linf = (std::max)(ref_linf, abs(u(i, j)));
+  VERIFY_IS_APPROX(u.template lpNorm<Infinity>(), ref_linf);
+
+  // L5 norm
+  RealScalar ref_l5(0);
+  for (Index j = 0; j < cols; ++j)
+    for (Index i = 0; i < rows; ++i) ref_l5 += pow(abs(u(i, j)), RealScalar(5));
+  VERIFY_IS_APPROX(u.template lpNorm<5>(), pow(ref_l5, RealScalar(1) / RealScalar(5)));
+}
+
 template <int>
 void regression_bug_654() {
   ArrayXf a = RowVectorXf(3);
@@ -346,4 +385,17 @@
   }
   CALL_SUBTEST_6(regression_bug_654<0>());
   CALL_SUBTEST_6(regrrssion_bug_1410<0>());
+
+  // lpNorm for matrices (not just vectors)
+  for (int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_9(lpNorm_matrix(Matrix2f()));
+    CALL_SUBTEST_9(lpNorm_matrix(Matrix4d()));
+    CALL_SUBTEST_9(lpNorm_matrix(
+        MatrixXf(internal::random<int>(1, EIGEN_TEST_MAX_SIZE), internal::random<int>(1, EIGEN_TEST_MAX_SIZE))));
+    CALL_SUBTEST_9(lpNorm_matrix(MatrixXcd(internal::random<int>(1, EIGEN_TEST_MAX_SIZE / 2),
+                                           internal::random<int>(1, EIGEN_TEST_MAX_SIZE / 2))));
+  }
+  // empty matrix
+  CALL_SUBTEST_9(lpNorm_matrix(MatrixXf(0, 0)));
+  CALL_SUBTEST_9(lpNorm_matrix(MatrixXf(0, 3)));
 }

diff --git a/test/array_reverse.cpp b/test/array_reverse.cpp
index 3120c3c..f83d498 100644
--- a/test/array_reverse.cpp
+++ b/test/array_reverse.cpp

@@ -178,6 +178,47 @@
   // VERIFY_IS_APPROX(m2, m1.rowwise().reverse().eval());
 }
 
+// Test reverseInPlace at vectorization boundary sizes.
+// Vectorized swap used by reverseInPlace has remainder handling
+// that could fail at packet boundaries.
+template <typename Scalar>
+void reverseInPlace_boundary() {
+  const Index PS = internal::packet_traits<Scalar>::size;
+  const Index sizes[] = {1,      2,          3,      PS - 1,     PS,     PS + 1,    2 * PS - 1,
+                         2 * PS, 2 * PS + 1, 4 * PS, 4 * PS + 1, 8 * PS, 8 * PS + 1};
+  for (int si = 0; si < 13; ++si) {
+    Index n = sizes[si];
+    if (n <= 0) continue;
+    typedef Matrix<Scalar, Dynamic, 1> Vec;
+    typedef Matrix<Scalar, Dynamic, Dynamic> Mat;
+
+    // Vector reverseInPlace
+    Vec v1 = Vec::Random(n);
+    Vec v2 = v1;
+    v2.reverseInPlace();
+    for (Index k = 0; k < n; ++k) VERIFY_IS_APPROX(v2(k), v1(n - 1 - k));
+
+    // Matrix reverseInPlace (full)
+    Mat m1 = Mat::Random(n, n);
+    Mat m2 = m1;
+    m2.reverseInPlace();
+    for (Index j = 0; j < n; ++j)
+      for (Index i = 0; i < n; ++i) VERIFY_IS_APPROX(m2(i, j), m1(n - 1 - i, n - 1 - j));
+
+    // colwise reverseInPlace
+    m2 = m1;
+    m2.colwise().reverseInPlace();
+    for (Index j = 0; j < n; ++j)
+      for (Index i = 0; i < n; ++i) VERIFY_IS_APPROX(m2(i, j), m1(n - 1 - i, j));
+
+    // rowwise reverseInPlace
+    m2 = m1;
+    m2.rowwise().reverseInPlace();
+    for (Index j = 0; j < n; ++j)
+      for (Index i = 0; i < n; ++i) VERIFY_IS_APPROX(m2(i, j), m1(i, n - 1 - j));
+  }
+}
+
 EIGEN_DECLARE_TEST(array_reverse) {
   for (int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1(reverse(Matrix<float, 1, 1>()));
@@ -196,4 +237,9 @@
     CALL_SUBTEST_3(bug1684<0>());
   }
   CALL_SUBTEST_3(array_reverse_extra<0>());
+
+  // reverseInPlace at vectorization boundaries (deterministic, outside g_repeat).
+  CALL_SUBTEST_10(reverseInPlace_boundary<float>());
+  CALL_SUBTEST_10(reverseInPlace_boundary<double>());
+  CALL_SUBTEST_10(reverseInPlace_boundary<int>());
 }

diff --git a/test/basicstuff.cpp b/test/basicstuff.cpp
index 455cbfd..882d4a9 100644
--- a/test/basicstuff.cpp
+++ b/test/basicstuff.cpp

@@ -38,8 +38,8 @@
   VectorType v1 = VectorType::Random(rows), vzero = VectorType::Zero(rows);
   SquareMatrixType sm1 = SquareMatrixType::Random(rows, rows), sm2(rows, rows);
 
-  Scalar x = 0;
-  while (x == Scalar(0)) x = internal::random<Scalar>();
+  Scalar x = internal::random<Scalar>();
+  if (x == Scalar(0)) x = Scalar(1);
 
   Index r = internal::random<Index>(0, rows - 1), c = internal::random<Index>(0, cols - 1);
 

diff --git a/test/block.cpp b/test/block.cpp
index 8f7c8de..5f48bfd 100644
--- a/test/block.cpp
+++ b/test/block.cpp

@@ -87,6 +87,32 @@
   m1.col(c1).setZero();
   VERIFY_IS_CWISE_EQUAL(m1.col(c1), DynamicVectorType::Zero(rows));
   m1 = m1_copy;
+  // test setZero/setConstant/setOnes on non-contiguous multi-row/multi-col blocks
+  // This exercises the non-fill_n path in Fill.h and verifies no data corruption
+  if (r2 > r1 && c2 > c1) {
+    Index br = r2 - r1, bc = c2 - c1;
+    m1 = m1_copy;
+    m1.block(r1, c1, br, bc).setZero();
+    VERIFY_IS_CWISE_EQUAL(m1.block(r1, c1, br, bc), DynamicMatrixType::Zero(br, bc));
+    for (Index j = 0; j < cols; ++j)
+      for (Index i = 0; i < rows; ++i)
+        if (i < r1 || i >= r2 || j < c1 || j >= c2) VERIFY_IS_EQUAL(m1(i, j), m1_copy(i, j));
+
+    m1 = m1_copy;
+    m1.block(r1, c1, br, bc).setConstant(s1);
+    VERIFY_IS_CWISE_EQUAL(m1.block(r1, c1, br, bc), DynamicMatrixType::Constant(br, bc, s1));
+    for (Index j = 0; j < cols; ++j)
+      for (Index i = 0; i < rows; ++i)
+        if (i < r1 || i >= r2 || j < c1 || j >= c2) VERIFY_IS_EQUAL(m1(i, j), m1_copy(i, j));
+
+    m1 = m1_copy;
+    m1.block(r1, c1, br, bc).setOnes();
+    VERIFY_IS_CWISE_EQUAL(m1.block(r1, c1, br, bc), DynamicMatrixType::Ones(br, bc));
+    for (Index j = 0; j < cols; ++j)
+      for (Index i = 0; i < rows; ++i)
+        if (i < r1 || i >= r2 || j < c1 || j >= c2) VERIFY_IS_EQUAL(m1(i, j), m1_copy(i, j));
+  }
+  m1 = m1_copy;
 
   // check row() and col()
   VERIFY_IS_EQUAL(m1.col(c1).transpose(), m1.transpose().row(c1));

diff --git a/test/bug1213_main.cpp b/test/bug1213_main.cpp
index 16150cf..3543155 100644
--- a/test/bug1213_main.cpp
+++ b/test/bug1213_main.cpp

@@ -1,5 +1,5 @@
 
-// This is a regression unit regarding a weird linking issue with gcc.
+// This is a regression unit regarding an unusual linking issue with GCC.
 
 #include "bug1213.h"
 

diff --git a/test/condition_estimator.cpp b/test/condition_estimator.cpp
new file mode 100644
index 0000000..ce65f6e
--- /dev/null
+++ b/test/condition_estimator.cpp

@@ -0,0 +1,265 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen (rmlarsen@gmail.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <Eigen/Dense>
+
+template <typename MatrixType>
+typename MatrixType::RealScalar matrix_l1_norm(const MatrixType& m) {
+  return m.cwiseAbs().colwise().sum().maxCoeff();
+}
+
+template <typename MatrixType>
+void rcond_partial_piv_lu() {
+  typedef typename MatrixType::RealScalar RealScalar;
+  Index size = MatrixType::RowsAtCompileTime;
+  if (size == Dynamic) size = internal::random<Index>(2, EIGEN_TEST_MAX_SIZE);
+
+  // Create a random diagonally dominant (thus invertible) matrix.
+  MatrixType m = MatrixType::Random(size, size);
+  m.diagonal().array() += RealScalar(2 * size);
+
+  PartialPivLU<MatrixType> lu(m);
+  MatrixType m_inverse = lu.inverse();
+  RealScalar rcond = (RealScalar(1) / matrix_l1_norm(m)) / matrix_l1_norm(m_inverse);
+  RealScalar rcond_est = lu.rcond();
+  // Verify the estimate is within a factor of 3 of the truth.
+  VERIFY(rcond_est > rcond / 3 && rcond_est < rcond * 3);
+}
+
+template <typename MatrixType>
+void rcond_full_piv_lu() {
+  typedef typename MatrixType::RealScalar RealScalar;
+  Index size = MatrixType::RowsAtCompileTime;
+  if (size == Dynamic) size = internal::random<Index>(2, EIGEN_TEST_MAX_SIZE);
+
+  // Create a random diagonally dominant (thus invertible) matrix.
+  MatrixType m = MatrixType::Random(size, size);
+  m.diagonal().array() += RealScalar(2 * size);
+
+  FullPivLU<MatrixType> lu(m);
+  MatrixType m_inverse = lu.inverse();
+  RealScalar rcond = (RealScalar(1) / matrix_l1_norm(m)) / matrix_l1_norm(m_inverse);
+  RealScalar rcond_est = lu.rcond();
+  VERIFY(rcond_est > rcond / 3 && rcond_est < rcond * 3);
+}
+
+template <typename MatrixType>
+void rcond_llt() {
+  typedef typename MatrixType::RealScalar RealScalar;
+  Index size = MatrixType::RowsAtCompileTime;
+  if (size == Dynamic) size = internal::random<Index>(2, EIGEN_TEST_MAX_SIZE);
+
+  // Create a random SPD matrix: A^T * A + I.
+  MatrixType a = MatrixType::Random(size, size);
+  MatrixType m = a.adjoint() * a + MatrixType::Identity(size, size);
+
+  LLT<MatrixType> llt(m);
+  VERIFY(llt.info() == Success);
+  MatrixType m_inverse = llt.solve(MatrixType::Identity(size, size));
+  RealScalar rcond = (RealScalar(1) / matrix_l1_norm(m)) / matrix_l1_norm(m_inverse);
+  RealScalar rcond_est = llt.rcond();
+  VERIFY(rcond_est > rcond / 3 && rcond_est < rcond * 3);
+}
+
+template <typename MatrixType>
+void rcond_ldlt() {
+  typedef typename MatrixType::RealScalar RealScalar;
+  Index size = MatrixType::RowsAtCompileTime;
+  if (size == Dynamic) size = internal::random<Index>(2, EIGEN_TEST_MAX_SIZE);
+
+  // Create a random SPD matrix: A^T * A + I.
+  MatrixType a = MatrixType::Random(size, size);
+  MatrixType m = a.adjoint() * a + MatrixType::Identity(size, size);
+
+  LDLT<MatrixType> ldlt(m);
+  VERIFY(ldlt.info() == Success);
+  MatrixType m_inverse = ldlt.solve(MatrixType::Identity(size, size));
+  RealScalar rcond = (RealScalar(1) / matrix_l1_norm(m)) / matrix_l1_norm(m_inverse);
+  RealScalar rcond_est = ldlt.rcond();
+  VERIFY(rcond_est > rcond / 3 && rcond_est < rcond * 3);
+}
+
+template <typename MatrixType>
+void rcond_singular() {
+  typedef typename MatrixType::Scalar Scalar;
+  Index size = MatrixType::RowsAtCompileTime;
+  if (size == Dynamic) size = internal::random<Index>(2, EIGEN_TEST_MAX_SIZE);
+
+  // Create a rank-deficient matrix: first row is zero.
+  MatrixType m = MatrixType::Random(size, size);
+  m.row(0).setZero();
+
+  FullPivLU<MatrixType> lu(m);
+  VERIFY_IS_EQUAL(lu.rcond(), Scalar(0));
+}
+
+template <typename MatrixType>
+void rcond_identity() {
+  typedef typename MatrixType::RealScalar RealScalar;
+  Index size = MatrixType::RowsAtCompileTime;
+  if (size == Dynamic) size = internal::random<Index>(2, EIGEN_TEST_MAX_SIZE);
+
+  MatrixType m = MatrixType::Identity(size, size);
+
+  // All decompositions should give rcond ~= 1 for the identity.
+  {
+    PartialPivLU<MatrixType> lu(m);
+    VERIFY(lu.rcond() > RealScalar(0.5));
+  }
+  {
+    FullPivLU<MatrixType> lu(m);
+    VERIFY(lu.rcond() > RealScalar(0.5));
+  }
+  {
+    LLT<MatrixType> llt(m);
+    VERIFY(llt.rcond() > RealScalar(0.5));
+  }
+  {
+    LDLT<MatrixType> ldlt(m);
+    VERIFY(ldlt.rcond() > RealScalar(0.5));
+  }
+}
+
+template <typename MatrixType>
+void rcond_ill_conditioned() {
+  typedef typename MatrixType::RealScalar RealScalar;
+  Index size = MatrixType::RowsAtCompileTime;
+  if (size == Dynamic) size = internal::random<Index>(4, EIGEN_TEST_MAX_SIZE);
+
+  // Create a diagonal matrix with known large condition number.
+  // Use 1e-3 to stay well within single-precision range.
+  MatrixType m = MatrixType::Zero(size, size);
+  m(0, 0) = RealScalar(1);
+  for (Index i = 1; i < size; ++i) {
+    m(i, i) = RealScalar(1e-3);
+  }
+  // True condition number = 1e3, so rcond = 1e-3.
+
+  {
+    PartialPivLU<MatrixType> lu(m);
+    RealScalar rcond_est = lu.rcond();
+    VERIFY(rcond_est < RealScalar(1e-1));
+    VERIFY(rcond_est > RealScalar(1e-5));
+  }
+  {
+    FullPivLU<MatrixType> lu(m);
+    RealScalar rcond_est = lu.rcond();
+    VERIFY(rcond_est < RealScalar(1e-1));
+    VERIFY(rcond_est > RealScalar(1e-5));
+  }
+}
+
+template <typename MatrixType>
+void rcond_1x1() {
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef Matrix<typename MatrixType::Scalar, 1, 1> Mat1;
+  Mat1 m;
+  m(0, 0) = internal::random<RealScalar>(RealScalar(1), RealScalar(100));
+
+  {
+    PartialPivLU<Mat1> lu(m);
+    VERIFY_IS_APPROX(lu.rcond(), RealScalar(1));
+  }
+  {
+    FullPivLU<Mat1> lu(m);
+    VERIFY_IS_APPROX(lu.rcond(), RealScalar(1));
+  }
+  {
+    LLT<Mat1> llt(m);
+    VERIFY_IS_APPROX(llt.rcond(), RealScalar(1));
+  }
+  {
+    LDLT<Mat1> ldlt(m);
+    VERIFY_IS_APPROX(ldlt.rcond(), RealScalar(1));
+  }
+}
+
+template <typename MatrixType>
+void rcond_2x2() {
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef Matrix<typename MatrixType::Scalar, 2, 2> Mat2;
+
+  // Well-conditioned 2x2 matrix.
+  Mat2 m;
+  m << RealScalar(2), RealScalar(1), RealScalar(1), RealScalar(3);
+
+  {
+    PartialPivLU<Mat2> lu(m);
+    Mat2 m_inverse = lu.inverse();
+    RealScalar rcond = (RealScalar(1) / matrix_l1_norm(m)) / matrix_l1_norm(m_inverse);
+    RealScalar rcond_est = lu.rcond();
+    VERIFY(rcond_est > rcond / 3 && rcond_est < rcond * 3);
+  }
+  {
+    FullPivLU<Mat2> lu(m);
+    Mat2 m_inverse = lu.inverse();
+    RealScalar rcond = (RealScalar(1) / matrix_l1_norm(m)) / matrix_l1_norm(m_inverse);
+    RealScalar rcond_est = lu.rcond();
+    VERIFY(rcond_est > rcond / 3 && rcond_est < rcond * 3);
+  }
+  {
+    LLT<Mat2> llt(m);
+    Mat2 m_inverse = llt.solve(Mat2::Identity());
+    RealScalar rcond = (RealScalar(1) / matrix_l1_norm(m)) / matrix_l1_norm(m_inverse);
+    RealScalar rcond_est = llt.rcond();
+    VERIFY(rcond_est > rcond / 3 && rcond_est < rcond * 3);
+  }
+}
+
+EIGEN_DECLARE_TEST(condition_estimator) {
+  for (int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1(rcond_partial_piv_lu<Matrix3f>());
+    CALL_SUBTEST_1(rcond_full_piv_lu<Matrix3f>());
+    CALL_SUBTEST_1(rcond_llt<Matrix3f>());
+    CALL_SUBTEST_1(rcond_ldlt<Matrix3f>());
+    CALL_SUBTEST_1(rcond_singular<Matrix3f>());
+    CALL_SUBTEST_1(rcond_identity<Matrix3f>());
+    CALL_SUBTEST_1(rcond_1x1<Matrix3f>());
+    CALL_SUBTEST_1(rcond_2x2<Matrix3f>());
+
+    CALL_SUBTEST_2(rcond_partial_piv_lu<Matrix4d>());
+    CALL_SUBTEST_2(rcond_full_piv_lu<Matrix4d>());
+    CALL_SUBTEST_2(rcond_llt<Matrix4d>());
+    CALL_SUBTEST_2(rcond_ldlt<Matrix4d>());
+    CALL_SUBTEST_2(rcond_singular<Matrix4d>());
+    CALL_SUBTEST_2(rcond_identity<Matrix4d>());
+    CALL_SUBTEST_2(rcond_2x2<Matrix4d>());
+
+    CALL_SUBTEST_3(rcond_partial_piv_lu<MatrixXf>());
+    CALL_SUBTEST_3(rcond_full_piv_lu<MatrixXf>());
+    CALL_SUBTEST_3(rcond_llt<MatrixXf>());
+    CALL_SUBTEST_3(rcond_ldlt<MatrixXf>());
+    CALL_SUBTEST_3(rcond_singular<MatrixXf>());
+    CALL_SUBTEST_3(rcond_identity<MatrixXf>());
+    CALL_SUBTEST_3(rcond_ill_conditioned<MatrixXf>());
+
+    CALL_SUBTEST_4(rcond_partial_piv_lu<MatrixXd>());
+    CALL_SUBTEST_4(rcond_full_piv_lu<MatrixXd>());
+    CALL_SUBTEST_4(rcond_llt<MatrixXd>());
+    CALL_SUBTEST_4(rcond_ldlt<MatrixXd>());
+    CALL_SUBTEST_4(rcond_singular<MatrixXd>());
+    CALL_SUBTEST_4(rcond_identity<MatrixXd>());
+    CALL_SUBTEST_4(rcond_ill_conditioned<MatrixXd>());
+
+    CALL_SUBTEST_5(rcond_partial_piv_lu<MatrixXcf>());
+    CALL_SUBTEST_5(rcond_full_piv_lu<MatrixXcf>());
+    CALL_SUBTEST_5(rcond_llt<MatrixXcf>());
+    CALL_SUBTEST_5(rcond_ldlt<MatrixXcf>());
+    CALL_SUBTEST_5(rcond_singular<MatrixXcf>());
+    CALL_SUBTEST_5(rcond_identity<MatrixXcf>());
+
+    CALL_SUBTEST_6(rcond_partial_piv_lu<MatrixXcd>());
+    CALL_SUBTEST_6(rcond_full_piv_lu<MatrixXcd>());
+    CALL_SUBTEST_6(rcond_llt<MatrixXcd>());
+    CALL_SUBTEST_6(rcond_ldlt<MatrixXcd>());
+    CALL_SUBTEST_6(rcond_singular<MatrixXcd>());
+    CALL_SUBTEST_6(rcond_identity<MatrixXcd>());
+  }
+}

diff --git a/test/constexpr.cpp b/test/constexpr.cpp
index ecfda0a..5eb462e 100644
--- a/test/constexpr.cpp
+++ b/test/constexpr.cpp

@@ -40,7 +40,7 @@
   VERIFY_IS_EQUAL(vec.size(), 3);
   static_assert(vec.coeff(0, 1) == 2);
 
-  // Check assignment. A wrapper struct is used to avoid copy ellision.
+  // Check assignment. A wrapper struct is used to avoid copy elision.
   constexpr ConstexprTest<double, 2> obj1(Matrix2d({{1, 2}, {3, 4}}));
   VERIFY_IS_EQUAL(obj1.A.size(), 4);
   static_assert(obj1.A(0, 0) == 1);
@@ -64,6 +64,27 @@
   static_assert(dyn_arr(0) == 1);
   VERIFY_IS_EQUAL(dyn_arr.size(), 9);
   static_assert(dyn_arr.coeff(0, 1) == 2);
+
+  // Test matrix addition.
+  constexpr Matrix3i mat_a({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  constexpr Matrix3i mat_b({{9, 8, 7}, {6, 5, 4}, {3, 2, 1}});
+  constexpr Matrix3i mat_sum = mat_a + mat_b;
+  static_assert(mat_sum(0, 0) == 10);
+  static_assert(mat_sum(1, 1) == 10);
+  static_assert(mat_sum(2, 2) == 10);
+
+  // Test matrix subtraction.
+  constexpr Matrix3i mat_diff = mat_a - mat_b;
+  static_assert(mat_diff(0, 0) == -8);
+  static_assert(mat_diff(1, 1) == 0);
+  static_assert(mat_diff(2, 2) == 8);
+
+  // Test scalar multiplication.
+  constexpr Matrix3i mat_scaled = mat_a * 2;
+  static_assert(mat_scaled(0, 0) == 2);
+  static_assert(mat_scaled(1, 1) == 10);
+  static_assert(mat_scaled(2, 2) == 18);
+
 #endif  // __cpp_constexpr >= 201907L
 }
 

diff --git a/test/diagonal.cpp b/test/diagonal.cpp
index a56118c..e7c4bf2 100644
--- a/test/diagonal.cpp
+++ b/test/diagonal.cpp

@@ -81,6 +81,79 @@
   VERIFY_RAISES_ASSERT(m1.diagonal(-(rows + 1)));
 }
 
+// Test that (A * B).diagonal() gives the same result as (A * B).eval().diagonal().
+// The diagonal-of-product path uses LazyProduct evaluation (see ProductEvaluators.h),
+// which avoids computing the full product. Verify this optimization is correct.
+template <typename Scalar>
+void diagonal_of_product() {
+  const Index PS = internal::packet_traits<Scalar>::size;
+  const Index sizes[] = {1, 2, 3, PS - 1, PS, PS + 1, 2 * PS - 1, 2 * PS, 2 * PS + 1, 4 * PS, 4 * PS + 1};
+  typedef Matrix<Scalar, Dynamic, Dynamic> Mat;
+  typedef Matrix<Scalar, Dynamic, 1> Vec;
+
+  for (int si = 0; si < 11; ++si) {
+    Index n = sizes[si];
+    if (n <= 0) continue;
+
+    Mat A = Mat::Random(n, n);
+    Mat B = Mat::Random(n, n);
+
+    // Lazy diagonal vs explicit product diagonal
+    Vec diag_lazy = (A * B).diagonal();
+    Vec diag_explicit = (A * B).eval().diagonal();
+    VERIFY_IS_APPROX(diag_lazy, diag_explicit);
+
+    // Also test non-square: A is m×k, B is k×n
+    for (int k : {1, 3, (int)n}) {
+      if (k <= 0) continue;
+      Mat C = Mat::Random(n, k);
+      Mat D = Mat::Random(k, n);
+      Vec diag_lazy2 = (C * D).diagonal();
+      Vec diag_explicit2 = (C * D).eval().diagonal();
+      VERIFY_IS_APPROX(diag_lazy2, diag_explicit2);
+    }
+  }
+}
+
+// Test .select() at vectorization boundary sizes.
+// select() uses CwiseTernaryOp which has packet-level evaluation with remainder handling.
+template <typename Scalar>
+void select_boundary() {
+  const Index PS = internal::packet_traits<Scalar>::size;
+  const Index sizes[] = {1, 2, 3, PS - 1, PS, PS + 1, 2 * PS - 1, 2 * PS, 2 * PS + 1, 4 * PS, 4 * PS + 1};
+  typedef Array<Scalar, Dynamic, 1> Arr;
+
+  for (int si = 0; si < 11; ++si) {
+    Index n = sizes[si];
+    if (n <= 0) continue;
+
+    Arr a = Arr::Random(n);
+    Arr b = Arr::Random(n);
+    auto cond = (a > Scalar(0));
+
+    // select with two arrays
+    Arr result = cond.select(a, b);
+    for (Index k = 0; k < n; ++k) {
+      Scalar expected = (a(k) > Scalar(0)) ? a(k) : b(k);
+      VERIFY_IS_APPROX(result(k), expected);
+    }
+
+    // select with scalar else
+    Arr result2 = cond.select(a, Scalar(0));
+    for (Index k = 0; k < n; ++k) {
+      Scalar expected = (a(k) > Scalar(0)) ? a(k) : Scalar(0);
+      VERIFY_IS_APPROX(result2(k), expected);
+    }
+
+    // select with scalar then
+    Arr result3 = cond.select(Scalar(42), b);
+    for (Index k = 0; k < n; ++k) {
+      Scalar expected = (a(k) > Scalar(0)) ? Scalar(42) : b(k);
+      VERIFY_IS_APPROX(result3(k), expected);
+    }
+  }
+}
+
 EIGEN_DECLARE_TEST(diagonal) {
   for (int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1(diagonal(Matrix<float, 1, 1>()));
@@ -99,4 +172,14 @@
     CALL_SUBTEST_1(diagonal_assert(
         MatrixXf(internal::random<int>(1, EIGEN_TEST_MAX_SIZE), internal::random<int>(1, EIGEN_TEST_MAX_SIZE))));
   }
+
+  // Diagonal-of-product optimization (deterministic, outside g_repeat).
+  CALL_SUBTEST_3(diagonal_of_product<float>());
+  CALL_SUBTEST_3(diagonal_of_product<double>());
+  CALL_SUBTEST_3(diagonal_of_product<std::complex<float>>());
+
+  // Select at vectorization boundaries (deterministic, outside g_repeat).
+  CALL_SUBTEST_4(select_boundary<float>());
+  CALL_SUBTEST_4(select_boundary<double>());
+  CALL_SUBTEST_4(select_boundary<int>());
 }

diff --git a/test/diagonalview.cpp b/test/diagonalview.cpp
new file mode 100644
index 0000000..0d83c8e
--- /dev/null
+++ b/test/diagonalview.cpp

@@ -0,0 +1,55 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// discard stack allocation as that too bypasses malloc
+#define EIGEN_STACK_ALLOCATION_LIMIT 0
+// heap allocation will raise an assert if enabled at runtime
+#define EIGEN_RUNTIME_NO_MALLOC
+
+#include "main.h"
+using namespace std;
+template <typename MatrixType>
+void diagonalview(const MatrixType& m) {
+  enum { Rows = MatrixType::RowsAtCompileTime, Cols = MatrixType::ColsAtCompileTime };
+  Index rows = m.rows();
+  Index cols = m.cols();
+  // create random matrix
+  MatrixType m1 = MatrixType::Random(rows, cols);
+
+  // check equivalence to diagonal(i).asDiagonal() for dynamic indexes
+  VERIFY_IS_APPROX(m1.diagonal(0).asDiagonal().toDenseMatrix(), m1.diagonalView(0).toDenseMatrix());
+  // subdiagonal
+  VERIFY_IS_APPROX(m1.diagonal(-1).asDiagonal().toDenseMatrix(), m1.diagonalView(-1).toDenseMatrix());
+  // superdiagonal
+  VERIFY_IS_APPROX(m1.diagonal(1).asDiagonal().toDenseMatrix(), m1.diagonalView(1).toDenseMatrix());
+
+  // check equivalence to diagonal(i).asDiagonal() for compile time indexes
+  VERIFY_IS_APPROX(m1.diagonal(0).asDiagonal().toDenseMatrix(), m1.template diagonalView<0>().toDenseMatrix());
+  // sub
+  VERIFY_IS_APPROX(m1.diagonal(-1).asDiagonal().toDenseMatrix(), m1.template diagonalView<-1>().toDenseMatrix());
+  // super
+  VERIFY_IS_APPROX(m1.diagonal(1).asDiagonal().toDenseMatrix(), m1.template diagonalView<1>().toDenseMatrix());
+
+  // check const overloads
+  const auto m2(m1);
+  typedef decltype(m1) Type1;
+  typedef decltype(m2) Type2;
+  constexpr bool types_are_same = std::is_same<Type1, Type2>::value;
+  VERIFY(!types_are_same);
+
+  VERIFY_IS_APPROX(m2.diagonal(0).asDiagonal().toDenseMatrix(), m2.diagonalView(0).toDenseMatrix());
+  VERIFY_IS_APPROX(m2.diagonal(1).asDiagonal().toDenseMatrix(), m2.template diagonalView<1>().toDenseMatrix());
+}
+
+EIGEN_DECLARE_TEST(diagonalview) {
+  for (int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1(diagonalview(Matrix<float, 3, 3>()));
+    CALL_SUBTEST_2(diagonalview(Matrix<int, 50, 50>()));
+  }
+}

diff --git a/test/eigensolver_complex.cpp b/test/eigensolver_complex.cpp
index 76846a9..1590092 100644
--- a/test/eigensolver_complex.cpp
+++ b/test/eigensolver_complex.cpp

@@ -132,7 +132,8 @@
     ComplexEigenSolver<MatrixType> ei3(a);
     VERIFY_IS_EQUAL(ei3.info(), Success);
     VERIFY_IS_MUCH_SMALLER_THAN(ei3.eigenvalues().norm(), RealScalar(1));
-    VERIFY((ei3.eigenvectors().transpose() * ei3.eigenvectors().transpose()).eval().isIdentity());
+    RealScalar tol = 2 * a.cols() * NumTraits<RealScalar>::epsilon();
+    VERIFY((ei3.eigenvectors().adjoint() * ei3.eigenvectors()).eval().isIdentity(tol));
   }
 }
 

diff --git a/test/eigensolver_generic.cpp b/test/eigensolver_generic.cpp
index c665df8..dafaf99 100644
--- a/test/eigensolver_generic.cpp
+++ b/test/eigensolver_generic.cpp

@@ -89,7 +89,8 @@
     EigenSolver<MatrixType> ei3(a);
     VERIFY_IS_EQUAL(ei3.info(), Success);
     VERIFY_IS_MUCH_SMALLER_THAN(ei3.eigenvalues().norm(), RealScalar(1));
-    VERIFY((ei3.eigenvectors().transpose() * ei3.eigenvectors().transpose()).eval().isIdentity());
+    RealScalar tol = 2 * a.cols() * NumTraits<RealScalar>::epsilon();
+    VERIFY((ei3.eigenvectors().adjoint() * ei3.eigenvectors()).eval().isIdentity(tol));
   }
 }
 

diff --git a/test/eigensolver_selfadjoint.cpp b/test/eigensolver_selfadjoint.cpp
index 12bc3b1..7cc4d63 100644
--- a/test/eigensolver_selfadjoint.cpp
+++ b/test/eigensolver_selfadjoint.cpp

@@ -197,7 +197,8 @@
     SelfAdjointEigenSolver<MatrixType> ei3(a);
     VERIFY_IS_EQUAL(ei3.info(), Success);
     VERIFY_IS_MUCH_SMALLER_THAN(ei3.eigenvalues().norm(), RealScalar(1));
-    VERIFY((ei3.eigenvectors().transpose() * ei3.eigenvectors().transpose()).eval().isIdentity());
+    RealScalar tol = 2 * a.cols() * NumTraits<RealScalar>::epsilon();
+    VERIFY((ei3.eigenvectors().adjoint() * ei3.eigenvectors()).eval().isIdentity(tol));
   }
 }
 

diff --git a/test/evaluators.cpp b/test/evaluators.cpp
index 5a4ab37..2941536 100644
--- a/test/evaluators.cpp
+++ b/test/evaluators.cpp

@@ -33,6 +33,7 @@
   eigen_assert((dst.size() == 0 || (IsVectorAtCompileTime ? (dst.size() == src.size())
                                                           : (dst.rows() == src.rows() && dst.cols() == src.cols()))) &&
                "Size mismatch. Automatic resizing is disabled because EIGEN_NO_AUTOMATIC_RESIZING is defined");
+  if (dst.size() == 0) dst.const_cast_derived().resizeLike(src.derived());
 #else
   dst.const_cast_derived().resizeLike(src.derived());
 #endif

diff --git a/test/geo_alignedbox.cpp b/test/geo_alignedbox.cpp
index da49c08..67ca27e 100644
--- a/test/geo_alignedbox.cpp
+++ b/test/geo_alignedbox.cpp

@@ -257,8 +257,8 @@
   // box((-3, -2, -2), (-1, 0, 0))
 
   IsometryTransform tf2 = IsometryTransform::Identity();
-  // for some weird reason the following statement has to be put separate from
-  // the following rotate call, otherwise precision problems arise...
+  // The following statement must be separate from the rotate call below,
+  // otherwise precision problems arise.
   Rotation rot = rotate(NonInteger(EIGEN_PI));
   tf2.rotate(rot);
 

diff --git a/test/geo_hyperplane.cpp b/test/geo_hyperplane.cpp
index 2e6b500..bf0a181 100644
--- a/test/geo_hyperplane.cpp
+++ b/test/geo_hyperplane.cpp

@@ -96,9 +96,9 @@
     Vector u = Vector::Random();
     Vector v = Vector::Random();
     Scalar a = internal::random<Scalar>();
-    while (abs(a - 1) < Scalar(1e-4)) a = internal::random<Scalar>();
-    while (u.norm() < Scalar(1e-4)) u = Vector::Random();
-    while (v.norm() < Scalar(1e-4)) v = Vector::Random();
+    if (abs(a - 1) < Scalar(1e-4)) a = Scalar(0);
+    if (u.norm() < Scalar(1e-4)) u = Vector::Unit(0);
+    if (v.norm() < Scalar(1e-4)) v = Vector::Unit(1);
 
     HLine line_u = HLine::Through(center + u, center + a * u);
     HLine line_v = HLine::Through(center + v, center + a * v);

diff --git a/test/geo_transformations.cpp b/test/geo_transformations.cpp
index a9eb87e..d10281a 100644
--- a/test/geo_transformations.cpp
+++ b/test/geo_transformations.cpp

@@ -162,9 +162,8 @@
 
   // Transform
   // TODO complete the tests !
-  a = 0;
-  while (abs(a) < Scalar(0.1))
-    a = internal::random<Scalar>(-Scalar(0.4) * Scalar(EIGEN_PI), Scalar(0.4) * Scalar(EIGEN_PI));
+  a = internal::random<Scalar>(-Scalar(0.4) * Scalar(EIGEN_PI), Scalar(0.4) * Scalar(EIGEN_PI));
+  if (abs(a) < Scalar(0.1)) a = Scalar(0.1);
   q1 = AngleAxisx(a, v0.normalized());
   Transform3 t0, t1, t2;
 
@@ -224,10 +223,11 @@
   t4 *= aa3;
   VERIFY_IS_APPROX(t3.matrix(), t4.matrix());
 
-  do {
-    v3 = Vector3::Random();
-    dont_over_optimize(v3);
-  } while (v3.cwiseAbs().minCoeff() < NumTraits<Scalar>::epsilon());
+  v3 = Vector3::Random();
+  dont_over_optimize(v3);
+  for (int k = 0; k < 3; ++k) {
+    if (numext::abs(v3(k)) < NumTraits<Scalar>::epsilon()) v3(k) = NumTraits<Scalar>::epsilon();
+  }
   Translation3 tv3(v3);
   Transform3 t5(tv3);
   t4 = tv3;
@@ -381,9 +381,8 @@
   // test transform inversion
   t0.setIdentity();
   t0.translate(v0);
-  do {
-    t0.linear().setRandom();
-  } while (t0.linear().jacobiSvd().singularValues()(2) < test_precision<Scalar>());
+  t0.linear().setRandom();
+  if (t0.linear().jacobiSvd().singularValues()(2) < test_precision<Scalar>()) t0.linear().setIdentity();
   Matrix4 t044 = Matrix4::Zero();
   t044(3, 3) = 1;
   t044.block(0, 0, t0.matrix().rows(), 4) = t0.matrix();

diff --git a/test/gpu_basic.cu b/test/gpu_basic.cu
index 35839e4..5e69fb7 100644
--- a/test/gpu_basic.cu
+++ b/test/gpu_basic.cu

@@ -438,7 +438,7 @@
   // numeric_limits
   CALL_SUBTEST(test_with_infs_nans(numeric_limits_test<Vector3f>(), 1, in, out));
 
-  // These tests require dynamic-sized matrix multiplcation, which isn't currently
+  // These tests require dynamic-sized matrix multiplication, which isn't currently
   // supported on GPU.
 
   // CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues<Matrix4f>(), nthreads, in, out) );

diff --git a/test/gpu_common.h b/test/gpu_common.h
index f34f1b0..8c7049b 100644
--- a/test/gpu_common.h
+++ b/test/gpu_common.h

@@ -153,9 +153,7 @@
   std::cout << "  warpSize:                    " << deviceProp.warpSize << "\n";
   std::cout << "  regsPerBlock:                " << deviceProp.regsPerBlock << "\n";
   std::cout << "  concurrentKernels:           " << deviceProp.concurrentKernels << "\n";
-  std::cout << "  clockRate:                   " << deviceProp.clockRate << "\n";
   std::cout << "  canMapHostMemory:            " << deviceProp.canMapHostMemory << "\n";
-  std::cout << "  computeMode:                 " << deviceProp.computeMode << "\n";
 }
 
 #endif  // EIGEN_TEST_GPU_COMMON_H

diff --git a/test/gpu_example.cu b/test/gpu_example.cu
index cb5ea30..03b2ec5 100644
--- a/test/gpu_example.cu
+++ b/test/gpu_example.cu

@@ -68,7 +68,7 @@
 };
 
 template <typename T1, typename T2, typename T3>
-void test_multiply(const T1& type1, const T2& type2, const T3& type3) {
+void test_multiply(const T1& type1, const T2& type2, const T3& /*type3*/) {
   const T1 A = T1::Random(type1.rows(), type1.cols());
   const T2 B = T2::Random(type2.rows(), type2.cols());
   T3 C;

diff --git a/test/gpu_test_helper.h b/test/gpu_test_helper.h
index c1ef70a..ee6c70b 100644
--- a/test/gpu_test_helper.h
+++ b/test/gpu_test_helper.h

@@ -24,7 +24,7 @@
 
 namespace internal {
 
-// Note: cannot re-use tuple_impl, since that will cause havoc for
+// Note: cannot reuse tuple_impl, since that will cause havoc for
 // tuple_test.
 namespace test_detail {
 // Use std::tuple on CPU, otherwise use the GPU-specific versions.
@@ -389,6 +389,7 @@
   std::cout << "  warpSize:                    " << deviceProp.warpSize << std::endl;
   std::cout << "  regsPerBlock:                " << deviceProp.regsPerBlock << std::endl;
   std::cout << "  concurrentKernels:           " << deviceProp.concurrentKernels << std::endl;
+  std::cout << "  canMapHostMemory:            " << deviceProp.canMapHostMemory << "\n";
 }
 
 #endif  // EIGEN_GPUCC

diff --git a/test/indexed_view.cpp b/test/indexed_view.cpp
index 9f0df85..1801695 100644
--- a/test/indexed_view.cpp
+++ b/test/indexed_view.cpp

@@ -356,7 +356,7 @@
     VERIFY_IS_CWISE_EQUAL(R_ref(eigen_matrix_rows, eigen_matrix_cols), R_ref(c_array_rows, c_array_cols));
   }
 
-  // check mat(i,j) with weird types for i and j
+  // check mat(i,j) with unusual types for i and j
   {
     VERIFY_IS_APPROX(A(B.RowsAtCompileTime - 1, 1), A(3, 1));
     VERIFY_IS_APPROX(A(B.RowsAtCompileTime, 1), A(4, 1));
@@ -420,9 +420,9 @@
   // check symbolic indices
   a(last) = 1.0;
   A(last, last) = 1;
-  // check weird non-const, non-lvalue scenarios
+  // check unusual non-const, non-lvalue scenarios
   {
-    // in these scenarios, the objects are not declared 'const', and the compiler will atttempt to use the non-const
+    // in these scenarios, the objects are not declared 'const', and the compiler will attempt to use the non-const
     // overloads without intervention
 
     // non-const map to a const object

diff --git a/test/integer_types.cpp b/test/integer_types.cpp
index a3be749..142854c 100644
--- a/test/integer_types.cpp
+++ b/test/integer_types.cpp

@@ -26,16 +26,24 @@
 
   MatrixType m1(rows, cols), m2 = MatrixType::Random(rows, cols), mzero = MatrixType::Zero(rows, cols);
 
-  do {
-    m1 = MatrixType::Random(rows, cols);
-  } while (m1 == mzero || m1 == m2);
+  {
+    int guard = 0;
+    do {
+      m1 = MatrixType::Random(rows, cols);
+    } while ((m1 == mzero || m1 == m2) && (++guard) < 100);
+    VERIFY(guard < 100);
+  }
 
   // check linear structure
 
   Scalar s1;
-  do {
-    s1 = internal::random<Scalar>();
-  } while (s1 == 0);
+  {
+    int guard = 0;
+    do {
+      s1 = internal::random<Scalar>();
+    } while (s1 == 0 && (++guard) < 100);
+    VERIFY(guard < 100);
+  }
 
   VERIFY_IS_EQUAL(-(-m1), m1);
   VERIFY_IS_EQUAL(-m2 + m1 + m2, m1);
@@ -63,13 +71,21 @@
   SquareMatrixType identity = SquareMatrixType::Identity(rows, rows), square = SquareMatrixType::Random(rows, rows);
   VectorType v1(rows), v2 = VectorType::Random(rows), vzero = VectorType::Zero(rows);
 
-  do {
-    m1 = MatrixType::Random(rows, cols);
-  } while (m1 == mzero || m1 == m2);
+  {
+    int guard = 0;
+    do {
+      m1 = MatrixType::Random(rows, cols);
+    } while ((m1 == mzero || m1 == m2) && (++guard) < 100);
+    VERIFY(guard < 100);
+  }
 
-  do {
-    v1 = VectorType::Random(rows);
-  } while (v1 == vzero || v1 == v2);
+  {
+    int guard = 0;
+    do {
+      v1 = VectorType::Random(rows);
+    } while ((v1 == vzero || v1 == v2) && (++guard) < 100);
+    VERIFY(guard < 100);
+  }
 
   VERIFY_IS_APPROX(v1, v1);
   VERIFY_IS_NOT_APPROX(v1, 2 * v1);

diff --git a/test/io.cpp b/test/io.cpp
index 219e1cb..637f02d 100644
--- a/test/io.cpp
+++ b/test/io.cpp

@@ -48,7 +48,7 @@
   check_ostream_impl<Scalar>::run();
 }
 
-EIGEN_DECLARE_TEST(rand) {
+EIGEN_DECLARE_TEST(io) {
   CALL_SUBTEST(check_ostream<bool>());
   CALL_SUBTEST(check_ostream<float>());
   CALL_SUBTEST(check_ostream<double>());

diff --git a/test/linearstructure.cpp b/test/linearstructure.cpp
index e5684a5..082fa47 100644
--- a/test/linearstructure.cpp
+++ b/test/linearstructure.cpp

@@ -21,7 +21,6 @@
      CwiseUnaryOp.h, CwiseBinaryOp.h, SelfCwiseBinaryOp.h
   */
   typedef typename MatrixType::Scalar Scalar;
-  typedef typename MatrixType::RealScalar RealScalar;
 
   Index rows = m.rows();
   Index cols = m.cols();
@@ -31,7 +30,7 @@
   MatrixType m1 = MatrixType::Random(rows, cols), m2 = MatrixType::Random(rows, cols), m3(rows, cols);
 
   Scalar s1 = internal::random<Scalar>();
-  while (abs(s1) < RealScalar(1e-3)) s1 = internal::random<Scalar>();
+  if (s1 == Scalar(0)) s1 = Scalar(1);
 
   Index r = internal::random<Index>(0, rows - 1), c = internal::random<Index>(0, cols - 1);
 
@@ -110,6 +109,56 @@
   VERIFY(g_called && "matrix<complex> - real not properly optimized");
 }
 
+// Test linear structure operations between matrices with different storage orders.
+// When storage orders differ, vectorization is disabled (StorageOrdersAgree=false in
+// AssignEvaluator.h), exercising the scalar fallback path.
+template <typename Scalar>
+void linearStructure_mixed_storage() {
+  const Index PS = internal::packet_traits<Scalar>::size;
+  // Sizes at vectorization boundaries to expose any mismatch in traversal
+  const Index sizes[] = {1, PS, PS + 1, 2 * PS, 2 * PS + 1, 4 * PS + 1, 16};
+  typedef Matrix<Scalar, Dynamic, Dynamic, ColMajor> ColMat;
+  typedef Matrix<Scalar, Dynamic, Dynamic, RowMajor> RowMat;
+
+  for (int si = 0; si < 7; ++si) {
+    Index n = sizes[si];
+    if (n <= 0) continue;
+    ColMat mc = ColMat::Random(n, n);
+    RowMat mr = RowMat::Random(n, n);
+
+    // ColMajor + RowMajor → ColMajor
+    ColMat sum_c = mc + mr;
+    for (Index j = 0; j < n; ++j)
+      for (Index i = 0; i < n; ++i) VERIFY_IS_APPROX(sum_c(i, j), mc(i, j) + mr(i, j));
+
+    // ColMajor - RowMajor → ColMajor
+    ColMat diff_c = mc - mr;
+    for (Index j = 0; j < n; ++j)
+      for (Index i = 0; i < n; ++i) VERIFY_IS_APPROX(diff_c(i, j), mc(i, j) - mr(i, j));
+
+    // RowMajor + ColMajor → RowMajor
+    RowMat sum_r = mr + mc;
+    for (Index j = 0; j < n; ++j)
+      for (Index i = 0; i < n; ++i) VERIFY_IS_APPROX(sum_r(i, j), mr(i, j) + mc(i, j));
+
+    // Assignment between storage orders
+    ColMat from_row = mr;
+    VERIFY_IS_APPROX(from_row, mr);
+    RowMat from_col = mc;
+    VERIFY_IS_APPROX(from_col, mc);
+
+    // cwiseProduct with mixed storage
+    ColMat cwp = mc.cwiseProduct(mr);
+    for (Index j = 0; j < n; ++j)
+      for (Index i = 0; i < n; ++i) VERIFY_IS_APPROX(cwp(i, j), mc(i, j) * mr(i, j));
+
+    // += with mixed storage
+    ColMat mc2 = mc;
+    mc2 += mr;
+    VERIFY_IS_APPROX(mc2, sum_c);
+  }
+}
+
 template <int>
 void linearstructure_overflow() {
   // make sure that /=scalar and /scalar do not overflow
@@ -148,4 +197,9 @@
     CALL_SUBTEST_11(real_complex<ArrayXXcf>(10, 10));
   }
   CALL_SUBTEST_4(linearstructure_overflow<0>());
+
+  // Mixed storage order tests (deterministic, outside g_repeat).
+  CALL_SUBTEST_12(linearStructure_mixed_storage<float>());
+  CALL_SUBTEST_12(linearStructure_mixed_storage<double>());
+  CALL_SUBTEST_12(linearStructure_mixed_storage<std::complex<float>>());
 }

diff --git a/test/lu.cpp b/test/lu.cpp
index b20bcfc..7946c0b 100644
--- a/test/lu.cpp
+++ b/test/lu.cpp

@@ -106,10 +106,10 @@
   MatrixType m1(size, size), m2(size, size), m3(size, size);
   FullPivLU<MatrixType> lu;
   lu.setThreshold(RealScalar(0.01));
-  do {
-    m1 = MatrixType::Random(size, size);
-    lu.compute(m1);
-  } while (!lu.isInvertible());
+  // Create a random diagonally dominant (thus invertible) matrix.
+  m1 = MatrixType::Random(size, size);
+  m1.diagonal().array() += RealScalar(2 * size);
+  lu.compute(m1);
 
   VERIFY_IS_APPROX(m1, lu.reconstructedMatrix());
   VERIFY(0 == lu.dimensionOfKernel());

diff --git a/test/main.h b/test/main.h
index fa7053e..7ba0dc8 100644
--- a/test/main.h
+++ b/test/main.h

@@ -635,7 +635,7 @@
 // The idea behind this function is to compare the two scalars a and b where
 // the scalar ref is a hint about the expected order of magnitude of a and b.
 // WARNING: the scalar a and b must be positive
-// Therefore, if for some reason a and b are very small compared to ref,
+// Therefore, if a and b happen to be very small compared to ref,
 // we won't issue a false negative.
 // This test could be: abs(a-b) <= eps * ref
 // However, it seems that simply comparing a+ref and b+ref is more sensitive to true error.

diff --git a/test/mapped_matrix.cpp b/test/mapped_matrix.cpp
index 78ae09c..71ecb34 100644
--- a/test/mapped_matrix.cpp
+++ b/test/mapped_matrix.cpp

@@ -167,12 +167,83 @@
 
   // verify that map-to-const don't have LvalueBit
   typedef std::add_const_t<PlainObjectType> ConstPlainObjectType;
-  VERIFY(!(internal::traits<Map<ConstPlainObjectType> >::Flags & LvalueBit));
-  VERIFY(!(internal::traits<Map<ConstPlainObjectType, AlignedMax> >::Flags & LvalueBit));
+  VERIFY(!(internal::traits<Map<ConstPlainObjectType>>::Flags & LvalueBit));
+  VERIFY(!(internal::traits<Map<ConstPlainObjectType, AlignedMax>>::Flags & LvalueBit));
   VERIFY(!(Map<ConstPlainObjectType>::Flags & LvalueBit));
   VERIFY(!(Map<ConstPlainObjectType, AlignedMax>::Flags & LvalueBit));
 }
 
+// Test Map with InnerStride at vectorization boundary sizes.
+// Strided Maps exercise different traversal paths (SliceVectorized or Default)
+// in assignment and reductions.
+template <typename Scalar>
+void map_inner_stride_boundary() {
+  const Index PS = internal::packet_traits<Scalar>::size;
+  const Index sizes[] = {1, 2, 3, PS - 1, PS, PS + 1, 2 * PS, 2 * PS + 1, 4 * PS, 4 * PS + 1};
+  for (int si = 0; si < 10; ++si) {
+    const Index n = sizes[si];
+    if (n <= 0) continue;
+    typedef Matrix<Scalar, Dynamic, 1> Vec;
+    // InnerStride<2>: every other element
+    Vec data = Vec::Random(2 * n);
+    Map<Vec, 0, InnerStride<2>> strided(data.data(), n);
+
+    // Test assignment to/from strided map
+    Vec dense = strided;
+    for (Index k = 0; k < n; ++k) VERIFY_IS_APPROX(dense(k), data(2 * k));
+
+    // Test scalar operations on strided map
+    Vec result = Scalar(2) * strided;
+    for (Index k = 0; k < n; ++k) VERIFY_IS_APPROX(result(k), Scalar(2) * data(2 * k));
+
+    // Test strided map + dense vector
+    Vec other = Vec::Random(n);
+    Vec sum_result = strided + other;
+    for (Index k = 0; k < n; ++k) VERIFY_IS_APPROX(sum_result(k), data(2 * k) + other(k));
+
+    // Test writing to strided map
+    Map<Vec, 0, InnerStride<2>> strided_dst(data.data(), n);
+    strided_dst = other;
+    for (Index k = 0; k < n; ++k) VERIFY_IS_APPROX(data(2 * k), other(k));
+  }
+}
+
+// Test Map with OuterStride on matrices at boundary sizes.
+template <typename Scalar>
+void map_outer_stride_boundary() {
+  const Index PS = internal::packet_traits<Scalar>::size;
+  typedef Matrix<Scalar, Dynamic, Dynamic> Mat;
+  // Test various inner dimensions around packet size
+  const Index inner_sizes[] = {1, PS - 1, PS, PS + 1, 2 * PS, 2 * PS + 1};
+  const Index outer_stride = 64;  // large enough for any inner size
+  const Index cols = 4;
+
+  for (int si = 0; si < 6; ++si) {
+    Index rows = inner_sizes[si];
+    if (rows <= 0) continue;
+    typedef Matrix<Scalar, Dynamic, 1> Vec;
+    Vec data = Vec::Random(outer_stride * cols);
+    Map<Mat, 0, OuterStride<>> mapped(data.data(), rows, cols, OuterStride<>(outer_stride));
+
+    // Test that mapped values match expected layout
+    Mat dense = mapped;
+    for (Index j = 0; j < cols; ++j)
+      for (Index i = 0; i < rows; ++i) VERIFY_IS_APPROX(dense(i, j), data(j * outer_stride + i));
+
+    // Test reduction on mapped matrix
+    Scalar ref_sum(0);
+    for (Index j = 0; j < cols; ++j)
+      for (Index i = 0; i < rows; ++i) ref_sum += data(j * outer_stride + i);
+    VERIFY_IS_APPROX(mapped.sum(), ref_sum);
+
+    // Test matrix product with mapped matrix
+    Vec x = Vec::Random(cols);
+    Vec y = mapped * x;
+    Vec y_ref = dense * x;
+    VERIFY_IS_APPROX(y, y_ref);
+  }
+}
+
 EIGEN_DECLARE_TEST(mapped_matrix) {
   for (int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1(map_class_vector(Matrix<float, 1, 1>()));
@@ -197,4 +268,10 @@
     CALL_SUBTEST_9(map_static_methods(VectorXcd(8)));
     CALL_SUBTEST_10(map_static_methods(VectorXf(12)));
   }
+
+  // Strided map tests at vectorization boundaries (deterministic, outside g_repeat).
+  CALL_SUBTEST_12(map_inner_stride_boundary<float>());
+  CALL_SUBTEST_12(map_inner_stride_boundary<double>());
+  CALL_SUBTEST_13(map_outer_stride_boundary<float>());
+  CALL_SUBTEST_13(map_outer_stride_boundary<double>());
 }

diff --git a/test/mixingtypes.cpp b/test/mixingtypes.cpp
index 8f0fff3..78032b5 100644
--- a/test/mixingtypes.cpp
+++ b/test/mixingtypes.cpp

@@ -77,10 +77,10 @@
   float epsf = std::sqrt(std::numeric_limits<float>::min EIGEN_EMPTY());
   double epsd = std::sqrt(std::numeric_limits<double>::min EIGEN_EMPTY());
 
-  while (std::abs(sf) < epsf) sf = internal::random<float>();
-  while (std::abs(sd) < epsd) sd = internal::random<double>();
-  while (std::abs(scf) < epsf) scf = internal::random<CF>();
-  while (std::abs(scd) < epsd) scd = internal::random<CD>();
+  if (std::abs(sf) < epsf) sf = 1.0f;
+  if (std::abs(sd) < epsd) sd = 1.0;
+  if (std::abs(scf) < epsf) scf = CF(1);
+  if (std::abs(scd) < epsd) scd = CD(1);
 
   // check scalar products
   VERIFY_MIX_SCALAR(vcf * sf, vcf * complex<float>(sf));

diff --git a/test/no_automatic_resizing.cpp b/test/no_automatic_resizing.cpp
new file mode 100644
index 0000000..6156471
--- /dev/null
+++ b/test/no_automatic_resizing.cpp

@@ -0,0 +1,106 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// Test that EIGEN_NO_AUTOMATIC_RESIZING still allows assignment to
+// default-constructed (empty) matrices and arrays.
+
+#define EIGEN_NO_AUTOMATIC_RESIZING
+#include "main.h"
+
+template <typename Scalar>
+void testNoAutomaticResizing() {
+  using Matrix = Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>;
+  using Vector = Eigen::Matrix<Scalar, Eigen::Dynamic, 1>;
+  using RowVector = Eigen::Matrix<Scalar, 1, Eigen::Dynamic>;
+  using Array = Eigen::Array<Scalar, Eigen::Dynamic, Eigen::Dynamic>;
+  using ArrayVector = Eigen::Array<Scalar, Eigen::Dynamic, 1>;
+
+  const Index rows = internal::random<Index>(1, 50);
+  const Index cols = internal::random<Index>(1, 50);
+
+  // Assignment of Zero expression to default-constructed matrix.
+  {
+    Matrix M;
+    M = Matrix::Zero(rows, cols);
+    VERIFY_IS_EQUAL(M.rows(), rows);
+    VERIFY_IS_EQUAL(M.cols(), cols);
+    VERIFY_IS_EQUAL(M.norm(), Scalar(0));
+  }
+
+  // Assignment of Zero expression to default-constructed array.
+  {
+    Array A;
+    A = Array::Zero(rows, cols);
+    VERIFY_IS_EQUAL(A.rows(), rows);
+    VERIFY_IS_EQUAL(A.cols(), cols);
+  }
+
+  // Assignment of Ones expression to default-constructed matrix.
+  {
+    Matrix M;
+    M = Matrix::Ones(rows, cols);
+    VERIFY_IS_EQUAL(M.rows(), rows);
+    VERIFY_IS_EQUAL(M.cols(), cols);
+  }
+
+  // Assignment of Random expression to default-constructed matrix.
+  {
+    Matrix M;
+    M = Matrix::Random(rows, cols);
+    VERIFY_IS_EQUAL(M.rows(), rows);
+    VERIFY_IS_EQUAL(M.cols(), cols);
+  }
+
+  // Assignment from another matrix to default-constructed matrix.
+  {
+    Matrix src = Matrix::Random(rows, cols);
+    Matrix dst;
+    dst = src;
+    VERIFY_IS_EQUAL(dst.rows(), rows);
+    VERIFY_IS_EQUAL(dst.cols(), cols);
+    VERIFY_IS_APPROX(dst, src);
+  }
+
+  // Vector assignment to default-constructed vector.
+  {
+    Vector v;
+    v = Vector::Zero(rows);
+    VERIFY_IS_EQUAL(v.size(), rows);
+  }
+
+  // RowVector assignment to default-constructed row vector.
+  {
+    RowVector v;
+    v = RowVector::Zero(cols);
+    VERIFY_IS_EQUAL(v.size(), cols);
+  }
+
+  // Array vector assignment to default-constructed array vector.
+  {
+    ArrayVector v;
+    v = ArrayVector::Zero(rows);
+    VERIFY_IS_EQUAL(v.size(), rows);
+  }
+
+  // Column access after Zero initialization (reproducer for reported bug).
+  {
+    Array A;
+    A = Array::Zero(rows, cols);
+    for (Index j = 0; j < cols; ++j) {
+      auto c = A.col(j);
+      VERIFY_IS_EQUAL(c.rows(), rows);
+    }
+  }
+}
+
+EIGEN_DECLARE_TEST(no_automatic_resizing) {
+  CALL_SUBTEST_1(testNoAutomaticResizing<float>());
+  CALL_SUBTEST_2(testNoAutomaticResizing<double>());
+  CALL_SUBTEST_3(testNoAutomaticResizing<std::complex<double>>());
+}

diff --git a/test/nullary.cpp b/test/nullary.cpp
index 9f9cfea..a54c618 100644
--- a/test/nullary.cpp
+++ b/test/nullary.cpp

@@ -207,12 +207,8 @@
   const Index rows = m.rows();
   const Index cols = m.cols();
   typedef typename MatrixType::Scalar Scalar;
-  typedef typename MatrixType::RealScalar RealScalar;
 
-  Scalar s1;
-  do {
-    s1 = internal::random<Scalar>();
-  } while (abs(s1) < RealScalar(1e-5) && (!NumTraits<Scalar>::IsInteger));
+  Scalar s1 = internal::random<Scalar>();
 
   MatrixType A;
   A.setIdentity(rows, cols);
@@ -271,7 +267,7 @@
   VERIFY((!internal::has_binary_operator<internal::linspaced_op<float> >::value));
   VERIFY((internal::functor_has_linear_access<internal::linspaced_op<float> >::ret));
 
-  // Regression unit test for a weird MSVC bug.
+  // Regression unit test for an MSVC bug.
   // Search "nullary_wrapper_workaround_msvc" in CoreEvaluators.h for the details.
   // See also traits<Ref>::match.
   {
@@ -297,6 +293,130 @@
   }
 }
 
+// Test LinSpaced at vectorization boundary sizes.
+// The packetOp in linspaced_op_impl uses mask/select logic to handle
+// the last partial packet (when vector size is not a multiple of PacketSize).
+// This exercises those boundaries with element-by-element verification.
+template <typename Scalar>
+void linspaced_boundary() {
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  const Index PS = internal::packet_traits<Scalar>::size;
+  const Index sizes[] = {1, 2, 3, PS - 1, PS, PS + 1, 2 * PS - 1, 2 * PS, 2 * PS + 1, 4 * PS, 4 * PS + 1};
+  typedef Matrix<Scalar, Dynamic, 1> Vec;
+
+  for (int si = 0; si < 11; ++si) {
+    Index n = sizes[si];
+    if (n <= 0) continue;
+
+    Scalar low(1), high(100);
+    Vec v = Vec::LinSpaced(n, low, high);
+
+    // With n==1, LinSpaced returns [high] by design.
+    if (n == 1) {
+      VERIFY_IS_EQUAL(v(0), high);
+    } else {
+      VERIFY_IS_EQUAL(v(0), low);
+      VERIFY_IS_EQUAL(v(n - 1), high);
+
+      // Verify monotonicity.
+      for (Index k = 1; k < n; ++k) {
+        VERIFY(numext::real(v(k)) >= numext::real(v(k - 1)));
+      }
+
+      // Verify against scalar reference computation.
+      for (Index k = 0; k < n; ++k) {
+        Scalar ref = Scalar(low + (high - low) * RealScalar(k) / RealScalar(n - 1));
+        VERIFY_IS_APPROX(v(k), ref);
+      }
+    }
+  }
+
+  // Test the "flip" path: when |high| < |low|, the implementation uses
+  // a reversed computation for better precision. Verify at packet boundaries.
+  for (int si = 0; si < 11; ++si) {
+    Index n = sizes[si];
+    if (n <= 0 || n == 1) continue;  // skip n=1, flip irrelevant for single element
+
+    Scalar low(1000), high(1);  // |high| < |low| triggers flip
+    Vec v = Vec::LinSpaced(n, low, high);
+
+    VERIFY_IS_EQUAL(v(0), low);
+    VERIFY_IS_EQUAL(v(n - 1), high);
+
+    // Verify monotonicity (decreasing).
+    for (Index k = 1; k < n; ++k) {
+      VERIFY(numext::real(v(k)) <= numext::real(v(k - 1)));
+    }
+
+    // Verify against scalar reference.
+    for (Index k = 0; k < n; ++k) {
+      Scalar ref = Scalar(low + (high - low) * RealScalar(k) / RealScalar(n - 1));
+      VERIFY_IS_APPROX(v(k), ref);
+    }
+  }
+}
+
+// Test integer LinSpaced divisor path.
+// When (abs(high - low) + 1) < num_steps, the integer LinSpaced uses
+// a divisor-based formula instead of multiplication. This path is
+// barely covered by existing tests which use random ranges.
+template <int>
+void linspaced_integer_divisor() {
+  typedef Matrix<int, Dynamic, 1> VecI;
+
+  // Case: num_steps much larger than range → triggers divisor path.
+  // LinSpaced(12, 0, 5): 12 steps over range [0,5], so range+1=6, 6 < 12.
+  {
+    VecI v = VecI::LinSpaced(12, 0, 5);
+    VERIFY_IS_EQUAL(v(0), 0);
+    // All values must be in [0, 5].
+    for (Index k = 0; k < 12; ++k) {
+      VERIFY(v(k) >= 0 && v(k) <= 5);
+    }
+    // Must be non-decreasing.
+    for (Index k = 1; k < 12; ++k) {
+      VERIFY(v(k) >= v(k - 1));
+    }
+    // Each integer 0-5 should appear at least once.
+    for (int val = 0; val <= 5; ++val) {
+      bool found = false;
+      for (Index k = 0; k < 12; ++k) {
+        if (v(k) == val) {
+          found = true;
+          break;
+        }
+      }
+      VERIFY(found);
+    }
+  }
+
+  // Case: range exactly divides steps → each value should appear equally.
+  // LinSpaced(20, 0, 3): range+1=4, 20%4==0, so each of 0,1,2,3 appears 5 times.
+  {
+    VecI v = VecI::LinSpaced(20, 0, 3);
+    VERIFY_IS_EQUAL(v(0), 0);
+    for (Index k = 0; k < 20; ++k) {
+      VERIFY(v(k) >= 0 && v(k) <= 3);
+    }
+    for (Index k = 1; k < 20; ++k) {
+      VERIFY(v(k) >= v(k - 1));
+    }
+  }
+
+  // Reverse: LinSpaced(12, 5, 0) should be reverse of LinSpaced(12, 0, 5).
+  {
+    VecI fwd = VecI::LinSpaced(12, 0, 5);
+    VecI rev = VecI::LinSpaced(12, 5, 0);
+    VERIFY_IS_APPROX(fwd, rev.reverse());
+  }
+
+  // Single step: always returns high.
+  {
+    VecI v = VecI::LinSpaced(1, 3, 7);
+    VERIFY_IS_EQUAL(v(0), 7);
+  }
+}
+
 EIGEN_DECLARE_TEST(nullary) {
   CALL_SUBTEST_1(testMatrixType(Matrix2d()));
   CALL_SUBTEST_2(testMatrixType(MatrixXcf(internal::random<int>(1, 300), internal::random<int>(1, 300))));
@@ -322,4 +442,11 @@
   CALL_SUBTEST_6(bug1630<0>());
   CALL_SUBTEST_9(nullary_overflow<0>());
   CALL_SUBTEST_10(nullary_internal_logic<0>());
+
+  // LinSpaced at vectorization boundaries (deterministic, outside g_repeat).
+  CALL_SUBTEST_11(linspaced_boundary<float>());
+  CALL_SUBTEST_11(linspaced_boundary<double>());
+
+  // Integer LinSpaced divisor path tests.
+  CALL_SUBTEST_12(linspaced_integer_divisor<0>());
 }

diff --git a/test/packetmath.cpp b/test/packetmath.cpp
index f645b8c..b4300b3 100644
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp

@@ -86,6 +86,13 @@
   return a > b ? a - b : b - a;
 }
 
+// MacOS apple-clang has an issue with pcmp_eq for half when inlined,
+// resulting in an ICE, but only in this specific test.
+template <typename Packet>
+EIGEN_DONT_INLINE Packet REF_PCMP_EQ(const Packet& a, const Packet& b) {
+  return internal::pcmp_eq(a, b);
+}
+
 // Specializations for bool.
 template <>
 inline bool REF_ADD(const bool& a, const bool& b) {
@@ -120,7 +127,7 @@
   exp = static_cast<T>(iexp);
 
   // The exponent value is unspecified if the input is inf or NaN, but MSVC
-  // seems to set it to 1.  We need to set it back to zero for consistency.
+  // sets it to 1.  We need to set it back to zero for consistency.
   if (!(numext::isfinite)(x)) {
     exp = T(0);
   }
@@ -361,21 +368,21 @@
     data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
   }
 
-  CHECK_CWISE2_MASK(internal::pcmp_eq, internal::pcmp_eq);
+  CHECK_CWISE2_MASK(REF_PCMP_EQ, internal::pcmp_eq);
 
   // Test (-0) == (0) for signed operations
   for (int i = 0; i < PacketSize; ++i) {
     data1[i] = Scalar(-0.0);
     data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
   }
-  CHECK_CWISE2_MASK(internal::pcmp_eq, internal::pcmp_eq);
+  CHECK_CWISE2_MASK(REF_PCMP_EQ, internal::pcmp_eq);
 
   // Test NaN
   for (int i = 0; i < PacketSize; ++i) {
     data1[i] = NumTraits<Scalar>::quiet_NaN();
     data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
   }
-  CHECK_CWISE2_MASK(internal::pcmp_eq, internal::pcmp_eq);
+  CHECK_CWISE2_MASK(REF_PCMP_EQ, internal::pcmp_eq);
 }
 
 template <typename Scalar, typename Packet>
@@ -1015,7 +1022,7 @@
   }
 
   if (PacketTraits::HasTanh) {
-    // NOTE this test migh fail with GCC prior to 6.3, see MathFunctionsImpl.h for details.
+    // NOTE this test might fail with GCC prior to 6.3, see MathFunctionsImpl.h for details.
     data1[0] = NumTraits<Scalar>::quiet_NaN();
     test::packet_helper<internal::packet_traits<Scalar>::HasTanh, Packet> h;
     h.store(data2, internal::ptanh(h.load(data1)));
@@ -1204,17 +1211,27 @@
   return (numext::mini)(a, b);
 }
 
-template <bool Cond, typename Scalar, typename Packet, bool SkipDenorms = false, typename FunctorT>
+template <bool Cond, typename Scalar, typename Packet, bool SkipDenorms = EIGEN_ARCH_ARM, typename FunctorT>
 std::enable_if_t<!Cond, void> run_ieee_cases(const FunctorT&) {}
 
-template <bool Cond, typename Scalar, typename Packet, bool SkipDenorms = false, typename FunctorT>
+template <bool Cond, typename Scalar, typename Packet, bool SkipDenorms = EIGEN_ARCH_ARM, typename FunctorT>
 std::enable_if_t<Cond, void> run_ieee_cases(const FunctorT& fun) {
   const int PacketSize = internal::unpacket_traits<Packet>::size;
   const Scalar norm_min = (std::numeric_limits<Scalar>::min)();
   const Scalar norm_max = (std::numeric_limits<Scalar>::max)();
   const Scalar inf = (std::numeric_limits<Scalar>::infinity)();
   const Scalar nan = (std::numeric_limits<Scalar>::quiet_NaN)();
-  std::vector<Scalar> values{norm_min, Scalar(0), Scalar(1), norm_max, inf, nan};
+  std::vector<Scalar> values{Scalar(0), Scalar(1), norm_max, inf, nan};
+  // On ARM, NEON flush-to-zero mode can flush intermediate subnormal results to zero,
+  // causing functions like sin(norm_min) to return 0 instead of norm_min. Skip norm_min
+  // in that case, along with truly subnormal values.
+  if (!SkipDenorms) {
+    values.push_back(norm_min);
+    if (std::numeric_limits<Scalar>::has_denorm == std::denorm_present) {
+      values.push_back(std::numeric_limits<Scalar>::denorm_min());
+      values.push_back(norm_min / Scalar(2));
+    }
+  }
 
   constexpr int size = PacketSize * 2;
   EIGEN_ALIGN_MAX Scalar data1[size];
@@ -1224,15 +1241,12 @@
     data1[i] = data2[i] = ref[i] = Scalar(0);
   }
 
-  if (Cond && !SkipDenorms && std::numeric_limits<Scalar>::has_denorm == std::denorm_present) {
-    values.push_back(std::numeric_limits<Scalar>::denorm_min());
-    values.push_back(norm_min / Scalar(2));
-  }
-
   for (Scalar abs_value : values) {
     data1[0] = abs_value;
     data1[1] = -data1[0];
+    g_test_stack.push_back("IEEE cases: " + fun.name);
     CHECK_CWISE1_IF(Cond, fun.expected, fun.actual);
+    g_test_stack.pop_back();
   }
 }
 
@@ -1248,6 +1262,7 @@
     T expected(const T& val) const {          \
       return EXPECTED(val);                   \
     }                                         \
+    const std::string name = #NAME;           \
   }
 
 CREATE_TESTER(sqrt_fun, internal::psqrt, numext::sqrt);
@@ -1475,10 +1490,10 @@
   }
 
   // Verify equality with signed zero.
-  static bool is_exactly_equal(const Scalar& a, const Scalar& b) {
+  static bool is_exactly_equal(const Scalar& a, const Scalar& b, bool quiet = false) {
     bool result = is_exactly_equal(numext::real_ref(a), numext::real_ref(b)) &&
                   is_exactly_equal(numext::imag_ref(a), numext::imag_ref(b));
-    if (!result) {
+    if (!result && !quiet) {
       std::cout << a << " != " << b << std::endl;
     }
     return result;
@@ -1502,6 +1517,13 @@
     if (numext::real_ref(z) == +inf && (numext::isnan)(numext::imag_ref(z))) {
       return true;
     }
+    // If exp(x) overflows to inf and y is finite nonzero, the result involves inf * cos(y) and
+    // inf * sin(y). When cos(y) or sin(y) is near a zero crossing (e.g., cos(pi/2)), different
+    // trig implementations may produce different signs, so the signs of the result are unspecified.
+    if (!(numext::isinf)(numext::imag_ref(z)) && !(numext::isnan)(numext::imag_ref(z)) && numext::imag_ref(z) != 0 &&
+        (numext::isinf)(std::exp(numext::real_ref(z)))) {
+      return true;
+    }
     return false;
   }
 
@@ -1548,7 +1570,12 @@
               Scalar(numext::abs(numext::real_ref(expected)), numext::abs(numext::imag_ref(expected)));
           VERIFY(is_exactly_equal(abs_w, abs_expected));
         } else {
-          VERIFY(is_exactly_equal(w, numext::exp(z)));
+          Scalar expected = numext::exp(z);
+          // First try exact equality (handles NaN, signed zeros correctly).
+          // Fall back to approximate comparison to allow for small differences
+          // in trig functions near zero crossings (e.g., vectorized sincos may
+          // compute cos(pi/2) = 0 while scalar std::exp gives ~6.12e-17).
+          VERIFY(is_exactly_equal(w, expected, /*quiet=*/true) || verifyIsApprox(w, expected));
         }
       }
     }
@@ -1558,7 +1585,7 @@
 template <typename Scalar, typename Packet>
 struct exp_complex_test_impl<Scalar, Packet, false> {
   typedef typename Scalar::value_type RealScalar;
-  static void run(Scalar*, Scalar*, Scalar*, int) {};
+  static void run(Scalar*, Scalar*, Scalar*, int){};
 };
 
 template <typename Scalar, typename Packet>
@@ -1597,6 +1624,44 @@
     VERIFY(test::areApprox(ref, pval, PacketSize) && "pcplxflip");
   }
 
+  const RealScalar zero = RealScalar(0);
+  const RealScalar one = RealScalar(1);
+  const RealScalar inf = std::numeric_limits<RealScalar>::infinity();
+  const RealScalar nan = std::numeric_limits<RealScalar>::quiet_NaN();
+
+  // Test division by a denominator with equal real and imaginary magnitudes
+  // to ensure pdiv scaling avoids division by zero (e.g. 1.0 - 1.0i).
+  if (PacketTraits::HasDiv) {
+    for (int i = 0; i < PacketSize; ++i) {
+      data1[i] = Scalar(one, zero);
+      RealScalar sign_re = (i & 1) ? -one : one;
+      RealScalar sign_im = (i & 2) ? -one : one;
+      data2[i] = Scalar(sign_re, sign_im);
+    }
+    internal::pstore(pval, internal::pdiv(internal::pload<Packet>(data1), internal::pload<Packet>(data2)));
+    for (int i = 0; i < PacketSize; ++i) {
+      Scalar expected = data1[i] / data2[i];
+      VERIFY_IS_APPROX(pval[i], expected);
+    }
+  }
+
+  // Multiplication and Division.
+  {
+    std::array<RealScalar, 8> special_values = {zero, one, inf, nan, -zero, -one, -inf, -nan};
+    for (RealScalar a : special_values) {
+      for (RealScalar b : special_values) {
+        for (RealScalar c : special_values) {
+          for (RealScalar d : special_values) {
+            data1[0] = Scalar(a, b);
+            data2[0] = Scalar(c, d);
+            CHECK_CWISE2_IF(PacketTraits::HasMul, internal::complex_multiply, internal::pmul);
+            CHECK_CWISE2_IF(PacketTraits::HasDiv, internal::complex_divide, internal::pdiv);
+          }
+        }
+      }
+    }
+  }
+
   if (PacketTraits::HasSqrt) {
     for (int i = 0; i < size; ++i) {
       data1[i] = Scalar(internal::random<RealScalar>(), internal::random<RealScalar>());
@@ -1605,10 +1670,6 @@
     CHECK_CWISE1_IF(PacketTraits::HasSign, numext::sign, internal::psign);
 
     // Test misc. corner cases.
-    const RealScalar zero = RealScalar(0);
-    const RealScalar one = RealScalar(1);
-    const RealScalar inf = std::numeric_limits<RealScalar>::infinity();
-    const RealScalar nan = std::numeric_limits<RealScalar>::quiet_NaN();
     data1[0] = Scalar(zero, zero);
     data1[1] = Scalar(-zero, zero);
     data1[2] = Scalar(one, zero);
@@ -1647,10 +1708,6 @@
     CHECK_CWISE1_N(std::log, internal::plog, size);
 
     // Test misc. corner cases.
-    const RealScalar zero = RealScalar(0);
-    const RealScalar one = RealScalar(1);
-    const RealScalar inf = std::numeric_limits<RealScalar>::infinity();
-    const RealScalar nan = std::numeric_limits<RealScalar>::quiet_NaN();
     for (RealScalar x : {zero, one, inf}) {
       for (RealScalar y : {zero, one, inf}) {
         data1[0] = Scalar(x, y);

diff --git a/test/packetmath_generic_16.cpp b/test/packetmath_generic_16.cpp
new file mode 100644
index 0000000..612a75c
--- /dev/null
+++ b/test/packetmath_generic_16.cpp

@@ -0,0 +1,4 @@
+// Force the generic clang vector backend with 16-byte vectors.
+#define EIGEN_VECTORIZE_GENERIC 1
+#define EIGEN_GENERIC_VECTOR_SIZE_BYTES 16
+#include "packetmath.cpp"

diff --git a/test/packetmath_generic_32.cpp b/test/packetmath_generic_32.cpp
new file mode 100644
index 0000000..9816f9f
--- /dev/null
+++ b/test/packetmath_generic_32.cpp

@@ -0,0 +1,4 @@
+// Force the generic clang vector backend with 32-byte vectors.
+#define EIGEN_VECTORIZE_GENERIC 1
+#define EIGEN_GENERIC_VECTOR_SIZE_BYTES 32
+#include "packetmath.cpp"

diff --git a/test/packetmath_generic_64.cpp b/test/packetmath_generic_64.cpp
new file mode 100644
index 0000000..6957544
--- /dev/null
+++ b/test/packetmath_generic_64.cpp

@@ -0,0 +1,4 @@
+// Force the generic clang vector backend with 64-byte vectors.
+#define EIGEN_VECTORIZE_GENERIC 1
+#define EIGEN_GENERIC_VECTOR_SIZE_BYTES 64
+#include "packetmath.cpp"

diff --git a/test/prec_inverse_4x4.cpp b/test/prec_inverse_4x4.cpp
index dc9967f..3864ecf 100644
--- a/test/prec_inverse_4x4.cpp
+++ b/test/prec_inverse_4x4.cpp

@@ -28,12 +28,9 @@
   typedef typename MatrixType::Scalar Scalar;
   double error_sum = 0., error_max = 0.;
   for (int i = 0; i < repeat; ++i) {
-    MatrixType m;
-    bool is_invertible;
-    do {
-      m = MatrixType::Random();
-      is_invertible = Eigen::FullPivLU<MatrixType>(m).isInvertible();
-    } while (!is_invertible);
+    // Create a random diagonally dominant (thus invertible) matrix.
+    MatrixType m = MatrixType::Random();
+    m.diagonal().array() += Scalar(8);  // 2 * 4 for a 4x4 matrix.
     MatrixType inv = m.inverse();
     double error = double((m * inv - MatrixType::Identity()).norm());
     error_sum += error;
@@ -43,7 +40,7 @@
   double error_avg = error_sum / repeat;
   EIGEN_DEBUG_VAR(error_avg);
   EIGEN_DEBUG_VAR(error_max);
-  // FIXME that 1.25 used to be a 1.0 until the NumTraits changes on 28 April 2010, what's going wrong??
+  // FIXME: that 1.25 used to be 1.0 until the NumTraits changes on 28 April 2010.
   // FIXME that 1.25 used to be 1.2 until we tested gcc 4.1 on 30 June 2010 and got 1.21.
   VERIFY(error_avg < (NumTraits<Scalar>::IsComplex ? 8.0 : 1.25));
   VERIFY(error_max < (NumTraits<Scalar>::IsComplex ? 64.0 : 20.0));

diff --git a/test/product_extra.cpp b/test/product_extra.cpp
index ddd77b5..1e3c665 100644
--- a/test/product_extra.cpp
+++ b/test/product_extra.cpp

@@ -252,6 +252,7 @@
 template <typename T>
 Index compute_block_size() {
   Index ret = 0;
+  // Zero-sized inputs: verify they compile and don't crash.
   ret += test_compute_block_size<T>(0, 1, 1);
   ret += test_compute_block_size<T>(1, 0, 1);
   ret += test_compute_block_size<T>(1, 1, 0);
@@ -259,9 +260,58 @@
   ret += test_compute_block_size<T>(0, 1, 0);
   ret += test_compute_block_size<T>(1, 0, 0);
   ret += test_compute_block_size<T>(0, 0, 0);
+
+  // Sanity checks: blocking sizes must be positive and not exceed the original.
+  {
+    Index m = 200, n = 200, k = 200;
+    Index mc = m, nc = n, kc = k;
+    internal::computeProductBlockingSizes<T, T>(kc, mc, nc);
+    VERIFY(kc > 0 && kc <= k);
+    VERIFY(mc > 0 && mc <= m);
+    VERIFY(nc > 0 && nc <= n);
+  }
+  // With EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS (l1=9KB, l2=32KB, l3=512KB),
+  // large sizes must be actually blocked (not returned as-is).
+  {
+    Index m = 500, n = 500, k = 500;
+    Index mc = m, nc = n, kc = k;
+    internal::computeProductBlockingSizes<T, T>(kc, mc, nc);
+    VERIFY(kc < k);
+  }
+
   return ret;
 }
 
+// Verify correctness of GEMM at sizes that require multiple blocking passes
+// under EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS (l1=9KB, l2=32KB, l3=512KB).
+// The blocking early-return threshold is max(k,m,n) < 48, so sizes >= 48
+// trigger actual multi-pass blocking with these tiny cache sizes.
+// Verifies GEMM against column-by-column GEMV (a different code path).
+template <int>
+void test_small_block_correctness() {
+  const int sizes[] = {48, 64, 96, 128, 200};
+  for (int si = 0; si < 5; ++si) {
+    int n = sizes[si];
+    MatrixXd A = MatrixXd::Random(n, n);
+    MatrixXd B = MatrixXd::Random(n, n);
+    MatrixXd C(n, n);
+    C.noalias() = A * B;
+    MatrixXd Cref(n, n);
+    for (int j = 0; j < n; ++j) Cref.col(j) = A * B.col(j);
+    VERIFY_IS_APPROX(C, Cref);
+  }
+  // Non-square: exercise different blocking in m, n, k dimensions.
+  {
+    MatrixXd A = MatrixXd::Random(200, 64);
+    MatrixXd B = MatrixXd::Random(64, 300);
+    MatrixXd C(200, 300);
+    C.noalias() = A * B;
+    MatrixXd Cref(200, 300);
+    for (int j = 0; j < 300; ++j) Cref.col(j) = A * B.col(j);
+    VERIFY_IS_APPROX(C, Cref);
+  }
+}
+
 template <typename>
 void aliasing_with_resize() {
   Index m = internal::random<Index>(10, 50);
@@ -347,6 +397,260 @@
   VERIFY_IS_APPROX(r44.noalias() += Vector4d::Ones() * m44.col(0).transpose(), ones44);
 }
 
+// Regression test for issue #3059: GEBP asm register constraints fail
+// for custom (non-vectorizable) scalar types. Type T has a non-trivial
+// destructor (making sizeof(T) > sizeof(double)), while type U is a
+// simple wrapper. Both must compile and produce correct products.
+namespace issue_3059 {
+
+class Ptr {
+ public:
+  ~Ptr() {}
+  double* m_ptr = nullptr;
+};
+
+class T {
+ public:
+  T() = default;
+  T(double v) : m_value(v) {}
+
+  friend T operator*(const T& a, const T& b) { return T(a.m_value * b.m_value); }
+  T& operator*=(const T& o) {
+    m_value *= o.m_value;
+    return *this;
+  }
+  friend T operator/(const T& a, const T& b) { return T(a.m_value / b.m_value); }
+  T& operator/=(const T& o) {
+    m_value /= o.m_value;
+    return *this;
+  }
+  friend T operator+(const T& a, const T& b) { return T(a.m_value + b.m_value); }
+  T& operator+=(const T& o) {
+    m_value += o.m_value;
+    return *this;
+  }
+  friend T operator-(const T& a, const T& b) { return T(a.m_value - b.m_value); }
+  T& operator-=(const T& o) {
+    m_value -= o.m_value;
+    return *this;
+  }
+  friend T operator-(const T& a) { return T(-a.m_value); }
+
+  bool operator==(const T& o) const { return m_value == o.m_value; }
+  bool operator<(const T& o) const { return m_value < o.m_value; }
+  bool operator<=(const T& o) const { return m_value <= o.m_value; }
+  bool operator>(const T& o) const { return m_value > o.m_value; }
+  bool operator>=(const T& o) const { return m_value >= o.m_value; }
+  bool operator!=(const T& o) const { return m_value != o.m_value; }
+
+  double value() const { return m_value; }
+
+ private:
+  double m_value = 0.0;
+  Ptr m_ptr;  // Makes sizeof(T) > sizeof(double)
+};
+
+T sqrt(const T& x) { return T(std::sqrt(x.value())); }
+T abs(const T& x) { return T(std::abs(x.value())); }
+T abs2(const T& x) { return T(x.value() * x.value()); }
+
+class U {
+ public:
+  U() = default;
+  U(double v) : m_value(v) {}
+
+  friend U operator*(const U& a, const U& b) { return U(a.m_value * b.m_value); }
+  U& operator*=(const U& o) {
+    m_value *= o.m_value;
+    return *this;
+  }
+  friend U operator/(const U& a, const U& b) { return U(a.m_value / b.m_value); }
+  U& operator/=(const U& o) {
+    m_value /= o.m_value;
+    return *this;
+  }
+  friend U operator+(const U& a, const U& b) { return U(a.m_value + b.m_value); }
+  U& operator+=(const U& o) {
+    m_value += o.m_value;
+    return *this;
+  }
+  friend U operator-(const U& a, const U& b) { return U(a.m_value - b.m_value); }
+  U& operator-=(const U& o) {
+    m_value -= o.m_value;
+    return *this;
+  }
+  friend U operator-(const U& a) { return U(-a.m_value); }
+
+  bool operator==(const U& o) const { return m_value == o.m_value; }
+  bool operator<(const U& o) const { return m_value < o.m_value; }
+  bool operator<=(const U& o) const { return m_value <= o.m_value; }
+  bool operator>(const U& o) const { return m_value > o.m_value; }
+  bool operator>=(const U& o) const { return m_value >= o.m_value; }
+  bool operator!=(const U& o) const { return m_value != o.m_value; }
+
+  double value() const { return m_value; }
+
+ private:
+  double m_value = 0.0;
+};
+
+U sqrt(const U& x) { return U(std::sqrt(x.value())); }
+U abs(const U& x) { return U(std::abs(x.value())); }
+U abs2(const U& x) { return U(x.value() * x.value()); }
+
+}  // namespace issue_3059
+
+namespace Eigen {
+
+template <>
+struct NumTraits<issue_3059::T> : NumTraits<double> {
+  using Real = issue_3059::T;
+  using NonInteger = issue_3059::T;
+  using Nested = issue_3059::T;
+  enum { IsComplex = 0, RequireInitialization = 1 };
+};
+
+template <>
+struct NumTraits<issue_3059::U> : NumTraits<double> {
+  using Real = issue_3059::U;
+  using NonInteger = issue_3059::U;
+  using Nested = issue_3059::U;
+  enum { IsComplex = 0, RequireInitialization = 0 };
+};
+
+}  // namespace Eigen
+
+template <int>
+void product_custom_scalar_types() {
+  using namespace issue_3059;
+  // Type T: has non-trivial destructor, sizeof(T) > sizeof(double)
+  {
+    Matrix<T, Dynamic, Dynamic> A(4, 4), B(4, 4), C(4, 4);
+    for (int i = 0; i < 4; ++i)
+      for (int j = 0; j < 4; ++j) {
+        A(i, j) = T(static_cast<double>(i + 1));
+        B(i, j) = T(static_cast<double>(j + 1));
+      }
+    C.noalias() = A * B;
+    // A*B: C(i,j) = sum_k (i+1)*(k+1) * ... no, A(i,k)=(i+1), B(k,j)=(j+1)
+    // so C(i,j) = sum_k (i+1)*(j+1) = 4*(i+1)*(j+1)
+    for (int i = 0; i < 4; ++i)
+      for (int j = 0; j < 4; ++j) VERIFY(C(i, j) == T(4.0 * (i + 1) * (j + 1)));
+  }
+  // Type U: simple wrapper, sizeof(U) == sizeof(double)
+  {
+    Matrix<U, Dynamic, Dynamic> A(4, 4), B(4, 4), C(4, 4);
+    for (int i = 0; i < 4; ++i)
+      for (int j = 0; j < 4; ++j) {
+        A(i, j) = U(static_cast<double>(i + 1));
+        B(i, j) = U(static_cast<double>(j + 1));
+      }
+    C.noalias() = A * B;
+    for (int i = 0; i < 4; ++i)
+      for (int j = 0; j < 4; ++j) VERIFY(C(i, j) == U(4.0 * (i + 1) * (j + 1)));
+  }
+  // Larger matrices to exercise GEBP blocking.
+  {
+    const int n = 33;
+    Matrix<U, Dynamic, Dynamic> A(n, n), B(n, n), C(n, n);
+    for (int i = 0; i < n; ++i)
+      for (int j = 0; j < n; ++j) {
+        A(i, j) = U(static_cast<double>((i * 7 + j * 3) % 13));
+        B(i, j) = U(static_cast<double>((i * 5 + j * 11) % 17));
+      }
+    C.noalias() = A * B;
+    // Verify against explicit triple loop.
+    for (int i = 0; i < n; ++i)
+      for (int j = 0; j < n; ++j) {
+        double sum = 0;
+        for (int k = 0; k < n; ++k) sum += A(i, k).value() * B(k, j).value();
+        VERIFY(C(i, j) == U(sum));
+      }
+  }
+}
+
+// Test complex GEMV with all conjugation combinations at sizes that
+// exercise full, half, and quarter packet code paths.
+// The GEMV kernels in GeneralMatrixVector.h use conj_helper at three
+// packet levels. The existing product_extra tests cover conjugation
+// but only at random sizes, never systematically at packet boundaries.
+template <int>
+void gemv_complex_conjugate() {
+  typedef std::complex<float> Scf;
+  typedef std::complex<double> Scd;
+  const Index PS_f = internal::packet_traits<Scf>::size;
+  const Index PS_d = internal::packet_traits<Scd>::size;
+
+  // Sizes chosen to exercise packet boundaries for both float and double.
+  const Index sizes[] = {1, 2, 3, 4, 5, 7, 8, 9, 15, 16, 17, 31, 32, 33};
+
+  for (int si = 0; si < 14; ++si) {
+    Index m = sizes[si];
+    // Test complex<float> GEMV with all conjugation combos.
+    {
+      typedef Matrix<Scf, Dynamic, Dynamic> Mat;
+      typedef Matrix<Scf, Dynamic, 1> Vec;
+      Mat A = Mat::Random(m, m);
+      Vec v = Vec::Random(m);
+      Vec res(m);
+
+      // A * v (no conjugation)
+      res.noalias() = A * v;
+      VERIFY_IS_APPROX(res, (A.eval() * v.eval()).eval());
+
+      // A.conjugate() * v
+      res.noalias() = A.conjugate() * v;
+      VERIFY_IS_APPROX(res, (A.conjugate().eval() * v.eval()).eval());
+
+      // A * v.conjugate()
+      res.noalias() = A * v.conjugate();
+      VERIFY_IS_APPROX(res, (A.eval() * v.conjugate().eval()).eval());
+
+      // A.conjugate() * v.conjugate()
+      res.noalias() = A.conjugate() * v.conjugate();
+      VERIFY_IS_APPROX(res, (A.conjugate().eval() * v.conjugate().eval()).eval());
+
+      // A.adjoint() * v (transpose + conjugate of lhs)
+      Vec res2(m);
+      res2.noalias() = A.adjoint() * v;
+      VERIFY_IS_APPROX(res2, (A.adjoint().eval() * v.eval()).eval());
+
+      // Row-major complex GEMV
+      typedef Matrix<Scf, Dynamic, Dynamic, RowMajor> RMat;
+      RMat B = A;
+      res.noalias() = B * v;
+      VERIFY_IS_APPROX(res, (A.eval() * v.eval()).eval());
+
+      res.noalias() = B.conjugate() * v;
+      VERIFY_IS_APPROX(res, (A.conjugate().eval() * v.eval()).eval());
+    }
+
+    // Test complex<double> GEMV with conjugation.
+    {
+      typedef Matrix<Scd, Dynamic, Dynamic> Mat;
+      typedef Matrix<Scd, Dynamic, 1> Vec;
+      Mat A = Mat::Random(m, m);
+      Vec v = Vec::Random(m);
+      Vec res(m);
+
+      res.noalias() = A.conjugate() * v;
+      VERIFY_IS_APPROX(res, (A.conjugate().eval() * v.eval()).eval());
+
+      res.noalias() = A * v.conjugate();
+      VERIFY_IS_APPROX(res, (A.eval() * v.conjugate().eval()).eval());
+
+      // Non-square: wide matrix × vector (exercises different cols path).
+      Mat C = Mat::Random(m, m + 3);
+      Vec w = Vec::Random(m + 3);
+      Vec res3(m);
+      res3.noalias() = C.conjugate() * w;
+      VERIFY_IS_APPROX(res3, (C.conjugate().eval() * w.eval()).eval());
+    }
+  }
+  (void)PS_f;
+  (void)PS_d;
+}
+
 EIGEN_DECLARE_TEST(product_extra) {
   for (int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1(product_extra(
@@ -369,4 +673,9 @@
   CALL_SUBTEST_7(compute_block_size<double>());
   CALL_SUBTEST_7(compute_block_size<std::complex<double> >());
   CALL_SUBTEST_8(aliasing_with_resize<void>());
+  CALL_SUBTEST_9(product_custom_scalar_types<0>());
+  CALL_SUBTEST_10(test_small_block_correctness<0>());
+
+  // Complex GEMV conjugation at varied sizes (deterministic, outside g_repeat).
+  CALL_SUBTEST_11(gemv_complex_conjugate<0>());
 }

diff --git a/test/product_large.cpp b/test/product_large.cpp
index b3213ec..2fdc261 100644
--- a/test/product_large.cpp
+++ b/test/product_large.cpp

@@ -94,6 +94,217 @@
   }
 }
 
+// Regression test: row-major GEMV with stride*sizeof > 32000 disables the
+// 8-row main loop (n8=0). The cleanup must use `for` loops (not `if`) to
+// process all remaining rows. Without the fix, only 7 out of `rows` results
+// are computed. This manifests as loss of orthogonality in QR of tall-skinny
+// matrices, since the Householder application uses row-major GEMV internally.
+template <int>
+void bug_gemv_rowmajor_large_stride() {
+  // Direct GEMV test: row-major A with stride (= cols) triggering n8=0.
+  // The threshold is stride * sizeof(Scalar) > 32000.
+  // For double: cols > 4000. For float: cols > 8000.
+  {
+    const int rows = 100;
+    const int cols = 5000;  // cols * sizeof(double) = 40000 > 32000
+    Matrix<double, Dynamic, Dynamic, RowMajor> A(rows, cols);
+    A.setRandom();
+    VectorXd x = VectorXd::Random(cols);
+    VectorXd y = A * x;
+    VectorXd y_ref = VectorXd::Zero(rows);
+    for (int i = 0; i < rows; ++i)
+      for (int j = 0; j < cols; ++j) y_ref(i) += A(i, j) * x(j);
+    VERIFY_IS_APPROX(y, y_ref);
+  }
+
+  // QR orthogonality test: this is the high-level symptom.
+  // HouseholderQR of a col-major (m x n) matrix with m > 4000
+  // uses row-major GEMV internally during Householder application.
+  {
+    const int m = 5000;
+    const int n = 50;
+    MatrixXd A = MatrixXd::Random(m, n);
+    MatrixXd Q = A.householderQr().householderQ() * MatrixXd::Identity(m, n);
+    MatrixXd QtQ = Q.adjoint() * Q;
+    VERIFY_IS_APPROX(QtQ, MatrixXd::Identity(n, n));
+  }
+}
+
+// Regression test for row-major GEMV run_small_cols bug.
+// When cols is small (e.g., 2), and loop variables (like n8) are 0 due
+// to row or stride limits, the remainder loops previously used `if` checks
+// like `if (i < n4)`. This incorrectly skips rows if multiple remainder
+// blocks are needed (e.g., 9 rows).
+template <int>
+void bug_gemv_run_small_cols() {
+  const int rows = 9;       // > 8, covers 8-row loop step but tests remainder cleanup
+  const int cols = 2;       // triggers run_small_cols (cols < PacketSize)
+  const int stride = 5000;  // 5000 * sizeof(double) > 32000, forces n8 = 0
+
+  Matrix<double, Dynamic, Dynamic, RowMajor> A_full(rows, stride);
+  A_full.setRandom();
+  auto A = A_full.leftCols(cols);
+
+  VectorXd x = VectorXd::Random(cols);
+  VectorXd y = A * x;
+  VectorXd y_ref = A.eval() * x;  // No stride.
+
+  VERIFY_IS_APPROX(y, y_ref);
+}
+
+// Systematic test of row-major GEMV run_small_cols and main run() remainder paths.
+// Varies cols from 1-7 (covers float PacketSize=8 and double PacketSize=4 boundaries)
+// and rows across values that exercise all n8/n4/n2/n1 remainder combinations.
+template <int>
+void gemv_small_cols_systematic() {
+  const int test_cols[] = {1, 2, 3, 4, 5, 6, 7};
+  const int test_rows[] = {1, 2, 3, 4, 5, 7, 8, 9, 11, 13, 15, 16, 17, 25};
+
+  // Large stride forces n8=0, exercising all remainder-only paths.
+  {
+    const int stride = 5000;  // 5000 * sizeof(double) = 40000 > 32000
+    for (int ci = 0; ci < 7; ++ci) {
+      for (int ri = 0; ri < 14; ++ri) {
+        int rows = test_rows[ri], cols = test_cols[ci];
+        Matrix<double, Dynamic, Dynamic, RowMajor> A_full(rows, stride);
+        A_full.setRandom();
+        auto A = A_full.leftCols(cols);
+        VectorXd x = VectorXd::Random(cols);
+        VectorXd y = A * x;
+        VectorXd y_ref = VectorXd::Zero(rows);
+        for (int i = 0; i < rows; ++i)
+          for (int j = 0; j < cols; ++j) y_ref(i) += A(i, j) * x(j);
+        VERIFY_IS_APPROX(y, y_ref);
+      }
+    }
+  }
+
+  // Normal stride (n8 active) to cover the 8-row main loop + remainders.
+  for (int ci = 0; ci < 7; ++ci) {
+    for (int ri = 0; ri < 14; ++ri) {
+      int rows = test_rows[ri], cols = test_cols[ci];
+      Matrix<double, Dynamic, Dynamic, RowMajor> A(rows, cols);
+      A.setRandom();
+      VectorXd x = VectorXd::Random(cols);
+      VectorXd y = A * x;
+      VectorXd y_ref = VectorXd::Zero(rows);
+      for (int i = 0; i < rows; ++i)
+        for (int j = 0; j < cols; ++j) y_ref(i) += A(i, j) * x(j);
+      VERIFY_IS_APPROX(y, y_ref);
+    }
+  }
+
+  // Float with large stride: 9000 * sizeof(float) = 36000 > 32000
+  {
+    const int stride = 9000;
+    for (int ci = 0; ci < 7; ++ci) {
+      for (int ri = 0; ri < 14; ++ri) {
+        int rows = test_rows[ri], cols = test_cols[ci];
+        Matrix<float, Dynamic, Dynamic, RowMajor> A_full(rows, stride);
+        A_full.setRandom();
+        auto A = A_full.leftCols(cols);
+        VectorXf x = VectorXf::Random(cols);
+        VectorXf y = A * x;
+        VectorXf y_ref = VectorXf::Zero(rows);
+        for (int i = 0; i < rows; ++i)
+          for (int j = 0; j < cols; ++j) y_ref(i) += A(i, j) * x(j);
+        VERIFY_IS_APPROX(y, y_ref);
+      }
+    }
+  }
+}
+
+// Test the main row-major GEMV n8=0 path (not run_small_cols) with varied row counts.
+// The n8 threshold is stride*sizeof(Scalar) > 32000.
+template <int>
+void gemv_rowmajor_large_stride_varied_rows() {
+  const int test_rows[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 15, 16, 17, 25, 100};
+  // Double: cols=5000 (5000*8 > 32000), enough cols to stay on main run() path.
+  {
+    const int cols = 5000;
+    for (int ri = 0; ri < 16; ++ri) {
+      int rows = test_rows[ri];
+      Matrix<double, Dynamic, Dynamic, RowMajor> A(rows, cols);
+      A.setRandom();
+      VectorXd x = VectorXd::Random(cols);
+      VectorXd y = A * x;
+      VectorXd y_ref = VectorXd::Zero(rows);
+      for (int i = 0; i < rows; ++i)
+        for (int j = 0; j < cols; ++j) y_ref(i) += A(i, j) * x(j);
+      VERIFY_IS_APPROX(y, y_ref);
+    }
+  }
+  // Float: cols=9000 (9000*4 > 32000).
+  {
+    const int cols = 9000;
+    for (int ri = 0; ri < 16; ++ri) {
+      int rows = test_rows[ri];
+      Matrix<float, Dynamic, Dynamic, RowMajor> A(rows, cols);
+      A.setRandom();
+      VectorXf x = VectorXf::Random(cols);
+      VectorXf y = A * x;
+      VectorXf y_ref = VectorXf::Zero(rows);
+      for (int i = 0; i < rows; ++i)
+        for (int j = 0; j < cols; ++j) y_ref(i) += A(i, j) * x(j);
+      VERIFY_IS_APPROX(y, y_ref);
+    }
+  }
+}
+
+// Test extreme aspect ratios that exercise GEMV, outer-product, and thin-GEMM dispatch.
+template <int>
+void product_extreme_aspect_ratios() {
+  const int sizes[] = {1, 2, 3, 4, 8, 16, 48, 64, 128};
+  for (int si = 0; si < 9; ++si) {
+    int s = sizes[si];
+    for (int ki = 0; ki < 9; ++ki) {
+      int k = sizes[ki];
+      // Thin result: s x k * k x 2  (2-column GEMM)
+      {
+        MatrixXd A = MatrixXd::Random(s, k);
+        MatrixXd B = MatrixXd::Random(k, 2);
+        MatrixXd C = A * B;
+        MatrixXd Cref = MatrixXd::Zero(s, 2);
+        for (int i = 0; i < s; ++i)
+          for (int j = 0; j < 2; ++j)
+            for (int kk = 0; kk < k; ++kk) Cref(i, j) += A(i, kk) * B(kk, j);
+        VERIFY_IS_APPROX(C, Cref);
+      }
+      // Wide result: 2 x k * k x s  (2-row GEMM)
+      {
+        MatrixXd A = MatrixXd::Random(2, k);
+        MatrixXd B = MatrixXd::Random(k, s);
+        MatrixXd C = A * B;
+        MatrixXd Cref = MatrixXd::Zero(2, s);
+        for (int i = 0; i < 2; ++i)
+          for (int j = 0; j < s; ++j)
+            for (int kk = 0; kk < k; ++kk) Cref(i, j) += A(i, kk) * B(kk, j);
+        VERIFY_IS_APPROX(C, Cref);
+      }
+      // GEMV: s x k * k x 1
+      {
+        MatrixXd A = MatrixXd::Random(s, k);
+        VectorXd x = VectorXd::Random(k);
+        VectorXd y = A * x;
+        VectorXd yref = VectorXd::Zero(s);
+        for (int i = 0; i < s; ++i)
+          for (int kk = 0; kk < k; ++kk) yref(i) += A(i, kk) * x(kk);
+        VERIFY_IS_APPROX(y, yref);
+      }
+      // Vec-mat: 1 x k * k x s
+      {
+        RowVectorXd v = RowVectorXd::Random(k);
+        MatrixXd B = MatrixXd::Random(k, s);
+        RowVectorXd r = v * B;
+        RowVectorXd rref = RowVectorXd::Zero(s);
+        for (int j = 0; j < s; ++j)
+          for (int kk = 0; kk < k; ++kk) rref(j) += v(kk) * B(kk, j);
+        VERIFY_IS_APPROX(r, rref);
+      }
+    }
+  }
+}
+
 template <int>
 void bug_1622() {
   typedef Matrix<double, 2, -1, 0, 2, -1> Mat2X;
@@ -135,9 +346,16 @@
         internal::random<int>(1, EIGEN_TEST_MAX_SIZE), internal::random<int>(1, EIGEN_TEST_MAX_SIZE))));
     CALL_SUBTEST_11(product(Matrix<bfloat16, Dynamic, Dynamic, RowMajor>(
         internal::random<int>(1, EIGEN_TEST_MAX_SIZE), internal::random<int>(1, EIGEN_TEST_MAX_SIZE))));
+    CALL_SUBTEST_12(product(Matrix<Eigen::half, Dynamic, Dynamic>(internal::random<int>(1, EIGEN_TEST_MAX_SIZE),
+                                                                  internal::random<int>(1, EIGEN_TEST_MAX_SIZE))));
   }
 
   CALL_SUBTEST_6(product_large_regressions<0>());
+  CALL_SUBTEST_6(bug_gemv_rowmajor_large_stride<0>());
+  CALL_SUBTEST_6(bug_gemv_run_small_cols<0>());
+  CALL_SUBTEST_6(gemv_small_cols_systematic<0>());
+  CALL_SUBTEST_6(gemv_rowmajor_large_stride_varied_rows<0>());
+  CALL_SUBTEST_6(product_extreme_aspect_ratios<0>());
 
   // Regression test for bug 714:
 #if defined EIGEN_HAS_OPENMP

diff --git a/test/product_selfadjoint.cpp b/test/product_selfadjoint.cpp
index b61f300..96c089f 100644
--- a/test/product_selfadjoint.cpp
+++ b/test/product_selfadjoint.cpp

@@ -63,6 +63,70 @@
   VERIFY_IS_APPROX(m1 * m4, m2.template selfadjointView<Lower>() * m4);
 }
 
+// Test selfadjoint products at blocking boundary sizes.
+// The existing test uses random sizes; this tests deterministic sizes
+// at transitions (especially around the GEBP early-return threshold of 48).
+template <int>
+void product_selfadjoint_boundary() {
+  typedef double Scalar;
+  typedef Matrix<Scalar, Dynamic, Dynamic> Mat;
+  typedef Matrix<Scalar, Dynamic, 1> Vec;
+
+  const int sizes[] = {1, 2, 3, 4, 8, 16, 47, 48, 49, 64, 96, 128};
+  for (int si = 0; si < 12; ++si) {
+    int n = sizes[si];
+    Mat m1 = Mat::Random(n, n);
+    m1 = (m1 + m1.transpose()).eval();  // make symmetric
+
+    Vec v1 = Vec::Random(n);
+    Mat rhs = Mat::Random(n, 5);
+
+    // Lower selfadjointView * vector
+    Mat m2 = m1.triangularView<Lower>();
+    VERIFY_IS_APPROX(m2.selfadjointView<Lower>() * v1, m1 * v1);
+
+    // Upper selfadjointView * vector
+    m2 = m1.triangularView<Upper>();
+    VERIFY_IS_APPROX(m2.selfadjointView<Upper>() * v1, m1 * v1);
+
+    // selfadjointView * matrix
+    m2 = m1.triangularView<Lower>();
+    VERIFY_IS_APPROX(m2.selfadjointView<Lower>() * rhs, m1 * rhs);
+
+    // rankUpdate
+    Vec v2 = Vec::Random(n);
+    m2 = m1.triangularView<Lower>();
+    m2.selfadjointView<Lower>().rankUpdate(v1, v2);
+    VERIFY_IS_APPROX(m2, (m1 + v1 * v2.transpose() + v2 * v1.transpose()).triangularView<Lower>().toDenseMatrix());
+  }
+}
+
+// Same test for complex type (tests conjugation logic).
+template <int>
+void product_selfadjoint_boundary_complex() {
+  typedef std::complex<float> Scalar;
+  typedef Matrix<Scalar, Dynamic, Dynamic> Mat;
+  typedef Matrix<Scalar, Dynamic, 1> Vec;
+
+  const int sizes[] = {1, 8, 47, 48, 49, 64};
+  for (int si = 0; si < 6; ++si) {
+    int n = sizes[si];
+    Mat m1 = Mat::Random(n, n);
+    m1 = (m1 + m1.adjoint()).eval();                               // make Hermitian
+    m1.diagonal() = m1.diagonal().real().template cast<Scalar>();  // real diagonal
+
+    Vec v1 = Vec::Random(n);
+    Mat rhs = Mat::Random(n, 3);
+
+    Mat m2 = m1.triangularView<Lower>();
+    VERIFY_IS_APPROX(m2.selfadjointView<Lower>() * v1, m1 * v1);
+    VERIFY_IS_APPROX(m2.selfadjointView<Lower>() * rhs, m1 * rhs);
+
+    m2 = m1.triangularView<Upper>();
+    VERIFY_IS_APPROX(m2.selfadjointView<Upper>() * v1, m1 * v1);
+  }
+}
+
 EIGEN_DECLARE_TEST(product_selfadjoint) {
   int s = 0;
   for (int i = 0; i < g_repeat; i++) {
@@ -86,4 +150,8 @@
     CALL_SUBTEST_7(product_selfadjoint(Matrix<float, Dynamic, Dynamic, RowMajor>(s, s)));
     TEST_SET_BUT_UNUSED_VARIABLE(s)
   }
+
+  // Deterministic blocking boundary tests (outside g_repeat).
+  CALL_SUBTEST_8(product_selfadjoint_boundary<0>());
+  CALL_SUBTEST_9(product_selfadjoint_boundary_complex<0>());
 }

diff --git a/test/product_small.cpp b/test/product_small.cpp
index 1b1eef1..1ad06f6 100644
--- a/test/product_small.cpp
+++ b/test/product_small.cpp

@@ -275,6 +275,31 @@
   }
 }
 
+// Test products at sizes near critical code-path transitions:
+//  - EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD = 8 (coeff-based vs GEBP)
+//  - Blocking early-return at max(k,m,n) < 48
+// Uses a sparse set of sizes so total count is 14^3 = 2744 (fast).
+template <typename T>
+void product_transition_sizes() {
+  using Matrix = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>;
+  const int critical[] = {1, 2, 4, 7, 8, 9, 12, 16, 24, 32, 47, 48, 49, 64};
+  for (int im = 0; im < 14; ++im) {
+    for (int in = 0; in < 14; ++in) {
+      Matrix C = Matrix::Zero(critical[im], critical[in]);
+      Matrix Cref = Matrix::Zero(critical[im], critical[in]);
+      for (int ik = 0; ik < 14; ++ik) {
+        int m = critical[im], n = critical[in], k = critical[ik];
+        Matrix A = Matrix::Random(m, k);
+        Matrix B = Matrix::Random(k, n);
+        C = A * B;
+        Cref.setZero();
+        ref_prod(Cref, A, B);
+        VERIFY_IS_APPROX(C, Cref);
+      }
+    }
+  }
+}
+
 template <typename T>
 void product_sweep(int max_m, int max_k, int max_n) {
   using Matrix = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>;
@@ -338,4 +363,8 @@
   }
 
   CALL_SUBTEST_6(product_small_regressions<0>());
+
+  // Deterministic sweep at transition boundaries (outside g_repeat).
+  CALL_SUBTEST_54(product_transition_sizes<float>());
+  CALL_SUBTEST_55(product_transition_sizes<double>());
 }

diff --git a/test/product_symm.cpp b/test/product_symm.cpp
index 7c6db2e..0b92932 100644
--- a/test/product_symm.cpp
+++ b/test/product_symm.cpp

@@ -110,6 +110,26 @@
   }
 }
 
+// Test symmetric products at blocking boundary sizes.
+// The existing test uses random sizes; these deterministic sizes exercise
+// transitions in GEBP blocking (early-return at 48, block size changes).
+template <int>
+void product_symm_boundary() {
+  const int sizes[] = {1, 2, 3, 4, 8, 16, 47, 48, 49, 64, 96, 128};
+  for (int si = 0; si < 12; ++si) {
+    int n = sizes[si];
+
+    // double, matrix RHS
+    symm<double, Dynamic, Dynamic>(n, 5);
+    // double, vector RHS
+    symm<double, Dynamic, 1>(n);
+    // float, matrix RHS
+    symm<float, Dynamic, Dynamic>(n, 7);
+    // complex float, matrix RHS
+    symm<std::complex<float>, Dynamic, Dynamic>(n, 3);
+  }
+}
+
 EIGEN_DECLARE_TEST(product_symm) {
   for (int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1((symm<float, Dynamic, Dynamic>(internal::random<int>(1, EIGEN_TEST_MAX_SIZE),
@@ -126,4 +146,7 @@
     CALL_SUBTEST_7((symm<std::complex<float>, Dynamic, 1>(internal::random<int>(1, EIGEN_TEST_MAX_SIZE))));
     CALL_SUBTEST_8((symm<std::complex<double>, Dynamic, 1>(internal::random<int>(1, EIGEN_TEST_MAX_SIZE))));
   }
+
+  // Deterministic blocking boundary tests (outside g_repeat).
+  CALL_SUBTEST_9(product_symm_boundary<0>());
 }

diff --git a/test/product_threaded.cpp b/test/product_threaded.cpp
index 1782c28..6bd9397 100644
--- a/test/product_threaded.cpp
+++ b/test/product_threaded.cpp

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2023 Rasmus Munk Larsen <rmlarsen@google.com>
+// Copyright (C) 2023 Rasmus Munk Larsen <rmlarsen@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -24,6 +24,54 @@
   c_threaded.noalias() = a * b;
 
   VERIFY_IS_APPROX(c, c_threaded);
+  Eigen::setGemmThreadPool(nullptr);
 }
 
-EIGEN_DECLARE_TEST(product_threaded) { CALL_SUBTEST(test_parallelize_gemm()); }
+void test_parallelize_gemm_varied() {
+  constexpr int num_threads = 4;
+  ThreadPool pool(num_threads);
+
+  // Non-square float
+  {
+    MatrixXf a = MatrixXf::Random(512, 2048);
+    MatrixXf b = MatrixXf::Random(2048, 256);
+    MatrixXf c_serial(512, 256);
+    c_serial.noalias() = a * b;
+    Eigen::setGemmThreadPool(&pool);
+    MatrixXf c_threaded(512, 256);
+    c_threaded.noalias() = a * b;
+    Eigen::setGemmThreadPool(nullptr);
+    VERIFY_IS_APPROX(c_serial, c_threaded);
+  }
+
+  // Double
+  {
+    MatrixXd a = MatrixXd::Random(512, 512);
+    MatrixXd b = MatrixXd::Random(512, 512);
+    MatrixXd c_serial(512, 512);
+    c_serial.noalias() = a * b;
+    Eigen::setGemmThreadPool(&pool);
+    MatrixXd c_threaded(512, 512);
+    c_threaded.noalias() = a * b;
+    Eigen::setGemmThreadPool(nullptr);
+    VERIFY_IS_APPROX(c_serial, c_threaded);
+  }
+
+  // Complex double
+  {
+    MatrixXcd a = MatrixXcd::Random(256, 256);
+    MatrixXcd b = MatrixXcd::Random(256, 256);
+    MatrixXcd c_serial(256, 256);
+    c_serial.noalias() = a * b;
+    Eigen::setGemmThreadPool(&pool);
+    MatrixXcd c_threaded(256, 256);
+    c_threaded.noalias() = a * b;
+    Eigen::setGemmThreadPool(nullptr);
+    VERIFY_IS_APPROX(c_serial, c_threaded);
+  }
+}
+
+EIGEN_DECLARE_TEST(product_threaded) {
+  CALL_SUBTEST_1(test_parallelize_gemm());
+  CALL_SUBTEST_2(test_parallelize_gemm_varied());
+}

diff --git a/test/product_trsolve.cpp b/test/product_trsolve.cpp
index 033aa86..c7dfb25 100644
--- a/test/product_trsolve.cpp
+++ b/test/product_trsolve.cpp

@@ -108,6 +108,119 @@
   }
 }
 
+// Test triangular solve with non-unit inner stride at blocking boundary sizes.
+// The scalar fallback path in trsmKernelR (TriangularSolverMatrix.h lines 156-166)
+// is used when OtherInnerStride != 1. The existing bug 1741 test only uses
+// InnerStride=2 at random sizes. This exercises the scalar path at sizes that
+// trigger blocking transitions and tests additional configurations.
+template <int>
+void trsolve_strided_boundary() {
+  typedef double Scalar;
+  typedef Matrix<Scalar, Dynamic, Dynamic> MatrixX;
+
+  const int sizes[] = {1, 2, 3, 4, 8, 12, 16, 24, 32, 47, 48, 49, 64};
+  for (int si = 0; si < 13; ++si) {
+    int n = sizes[si];
+
+    MatrixX lhs = MatrixX::Random(n, n);
+    lhs *= 0.1;
+    lhs.diagonal().array() += 1.0;
+
+    // InnerStride = 2: ColMajor RHS, OnTheLeft, Lower
+    {
+      int cols = 5;
+      MatrixX buffer(2 * n, 2 * cols);
+      Map<MatrixX, 0, Stride<Dynamic, 2> > map(buffer.data(), n, cols, Stride<Dynamic, 2>(2 * n, 2));
+      MatrixX ref(n, cols);
+      buffer.setZero();
+      map.setRandom();
+      ref = map;
+      lhs.triangularView<Lower>().solveInPlace(map);
+      VERIFY_IS_APPROX(lhs.triangularView<Lower>().toDenseMatrix() * MatrixX(map), ref);
+    }
+
+    // InnerStride = 2: Upper triangular
+    {
+      int cols = 5;
+      MatrixX buffer(2 * n, 2 * cols);
+      Map<MatrixX, 0, Stride<Dynamic, 2> > map(buffer.data(), n, cols, Stride<Dynamic, 2>(2 * n, 2));
+      MatrixX ref(n, cols);
+      buffer.setZero();
+      map.setRandom();
+      ref = map;
+      lhs.triangularView<Upper>().solveInPlace(map);
+      VERIFY_IS_APPROX(lhs.triangularView<Upper>().toDenseMatrix() * MatrixX(map), ref);
+    }
+
+    // InnerStride = 2: UnitLower (tests the UnitDiag path without diagonal scaling)
+    {
+      int cols = 3;
+      MatrixX buffer(2 * n, 2 * cols);
+      Map<MatrixX, 0, Stride<Dynamic, 2> > map(buffer.data(), n, cols, Stride<Dynamic, 2>(2 * n, 2));
+      MatrixX ref(n, cols);
+      buffer.setZero();
+      map.setRandom();
+      ref = map;
+      lhs.triangularView<UnitLower>().solveInPlace(map);
+      VERIFY_IS_APPROX(lhs.triangularView<UnitLower>().toDenseMatrix() * MatrixX(map), ref);
+    }
+
+    // InnerStride = 3: Less common stride to exercise the scalar path more thoroughly
+    {
+      int cols = 4;
+      MatrixX buffer(3 * n, 3 * cols);
+      Map<MatrixX, 0, Stride<Dynamic, 3> > map(buffer.data(), n, cols, Stride<Dynamic, 3>(3 * n, 3));
+      MatrixX ref(n, cols);
+      buffer.setZero();
+      map.setRandom();
+      ref = map;
+      lhs.triangularView<Lower>().solveInPlace(map);
+      VERIFY_IS_APPROX(lhs.triangularView<Lower>().toDenseMatrix() * MatrixX(map), ref);
+    }
+
+    // Vector RHS with InnerStride = 2
+    {
+      typedef Matrix<Scalar, Dynamic, 1> VecX;
+      VecX buffer(2 * n);
+      Map<VecX, 0, InnerStride<2> > map(buffer.data(), n, InnerStride<2>(2));
+      buffer.setZero();
+      map.setRandom();
+      VecX ref = map;
+      lhs.triangularView<Lower>().solveInPlace(map);
+      VERIFY_IS_APPROX(lhs.triangularView<Lower>().toDenseMatrix() * VecX(map), ref);
+    }
+  }
+
+  // Complex with non-unit stride: tests conjugation in the scalar fallback path.
+  {
+    typedef std::complex<double> CScalar;
+    typedef Matrix<CScalar, Dynamic, Dynamic> CMatrixX;
+    int n = 32;
+    CMatrixX lhs = CMatrixX::Random(n, n);
+    lhs *= CScalar(0.1);
+    lhs.diagonal().array() += CScalar(1.0);
+
+    int cols = 4;
+    CMatrixX buffer(2 * n, 2 * cols);
+    Map<CMatrixX, 0, Stride<Dynamic, 2> > map(buffer.data(), n, cols, Stride<Dynamic, 2>(2 * n, 2));
+    CMatrixX ref(n, cols);
+
+    // Conjugate Lower
+    buffer.setZero();
+    map.setRandom();
+    ref = map;
+    lhs.conjugate().triangularView<Lower>().solveInPlace(map);
+    VERIFY_IS_APPROX(lhs.conjugate().triangularView<Lower>().toDenseMatrix() * CMatrixX(map), ref);
+
+    // Adjoint Upper
+    buffer.setZero();
+    map.setRandom();
+    ref = map;
+    lhs.adjoint().triangularView<Lower>().solveInPlace(map);
+    VERIFY_IS_APPROX(lhs.adjoint().triangularView<Lower>().toDenseMatrix() * CMatrixX(map), ref);
+  }
+}
+
 EIGEN_DECLARE_TEST(product_trsolve) {
   for (int i = 0; i < g_repeat; i++) {
     // matrices
@@ -134,4 +247,7 @@
     CALL_SUBTEST_13((trsolve<float, 1, 2>()));
     CALL_SUBTEST_14((trsolve<float, 3, 1>()));
   }
+
+  // Strided solve at blocking boundaries (deterministic, outside g_repeat).
+  CALL_SUBTEST_15(trsolve_strided_boundary<0>());
 }

diff --git a/test/qr_colpivoting.cpp b/test/qr_colpivoting.cpp
index c821304..651eb37 100644
--- a/test/qr_colpivoting.cpp
+++ b/test/qr_colpivoting.cpp

@@ -154,10 +154,10 @@
   {
     MatrixType m2, m3;
     Index size = rows;
-    do {
-      m1 = MatrixType::Random(size, size);
-      qr.compute(m1);
-    } while (!qr.isInvertible());
+    // Create a random diagonally dominant (thus invertible) matrix.
+    m1 = MatrixType::Random(size, size);
+    m1.diagonal().array() += Scalar(2 * size);
+    qr.compute(m1);
     MatrixType m1_inv = qr.inverse();
     m3 = m1 * MatrixType::Random(size, cols2);
     m2 = qr.solve(m3);
@@ -328,6 +328,101 @@
   VERIFY_RAISES_ASSERT(cod.signDeterminant())
 }
 
+// Stress test: verify rank detection on partial isometries (SVs = 0 or 1) across
+// many random trials and various aspect ratios (square, tall, wide).
+// This tests ROBUSTNESS: the threshold must be large enough that roundoff noise
+// in the null-space R diagonal elements does not cause rank overestimation.
+template <typename MatrixType>
+void qr_rank_detection_stress() {
+  // Test a range of matrix sizes and aspect ratios.
+  const Index sizes[][2] = {{10, 10}, {20, 20}, {50, 50}, {100, 100}, {40, 10}, {100, 10}, {10, 40}, {10, 100}};
+  for (const auto& sz : sizes) {
+    const Index rows = sz[0], cols = sz[1];
+    const Index min_dim = (std::min)(rows, cols);
+    // Test several rank values: 1, half, and min_dim - 1.
+    for (Index rank : {Index(1), (std::max)(Index(1), min_dim / 2), min_dim - 1}) {
+      if (rank >= min_dim) continue;
+      for (int trial = 0; trial < 20; ++trial) {
+        MatrixType m1;
+        createRandomPIMatrixOfRank(rank, rows, cols, m1);
+        ColPivHouseholderQR<MatrixType> qr(m1);
+        VERIFY_IS_EQUAL(rank, qr.rank());
+      }
+    }
+  }
+}
+
+// Efficiency test: verify the threshold is not so large that it causes false
+// rank deficiency. Creates matrices with smallest singular value well above
+// the backward error bound, and verifies they are detected as full rank.
+template <typename MatrixType>
+void qr_threshold_efficiency() {
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef Matrix<RealScalar, Dynamic, 1> RealVectorType;
+
+  const Index sizes[][2] = {{10, 10}, {50, 50}, {100, 100}, {40, 10}, {10, 40}};
+  for (const auto& sz : sizes) {
+    const Index rows = sz[0], cols = sz[1];
+    const Index min_dim = (std::min)(rows, cols);
+    // Create a matrix with prescribed singular values: smallest SV = 100 * threshold.
+    // The threshold is 4*min(m,n)*eps, so we set sigma_min = 400*min(m,n)*eps.
+    // This must be detected as full rank.
+    RealScalar sigma_min = RealScalar(400) * RealScalar(min_dim) * NumTraits<RealScalar>::epsilon();
+    RealVectorType svs = setupRangeSvs<RealVectorType>(min_dim, sigma_min, RealScalar(1));
+    MatrixType m1;
+    generateRandomMatrixSvs(svs, rows, cols, m1);
+    ColPivHouseholderQR<MatrixType> qr(m1);
+    // sigma_min is 100x the threshold — must be detected as full rank.
+    VERIFY_IS_EQUAL(min_dim, qr.rank());
+
+    // Also check with FullPivHouseholderQR.
+    FullPivHouseholderQR<MatrixType> fpqr(m1);
+    VERIFY_IS_EQUAL(min_dim, fpqr.rank());
+  }
+}
+
+// Test rank detection on matrices with geometrically distributed singular values
+// and a clear gap at the desired rank. This mimics real-world matrices better
+// than partial isometries and stresses the threshold from both sides.
+template <typename MatrixType>
+void qr_rank_gap_test() {
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef Matrix<RealScalar, Dynamic, 1> RealVectorType;
+
+  const Index sizes[][2] = {{20, 20}, {50, 50}, {100, 100}, {50, 20}, {20, 50}};
+  for (const auto& sz : sizes) {
+    const Index rows = sz[0], cols = sz[1];
+    const Index min_dim = (std::min)(rows, cols);
+    const Index rank = (std::max)(Index(1), min_dim / 2);
+
+    // Singular values: [1, ..., sigma_rank] for the "signal" part,
+    // then [eps_level, ..., eps_level] for the "noise" part.
+    // The gap ratio is sigma_rank / eps_level >> 1.
+    RealScalar sigma_rank = RealScalar(0.1);
+    RealScalar eps_level = NumTraits<RealScalar>::epsilon();
+
+    RealVectorType svs(min_dim);
+    // Signal part: geometrically spaced from 1 to sigma_rank.
+    for (Index i = 0; i < rank; ++i) {
+      RealScalar t = (rank > 1) ? RealScalar(i) / RealScalar(rank - 1) : RealScalar(0);
+      svs(i) = std::pow(sigma_rank, t);  // svs(0) = 1, svs(rank-1) = sigma_rank
+    }
+    // Noise part: near machine epsilon (well below any reasonable threshold).
+    for (Index i = rank; i < min_dim; ++i) {
+      svs(i) = eps_level * RealScalar(min_dim - i);
+    }
+
+    MatrixType m1;
+    generateRandomMatrixSvs(svs, rows, cols, m1);
+
+    ColPivHouseholderQR<MatrixType> qr(m1);
+    VERIFY_IS_EQUAL(rank, qr.rank());
+
+    FullPivHouseholderQR<MatrixType> fpqr(m1);
+    VERIFY_IS_EQUAL(rank, fpqr.rank());
+  }
+}
+
 EIGEN_DECLARE_TEST(qr_colpivoting) {
   for (int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1(qr<MatrixXf>());
@@ -373,4 +468,12 @@
 
   CALL_SUBTEST_1(qr_kahan_matrix<MatrixXf>());
   CALL_SUBTEST_2(qr_kahan_matrix<MatrixXd>());
+
+  // Stress tests for rank detection threshold robustness and efficiency.
+  CALL_SUBTEST_1(qr_rank_detection_stress<MatrixXf>());
+  CALL_SUBTEST_2(qr_rank_detection_stress<MatrixXd>());
+  CALL_SUBTEST_1(qr_threshold_efficiency<MatrixXf>());
+  CALL_SUBTEST_2(qr_threshold_efficiency<MatrixXd>());
+  CALL_SUBTEST_1(qr_rank_gap_test<MatrixXf>());
+  CALL_SUBTEST_2(qr_rank_gap_test<MatrixXd>());
 }

diff --git a/test/qr_fullpivoting.cpp b/test/qr_fullpivoting.cpp
index 2b6ecc5..d3dd2d1 100644
--- a/test/qr_fullpivoting.cpp
+++ b/test/qr_fullpivoting.cpp

@@ -56,10 +56,10 @@
   {
     MatrixType m2, m3;
     Index size = rows;
-    do {
-      m1 = MatrixType::Random(size, size);
-      qr.compute(m1);
-    } while (!qr.isInvertible());
+    // Create a random diagonally dominant (thus invertible) matrix.
+    m1 = MatrixType::Random(size, size);
+    m1.diagonal().array() += Scalar(2 * size);
+    qr.compute(m1);
     MatrixType m1_inv = qr.inverse();
     m3 = m1 * MatrixType::Random(size, cols2);
     m2 = qr.solve(m3);

diff --git a/test/rand.cpp b/test/rand.cpp
index 69e7cf9..eb70777 100644
--- a/test/rand.cpp
+++ b/test/rand.cpp

@@ -14,7 +14,7 @@
 // SafeScalar<T> is used to simulate custom Scalar types, which use a more generalized approach to generate random
 // numbers
 
-// For GCC-6, if this function is inlined then there seems to be an optimization
+// For GCC-6, if this function is inlined then there is an optimization
 // bug that triggers a failure.  This failure goes away if you access `r` in
 // in any way, and for any other compiler.
 template <typename Scalar>
@@ -83,7 +83,9 @@
 
 // helper class to avoid extending std:: namespace
 template <typename T>
-struct get_range_type : internal::make_unsigned<T> {};
+struct get_range_type {
+  using type = std::conditional_t<(sizeof(T) < sizeof(int)), unsigned int, std::make_unsigned_t<T>>;
+};
 template <typename T>
 struct get_range_type<SafeScalar<T>> : internal::make_unsigned<T> {};
 

diff --git a/test/random_without_cast_overflow.h b/test/random_without_cast_overflow.h
index 418f21c..c176c07 100644
--- a/test/random_without_cast_overflow.h
+++ b/test/random_without_cast_overflow.h

@@ -138,7 +138,7 @@
   }
 };
 
-// Integer to floating-point, re-use above logic.
+// Integer to floating-point, reuse above logic.
 template <typename SrcScalar, typename TgtScalar>
 struct random_without_cast_overflow<
     SrcScalar, TgtScalar,

diff --git a/test/redux.cpp b/test/redux.cpp
index 71ef535..ad3ff1f 100644
--- a/test/redux.cpp
+++ b/test/redux.cpp

@@ -152,6 +152,110 @@
   VERIFY_RAISES_ASSERT(v.head(0).maxCoeff());
 }
 
+void boolRedux(Index rows, Index cols) {
+  // Test boolean reductions: all(), any(), count()
+  typedef Array<bool, Dynamic, Dynamic> BoolArray;
+
+  // All-true
+  BoolArray all_true = BoolArray::Constant(rows, cols, true);
+  VERIFY(all_true.all());
+  VERIFY(all_true.any());
+  VERIFY_IS_EQUAL(all_true.count(), rows * cols);
+
+  // All-false
+  BoolArray all_false = BoolArray::Constant(rows, cols, false);
+  if (rows > 0 && cols > 0) {
+    VERIFY(!all_false.all());
+    VERIFY(!all_false.any());
+  }
+  VERIFY_IS_EQUAL(all_false.count(), Index(0));
+
+  // Mixed: set a checkerboard pattern
+  BoolArray mixed(rows, cols);
+  Index expected_count = 0;
+  for (Index j = 0; j < cols; ++j)
+    for (Index i = 0; i < rows; ++i) {
+      mixed(i, j) = ((i + j) % 2 == 0);
+      if (mixed(i, j)) expected_count++;
+    }
+  VERIFY_IS_EQUAL(mixed.count(), expected_count);
+  if (rows > 0 && cols > 0) {
+    VERIFY(mixed.any());
+    VERIFY(mixed.all() == (expected_count == rows * cols));
+  }
+
+  // Partial reductions
+  if (rows > 0 && cols > 0) {
+    auto col_counts = mixed.colwise().count();
+    for (Index k = 0; k < cols; ++k) VERIFY_IS_EQUAL(col_counts(k), mixed.col(k).count());
+    auto row_counts = mixed.rowwise().count();
+    for (Index k = 0; k < rows; ++k) VERIFY_IS_EQUAL(row_counts(k), mixed.row(k).count());
+  }
+}
+
+// Test reductions at sizes that hit vectorization boundaries in Redux.h:
+// LinearVectorizedTraversal with 2-way unrolled packet loop, scalar pre/post loops.
+template <typename Scalar>
+void redux_vec_boundary() {
+  const Index PS = internal::packet_traits<Scalar>::size;
+  // Critical sizes: around packet multiples and at 2-way unroll boundaries
+  const Index sizes[] = {1,      PS - 1,     PS,         PS + 1, 2 * PS - 1, 2 * PS, 2 * PS + 1,
+                         3 * PS, 3 * PS + 1, 4 * PS - 1, 4 * PS, 4 * PS + 1, 8 * PS, 8 * PS + 1};
+  for (int si = 0; si < 14; ++si) {
+    const Index n = sizes[si];
+    if (n <= 0) continue;
+    typedef Matrix<Scalar, Dynamic, 1> Vec;
+    Vec v = Vec::Random(n);
+    // For prod, use values near 1 to avoid underflow (float) or overflow (int).
+    Vec v_for_prod = Vec::Ones(n) + Scalar(typename NumTraits<Scalar>::Real(0.2)) * v;
+    // Reference: scalar loops
+    Scalar ref_sum(0), ref_prod(1);
+    typename NumTraits<Scalar>::Real ref_min = numext::real(v(0)), ref_max = numext::real(v(0));
+    for (Index k = 0; k < n; ++k) {
+      ref_sum += v(k);
+      ref_prod *= v_for_prod(k);
+      ref_min = (std::min)(ref_min, numext::real(v(k)));
+      ref_max = (std::max)(ref_max, numext::real(v(k)));
+    }
+    VERIFY_IS_APPROX(v.sum(), ref_sum);
+    VERIFY_IS_APPROX(v_for_prod.prod(), ref_prod);
+    VERIFY_IS_APPROX(v.real().minCoeff(), ref_min);
+    VERIFY_IS_APPROX(v.real().maxCoeff(), ref_max);
+  }
+}
+
+// Test reductions on strided (non-contiguous) mapped data.
+// This exercises SliceVectorizedTraversal or DefaultTraversal in Redux.h
+// depending on stride and packet size.
+template <typename Scalar>
+void redux_strided() {
+  const Index n = 64;
+  typedef Matrix<Scalar, Dynamic, 1> Vec;
+  Vec data = Vec::Random(2 * n);
+  // Map with inner stride of 2 — every other element
+  Map<Vec, 0, InnerStride<2>> strided(data.data(), n);
+  Scalar ref_sum(0);
+  typename NumTraits<Scalar>::Real ref_min = numext::real(strided(0)), ref_max = numext::real(strided(0));
+  for (Index k = 0; k < n; ++k) {
+    ref_sum += strided(k);
+    ref_min = (std::min)(ref_min, numext::real(strided(k)));
+    ref_max = (std::max)(ref_max, numext::real(strided(k)));
+  }
+  VERIFY_IS_APPROX(strided.sum(), ref_sum);
+  VERIFY_IS_APPROX(strided.real().minCoeff(), ref_min);
+  VERIFY_IS_APPROX(strided.real().maxCoeff(), ref_max);
+
+  // Also test reduction on a non-contiguous matrix block (SliceVectorizedTraversal)
+  typedef Matrix<Scalar, Dynamic, Dynamic> Mat;
+  Mat m = Mat::Random(16, 16);
+  for (Index bsz = 1; bsz <= 8; bsz *= 2) {
+    Scalar block_sum(0);
+    for (Index j = 0; j < bsz; ++j)
+      for (Index i = 0; i < bsz; ++i) block_sum += m(1 + i, 1 + j);
+    VERIFY_IS_APPROX(m.block(1, 1, bsz, bsz).sum(), block_sum);
+  }
+}
+
 EIGEN_DECLARE_TEST(redux) {
   // the max size cannot be too large, otherwise reduxion operations obviously generate large errors.
   int maxsize = (std::min)(100, EIGEN_TEST_MAX_SIZE);
@@ -202,4 +306,34 @@
     CALL_SUBTEST_10(vectorRedux(VectorX<int64_t>(size)));
     CALL_SUBTEST_10(vectorRedux(ArrayX<int64_t>(size)));
   }
+  // Bool reductions (deterministic, outside g_repeat)
+  CALL_SUBTEST_11(boolRedux(1, 1));
+  CALL_SUBTEST_11(boolRedux(4, 4));
+  CALL_SUBTEST_11(boolRedux(7, 13));
+  CALL_SUBTEST_11(boolRedux(63, 63));
+
+  // Bool reductions at vectorization boundary sizes.
+  // all()/any()/count() use packet-level visitors with remainder handling.
+  {
+    // bool packets are typically 16 bytes (SSE) or 32 bytes (AVX).
+    // Test sizes around common packet sizes to catch off-by-one in remainder loops.
+    const Index bsizes[] = {1, 2, 3, 7, 8, 9, 15, 16, 17, 31, 32, 33, 63, 64, 65, 127, 128, 129};
+    EIGEN_UNUSED_VARIABLE(bsizes);
+    for (int si = 0; si < 18; ++si) {
+      CALL_SUBTEST_11(boolRedux(bsizes[si], 1));  // column vector
+      CALL_SUBTEST_11(boolRedux(1, bsizes[si]));  // row vector
+      CALL_SUBTEST_11(boolRedux(bsizes[si], 3));  // thin matrix
+    }
+  }
+
+  // Vectorization boundary sizes — deterministic, run once.
+  // Integer types are excluded: full-range random ints overflow in sum/prod (UB).
+  // Integer reductions are already tested by matrixRedux/vectorRedux with clamped values.
+  CALL_SUBTEST_12(redux_vec_boundary<float>());
+  CALL_SUBTEST_12(redux_vec_boundary<double>());
+
+  // Strided (non-contiguous) reductions.
+  CALL_SUBTEST_13(redux_strided<float>());
+  CALL_SUBTEST_13(redux_strided<double>());
+  CALL_SUBTEST_13(redux_strided<std::complex<float>>());
 }

diff --git a/test/sparse_basic.cpp b/test/sparse_basic.cpp
index a9c6f4c..f1fd6b7 100644
--- a/test/sparse_basic.cpp
+++ b/test/sparse_basic.cpp

@@ -540,8 +540,6 @@
 
     triplets.reserve(ntriplets);
     DenseMatrix refMat_sum = DenseMatrix::Zero(rows, cols);
-    DenseMatrix refMat_prod = DenseMatrix::Zero(rows, cols);
-    DenseMatrix refMat_last = DenseMatrix::Zero(rows, cols);
 
     for (Index i = 0; i < ntriplets; ++i) {
       StorageIndex r = internal::random<StorageIndex>(0, StorageIndex(rows - 1));
@@ -549,18 +547,11 @@
       Scalar v = internal::random<Scalar>();
       triplets.push_back(TripletType(r, c, v));
       refMat_sum(r, c) += v;
-      if (std::abs(refMat_prod(r, c)) == 0)
-        refMat_prod(r, c) = v;
-      else
-        refMat_prod(r, c) *= v;
-      refMat_last(r, c) = v;
     }
 
     std::vector<TripletType> moreTriplets;
     moreTriplets.reserve(ntriplets);
     DenseMatrix refMat_sum_more = refMat_sum;
-    DenseMatrix refMat_prod_more = refMat_prod;
-    DenseMatrix refMat_last_more = refMat_last;
 
     for (Index i = 0; i < ntriplets; ++i) {
       StorageIndex r = internal::random<StorageIndex>(0, StorageIndex(rows - 1));
@@ -568,11 +559,44 @@
       Scalar v = internal::random<Scalar>();
       moreTriplets.push_back(TripletType(r, c, v));
       refMat_sum_more(r, c) += v;
-      if (std::abs(refMat_prod_more(r, c)) == 0)
-        refMat_prod_more(r, c) = v;
-      else
-        refMat_prod_more(r, c) *= v;
-      refMat_last_more(r, c) = v;
+    }
+
+    // setFromTriplets sorts internally by (outer, inner), so non-commutative
+    // reductions (std::multiplies, "last wins") depend on sorted order.
+    // Compute these references from sorted triplets to match.
+    struct triplet_comp {
+      inline bool operator()(const TripletType& a, const TripletType& b) {
+        return SparseMatrixType::IsRowMajor ? ((a.row() != b.row()) ? (a.row() < b.row()) : (a.col() < b.col()))
+                                            : ((a.col() != b.col()) ? (a.col() < b.col()) : (a.row() < b.row()));
+      }
+    };
+
+    DenseMatrix refMat_prod = DenseMatrix::Zero(rows, cols);
+    DenseMatrix refMat_last = DenseMatrix::Zero(rows, cols);
+    {
+      auto sorted = triplets;
+      std::stable_sort(sorted.begin(), sorted.end(), triplet_comp());
+      for (const auto& t : sorted) {
+        if (std::abs(refMat_prod(t.row(), t.col())) == 0)
+          refMat_prod(t.row(), t.col()) = t.value();
+        else
+          refMat_prod(t.row(), t.col()) *= t.value();
+        refMat_last(t.row(), t.col()) = t.value();
+      }
+    }
+
+    DenseMatrix refMat_prod_more = refMat_prod;
+    DenseMatrix refMat_last_more = refMat_last;
+    {
+      auto sorted = moreTriplets;
+      std::stable_sort(sorted.begin(), sorted.end(), triplet_comp());
+      for (const auto& t : sorted) {
+        if (std::abs(refMat_prod_more(t.row(), t.col())) == 0)
+          refMat_prod_more(t.row(), t.col()) = t.value();
+        else
+          refMat_prod_more(t.row(), t.col()) *= t.value();
+        refMat_last_more(t.row(), t.col()) = t.value();
+      }
     }
 
     SparseMatrixType m(rows, cols);
@@ -632,13 +656,6 @@
 
     // test setFromSortedTriplets / insertFromSortedTriplets
 
-    struct triplet_comp {
-      inline bool operator()(const TripletType& a, const TripletType& b) {
-        return SparseMatrixType::IsRowMajor ? ((a.row() != b.row()) ? (a.row() < b.row()) : (a.col() < b.col()))
-                                            : ((a.col() != b.col()) ? (a.col() < b.col()) : (a.row() < b.row()));
-      }
-    };
-
     // stable_sort is only necessary when the reduction functor is dependent on the order of the triplets
     // this is the case with refMat_last
     // for most cases, std::sort is sufficient and preferred
@@ -854,6 +871,10 @@
     m2 -= d.asDiagonal();
     refMat2 -= d.asDiagonal();
     VERIFY_IS_APPROX(m2, refMat2);
+
+    // zero-length diagonal
+    d = DenseVector(0);
+    SparseMatrixType m4(d.asDiagonal());
   }
 
   // test conservative resize

diff --git a/test/sparse_block.cpp b/test/sparse_block.cpp
index d4d036a..398658c 100644
--- a/test/sparse_block.cpp
+++ b/test/sparse_block.cpp

@@ -279,6 +279,22 @@
       ++block_iterator;
       ++matrix_iterator;
     }
+
+    // Test direct access methods
+    if (m2.isCompressed()) {
+      VERIFY(v.isCompressed());
+      VERIFY_IS_EQUAL(v.innerNonZeroPtr(), (const StorageIndex*)0);
+    } else {
+      VERIFY(!v.isCompressed());
+      if (SparseMatrixType::IsRowMajor) {
+        VERIFY_IS_EQUAL(v.innerNonZeroPtr(), m2.innerNonZeroPtr() + j0);
+      } else {
+        VERIFY_IS_EQUAL(v.innerNonZeroPtr(), m2.innerNonZeroPtr() + j0);
+      }
+    }
+    VERIFY_IS_EQUAL(v.valuePtr(), m2.valuePtr());
+    VERIFY_IS_EQUAL(v.innerIndexPtr(), m2.innerIndexPtr());
+    VERIFY_IS_EQUAL(v.outerIndexPtr(), m2.outerIndexPtr() + j0);
   }
 }
 

diff --git a/test/sparse_product.cpp b/test/sparse_product.cpp
index 1083f3f..426a375 100644
--- a/test/sparse_product.cpp
+++ b/test/sparse_product.cpp

@@ -158,6 +158,11 @@
     VERIFY_IS_APPROX(dm4 = m2t.transpose() * (refMat3 + refMat5) * 0.5,
                      refMat4 = refMat2t.transpose() * (refMat3 + refMat5) * 0.5);
 
+    // sparse * dense expression without DirectAccessBit (e.g. CwiseNullaryOp)
+    VERIFY_IS_APPROX(dm4 = m2 * DenseMatrix::Constant(depth, cols, s1),
+                     refMat4 = refMat2 * DenseMatrix::Constant(depth, cols, s1));
+    VERIFY_IS_APPROX(dm4 = m2 * DenseMatrix::Zero(depth, cols), refMat4 = refMat2 * DenseMatrix::Zero(depth, cols));
+
     // sparse * dense vector
     VERIFY_IS_APPROX(dm4.col(0) = m2 * refMat3.col(0), refMat4.col(0) = refMat2 * refMat3.col(0));
     VERIFY_IS_APPROX(dm4.col(0) = m2 * refMat3t.transpose().col(0),
@@ -296,9 +301,11 @@
     SparseMatrixType mS(rows, rows);
     SparseMatrixType mA(rows, rows);
     initSparse<Scalar>(density, refA, mA);
-    do {
-      initSparse<Scalar>(density, refUp, mUp, ForceRealDiag | /*ForceNonZeroDiag|*/ MakeUpperTriangular);
-    } while (refUp.isZero());
+    initSparse<Scalar>(density, refUp, mUp, ForceRealDiag | /*ForceNonZeroDiag|*/ MakeUpperTriangular);
+    if (refUp.isZero()) {
+      refUp(0, 0) = Scalar(1);
+      mUp.coeffRef(0, 0) = Scalar(1);
+    }
     refLo = refUp.adjoint();
     mLo = mUp.adjoint();
     refS = refUp + refLo;
@@ -535,8 +542,20 @@
   test_mixed_storage_imp<ColMajor, ColMajor, ColMajor>();
 }
 
+void test_sparse_vector_dense_product() {
+  SparseVector<double> sv(3);
+  sv.insert(0) = 1.0;
+  sv.insert(2) = 2.0;
+
+  MatrixXd dm = MatrixXd::Random(3, 2);
+  MatrixXd res = sv.transpose() * dm;
+  MatrixXd ref = MatrixXd(sv).transpose() * dm;
+  VERIFY_IS_APPROX(res, ref);
+}
+
 EIGEN_DECLARE_TEST(sparse_product) {
   for (int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1((test_sparse_vector_dense_product()));
     CALL_SUBTEST_1((sparse_product<SparseMatrix<double, ColMajor> >()));
     CALL_SUBTEST_1((sparse_product<SparseMatrix<double, RowMajor> >()));
     CALL_SUBTEST_1((bug_942<double>()));

diff --git a/test/stable_norm.cpp b/test/stable_norm.cpp
index e9ed3d5..c0ef4b6 100644
--- a/test/stable_norm.cpp
+++ b/test/stable_norm.cpp

@@ -52,13 +52,11 @@
   Index rows = m.rows();
   Index cols = m.cols();
 
-  // get a non-zero random factor
-  Scalar factor = internal::random<Scalar>();
-  while (numext::abs2(factor) < RealScalar(1e-4)) factor = internal::random<Scalar>();
+  // Get a random factor bounded away from zero: |factor| >= 0.1.
+  Scalar factor = internal::random<Scalar>(Scalar(RealScalar(0.1)), Scalar(RealScalar(1)));
   Scalar big = factor * ((std::numeric_limits<RealScalar>::max)() * RealScalar(1e-4));
 
-  factor = internal::random<Scalar>();
-  while (numext::abs2(factor) < RealScalar(1e-4)) factor = internal::random<Scalar>();
+  factor = internal::random<Scalar>(Scalar(RealScalar(0.1)), Scalar(RealScalar(1)));
   Scalar small = factor * ((std::numeric_limits<RealScalar>::min)() * RealScalar(1e4));
 
   Scalar one(1);
@@ -217,12 +215,11 @@
 template <typename Scalar>
 void test_hypot() {
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  Scalar factor = internal::random<Scalar>();
-  while (numext::abs2(factor) < RealScalar(1e-4)) factor = internal::random<Scalar>();
+  // Get a random factor bounded away from zero: |factor| >= 0.1.
+  Scalar factor = internal::random<Scalar>(Scalar(RealScalar(0.1)), Scalar(RealScalar(1)));
   Scalar big = factor * ((std::numeric_limits<RealScalar>::max)() * RealScalar(1e-4));
 
-  factor = internal::random<Scalar>();
-  while (numext::abs2(factor) < RealScalar(1e-4)) factor = internal::random<Scalar>();
+  factor = internal::random<Scalar>(Scalar(RealScalar(0.1)), Scalar(RealScalar(1)));
   Scalar small = factor * ((std::numeric_limits<RealScalar>::min)() * RealScalar(1e4));
 
   Scalar one(1), zero(0), sqrt2(std::sqrt(2)), nan(std::numeric_limits<RealScalar>::quiet_NaN());
@@ -239,6 +236,71 @@
   VERIFY((numext::isnan)(numext::hypot(a, nan)));
 }
 
+// Test stableNorm at the 4096-element block boundary.
+// stable_norm_impl_inner_step processes vectors in blocks of 4096.
+// Sizes near this boundary exercise the transition between full blocks
+// and the remainder tail, including scale propagation across blocks.
+template <typename Scalar>
+void stable_norm_block_boundary() {
+  using std::abs;
+  using std::sqrt;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef Matrix<Scalar, Dynamic, 1> VecType;
+
+  // Test sizes around the 4096 block boundary.
+  const Index sizes[] = {4095, 4096, 4097, 8191, 8192, 8193, 12288};
+  for (int si = 0; si < 7; ++si) {
+    Index n = sizes[si];
+    VecType v = VecType::Random(n);
+    VERIFY_IS_APPROX(v.stableNorm(), v.norm());
+    VERIFY_IS_APPROX(v.blueNorm(), v.norm());
+  }
+
+  // Test scale transitions across blocks: first block has tiny values,
+  // second block has huge values. This exercises the scale/invScale
+  // update logic when maxCoeff > scale in stable_norm_kernel.
+  {
+    RealScalar tiny = (std::numeric_limits<RealScalar>::min)() * RealScalar(1e4);
+    RealScalar huge_val = (std::numeric_limits<RealScalar>::max)() * RealScalar(1e-4);
+    Index n = 8192;
+    VecType v(n);
+    // First 4096 elements: tiny. Second 4096 elements: huge.
+    v.head(4096).setConstant(Scalar(tiny));
+    v.tail(4096).setConstant(Scalar(huge_val));
+    // The huge part dominates, so the expected norm is sqrt(4096)*huge_val.
+    RealScalar expected = sqrt(RealScalar(4096)) * abs(huge_val);
+    VERIFY_IS_APPROX(v.stableNorm(), expected);
+    VERIFY_IS_APPROX(v.blueNorm(), expected);
+  }
+
+  // Reverse: first block huge, second block tiny.
+  {
+    RealScalar tiny = (std::numeric_limits<RealScalar>::min)() * RealScalar(1e4);
+    RealScalar huge_val = (std::numeric_limits<RealScalar>::max)() * RealScalar(1e-4);
+    Index n = 8192;
+    VecType v(n);
+    v.head(4096).setConstant(Scalar(huge_val));
+    v.tail(4096).setConstant(Scalar(tiny));
+    RealScalar expected = sqrt(RealScalar(4096)) * abs(huge_val);
+    VERIFY_IS_APPROX(v.stableNorm(), expected);
+    VERIFY_IS_APPROX(v.blueNorm(), expected);
+  }
+
+  // Matrix version: columns with different magnitudes.
+  // Scale must propagate correctly across columns.
+  {
+    RealScalar tiny = (std::numeric_limits<RealScalar>::min)() * RealScalar(1e4);
+    RealScalar huge_val = (std::numeric_limits<RealScalar>::max)() * RealScalar(1e-4);
+    typedef Matrix<Scalar, Dynamic, Dynamic> MatType;
+    MatType m(100, 2);
+    m.col(0).setConstant(Scalar(tiny));
+    m.col(1).setConstant(Scalar(huge_val));
+    RealScalar expected = sqrt(RealScalar(100)) * abs(huge_val);
+    VERIFY_IS_APPROX(m.stableNorm(), expected);
+    VERIFY_IS_APPROX(m.blueNorm(), expected);
+  }
+}
+
 EIGEN_DECLARE_TEST(stable_norm) {
   CALL_SUBTEST_1(test_empty());
 
@@ -256,4 +318,8 @@
     CALL_SUBTEST_5(stable_norm(VectorXcd(internal::random<int>(10, 2000))));
     CALL_SUBTEST_6(stable_norm(VectorXcf(internal::random<int>(10, 2000))));
   }
+
+  // Block boundary and scale transition tests (deterministic, outside g_repeat).
+  CALL_SUBTEST_7(stable_norm_block_boundary<float>());
+  CALL_SUBTEST_7(stable_norm_block_boundary<double>());
 }

diff --git a/test/svd_common.h b/test/svd_common.h
index 5174ade..a9aa3b2 100644
--- a/test/svd_common.h
+++ b/test/svd_common.h

@@ -124,7 +124,7 @@
     if (internal::is_same<RealScalar, double>::value || svd.rank() == m.diagonal().size()) {
       using std::sqrt;
       // This test is not stable with single precision.
-      // This is probably because squaring m signicantly affects the precision.
+      // This is likely because squaring m significantly affects the precision.
       if (internal::is_same<RealScalar, float>::value) ++g_test_level;
 
       VERIFY_IS_APPROX(m.adjoint() * (m * x), m.adjoint() * rhs);
@@ -174,11 +174,11 @@
   typedef Matrix<Scalar, ColsAtCompileTime, RankAtCompileTime2> MatrixType2T;
   Index rank = RankAtCompileTime2 == Dynamic ? internal::random<Index>(1, cols) : Index(RankAtCompileTime2);
   MatrixType2 m2(rank, cols);
-  int guard = 0;
-  do {
-    m2.setRandom();
-  } while (SVD_FOR_MIN_NORM(MatrixType2)(m2).setThreshold(test_precision<Scalar>()).rank() != rank && (++guard) < 10);
-  VERIFY(guard < 10);
+  m2.setRandom();
+  if (SVD_FOR_MIN_NORM(MatrixType2)(m2).setThreshold(test_precision<Scalar>()).rank() != rank) {
+    // Ensure full row rank by making the leading square block diagonally dominant.
+    for (Index i = 0; i < rank; ++i) m2(i, i) += Scalar(1);
+  }
 
   RhsType2 rhs2 = RhsType2::Random(rank);
   // use QR to find a reference minimal norm solution

diff --git a/test/svd_fill.h b/test/svd_fill.h
index d092e83..3ef7c9a 100644
--- a/test/svd_fill.h
+++ b/test/svd_fill.h

@@ -23,6 +23,15 @@
   return four_denorms<double>().cast<T>();
 }
 
+template <typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
+struct maybe_set_imag_part {
+  static void run(Scalar& x, typename NumTraits<Scalar>::Real y) { numext::imag_ref(x) = y; }
+};
+template <typename Scalar>
+struct maybe_set_imag_part<Scalar, false> {
+  static void run(Scalar&, typename NumTraits<Scalar>::Real) {}
+};
+
 template <typename MatrixType>
 void svd_fill_random(MatrixType &m, int Option = 0) {
   using std::pow;
@@ -35,9 +44,12 @@
   for (Index k = 0; k < diagSize; ++k) d(k) = d(k) * pow(RealScalar(10), internal::random<RealScalar>(-s, s));
 
   bool dup = internal::random<int>(0, 10) < 3;
-  bool unit_uv =
-      internal::random<int>(0, 10) < (dup ? 7 : 3);  // if we duplicate some diagonal entries, then increase the chance
-                                                     // to preserve them using unitary U and V factors
+  // For SelfAdjoint, always use unitary U so the eigenvalues are exactly d
+  // and the conditioning is controlled.  For Symmetric/general SVD, randomly
+  // choose unitary or arbitrary U/V.
+  bool unit_uv = (Option == SelfAdjoint) ||
+                 internal::random<int>(0, 10) < (dup ? 7 : 3);  // if we duplicate some diagonal entries, then increase
+                                                                // the chance to preserve them using unitary U and V
 
   // duplicate some singular values
   if (dup) {
@@ -67,8 +79,11 @@
       RealScalar(1) / NumTraits<RealScalar>::highest(), (std::numeric_limits<RealScalar>::min)(),
       pow((std::numeric_limits<RealScalar>::min)(), RealScalar(0.8));
 
-  if (Option == Symmetric) {
-    m = U * d.asDiagonal() * U.transpose();
+  if (Option == Symmetric || Option == SelfAdjoint) {
+    if (Option == SelfAdjoint)
+      m = U * d.asDiagonal() * U.adjoint();
+    else
+      m = U * d.asDiagonal() * U.transpose();
 
     // randomly nullify some rows/columns
     {
@@ -85,10 +100,21 @@
           for (Index k = 0; k < n; ++k) {
             Index i = internal::random<Index>(0, m.rows() - 1);
             Index j = internal::random<Index>(0, m.cols() - 1);
-            m(j, i) = m(i, j) = samples(internal::random<Index>(0, samples.size() - 1));
-            if (NumTraits<Scalar>::IsComplex)
-              *(&numext::real_ref(m(j, i)) + 1) = *(&numext::real_ref(m(i, j)) + 1) =
-                  samples.real()(internal::random<Index>(0, samples.size() - 1));
+            Scalar val = samples(internal::random<Index>(0, samples.size() - 1));
+            maybe_set_imag_part<Scalar>::run(val, samples.real()(internal::random<Index>(0, samples.size() - 1)));
+            if (Option == SelfAdjoint) {
+              if (i == j) {
+                // Diagonal entries of a Hermitian matrix must be real.
+                m(i, i) = numext::real(val);
+              } else {
+                // Off-diagonal: m(j,i) = conj(m(i,j)) for Hermitian.
+                m(i, j) = val;
+                m(j, i) = numext::conj(val);
+              }
+            } else {
+              m(i, j) = val;
+              m(j, i) = val;
+            }
           }
         }
     }
@@ -101,8 +127,7 @@
         Index i = internal::random<Index>(0, m.rows() - 1);
         Index j = internal::random<Index>(0, m.cols() - 1);
         m(i, j) = samples(internal::random<Index>(0, samples.size() - 1));
-        if (NumTraits<Scalar>::IsComplex)
-          *(&numext::real_ref(m(i, j)) + 1) = samples.real()(internal::random<Index>(0, samples.size() - 1));
+        maybe_set_imag_part<Scalar>::run(m(i, j), samples.real()(internal::random<Index>(0, samples.size() - 1)));
       }
     }
   }

diff --git a/test/threads_eventcount.cpp b/test/threads_eventcount.cpp
index bd6e9a8..ebac091 100644
--- a/test/threads_eventcount.cpp
+++ b/test/threads_eventcount.cpp

@@ -74,7 +74,7 @@
 // fake queues. Ensure that it does not crash, consumers don't deadlock and
 // number of blocked and unblocked threads match.
 static void test_stress_eventcount() {
-  const int kThreads = std::thread::hardware_concurrency();
+  const int kThreads = (std::min)(static_cast<int>(std::thread::hardware_concurrency()), 16);
   static const int kEvents = 1 << 16;
   static const int kQueues = 10;
 

diff --git a/test/threads_non_blocking_thread_pool.cpp b/test/threads_non_blocking_thread_pool.cpp
index e805cf2..09990a0 100644
--- a/test/threads_non_blocking_thread_pool.cpp
+++ b/test/threads_non_blocking_thread_pool.cpp

@@ -12,6 +12,18 @@
 #include "main.h"
 #include "Eigen/ThreadPool"
 
+// Spin-wait with yielding until the condition is met, or fail after a timeout.
+// Returns true if the condition was met before the deadline.
+template <typename Cond>
+static bool spin_wait(Cond cond, int timeout_seconds = 60) {
+  auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(timeout_seconds);
+  while (!cond()) {
+    if (std::chrono::steady_clock::now() > deadline) return false;
+    std::this_thread::yield();
+  }
+  return true;
+}
+
 static void test_create_destroy_empty_pool() {
   // Just create and destroy the pool. This will wind up and tear down worker
   // threads. Ensure there are no issues in that logic.
@@ -38,12 +50,12 @@
         VERIFY_LE(thread_id, kThreads - 1);
         running++;
         while (phase < 1) {
+          std::this_thread::yield();
         }
         done++;
       });
     }
-    while (running != kThreads) {
-    }
+    VERIFY(spin_wait([&] { return running == kThreads; }));
     running = 0;
     phase = 1;
     // Now, while the previous tasks exit, schedule another kThreads tasks and
@@ -52,6 +64,7 @@
       tp.Schedule([&, i]() {
         running++;
         while (phase < 2) {
+          std::this_thread::yield();
         }
         // When all tasks are running, half of tasks exit, quarter of tasks
         // continue running and quarter of tasks schedule another 2 tasks each.
@@ -62,6 +75,7 @@
         } else if (i < 3 * kThreads / 4) {
           running++;
           while (phase < 3) {
+            std::this_thread::yield();
           }
           done++;
         } else {
@@ -69,6 +83,7 @@
             tp.Schedule([&]() {
               running++;
               while (phase < 3) {
+                std::this_thread::yield();
               }
               done++;
             });
@@ -77,23 +92,21 @@
         done++;
       });
     }
-    while (running != kThreads) {
-    }
+    VERIFY(spin_wait([&] { return running == kThreads; }));
     running = 0;
     phase = 2;
     for (int i = 0; i < kThreads / 4; ++i) {
       tp.Schedule([&]() {
         running++;
         while (phase < 3) {
+          std::this_thread::yield();
         }
         done++;
       });
     }
-    while (running != kThreads) {
-    }
+    VERIFY(spin_wait([&] { return running == kThreads; }));
     phase = 3;
-    while (done != 3 * kThreads) {
-    }
+    VERIFY(spin_wait([&] { return done == 3 * kThreads; }));
   }
 }
 
@@ -136,12 +149,12 @@
         VERIFY_LE(thread_id, kThreads - 1);
         ++running;
         while (phase < 1) {
+          std::this_thread::yield();
         }
         ++done;
       });
     }
-    while (running != kThreads) {
-    }
+    VERIFY(spin_wait([&] { return running == kThreads; }));
     // Schedule each closure to only run on thread 'i' and verify that it does.
     for (int i = 0; i < kThreads; ++i) {
       tp.ScheduleWithHint(
@@ -150,6 +163,7 @@
             const int thread_id = tp.CurrentThreadId();
             VERIFY_IS_EQUAL(thread_id, i);
             while (phase < 2) {
+              std::this_thread::yield();
             }
             ++done;
           },
@@ -157,8 +171,7 @@
     }
     running = 0;
     phase = 1;
-    while (running != kThreads) {
-    }
+    VERIFY(spin_wait([&] { return running == kThreads; }));
     running = 0;
     phase = 2;
   }

diff --git a/test/triangular.cpp b/test/triangular.cpp
index 62bb774..210228d 100644
--- a/test/triangular.cpp
+++ b/test/triangular.cpp

@@ -86,7 +86,7 @@
 
   m1 = MatrixType::Random(rows, cols);
   for (int i = 0; i < rows; ++i)
-    while (numext::abs2(m1(i, i)) < RealScalar(1e-1)) m1(i, i) = internal::random<Scalar>();
+    if (numext::abs2(m1(i, i)) < RealScalar(1e-1)) m1(i, i) = Scalar(1);
 
   Transpose<MatrixType> trm4(m4);
   // test back and forward substitution with a vector as the rhs
@@ -258,6 +258,71 @@
   VERIFY_IS_APPROX(m2, m3);
 }
 
+// Test triangular solve and product at sizes that exercise GEBP blocking.
+// The standard test caps at maxsize=20, which never triggers the blocked code paths
+// in TriangularSolverMatrix.h (requires size >= 48 with EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS).
+template <int>
+void triangular_at_blocking_boundaries() {
+  typedef double Scalar;
+  typedef Matrix<Scalar, Dynamic, Dynamic> Mat;
+  typedef Matrix<Scalar, Dynamic, 1> Vec;
+
+  const int sizes[] = {47, 48, 49, 64, 96, 128};
+  for (int si = 0; si < 6; ++si) {
+    int n = sizes[si];
+    Mat m1 = Mat::Random(n, n);
+    // Make well-conditioned: dominant diagonal
+    for (int i = 0; i < n; ++i) m1(i, i) += Scalar(n);
+
+    Vec v = Vec::Random(n);
+    Mat rhs = Mat::Random(n, 5);
+
+    // Upper triangular solve with vector
+    Mat U = m1.triangularView<Upper>();
+    Vec x = m1.triangularView<Upper>().solve(v);
+    VERIFY_IS_APPROX(U * x, v);
+
+    // Lower triangular solve with vector
+    Mat L = m1.triangularView<Lower>();
+    x = m1.triangularView<Lower>().solve(v);
+    VERIFY_IS_APPROX(L * x, v);
+
+    // Upper triangular solve with matrix rhs
+    Mat X = m1.triangularView<Upper>().solve(rhs);
+    VERIFY_IS_APPROX(U * X, rhs);
+
+    // Lower triangular solve with matrix rhs
+    X = m1.triangularView<Lower>().solve(rhs);
+    VERIFY_IS_APPROX(L * X, rhs);
+
+    // Triangular product
+    Mat prod = m1.triangularView<Upper>() * rhs;
+    VERIFY_IS_APPROX(prod, U * rhs);
+    prod = rhs.transpose() * m1.triangularView<Upper>();
+    VERIFY_IS_APPROX(prod, rhs.transpose() * U);
+  }
+
+  // Also test with float and RowMajor
+  {
+    typedef Matrix<float, Dynamic, Dynamic, RowMajor> RMat;
+    typedef Matrix<float, Dynamic, 1> FVec;
+    for (int si = 0; si < 6; ++si) {
+      int n = sizes[si];
+      RMat m1 = RMat::Random(n, n);
+      for (int i = 0; i < n; ++i) m1(i, i) += float(n);
+
+      FVec v = FVec::Random(n);
+      RMat U = m1.triangularView<Upper>();
+      FVec x = m1.triangularView<Upper>().solve(v);
+      VERIFY_IS_APPROX(U * x, v);
+
+      RMat L = m1.triangularView<Lower>();
+      x = m1.triangularView<Lower>().solve(v);
+      VERIFY_IS_APPROX(L * x, v);
+    }
+  }
+}
+
 void bug_159() {
   Matrix3d m = Matrix3d::Random().triangularView<Lower>();
   EIGEN_UNUSED_VARIABLE(m)
@@ -289,4 +354,7 @@
   }
 
   CALL_SUBTEST_1(bug_159());
+
+  // Triangular solve/product at blocking boundaries (deterministic, outside g_repeat).
+  CALL_SUBTEST_11(triangular_at_blocking_boundaries<0>());
 }

diff --git a/test/ulp_accuracy/README.md b/test/ulp_accuracy/README.md
new file mode 100644
index 0000000..440d668
--- /dev/null
+++ b/test/ulp_accuracy/README.md

@@ -0,0 +1,145 @@
+# ULP Accuracy Measurement Tool
+
+Standalone tool for measuring the accuracy of Eigen's vectorized math functions
+in units of ULP (Unit in the Last Place). Compares Eigen's SIMD implementations
+against either MPFR (128-bit high-precision reference) or the standard C++ math
+library.
+
+## Building
+
+From the Eigen build directory:
+
+```bash
+cd build
+cmake ..
+cmake --build . --target ulp_accuracy
+```
+
+If MPFR and GMP are installed, the build automatically enables MPFR support
+(`EIGEN_HAS_MPFR`). Without them, only `--ref=std` is available.
+
+### Installing MPFR (Debian/Ubuntu)
+
+```bash
+sudo apt install libmpfr-dev libgmp-dev
+```
+
+## Usage
+
+```
+./test/ulp_accuracy [options]
+
+Options:
+  --func=NAME    Function to test (required unless --list)
+  --lo=VAL       Start of range (default: -inf)
+  --hi=VAL       End of range (default: +inf)
+  --double       Test double precision (default: float)
+  --step=EPS     Sampling step: advance by (1+EPS)*nextafter(x)
+                 (default: 0 = exhaustive; useful for double, e.g. 1e-6)
+  --threads=N    Number of threads (default: all cores)
+  --batch=N      Batch size for Eigen eval (default: 4096)
+  --ref=MODE     Reference: 'std' (default) or 'mpfr'
+  --hist_width=N Histogram half-width in ULPs (default: 10)
+  --list         List available functions
+```
+
+## Examples
+
+List all supported functions:
+```bash
+./test/ulp_accuracy --list
+```
+
+Exhaustive float test of sin against std (tests all ~4.28 billion finite floats):
+```bash
+./test/ulp_accuracy --func=sin
+```
+
+Float test against MPFR (more accurate reference, but slower):
+```bash
+./test/ulp_accuracy --func=sin --ref=mpfr
+```
+
+Double precision test with geometric sampling (exhaustive is impractical for double):
+```bash
+./test/ulp_accuracy --func=exp --double --step=1e-6
+```
+
+Test a specific range:
+```bash
+./test/ulp_accuracy --func=sin --lo=0 --hi=6.2832
+```
+
+## Output
+
+The tool prints:
+
+- **Test configuration**: function, range, reference mode, thread count
+- **Max |ULP error|**: worst-case absolute ULP error with the offending input value
+- **Mean |ULP error|**: average absolute ULP error across all tested values
+- **Signed ULP histogram**: distribution of signed errors showing bias direction
+
+Example output:
+```
+Function: sin (float)
+Range: [-inf, inf]
+Representable values in range: 4278190082
+Reference: MPFR (128-bit)
+Threads: 32
+Batch size: 4096
+
+Results:
+  Values tested: 4278190081
+  Time: 529.04 seconds (8.1 Mvalues/s)
+  Max |ULP error|: 2
+    at x = -1.5413464e+38 (Eigen=-0.482218683, ref=-0.482218742)
+  Mean |ULP error|: 0.0874
+
+Signed ULP error histogram [-10, +10]:
+  -2   :        51988 (  0.001%)
+  -1   :    186805349 (  4.366%)
+  0    :   3904475407 ( 91.265%)
+  1    :    186805349 (  4.366%)
+  2    :        51988 (  0.001%)
+```
+
+## How it works
+
+1. **Range splitting**: The input range is divided evenly across threads by
+   splitting the linear ULP space.
+
+2. **Batched evaluation**: Each thread fills batches of input values, evaluates
+   them through Eigen's vectorized path (using `Eigen::Array` operations), and
+   computes reference values one at a time.
+
+3. **ULP computation**: IEEE 754 bit patterns are mapped to a linear integer
+   scale where adjacent representable values are adjacent integers. The signed
+   ULP error is the difference between Eigen's result and the reference on this
+   scale. Special cases (NaN, infinity mismatches) report infinite error.
+
+4. **Result reduction**: Per-thread statistics (max error, mean error, histogram)
+   are merged after all threads complete.
+
+## Supported functions
+
+| Category | Functions |
+|----------|-----------|
+| Trigonometric | sin, cos, tan, asin, acos, atan |
+| Hyperbolic | sinh, cosh, tanh, asinh, acosh, atanh |
+| Exponential/Log | exp, exp2, expm1, log, log1p, log10, log2 |
+| Error/Gamma | erf, erfc, lgamma |
+| Other | logistic, sqrt, cbrt, rsqrt |
+
+## File organization
+
+- `ulp_accuracy.cpp` — Main tool: ULP computation, worker threads, CLI, result printing
+- `mpfr_reference.h` — MPFR reference function wrappers and scalar conversion helpers
+
+## Performance tips
+
+- Float exhaustive sweeps test ~4.28 billion values. With `--ref=std` this takes
+  ~50 seconds per function; with `--ref=mpfr` it takes ~500 seconds (10x slower).
+- For double precision, exhaustive testing is impractical. Use `--step=1e-6` to
+  sample ~2.88 billion values geometrically.
+- Thread count defaults to all available cores. MPFR is the bottleneck (single
+  MPFR call per value per thread), so more cores help significantly.

diff --git a/test/ulp_accuracy/mpfr_reference.h b/test/ulp_accuracy/mpfr_reference.h
new file mode 100644
index 0000000..37c6911
--- /dev/null
+++ b/test/ulp_accuracy/mpfr_reference.h

@@ -0,0 +1,78 @@
+// MPFR high-precision reference implementations for ULP accuracy testing.
+//
+// This header provides MPFR-based reference functions for all math operations
+// tested by the ulp_accuracy tool. It also includes scalar conversion helpers
+// between float/double and mpfr_t.
+//
+// Only compiled when EIGEN_HAS_MPFR is defined (i.e., MPFR and GMP are found).
+
+#ifndef EIGEN_ULP_ACCURACY_MPFR_REFERENCE_H
+#define EIGEN_ULP_ACCURACY_MPFR_REFERENCE_H
+
+#ifdef EIGEN_HAS_MPFR
+#include <mpfr.h>
+
+// ---------------------------------------------------------------------------
+// Scalar <-> mpfr_t conversion
+// ---------------------------------------------------------------------------
+
+template <typename Scalar>
+static void mpfr_set_scalar(mpfr_t rop, Scalar x, mpfr_rnd_t rnd);
+template <>
+void mpfr_set_scalar<float>(mpfr_t rop, float x, mpfr_rnd_t rnd) {
+  mpfr_set_flt(rop, x, rnd);
+}
+template <>
+void mpfr_set_scalar<double>(mpfr_t rop, double x, mpfr_rnd_t rnd) {
+  mpfr_set_d(rop, x, rnd);
+}
+
+template <typename Scalar>
+static Scalar mpfr_get_scalar(mpfr_t op, mpfr_rnd_t rnd);
+template <>
+float mpfr_get_scalar<float>(mpfr_t op, mpfr_rnd_t rnd) {
+  return mpfr_get_flt(op, rnd);
+}
+template <>
+double mpfr_get_scalar<double>(mpfr_t op, mpfr_rnd_t rnd) {
+  return mpfr_get_d(op, rnd);
+}
+
+// ---------------------------------------------------------------------------
+// MPFR wrappers for functions not directly provided by libmpfr
+// ---------------------------------------------------------------------------
+
+// logistic(x) = 1 / (1 + exp(-x))
+static int mpfr_logistic(mpfr_t rop, const mpfr_t op, mpfr_rnd_t rnd) {
+  mpfr_t tmp, one;
+  mpfr_init2(tmp, mpfr_get_prec(rop));
+  mpfr_init2(one, mpfr_get_prec(rop));
+  mpfr_set_ui(one, 1, rnd);
+  mpfr_neg(tmp, op, rnd);
+  mpfr_exp(tmp, tmp, rnd);
+  mpfr_add(tmp, tmp, one, rnd);
+  int ret = mpfr_div(rop, one, tmp, rnd);
+  mpfr_clear(tmp);
+  mpfr_clear(one);
+  return ret;
+}
+
+// rsqrt(x) = 1 / sqrt(x)
+static int mpfr_rsqrt(mpfr_t rop, const mpfr_t op, mpfr_rnd_t rnd) { return mpfr_rec_sqrt(rop, op, rnd); }
+
+// exp2(x) = 2^x
+static int mpfr_exp2_wrap(mpfr_t rop, const mpfr_t op, mpfr_rnd_t rnd) {
+  mpfr_t two;
+  mpfr_init2(two, mpfr_get_prec(rop));
+  mpfr_set_ui(two, 2, rnd);
+  int ret = mpfr_pow(rop, two, op, rnd);
+  mpfr_clear(two);
+  return ret;
+}
+
+// log2(x) — thin wrapper to match the function signature convention.
+static int mpfr_log2_wrap(mpfr_t rop, const mpfr_t op, mpfr_rnd_t rnd) { return mpfr_log2(rop, op, rnd); }
+
+#endif  // EIGEN_HAS_MPFR
+
+#endif  // EIGEN_ULP_ACCURACY_MPFR_REFERENCE_H

diff --git a/test/ulp_accuracy/ulp_accuracy.cpp b/test/ulp_accuracy/ulp_accuracy.cpp
new file mode 100644
index 0000000..658d234
--- /dev/null
+++ b/test/ulp_accuracy/ulp_accuracy.cpp

@@ -0,0 +1,611 @@
+// Standalone tool to measure ULP accuracy of Eigen's vectorized math functions
+// against either MPFR (high-precision reference) or std C++ math functions.
+//
+// See README.md in this directory for full documentation.
+//
+// Usage:
+//   ./ulp_accuracy --func=sin --lo=0 --hi=6.2832 --threads=16
+//   ./ulp_accuracy --func=exp --threads=16
+//   ./ulp_accuracy --func=sin --ref=mpfr
+//   ./ulp_accuracy --func=sin --double --step=1e-6
+//   ./ulp_accuracy --list
+//
+// Build:
+//   cd build && cmake .. && make ulp_accuracy
+
+#include <Eigen/Core>
+#include <unsupported/Eigen/SpecialFunctions>
+
+#ifdef EIGEN_HAS_MPFR
+#include <mpfr.h>
+#endif
+
+#include <cfloat>
+#include <chrono>
+#include <cmath>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <string>
+#include <thread>
+#include <type_traits>
+#include <vector>
+
+#include "mpfr_reference.h"
+
+// ============================================================================
+// ULP distance computation
+// ============================================================================
+
+// Maps IEEE 754 bits to a linear integer scale where adjacent representable
+// values are adjacent integers. The mapping is strictly monotonic:
+//   -inf -> most negative, -0.0 -> -1, +0.0 -> 0, +inf -> most positive.
+static inline int64_t scalar_to_linear(float x) {
+  int32_t bits;
+  std::memcpy(&bits, &x, sizeof(bits));
+  if (bits < 0) {
+    bits = static_cast<int32_t>(INT32_MIN) - bits - 1;
+  }
+  return static_cast<int64_t>(bits);
+}
+
+static inline int64_t scalar_to_linear(double x) {
+  int64_t bits;
+  std::memcpy(&bits, &x, sizeof(bits));
+  if (bits < 0) {
+    bits = static_cast<int64_t>(INT64_MIN) - bits - 1;
+  }
+  return bits;
+}
+
+// Returns (eigen_val - ref_val) in ULP space.
+// Positive means Eigen overestimates, negative means it underestimates.
+// Returns INT64_MAX for incomparable values (NaN vs number, inf mismatch).
+template <typename Scalar>
+static inline int64_t signed_ulp_error(Scalar eigen_val, Scalar ref_val) {
+  if (eigen_val == ref_val) return 0;  // also handles -0.0 == +0.0
+  bool e_nan = std::isnan(eigen_val), r_nan = std::isnan(ref_val);
+  if (e_nan && r_nan) return 0;
+  if (e_nan || r_nan) return INT64_MAX;
+  if (std::isinf(eigen_val) || std::isinf(ref_val)) return INT64_MAX;
+  int64_t a = scalar_to_linear(eigen_val);
+  int64_t b = scalar_to_linear(ref_val);
+  // Overflow check for a - b.
+  if (b > 0 && a < INT64_MIN + b) return INT64_MAX;
+  if (b < 0 && a > INT64_MAX + b) return INT64_MAX;
+  return a - b;
+}
+
+// ============================================================================
+// Per-thread accumulator with signed ULP histogram
+// ============================================================================
+
+template <typename Scalar>
+struct alignas(128) ThreadResult {
+  int64_t max_abs_ulp = 0;
+  Scalar max_ulp_at = Scalar(0);
+  Scalar max_ulp_eigen = Scalar(0);
+  Scalar max_ulp_ref = Scalar(0);
+  double abs_ulp_sum = 0.0;
+  uint64_t count = 0;
+
+  // Signed histogram: bins for errors in [-hist_width, +hist_width],
+  // plus two overflow bins for < -hist_width and > +hist_width.
+  // Layout: [<-W] [-W] [-W+1] ... [0] ... [W-1] [W] [>W]
+  // Total bins = 2*hist_width + 3
+  int hist_width = 0;
+  std::vector<uint64_t> hist;
+
+  void init(int w) {
+    hist_width = w;
+    hist.assign(2 * w + 3, 0);
+  }
+
+  void record(int64_t signed_err, Scalar x, Scalar eigen_val, Scalar ref_val) {
+    int64_t abs_err = signed_err < 0 ? -signed_err : signed_err;
+    if (signed_err == INT64_MAX) abs_err = INT64_MAX;
+
+    if (abs_err > max_abs_ulp) {
+      max_abs_ulp = abs_err;
+      max_ulp_at = x;
+      max_ulp_eigen = eigen_val;
+      max_ulp_ref = ref_val;
+    }
+    if (abs_err != INT64_MAX) {
+      abs_ulp_sum += static_cast<double>(abs_err);
+    }
+    count++;
+
+    // Histogram bin.
+    int bin;
+    if (signed_err == INT64_MAX || signed_err > hist_width) {
+      bin = 2 * hist_width + 2;  // overflow high
+    } else if (signed_err < -hist_width) {
+      bin = 0;  // overflow low
+    } else {
+      bin = static_cast<int>(signed_err) + hist_width + 1;
+    }
+    hist[bin]++;
+  }
+};
+
+// ============================================================================
+// Function registry
+// ============================================================================
+
+template <typename Scalar>
+struct FuncEntry {
+  using ArrayType = Eigen::Array<Scalar, Eigen::Dynamic, 1>;
+  using EigenEval = std::function<void(Eigen::Ref<ArrayType>, const Eigen::Ref<const ArrayType>&)>;
+  using StdEval = std::function<Scalar(Scalar)>;
+
+#ifdef EIGEN_HAS_MPFR
+  using MpfrEval = std::function<int(mpfr_t, const mpfr_t, mpfr_rnd_t)>;
+#endif
+
+  std::string name;
+  EigenEval eigen_eval;
+  StdEval std_eval;
+#ifdef EIGEN_HAS_MPFR
+  MpfrEval mpfr_eval;
+#endif
+  Scalar default_lo;
+  Scalar default_hi;
+};
+
+// std::logistic is not part of the C++ standard library.
+template <typename Scalar>
+static Scalar std_logistic(Scalar x) {
+  if (x >= 0) {
+    Scalar e = std::exp(-x);
+    return Scalar(1) / (Scalar(1) + e);
+  } else {
+    Scalar e = std::exp(x);
+    return e / (Scalar(1) + e);
+  }
+}
+
+template <typename Scalar>
+static std::vector<FuncEntry<Scalar>> build_func_table() {
+  using ArrayType = Eigen::Array<Scalar, Eigen::Dynamic, 1>;
+  std::vector<FuncEntry<Scalar>> table;
+
+#ifdef EIGEN_HAS_MPFR
+#define ADD_FUNC(fname, eigen_expr, std_expr, mpfr_fn, lo, hi)                                                        \
+  table.push_back({#fname, [](Eigen::Ref<ArrayType> out, const Eigen::Ref<const ArrayType>& a) { out = eigen_expr; }, \
+                   [](Scalar x) -> Scalar { return std_expr; },                                                       \
+                   [](mpfr_t r, const mpfr_t o, mpfr_rnd_t d) { return mpfr_fn(r, o, d); }, lo, hi})
+#else
+#define ADD_FUNC(fname, eigen_expr, std_expr, mpfr_fn, lo, hi)                                                        \
+  table.push_back({#fname, [](Eigen::Ref<ArrayType> out, const Eigen::Ref<const ArrayType>& a) { out = eigen_expr; }, \
+                   [](Scalar x) -> Scalar { return std_expr; }, lo, hi})
+#endif
+
+  constexpr Scalar kInf = std::numeric_limits<Scalar>::infinity();
+
+  // Trigonometric
+  // clang-format off
+  ADD_FUNC(sin,   a.sin(),   std::sin(x),   mpfr_sin,   -kInf, kInf);
+  ADD_FUNC(cos,   a.cos(),   std::cos(x),   mpfr_cos,   -kInf, kInf);
+  ADD_FUNC(tan,   a.tan(),   std::tan(x),   mpfr_tan,   -kInf, kInf);
+  ADD_FUNC(asin,  a.asin(),  std::asin(x),  mpfr_asin,  -kInf, kInf);
+  ADD_FUNC(acos,  a.acos(),  std::acos(x),  mpfr_acos,  -kInf, kInf);
+  ADD_FUNC(atan,  a.atan(),  std::atan(x),  mpfr_atan,  -kInf, kInf);
+
+  // Hyperbolic
+  ADD_FUNC(sinh,  a.sinh(),  std::sinh(x),  mpfr_sinh,  -kInf, kInf);
+  ADD_FUNC(cosh,  a.cosh(),  std::cosh(x),  mpfr_cosh,  -kInf, kInf);
+  ADD_FUNC(tanh,  a.tanh(),  std::tanh(x),  mpfr_tanh,  -kInf, kInf);
+  ADD_FUNC(asinh, a.asinh(), std::asinh(x), mpfr_asinh, -kInf, kInf);
+  ADD_FUNC(acosh, a.acosh(), std::acosh(x), mpfr_acosh, -kInf, kInf);
+  ADD_FUNC(atanh, a.atanh(), std::atanh(x), mpfr_atanh, -kInf, kInf);
+
+  // Exponential / Logarithmic
+  ADD_FUNC(exp,   a.exp(),     std::exp(x),    mpfr_exp,       -kInf, kInf);
+  ADD_FUNC(exp2,  a.exp2(),    std::exp2(x),   mpfr_exp2_wrap, -kInf, kInf);
+  ADD_FUNC(expm1, a.expm1(),   std::expm1(x),  mpfr_expm1,     -kInf, kInf);
+  ADD_FUNC(log,   a.log(),     std::log(x),    mpfr_log,       -kInf, kInf);
+  ADD_FUNC(log1p, a.log1p(),   std::log1p(x),  mpfr_log1p,     -kInf, kInf);
+  ADD_FUNC(log10, a.log10(),   std::log10(x),  mpfr_log10,     -kInf, kInf);
+  ADD_FUNC(log2,  a.log2(),    std::log2(x),   mpfr_log2_wrap, -kInf, kInf);
+
+  // Error / Gamma
+  ADD_FUNC(erf,    a.erf(),    std::erf(x),    mpfr_erf,     -kInf, kInf);
+  ADD_FUNC(erfc,   a.erfc(),   std::erfc(x),   mpfr_erfc,    -kInf, kInf);
+  ADD_FUNC(lgamma, a.lgamma(), std::lgamma(x), mpfr_lngamma, -kInf, kInf);
+
+  // Other
+  ADD_FUNC(logistic, a.logistic(), std_logistic(x), mpfr_logistic, -kInf, kInf);
+  ADD_FUNC(sqrt,  a.sqrt(),  std::sqrt(x),            mpfr_sqrt,  -kInf, kInf);
+  ADD_FUNC(cbrt,  a.cbrt(),  std::cbrt(x),            mpfr_cbrt,  -kInf, kInf);
+  ADD_FUNC(rsqrt, a.rsqrt(), Scalar(1)/std::sqrt(x),  mpfr_rsqrt, -kInf, kInf);
+  // clang-format on
+
+#undef ADD_FUNC
+  return table;
+}
+
+// ============================================================================
+// Range iteration helpers
+// ============================================================================
+
+// Advances x toward +inf by at least 1 ULP. When step_eps > 0, additionally
+// jumps by a relative factor of (1 + step_eps) to sample the range sparsely.
+template <typename Scalar>
+static inline Scalar advance_by_step(Scalar x, double step_eps) {
+  Scalar next = std::nextafter(x, std::numeric_limits<Scalar>::infinity());
+  if (step_eps > 0.0 && std::isfinite(next)) {
+    // Try to jump further by a relative amount.
+    Scalar jumped = next > 0 ? next * static_cast<Scalar>(1.0 + step_eps) : next / static_cast<Scalar>(1.0 + step_eps);
+    // Use the jump only if it actually advances further (handles denormal stalling).
+    if (jumped > next) next = jumped;
+  }
+  return next;
+}
+
+// Counts the number of representable scalars in [lo, hi].
+template <typename Scalar>
+static uint64_t count_scalars_in_range(Scalar lo, Scalar hi) {
+  if (lo > hi) return 0;
+  uint64_t lo_u = static_cast<uint64_t>(scalar_to_linear(lo));
+  uint64_t hi_u = static_cast<uint64_t>(scalar_to_linear(hi));
+  uint64_t diff = hi_u - lo_u;
+  return diff == UINT64_MAX ? UINT64_MAX : diff + 1;
+}
+
+// Advances a scalar by n ULPs in the linear representation.
+static float advance_scalar(float x, uint64_t n) {
+  int64_t lin = scalar_to_linear(x);
+  lin += static_cast<int64_t>(n);
+  int32_t ibits;
+  if (lin < 0) {
+    ibits = static_cast<int32_t>(INT32_MIN) - static_cast<int32_t>(lin) - 1;
+  } else {
+    ibits = static_cast<int32_t>(lin);
+  }
+  float result;
+  std::memcpy(&result, &ibits, sizeof(result));
+  return result;
+}
+
+static double advance_scalar(double x, uint64_t n) {
+  int64_t lin = scalar_to_linear(x);
+  lin += static_cast<int64_t>(n);
+  int64_t ibits;
+  if (lin < 0) {
+    ibits = static_cast<int64_t>(INT64_MIN) - lin - 1;
+  } else {
+    ibits = lin;
+  }
+  double result;
+  std::memcpy(&result, &ibits, sizeof(result));
+  return result;
+}
+
+// ============================================================================
+// Worker thread: evaluates Eigen and reference over a subrange
+// ============================================================================
+
+template <typename Scalar>
+static void worker(const FuncEntry<Scalar>& func, Scalar lo, Scalar hi, int batch_size, bool use_mpfr, double step_eps,
+                   ThreadResult<Scalar>& result) {
+  using ArrayType = Eigen::Array<Scalar, Eigen::Dynamic, 1>;
+  ArrayType input(batch_size);
+  ArrayType eigen_out(batch_size);
+  std::vector<Scalar> ref_out(batch_size);
+
+#ifdef EIGEN_HAS_MPFR
+  mpfr_t mp_in, mp_out;
+  if (use_mpfr) {
+    mpfr_init2(mp_in, 128);
+    mpfr_init2(mp_out, 128);
+  }
+#else
+  (void)use_mpfr;
+#endif
+
+  auto process_batch = [&](int n, const ArrayType& in, const ArrayType& eig) {
+    for (int i = 0; i < n; i++) {
+#ifdef EIGEN_HAS_MPFR
+      if (use_mpfr) {
+        mpfr_set_scalar<Scalar>(mp_in, in[i], MPFR_RNDN);
+        func.mpfr_eval(mp_out, mp_in, MPFR_RNDN);
+        ref_out[i] = mpfr_get_scalar<Scalar>(mp_out, MPFR_RNDN);
+      } else
+#endif
+      {
+        ref_out[i] = func.std_eval(in[i]);
+      }
+    }
+    for (int i = 0; i < n; i++) {
+      int64_t err = signed_ulp_error(eig[i], ref_out[i]);
+      result.record(err, in[i], eig[i], ref_out[i]);
+    }
+  };
+
+  int idx = 0;
+  Scalar x = lo;
+  for (;;) {
+    input[idx] = x;
+    idx++;
+
+    if (idx == batch_size) {
+      func.eigen_eval(eigen_out, input);
+      process_batch(batch_size, input, eigen_out);
+      idx = 0;
+    }
+
+    if (x >= hi) break;
+    Scalar next = advance_by_step(x, step_eps);
+    x = (next > hi) ? hi : next;
+  }
+
+  // Process remaining partial batch.
+  if (idx > 0) {
+    auto partial_in = input.head(idx);
+    auto partial_eigen = eigen_out.head(idx);
+    func.eigen_eval(partial_eigen, partial_in);
+    process_batch(idx, input, eigen_out);
+  }
+
+#ifdef EIGEN_HAS_MPFR
+  if (use_mpfr) {
+    mpfr_clear(mp_in);
+    mpfr_clear(mp_out);
+  }
+#endif
+}
+
+// ============================================================================
+// Test driver: splits range across threads and prints results
+// ============================================================================
+
+struct Options {
+  std::string func_name;
+  double lo = std::numeric_limits<double>::quiet_NaN();
+  double hi = std::numeric_limits<double>::quiet_NaN();
+  int num_threads;
+  int batch_size = 4096;
+  int hist_width = 10;
+  bool use_mpfr = false;
+  bool use_double = false;
+  double step_eps = 0.0;
+  bool list_funcs = false;
+};
+
+template <typename Scalar>
+static int run_test(const Options& opts) {
+  const int kDigits = std::is_same<Scalar, float>::value ? 9 : 17;
+  const char* kTypeName = std::is_same<Scalar, float>::value ? "float" : "double";
+
+  auto table = build_func_table<Scalar>();
+
+  if (opts.list_funcs) {
+    std::printf("Available functions:\n");
+    for (const auto& f : table) {
+      std::printf("  %s\n", f.name.c_str());
+    }
+    return 0;
+  }
+
+  // Look up the requested function.
+  const FuncEntry<Scalar>* entry = nullptr;
+  for (const auto& f : table) {
+    if (f.name == opts.func_name) {
+      entry = &f;
+      break;
+    }
+  }
+  if (!entry) {
+    std::fprintf(stderr, "Error: unknown function '%s' (use --list to see available functions)\n",
+                 opts.func_name.c_str());
+    return 1;
+  }
+
+  Scalar lo = std::isnan(opts.lo) ? entry->default_lo : static_cast<Scalar>(opts.lo);
+  Scalar hi = std::isnan(opts.hi) ? entry->default_hi : static_cast<Scalar>(opts.hi);
+  uint64_t total_scalars = count_scalars_in_range(lo, hi);
+  int num_threads = opts.num_threads;
+
+  // Print test configuration.
+  std::printf("Function: %s (%s)\n", opts.func_name.c_str(), kTypeName);
+  std::printf("Range: [%.*g, %.*g]\n", kDigits, double(lo), kDigits, double(hi));
+  if (opts.step_eps > 0.0) {
+    std::printf("Sampling step: (1 + %g) * nextafter(x)\n", opts.step_eps);
+  } else {
+    std::printf("Representable values in range: %lu\n", static_cast<unsigned long>(total_scalars));
+  }
+  std::printf("Reference: %s\n", opts.use_mpfr ? "MPFR (128-bit)" : "std C++ math");
+  std::printf("Threads: %d\n", num_threads);
+  std::printf("Batch size: %d\n", opts.batch_size);
+  std::printf("\n");
+  std::fflush(stdout);
+
+  // Split range across threads.
+  if (total_scalars > 0 && static_cast<uint64_t>(num_threads) > total_scalars) {
+    num_threads = static_cast<int>(total_scalars);
+  }
+  if (num_threads < 1) num_threads = 1;
+
+  // Heap-allocate each ThreadResult separately to avoid false sharing.
+  std::vector<std::unique_ptr<ThreadResult<Scalar>>> results;
+  results.reserve(num_threads);
+  for (int t = 0; t < num_threads; t++) {
+    results.push_back(std::make_unique<ThreadResult<Scalar>>());
+    results.back()->init(opts.hist_width);
+  }
+
+  std::vector<std::thread> threads;
+  uint64_t scalars_per_thread = total_scalars / num_threads;
+  Scalar chunk_lo = lo;
+
+  auto start_time = std::chrono::steady_clock::now();
+
+  for (int t = 0; t < num_threads; t++) {
+    Scalar chunk_hi;
+    if (t == num_threads - 1) {
+      chunk_hi = hi;
+    } else {
+      chunk_hi = advance_scalar(chunk_lo, scalars_per_thread - 1);
+    }
+    threads.emplace_back(worker<Scalar>, std::cref(*entry), chunk_lo, chunk_hi, opts.batch_size, opts.use_mpfr,
+                         opts.step_eps, std::ref(*results[t]));
+    chunk_lo = std::nextafter(chunk_hi, std::numeric_limits<Scalar>::infinity());
+  }
+
+  for (auto& t : threads) t.join();
+  auto end_time = std::chrono::steady_clock::now();
+  double elapsed = std::chrono::duration<double>(end_time - start_time).count();
+
+  // Reduce per-thread results.
+  ThreadResult<Scalar> global;
+  global.init(opts.hist_width);
+  for (int t = 0; t < num_threads; t++) {
+    const auto& r = *results[t];
+    if (r.max_abs_ulp > global.max_abs_ulp) {
+      global.max_abs_ulp = r.max_abs_ulp;
+      global.max_ulp_at = r.max_ulp_at;
+      global.max_ulp_eigen = r.max_ulp_eigen;
+      global.max_ulp_ref = r.max_ulp_ref;
+    }
+    global.abs_ulp_sum += r.abs_ulp_sum;
+    global.count += r.count;
+    for (size_t b = 0; b < global.hist.size(); b++) {
+      global.hist[b] += r.hist[b];
+    }
+  }
+
+  double mean_ulp = global.count > 0 ? global.abs_ulp_sum / global.count : 0.0;
+
+  // Print results.
+  std::printf("Results:\n");
+  std::printf("  Values tested: %lu\n", static_cast<unsigned long>(global.count));
+  std::printf("  Time: %.2f seconds (%.1f Mvalues/s)\n", elapsed, global.count / elapsed / 1e6);
+  if (global.max_abs_ulp == INT64_MAX) {
+    std::printf("  Max |ULP error|: inf\n");
+  } else {
+    std::printf("  Max |ULP error|: %ld\n", static_cast<long>(global.max_abs_ulp));
+  }
+  std::printf("    at x = %.*g (Eigen=%.*g, ref=%.*g)\n", kDigits, double(global.max_ulp_at), kDigits,
+              double(global.max_ulp_eigen), kDigits, double(global.max_ulp_ref));
+  std::printf("  Mean |ULP error|: %.4f\n", mean_ulp);
+  std::printf("\n");
+
+  // Print signed error histogram.
+  std::printf("Signed ULP error histogram [-%d, +%d]:\n", opts.hist_width, opts.hist_width);
+  int nbins = 2 * opts.hist_width + 3;
+  for (int b = 0; b < nbins; b++) {
+    if (global.hist[b] == 0) continue;
+    double pct = 100.0 * global.hist[b] / global.count;
+    if (b == 0) {
+      std::printf("  <%-4d: %12lu (%7.3f%%)\n", -opts.hist_width, static_cast<unsigned long>(global.hist[b]), pct);
+    } else if (b == nbins - 1) {
+      std::printf("  >%-4d: %12lu (%7.3f%%)\n", opts.hist_width, static_cast<unsigned long>(global.hist[b]), pct);
+    } else {
+      int err = b - opts.hist_width - 1;
+      std::printf("  %-5d: %12lu (%7.3f%%)\n", err, static_cast<unsigned long>(global.hist[b]), pct);
+    }
+  }
+
+  return 0;
+}
+
+// ============================================================================
+// Command-line parsing
+// ============================================================================
+
+static void print_usage() {
+  std::printf(
+      "Usage: ulp_accuracy [options]\n"
+      "  --func=NAME    Function to test (required unless --list)\n"
+      "  --lo=VAL       Start of range (default: -inf)\n"
+      "  --hi=VAL       End of range (default: +inf)\n"
+      "  --double       Test double precision (default: float)\n"
+      "  --step=EPS     Sampling step: advance by (1+EPS)*nextafter(x)\n"
+      "                 (default: 0 = exhaustive; useful for double, e.g. 1e-6)\n"
+      "  --threads=N    Number of threads (default: all cores)\n"
+      "  --batch=N      Batch size for Eigen eval (default: 4096)\n"
+      "  --ref=MODE     Reference: 'std' (default) or 'mpfr'\n"
+      "  --hist_width=N Histogram half-width in ULPs (default: 10)\n"
+      "  --list         List available functions\n");
+}
+
+int main(int argc, char* argv[]) {
+  Options opts;
+  opts.num_threads = static_cast<int>(std::thread::hardware_concurrency());
+  if (opts.num_threads == 0) opts.num_threads = 4;
+  std::string ref_mode;
+
+  for (int i = 1; i < argc; i++) {
+    std::string arg = argv[i];
+    if (arg.substr(0, 7) == "--func=") {
+      opts.func_name = arg.substr(7);
+    } else if (arg.substr(0, 5) == "--lo=") {
+      std::string val = arg.substr(5);
+      if (val == "inf")
+        opts.lo = std::numeric_limits<double>::infinity();
+      else if (val == "-inf")
+        opts.lo = -std::numeric_limits<double>::infinity();
+      else
+        opts.lo = std::stod(val);
+    } else if (arg.substr(0, 5) == "--hi=") {
+      std::string val = arg.substr(5);
+      if (val == "inf")
+        opts.hi = std::numeric_limits<double>::infinity();
+      else if (val == "-inf")
+        opts.hi = -std::numeric_limits<double>::infinity();
+      else
+        opts.hi = std::stod(val);
+    } else if (arg.substr(0, 10) == "--threads=") {
+      opts.num_threads = std::stoi(arg.substr(10));
+    } else if (arg.substr(0, 8) == "--batch=") {
+      opts.batch_size = std::stoi(arg.substr(8));
+    } else if (arg.substr(0, 6) == "--ref=") {
+      ref_mode = arg.substr(6);
+    } else if (arg.substr(0, 13) == "--hist_width=") {
+      opts.hist_width = std::stoi(arg.substr(13));
+    } else if (arg.substr(0, 7) == "--step=") {
+      opts.step_eps = std::stod(arg.substr(7));
+    } else if (arg == "--double") {
+      opts.use_double = true;
+    } else if (arg == "--list") {
+      opts.list_funcs = true;
+    } else if (arg == "--help" || arg == "-h") {
+      print_usage();
+      return 0;
+    } else {
+      std::fprintf(stderr, "Unknown option: %s\n", arg.c_str());
+      print_usage();
+      return 1;
+    }
+  }
+
+  // Determine reference mode (default: std).
+  if (ref_mode.empty() || ref_mode == "std") {
+    opts.use_mpfr = false;
+  } else if (ref_mode == "mpfr") {
+#ifdef EIGEN_HAS_MPFR
+    opts.use_mpfr = true;
+#else
+    std::fprintf(stderr, "Error: MPFR support not compiled in. Use --ref=std or rebuild with MPFR.\n");
+    return 1;
+#endif
+  } else {
+    std::fprintf(stderr, "Error: --ref must be 'std' or 'mpfr'\n");
+    return 1;
+  }
+
+  if (!opts.list_funcs && opts.func_name.empty()) {
+    std::fprintf(stderr, "Error: --func=NAME is required (use --list to see available functions)\n");
+    return 1;
+  }
+
+  if (opts.use_double) {
+    return run_test<double>(opts);
+  } else {
+    return run_test<float>(opts);
+  }
+}

diff --git a/test/umeyama.cpp b/test/umeyama.cpp
index 2f6fe34..e329f91 100644
--- a/test/umeyama.cpp
+++ b/test/umeyama.cpp

@@ -13,6 +13,7 @@
 #include <Eigen/Geometry>
 
 #include <Eigen/LU>   // required for MatrixBase::determinant
+#include <Eigen/QR>   // required for HouseholderQR
 #include <Eigen/SVD>  // required for SVD
 
 using namespace Eigen;
@@ -23,43 +24,9 @@
   typedef T Scalar;
   typedef Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic> MatrixType;
 
-  MatrixType Q;
-
-  int max_tries = 40;
-  bool is_unitary = false;
-
-  while (!is_unitary && max_tries > 0) {
-    // initialize random matrix
-    Q = MatrixType::Random(size, size);
-
-    // orthogonalize columns using the Gram-Schmidt algorithm
-    for (int col = 0; col < size; ++col) {
-      typename MatrixType::ColXpr colVec = Q.col(col);
-      for (int prevCol = 0; prevCol < col; ++prevCol) {
-        typename MatrixType::ColXpr prevColVec = Q.col(prevCol);
-        colVec -= colVec.dot(prevColVec) * prevColVec;
-      }
-      Q.col(col) = colVec.normalized();
-    }
-
-    // this additional orthogonalization is not necessary in theory but should enhance
-    // the numerical orthogonality of the matrix
-    for (int row = 0; row < size; ++row) {
-      typename MatrixType::RowXpr rowVec = Q.row(row);
-      for (int prevRow = 0; prevRow < row; ++prevRow) {
-        typename MatrixType::RowXpr prevRowVec = Q.row(prevRow);
-        rowVec -= rowVec.dot(prevRowVec) * prevRowVec;
-      }
-      Q.row(row) = rowVec.normalized();
-    }
-
-    // final check
-    is_unitary = Q.isUnitary();
-    --max_tries;
-  }
-
-  if (max_tries == 0) eigen_assert(false && "randMatrixUnitary: Could not construct unitary matrix!");
-
+  // The Q factor of the QR decomposition of a random matrix is a random unitary matrix.
+  // HouseholderQR is numerically stable and always succeeds, unlike Gram-Schmidt.
+  MatrixType Q = MatrixType::Random(size, size).householderQr().householderQ();
   return Q;
 }
 

diff --git a/test/vectorization_logic.cpp b/test/vectorization_logic.cpp
index 724fa40..49f93fc 100644
--- a/test/vectorization_logic.cpp
+++ b/test/vectorization_logic.cpp

@@ -212,9 +212,8 @@
 
       VERIFY(test_assign(Matrix3(), Matrix3().cwiseProduct(Matrix3()), LinearVectorizedTraversal, CompleteUnrolling));
 
-      // Vectorization depends on too many factors - ignore.
-      VERIFY(
-          test_assign(Matrix<Scalar, 17, 17>(), Matrix<Scalar, 17, 17>() + Matrix<Scalar, 17, 17>(), -1, NoUnrolling));
+      // Vectorization and unrolling depend on too many factors (packet size, etc.) - ignore both.
+      VERIFY(test_assign(Matrix<Scalar, 17, 17>(), Matrix<Scalar, 17, 17>() + Matrix<Scalar, 17, 17>(), -1, -1));
 
       VERIFY(test_assign(Matrix11(), Matrix11() + Matrix11(), InnerVectorizedTraversal, CompleteUnrolling));
 

diff --git a/test/vectorwiseop.cpp b/test/vectorwiseop.cpp
index d037bb4..b30142d 100644
--- a/test/vectorwiseop.cpp
+++ b/test/vectorwiseop.cpp

@@ -91,6 +91,16 @@
   VERIFY((mb.col(c) == (m1.real().col(c) >= 0.7).any()).all());
   mb = (m1.real() >= 0.7).rowwise().any();
   VERIFY((mb.row(r) == (m1.real().row(r) >= 0.7).any()).all());
+
+  // test count()
+  {
+    Array<Index, 1, ArrayType::ColsAtCompileTime> colcounts(cols);
+    Array<Index, ArrayType::RowsAtCompileTime, 1> rowcounts(rows);
+    colcounts = (m1.real() >= 0).colwise().count();
+    for (Index k = 0; k < cols; ++k) VERIFY_IS_EQUAL(colcounts(k), (m1.real().col(k) >= 0).count());
+    rowcounts = (m1.real() >= 0).rowwise().count();
+    for (Index k = 0; k < rows; ++k) VERIFY_IS_EQUAL(rowcounts(k), (m1.real().row(k) >= 0).count());
+  }
 }
 
 template <typename MatrixType>
@@ -206,6 +216,15 @@
   VERIFY_EVALUATION_COUNT(m2 = (m1.rowwise() - m1.colwise().sum() / RealScalar(m1.rows())),
                           (MatrixType::RowsAtCompileTime != 1 ? 1 : 0));
 
+  // test colwise/rowwise reverse
+  {
+    MatrixType m_rev(rows, cols);
+    m_rev = m1.colwise().reverse();
+    for (Index k = 0; k < cols; ++k) VERIFY_IS_APPROX(m_rev.col(k), m1.col(k).reverse());
+    m_rev = m1.rowwise().reverse();
+    for (Index k = 0; k < rows; ++k) VERIFY_IS_APPROX(m_rev.row(k), m1.row(k).reverse());
+  }
+
   // test empty expressions
   VERIFY_IS_APPROX(m1.matrix().middleCols(0, 0).rowwise().sum().eval(), MatrixX::Zero(rows, 1));
   VERIFY_IS_APPROX(m1.matrix().middleRows(0, 0).colwise().sum().eval(), MatrixX::Zero(1, cols));
@@ -224,6 +243,73 @@
   VERIFY_IS_EQUAL(m1.real().middleCols(0, fix<0>).colwise().maxCoeff().eval().cols(), 0);
 }
 
+// Integer-safe subset of vectorwiseop_array: tests +, -, all/any, count only.
+// Skips *, / which cause integer overflow or division-by-zero with full-range random ints.
+template <typename ArrayType>
+void vectorwiseop_array_integer(const ArrayType& m) {
+  typedef typename ArrayType::Scalar Scalar;
+  typedef Array<Scalar, ArrayType::RowsAtCompileTime, 1> ColVectorType;
+  typedef Array<Scalar, 1, ArrayType::ColsAtCompileTime> RowVectorType;
+
+  Index rows = m.rows();
+  Index cols = m.cols();
+  Index r = internal::random<Index>(0, rows - 1), c = internal::random<Index>(0, cols - 1);
+
+  ArrayType m1 = ArrayType::Random(rows, cols), m2(rows, cols);
+  // Clamp to avoid overflow even in addition/subtraction.
+  for (Index j = 0; j < cols; ++j)
+    for (Index i = 0; i < rows; ++i) m1(i, j) = m1(i, j) % Scalar(10000);
+
+  ColVectorType colvec = ColVectorType::Random(rows);
+  for (Index i = 0; i < rows; ++i) colvec(i) = colvec(i) % Scalar(10000);
+  RowVectorType rowvec = RowVectorType::Random(cols);
+  for (Index j = 0; j < cols; ++j) rowvec(j) = rowvec(j) % Scalar(10000);
+
+  // test addition
+  m2 = m1;
+  m2.colwise() += colvec;
+  VERIFY_IS_APPROX(m2, m1.colwise() + colvec);
+  VERIFY_IS_APPROX(m2.col(c), m1.col(c) + colvec);
+
+  m2 = m1;
+  m2.rowwise() += rowvec;
+  VERIFY_IS_APPROX(m2, m1.rowwise() + rowvec);
+  VERIFY_IS_APPROX(m2.row(r), m1.row(r) + rowvec);
+
+  // test subtraction
+  m2 = m1;
+  m2.colwise() -= colvec;
+  VERIFY_IS_APPROX(m2, m1.colwise() - colvec);
+  VERIFY_IS_APPROX(m2.col(c), m1.col(c) - colvec);
+
+  m2 = m1;
+  m2.rowwise() -= rowvec;
+  VERIFY_IS_APPROX(m2, m1.rowwise() - rowvec);
+  VERIFY_IS_APPROX(m2.row(r), m1.row(r) - rowvec);
+
+  // all/any
+  Array<bool, Dynamic, Dynamic> mb(rows, cols);
+  mb = (m1 <= Scalar(0)).colwise().all();
+  VERIFY((mb.col(c) == (m1.col(c) <= Scalar(0)).all()).all());
+  mb = (m1 <= Scalar(0)).rowwise().all();
+  VERIFY((mb.row(r) == (m1.row(r) <= Scalar(0)).all()).all());
+
+  mb = (m1 >= Scalar(0)).colwise().any();
+  VERIFY((mb.col(c) == (m1.col(c) >= Scalar(0)).any()).all());
+  mb = (m1 >= Scalar(0)).rowwise().any();
+  VERIFY((mb.row(r) == (m1.row(r) >= Scalar(0)).any()).all());
+
+  // test count()
+  {
+    Array<Index, 1, ArrayType::ColsAtCompileTime> colcounts(cols);
+    Array<Index, ArrayType::RowsAtCompileTime, 1> rowcounts(rows);
+    colcounts = (m1 >= Scalar(0)).colwise().count();
+    for (Index k = 0; k < cols; ++k) VERIFY_IS_EQUAL(colcounts(k), (m1.col(k) >= Scalar(0)).count());
+    rowcounts = (m1 >= Scalar(0)).rowwise().count();
+    for (Index k = 0; k < rows; ++k) VERIFY_IS_EQUAL(rowcounts(k), (m1.row(k) >= Scalar(0)).count());
+  }
+}
+
 void vectorwiseop_mixedscalar() {
   Matrix4cd a = Matrix4cd::Random();
   Vector4cd b = Vector4cd::Random();
@@ -235,6 +321,58 @@
   VERIFY_IS_CWISE_EQUAL(c, d);
 }
 
+// Test partial reductions on RowMajor matrices.
+// The existing tests only use ColMajor matrices.
+template <typename Scalar>
+void vectorwiseop_rowmajor() {
+  typedef Matrix<Scalar, Dynamic, Dynamic, RowMajor> RowMajorMatrix;
+  typedef Matrix<Scalar, Dynamic, Dynamic, ColMajor> ColMajorMatrix;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef Matrix<RealScalar, 1, Dynamic> RealRowVectorType;
+  typedef Matrix<RealScalar, Dynamic, 1> RealColVectorType;
+
+  const Index rows = 7;
+  const Index cols = 11;
+  ColMajorMatrix mc = ColMajorMatrix::Random(rows, cols);
+  RowMajorMatrix mr = mc;  // same data, different storage
+
+  // Partial reductions should give the same result regardless of storage order.
+  VERIFY_IS_APPROX(mc.colwise().sum(), mr.colwise().sum());
+  VERIFY_IS_APPROX(mc.rowwise().sum(), mr.rowwise().sum());
+  VERIFY_IS_APPROX(mc.colwise().prod(), mr.colwise().prod());
+  VERIFY_IS_APPROX(mc.rowwise().prod(), mr.rowwise().prod());
+  VERIFY_IS_APPROX(mc.colwise().squaredNorm(), mr.colwise().squaredNorm());
+  VERIFY_IS_APPROX(mc.rowwise().squaredNorm(), mr.rowwise().squaredNorm());
+  VERIFY_IS_APPROX(mc.colwise().norm(), mr.colwise().norm());
+  VERIFY_IS_APPROX(mc.rowwise().norm(), mr.rowwise().norm());
+
+  RealRowVectorType rr_c, rr_r;
+  RealColVectorType rc_c, rc_r;
+  rr_c = mc.real().colwise().minCoeff();
+  rr_r = mr.real().colwise().minCoeff();
+  VERIFY_IS_APPROX(rr_c, rr_r);
+  rr_c = mc.real().colwise().maxCoeff();
+  rr_r = mr.real().colwise().maxCoeff();
+  VERIFY_IS_APPROX(rr_c, rr_r);
+  rc_c = mc.real().rowwise().minCoeff();
+  rc_r = mr.real().rowwise().minCoeff();
+  VERIFY_IS_APPROX(rc_c, rc_r);
+  rc_c = mc.real().rowwise().maxCoeff();
+  rc_r = mr.real().rowwise().maxCoeff();
+  VERIFY_IS_APPROX(rc_c, rc_r);
+
+  // Broadcast operations
+  typedef Matrix<Scalar, Dynamic, 1> ColVectorType;
+  typedef Matrix<Scalar, 1, Dynamic> RowVectorType;
+  ColVectorType cv = ColVectorType::Random(rows);
+  RowVectorType rv = RowVectorType::Random(cols);
+
+  VERIFY_IS_APPROX(ColMajorMatrix(mc.colwise() + cv), ColMajorMatrix(mr.colwise() + cv));
+  VERIFY_IS_APPROX(ColMajorMatrix(mc.rowwise() + rv), ColMajorMatrix(mr.rowwise() + rv));
+  VERIFY_IS_APPROX(ColMajorMatrix(mc.colwise() - cv), ColMajorMatrix(mr.colwise() - cv));
+  VERIFY_IS_APPROX(ColMajorMatrix(mc.rowwise() - rv), ColMajorMatrix(mr.rowwise() - rv));
+}
+
 EIGEN_DECLARE_TEST(vectorwiseop) {
   CALL_SUBTEST_1(vectorwiseop_array(Array22cd()));
   CALL_SUBTEST_2(vectorwiseop_array(Array<double, 3, 2>()));
@@ -248,4 +386,11 @@
   CALL_SUBTEST_7(vectorwiseop_matrix(VectorXd(internal::random<int>(1, EIGEN_TEST_MAX_SIZE))));
   CALL_SUBTEST_7(vectorwiseop_matrix(RowVectorXd(internal::random<int>(1, EIGEN_TEST_MAX_SIZE))));
   CALL_SUBTEST_8(vectorwiseop_mixedscalar());
+  CALL_SUBTEST_9(vectorwiseop_array_integer(
+      ArrayXXi(internal::random<int>(1, EIGEN_TEST_MAX_SIZE), internal::random<int>(1, EIGEN_TEST_MAX_SIZE))));
+
+  // RowMajor partial reductions (deterministic, outside g_repeat).
+  CALL_SUBTEST_10(vectorwiseop_rowmajor<float>());
+  CALL_SUBTEST_10(vectorwiseop_rowmajor<double>());
+  CALL_SUBTEST_10(vectorwiseop_rowmajor<std::complex<float>>());
 }

diff --git a/test/visitor.cpp b/test/visitor.cpp
index 7c6d4ff..0aa1f65 100644
--- a/test/visitor.cpp
+++ b/test/visitor.cpp

@@ -239,7 +239,7 @@
 
 template <typename T, bool Vectorizable>
 struct functor_traits<TrackedVisitor<T, Vectorizable>> {
-  enum { PacketAccess = Vectorizable, LinearAccess = false, Cost = 1 };
+  enum { PacketAccess = Vectorizable, Cost = 1 };
 };
 
 }  // namespace internal
@@ -315,6 +315,56 @@
   VERIFY(j == 0);
 }
 
+// Test minCoeff/maxCoeff at vectorization boundary sizes.
+// Visitor uses LinearVectorizedTraversal with packet-based min/max,
+// so we test at sizes around packet multiples.
+template <typename Scalar>
+void visitor_vec_boundary() {
+  const Index PS = internal::packet_traits<Scalar>::size;
+  const Index sizes[] = {1, 2, 3, PS - 1, PS, PS + 1, 2 * PS - 1, 2 * PS, 2 * PS + 1, 4 * PS, 4 * PS + 1};
+  for (int si = 0; si < 11; ++si) {
+    const Index n = sizes[si];
+    if (n <= 0) continue;
+    typedef Matrix<Scalar, Dynamic, 1> Vec;
+    Vec v = Vec::Random(n);
+    // Ensure all elements are distinct.
+    for (Index i = 0; i < n; ++i)
+      for (Index j = 0; j < i; ++j)
+        while (numext::equal_strict(v(i), v(j))) v(i) = internal::random<Scalar>();
+    // Reference
+    Scalar ref_min = v(0), ref_max = v(0);
+    Index ref_minidx = 0, ref_maxidx = 0;
+    for (Index k = 0; k < n; ++k) {
+      if (v(k) < ref_min) {
+        ref_min = v(k);
+        ref_minidx = k;
+      }
+      if (v(k) > ref_max) {
+        ref_max = v(k);
+        ref_maxidx = k;
+      }
+    }
+    Index eigen_minidx, eigen_maxidx;
+    VERIFY_IS_APPROX(v.minCoeff(&eigen_minidx), ref_min);
+    VERIFY_IS_APPROX(v.maxCoeff(&eigen_maxidx), ref_max);
+    VERIFY(eigen_minidx == ref_minidx);
+    VERIFY(eigen_maxidx == ref_maxidx);
+
+    // Also test matrix form at this size (exercises different inner/outer sizes).
+    if (n >= 2) {
+      typedef Matrix<Scalar, Dynamic, Dynamic> Mat;
+      // Test as n×1 and 1×n (different inner sizes for visitor traversal).
+      Mat mc = v;
+      Mat mr = v.transpose();
+      Index ri, ci;
+      VERIFY_IS_APPROX(mc.minCoeff(&ri, &ci), ref_min);
+      VERIFY(ri == ref_minidx && ci == 0);
+      VERIFY_IS_APPROX(mr.minCoeff(&ri, &ci), ref_min);
+      VERIFY(ri == 0 && ci == ref_minidx);
+    }
+  }
+}
+
 EIGEN_DECLARE_TEST(visitor) {
   for (int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1(matrixVisitor(Matrix<float, 1, 1>()));
@@ -332,4 +382,9 @@
     CALL_SUBTEST_10(vectorVisitor(VectorXf(33)));
   }
   CALL_SUBTEST_11(checkOptimalTraversal());
+
+  // Vectorization boundary sizes — deterministic, run once.
+  CALL_SUBTEST_12(visitor_vec_boundary<float>());
+  CALL_SUBTEST_12(visitor_vec_boundary<double>());
+  CALL_SUBTEST_12(visitor_vec_boundary<int>());
 }

diff --git a/unsupported/Eigen/AlignedVector3 b/unsupported/Eigen/AlignedVector3
index 8301ef0..f2d6416 100644
--- a/unsupported/Eigen/AlignedVector3
+++ b/unsupported/Eigen/AlignedVector3

@@ -35,7 +35,7 @@
  * This class makes this process simpler.
  *
  */
-// TODO specialize Cwise
+// TODO: Specialize Cwise.
 template <typename Scalar_>
 class AlignedVector3;
 

diff --git a/unsupported/Eigen/CMakeLists.txt b/unsupported/Eigen/CMakeLists.txt
index 1517ba9..6d3c57e 100644
--- a/unsupported/Eigen/CMakeLists.txt
+++ b/unsupported/Eigen/CMakeLists.txt

@@ -19,6 +19,9 @@
   SparseExtra
   SpecialFunctions
   Splines
+  Tensor
+  TensorSymmetry
+  ThreadPool
   )
 
 install(FILES

diff --git a/unsupported/Eigen/CXX11/CMakeLists.txt b/unsupported/Eigen/CXX11/CMakeLists.txt
index 385ed24..f31b4bc 100644
--- a/unsupported/Eigen/CXX11/CMakeLists.txt
+++ b/unsupported/Eigen/CXX11/CMakeLists.txt

@@ -1,8 +1,7 @@
+# Forwarding headers for backward compatibility.
 set(Eigen_CXX11_HEADERS Tensor TensorSymmetry ThreadPool)
 
 install(FILES
   ${Eigen_CXX11_HEADERS}
   DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11 COMPONENT Devel
   )
-
-install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11 COMPONENT Devel FILES_MATCHING PATTERN "*.h")

diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index 7375a9b..c80aa42 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor

@@ -1,142 +1,2 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// #ifndef EIGEN_CXX11_TENSOR_MODULE_H
-#define EIGEN_CXX11_TENSOR_MODULE_H
-
-#include "../../../Eigen/Core"
-
-#include "../SpecialFunctions"
-
-#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h"
-
-// IWYU pragma: begin_exports
-#include "../../../Eigen/src/Core/util/Meta.h"
-#include "../../../Eigen/src/Core/util/MaxSizeVector.h"
-// IWYU pragma: end_exports
-
-/** \defgroup CXX11_Tensor_Module Tensor Module
- *
- * This module provides a Tensor class for storing arbitrarily indexed
- * objects.
- *
- * \code
- * #include <Eigen/CXX11/Tensor>
- * \endcode
- *
- * Much of the documentation can be found \ref eigen_tensors "here".
- */
-
-#include <atomic>
-#include <chrono>
-#include <cmath>
-#include <cstddef>
-#include <cstring>
-#include <iterator>
-#include <numeric>
-#include <random>
-#include <thread>
-
-#if defined(EIGEN_USE_THREADS) || defined(EIGEN_USE_SYCL)
-#include "../../../Eigen/ThreadPool"
-#endif
-
-#ifdef EIGEN_USE_GPU
-#include <iostream>
-#if defined(EIGEN_USE_HIP)
-#include <hip/hip_runtime.h>
-#else
-#include <cuda_runtime.h>
-#endif
-#endif
-
-// IWYU pragma: begin_exports
-#include "src/Tensor/TensorMacros.h"
-#include "src/Tensor/TensorForwardDeclarations.h"
-#include "src/Tensor/TensorMeta.h"
-#include "src/Tensor/TensorFunctors.h"
-#include "src/Tensor/TensorCostModel.h"
-#include "src/Tensor/TensorDeviceDefault.h"
-#include "src/Tensor/TensorDeviceThreadPool.h"
-#include "src/Tensor/TensorDeviceGpu.h"
-#ifndef gpu_assert
-#define gpu_assert(x)
-#endif
-#include "src/Tensor/TensorDeviceSycl.h"
-#include "src/Tensor/TensorIndexList.h"
-#include "src/Tensor/TensorDimensionList.h"
-#include "src/Tensor/TensorDimensions.h"
-#include "src/Tensor/TensorInitializer.h"
-#include "src/Tensor/TensorTraits.h"
-#include "src/Tensor/TensorRandom.h"
-#include "src/Tensor/TensorUInt128.h"
-#include "src/Tensor/TensorIntDiv.h"
-#include "src/Tensor/TensorGlobalFunctions.h"
-
-#include "src/Tensor/TensorIO.h"
-
-#include "src/Tensor/TensorBase.h"
-#include "src/Tensor/TensorBlock.h"
-
-#include "src/Tensor/TensorEvaluator.h"
-#include "src/Tensor/TensorExpr.h"
-#include "src/Tensor/TensorReduction.h"
-#include "src/Tensor/TensorReductionGpu.h"
-#include "src/Tensor/TensorArgMax.h"
-#include "src/Tensor/TensorConcatenation.h"
-#include "src/Tensor/TensorContractionMapper.h"
-#include "src/Tensor/TensorContractionBlocking.h"
-#include "src/Tensor/TensorContraction.h"
-#include "src/Tensor/TensorContractionThreadPool.h"
-#include "src/Tensor/TensorContractionGpu.h"
-#include "src/Tensor/TensorConversion.h"
-#include "src/Tensor/TensorConvolution.h"
-#include "src/Tensor/TensorFFT.h"
-#include "src/Tensor/TensorPatch.h"
-#include "src/Tensor/TensorImagePatch.h"
-#include "src/Tensor/TensorVolumePatch.h"
-#include "src/Tensor/TensorBroadcasting.h"
-#include "src/Tensor/TensorChipping.h"
-#include "src/Tensor/TensorInflation.h"
-#include "src/Tensor/TensorLayoutSwap.h"
-#include "src/Tensor/TensorMorphing.h"
-#include "src/Tensor/TensorPadding.h"
-#include "src/Tensor/TensorReverse.h"
-#include "src/Tensor/TensorRoll.h"
-#include "src/Tensor/TensorShuffling.h"
-#include "src/Tensor/TensorStriding.h"
-#include "src/Tensor/TensorCustomOp.h"
-#include "src/Tensor/TensorEvalTo.h"
-#include "src/Tensor/TensorForcedEval.h"
-#include "src/Tensor/TensorGenerator.h"
-#include "src/Tensor/TensorAssign.h"
-#include "src/Tensor/TensorScan.h"
-#include "src/Tensor/TensorTrace.h"
-
-#ifdef EIGEN_USE_SYCL
-#include "src/Tensor/TensorReductionSycl.h"
-#include "src/Tensor/TensorConvolutionSycl.h"
-#include "src/Tensor/TensorContractionSycl.h"
-#include "src/Tensor/TensorScanSycl.h"
-#endif
-
-#include "src/Tensor/TensorExecutor.h"
-#include "src/Tensor/TensorDevice.h"
-
-#include "src/Tensor/TensorStorage.h"
-#include "src/Tensor/Tensor.h"
-#include "src/Tensor/TensorFixedSize.h"
-#include "src/Tensor/TensorMap.h"
-#include "src/Tensor/TensorRef.h"
-// IWYU pragma: end_exports
-
-#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h"
-
-// #endif // EIGEN_CXX11_TENSOR_MODULE_H
+// Forwarding header for backward compatibility.
+#include "../Tensor"  // IWYU pragma: export

diff --git a/unsupported/Eigen/CXX11/TensorSymmetry b/unsupported/Eigen/CXX11/TensorSymmetry
index 0bf9a48..5222978 100644
--- a/unsupported/Eigen/CXX11/TensorSymmetry
+++ b/unsupported/Eigen/CXX11/TensorSymmetry

@@ -1,40 +1,2 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSORSYMMETRY_MODULE_H
-#define EIGEN_CXX11_TENSORSYMMETRY_MODULE_H
-
-#include "Tensor"
-
-#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h"
-
-#include "src/util/CXX11Meta.h"
-
-/** \defgroup TensorSymmetry_Module Tensor Symmetry Module
- *
- * This module provides a classes that allow for the definition of
- * symmetries w.r.t. tensor indices.
- *
- * Including this module will implicitly include the Tensor module.
- *
- * \code
- * #include <Eigen/TensorSymmetry>
- * \endcode
- */
-
-// IWYU pragma: begin_exports
-#include "src/TensorSymmetry/util/TemplateGroupTheory.h"
-#include "src/TensorSymmetry/Symmetry.h"
-#include "src/TensorSymmetry/StaticSymmetry.h"
-#include "src/TensorSymmetry/DynamicSymmetry.h"
-// IWYU pragma: end_exports
-
-#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h"
-
-#endif  // EIGEN_CXX11_TENSORSYMMETRY_MODULE_H
+// Forwarding header for backward compatibility.
+#include "../TensorSymmetry"  // IWYU pragma: export

diff --git a/unsupported/Eigen/CXX11/ThreadPool b/unsupported/Eigen/CXX11/ThreadPool
index d487333..c215d5f 100644
--- a/unsupported/Eigen/CXX11/ThreadPool
+++ b/unsupported/Eigen/CXX11/ThreadPool

@@ -1 +1,2 @@
-#include "../../../Eigen/ThreadPool"  // IWYU pragma: export
+// Forwarding header for backward compatibility.
+#include "../ThreadPool"  // IWYU pragma: export

diff --git a/unsupported/Eigen/CXX11/src/Tensor/InternalHeaderCheck.h b/unsupported/Eigen/CXX11/src/Tensor/InternalHeaderCheck.h
deleted file mode 100644
index 9e4c1ed..0000000
--- a/unsupported/Eigen/CXX11/src/Tensor/InternalHeaderCheck.h
+++ /dev/null

@@ -1,3 +0,0 @@
-#ifndef EIGEN_CXX11_TENSOR_MODULE_H
-#error "Please include unsupported/Eigen/CXX11/Tensor instead of including headers inside the src directory directly."
-#endif

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
deleted file mode 100644
index dbea8aa..0000000
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
+++ /dev/null

@@ -1,7 +0,0 @@
-
-#if defined(__clang__) || defined(__GNUC__)
-#warning \
-    "Deprecated header file, please either include the main Eigen/CXX11/Tensor header or the respective TensorContractionGpu.h file"
-#endif
-
-#include "TensorContractionGpu.h"

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
deleted file mode 100644
index c2c8ed0..0000000
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ /dev/null

@@ -1,7 +0,0 @@
-
-#if defined(__clang__) || defined(__GNUC__)
-#warning \
-    "Deprecated header file, please either include the main Eigen/CXX11/Tensor header or the respective TensorDeviceGpu.h file"
-#endif
-
-#include "TensorDeviceGpu.h"

diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/InternalHeaderCheck.h b/unsupported/Eigen/CXX11/src/TensorSymmetry/InternalHeaderCheck.h
deleted file mode 100644
index b1b2e14..0000000
--- a/unsupported/Eigen/CXX11/src/TensorSymmetry/InternalHeaderCheck.h
+++ /dev/null

@@ -1,4 +0,0 @@
-#ifndef EIGEN_CXX11_TENSORSYMMETRY_MODULE_H
-#error \
-    "Please include unsupported/Eigen/CXX11/TensorSymmetry instead of including headers inside the src directory directly."
-#endif

diff --git a/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h b/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h
deleted file mode 100644
index 632f437..0000000
--- a/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h
+++ /dev/null

@@ -1,85 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11WORKAROUNDS_H
-#define EIGEN_CXX11WORKAROUNDS_H
-
-/* COMPATIBILITY CHECKS
- * (so users of compilers that are too old get some realistic error messages)
- */
-#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 1310)
-#error Intel Compiler only supports required C++ features since version 13.1.
-// note that most stuff in principle works with 13.0 but when combining
-// some features, at some point 13.0 will just fail with an internal assertion
-#elif defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && \
-    (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 6))
-// G++ < 4.6 by default will continue processing the source files - even if we use #error to make
-// it error out. For this reason, we use the pragma to make sure G++ aborts at the first error
-// it sees. Unfortunately, that is still not our #error directive, but at least the output is
-// short enough the user has a chance to see that the compiler version is not sufficient for
-// the funky template mojo we use.
-#pragma GCC diagnostic error "-Wfatal-errors"
-#error GNU C++ Compiler (g++) only supports required C++ features since version 4.6.
-#endif
-
-namespace Eigen {
-
-namespace internal {
-
-/* std::get is only constexpr in C++14, not yet in C++11
- */
-
-template <std::size_t I_, class T>
-constexpr T& array_get(std::vector<T>& a) {
-  return a[I_];
-}
-template <std::size_t I_, class T>
-constexpr T&& array_get(std::vector<T>&& a) {
-  return a[I_];
-}
-template <std::size_t I_, class T>
-constexpr T const& array_get(std::vector<T> const& a) {
-  return a[I_];
-}
-
-/* Suppose you have a template of the form
- * template<typename T> struct X;
- * And you want to specialize it in such a way:
- *    template<typename S1, typename... SN> struct X<Foo<S1, SN...>> { ::: };
- *    template<>                            struct X<Foo<>>          { ::: };
- * This will work in Intel's compiler 13.0, but only to some extent in g++ 4.6, since
- * g++ can only match templates called with parameter packs if the number of template
- * arguments is not a fixed size (so inside the first specialization, referencing
- * X<Foo<Sn...>> will fail in g++). On the other hand, g++ will accept the following:
- *    template<typename S...> struct X<Foo<S...>> { ::: }:
- * as an additional (!) specialization, which will then only match the empty case.
- * But Intel's compiler 13.0 won't accept that, it will only accept the empty syntax,
- * so we have to create a workaround for this.
- */
-#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
-#define EIGEN_TPL_PP_SPEC_HACK_DEF(mt, n) mt... n
-#define EIGEN_TPL_PP_SPEC_HACK_DEFC(mt, n) , EIGEN_TPL_PP_SPEC_HACK_DEF(mt, n)
-#define EIGEN_TPL_PP_SPEC_HACK_USE(n) n...
-#define EIGEN_TPL_PP_SPEC_HACK_USEC(n) , n...
-#else
-#define EIGEN_TPL_PP_SPEC_HACK_DEF(mt, n)
-#define EIGEN_TPL_PP_SPEC_HACK_DEFC(mt, n)
-#define EIGEN_TPL_PP_SPEC_HACK_USE(n)
-#define EIGEN_TPL_PP_SPEC_HACK_USEC(n)
-#endif
-
-}  // end namespace internal
-
-}  // end namespace Eigen
-
-#endif  // EIGEN_CXX11WORKAROUNDS_H
-
-/*
- * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
- */

diff --git a/unsupported/Eigen/FFT b/unsupported/Eigen/FFT
index bca5a3e..4b7f239 100644
--- a/unsupported/Eigen/FFT
+++ b/unsupported/Eigen/FFT

@@ -382,7 +382,7 @@
     defined EIGEN_MKL_DEFAULT
   inline void inv2(Complex* dst, const Complex* src, int n0, int n1) {
     m_impl.inv2(dst, src, n0, n1);
-    if (HasFlag(Unscaled) == false) scale(dst, 1. / (n0 * n1), n0 * n1);
+    if (HasFlag(Unscaled) == false) scale(dst, Scalar(1) / (n0 * n1), n0 * n1);
   }
 #endif
 

diff --git a/unsupported/Eigen/NNLS b/unsupported/Eigen/NNLS
index 2923f59..fcafc08 100644
--- a/unsupported/Eigen/NNLS
+++ b/unsupported/Eigen/NNLS

@@ -88,7 +88,7 @@
   /** */
   NNLS();
 
-  /** \brief Constructs a NNLS sovler and initializes it with the given system matrix @c A.
+  /** \brief Constructs a NNLS solver and initializes it with the given system matrix @c A.
    * \param A Specifies the system matrix.
    * \param max_iter Specifies the maximum number of iterations to solve the system.
    * \param tol Specifies the precision of the optimum.

diff --git a/unsupported/Eigen/Tensor b/unsupported/Eigen/Tensor
new file mode 100644
index 0000000..6053b01
--- /dev/null
+++ b/unsupported/Eigen/Tensor

@@ -0,0 +1,142 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// #ifndef EIGEN_CXX11_TENSOR_MODULE_H
+#define EIGEN_CXX11_TENSOR_MODULE_H
+
+#include "../../Eigen/Core"
+
+#include "SpecialFunctions"
+
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
+// IWYU pragma: begin_exports
+#include "../../Eigen/src/Core/util/Meta.h"
+#include "../../Eigen/src/Core/util/MaxSizeVector.h"
+// IWYU pragma: end_exports
+
+/** \defgroup CXX11_Tensor_Module Tensor Module
+ *
+ * This module provides a Tensor class for storing arbitrarily indexed
+ * objects.
+ *
+ * \code
+ * #include <unsupported/Eigen/Tensor>
+ * \endcode
+ *
+ * Much of the documentation can be found \ref eigen_tensors "here".
+ */
+
+#include <atomic>
+#include <chrono>
+#include <cmath>
+#include <cstddef>
+#include <cstring>
+#include <iterator>
+#include <numeric>
+#include <random>
+#include <thread>
+
+#if defined(EIGEN_USE_THREADS) || defined(EIGEN_USE_SYCL)
+#include "../../Eigen/ThreadPool"
+#endif
+
+#ifdef EIGEN_USE_GPU
+#include <iostream>
+#if defined(EIGEN_USE_HIP)
+#include <hip/hip_runtime.h>
+#else
+#include <cuda_runtime.h>
+#endif
+#endif
+
+// IWYU pragma: begin_exports
+#include "src/Tensor/TensorMacros.h"
+#include "src/Tensor/TensorForwardDeclarations.h"
+#include "src/Tensor/TensorMeta.h"
+#include "src/Tensor/TensorFunctors.h"
+#include "src/Tensor/TensorCostModel.h"
+#include "src/Tensor/TensorDeviceDefault.h"
+#include "src/Tensor/TensorDeviceThreadPool.h"
+#include "src/Tensor/TensorDeviceGpu.h"
+#ifndef gpu_assert
+#define gpu_assert(x)
+#endif
+#include "src/Tensor/TensorDeviceSycl.h"
+#include "src/Tensor/TensorIndexList.h"
+#include "src/Tensor/TensorDimensionList.h"
+#include "src/Tensor/TensorDimensions.h"
+#include "src/Tensor/TensorInitializer.h"
+#include "src/Tensor/TensorTraits.h"
+#include "src/Tensor/TensorRandom.h"
+#include "src/Tensor/TensorUInt128.h"
+#include "src/Tensor/TensorIntDiv.h"
+#include "src/Tensor/TensorGlobalFunctions.h"
+
+#include "src/Tensor/TensorIO.h"
+
+#include "src/Tensor/TensorBase.h"
+#include "src/Tensor/TensorBlock.h"
+
+#include "src/Tensor/TensorEvaluator.h"
+#include "src/Tensor/TensorExpr.h"
+#include "src/Tensor/TensorReduction.h"
+#include "src/Tensor/TensorReductionGpu.h"
+#include "src/Tensor/TensorArgMax.h"
+#include "src/Tensor/TensorConcatenation.h"
+#include "src/Tensor/TensorContractionMapper.h"
+#include "src/Tensor/TensorContractionBlocking.h"
+#include "src/Tensor/TensorContraction.h"
+#include "src/Tensor/TensorContractionThreadPool.h"
+#include "src/Tensor/TensorContractionGpu.h"
+#include "src/Tensor/TensorConversion.h"
+#include "src/Tensor/TensorConvolution.h"
+#include "src/Tensor/TensorFFT.h"
+#include "src/Tensor/TensorPatch.h"
+#include "src/Tensor/TensorImagePatch.h"
+#include "src/Tensor/TensorVolumePatch.h"
+#include "src/Tensor/TensorBroadcasting.h"
+#include "src/Tensor/TensorChipping.h"
+#include "src/Tensor/TensorInflation.h"
+#include "src/Tensor/TensorLayoutSwap.h"
+#include "src/Tensor/TensorMorphing.h"
+#include "src/Tensor/TensorPadding.h"
+#include "src/Tensor/TensorReverse.h"
+#include "src/Tensor/TensorRoll.h"
+#include "src/Tensor/TensorShuffling.h"
+#include "src/Tensor/TensorStriding.h"
+#include "src/Tensor/TensorCustomOp.h"
+#include "src/Tensor/TensorEvalTo.h"
+#include "src/Tensor/TensorForcedEval.h"
+#include "src/Tensor/TensorGenerator.h"
+#include "src/Tensor/TensorAssign.h"
+#include "src/Tensor/TensorScan.h"
+#include "src/Tensor/TensorTrace.h"
+
+#ifdef EIGEN_USE_SYCL
+#include "src/Tensor/TensorReductionSycl.h"
+#include "src/Tensor/TensorConvolutionSycl.h"
+#include "src/Tensor/TensorContractionSycl.h"
+#include "src/Tensor/TensorScanSycl.h"
+#endif
+
+#include "src/Tensor/TensorExecutor.h"
+#include "src/Tensor/TensorDevice.h"
+
+#include "src/Tensor/TensorStorage.h"
+#include "src/Tensor/Tensor.h"
+#include "src/Tensor/TensorFixedSize.h"
+#include "src/Tensor/TensorMap.h"
+#include "src/Tensor/TensorRef.h"
+// IWYU pragma: end_exports
+
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+// #endif // EIGEN_CXX11_TENSOR_MODULE_H

diff --git a/unsupported/Eigen/TensorSymmetry b/unsupported/Eigen/TensorSymmetry
new file mode 100644
index 0000000..2981da7
--- /dev/null
+++ b/unsupported/Eigen/TensorSymmetry

@@ -0,0 +1,40 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSORSYMMETRY_MODULE_H
+#define EIGEN_CXX11_TENSORSYMMETRY_MODULE_H
+
+#include "Tensor"
+
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
+#include "src/TensorUtil/CXX11Meta.h"
+
+/** \defgroup TensorSymmetry_Module Tensor Symmetry Module
+ *
+ * This module provides a classes that allow for the definition of
+ * symmetries w.r.t. tensor indices.
+ *
+ * Including this module will implicitly include the Tensor module.
+ *
+ * \code
+ * #include <Eigen/TensorSymmetry>
+ * \endcode
+ */
+
+// IWYU pragma: begin_exports
+#include "src/TensorSymmetry/util/TemplateGroupTheory.h"
+#include "src/TensorSymmetry/Symmetry.h"
+#include "src/TensorSymmetry/StaticSymmetry.h"
+#include "src/TensorSymmetry/DynamicSymmetry.h"
+// IWYU pragma: end_exports
+
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+#endif  // EIGEN_CXX11_TENSORSYMMETRY_MODULE_H

diff --git a/unsupported/Eigen/ThreadPool b/unsupported/Eigen/ThreadPool
new file mode 100644
index 0000000..6012171
--- /dev/null
+++ b/unsupported/Eigen/ThreadPool

@@ -0,0 +1 @@
+#include "../../Eigen/ThreadPool"  // IWYU pragma: export

diff --git a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
index 785cd4a..ea481fc 100644
--- a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
+++ b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h

@@ -222,16 +222,6 @@
     return AutoDiffScalar<DerType&>(a + b.value(), b.derivatives());
   }
 
-  //     inline const AutoDiffScalar<DerType&> operator+(const Real& other) const
-  //     {
-  //       return AutoDiffScalar<DerType&>(m_value + other, m_derivatives);
-  //     }
-
-  //     friend inline const AutoDiffScalar<DerType&> operator+(const Real& a, const AutoDiffScalar& b)
-  //     {
-  //       return AutoDiffScalar<DerType&>(a + b.value(), b.derivatives());
-  //     }
-
   inline AutoDiffScalar& operator+=(const Scalar& other) {
     value() += other;
     return *this;
@@ -290,22 +280,6 @@
     return MakeAutoDiffScalar(a.value() * other, a.derivatives() * other);
   }
 
-  //     inline const AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >
-  //     operator*(const Real& other) const
-  //     {
-  //       return AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >(
-  //         m_value * other,
-  //         (m_derivatives * other));
-  //     }
-  //
-  //     friend inline const AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >
-  //     operator*(const Real& other, const AutoDiffScalar& a)
-  //     {
-  //       return AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >(
-  //         a.value() * other,
-  //         a.derivatives() * other);
-  //     }
-
   inline auto operator/(const Scalar& other) const {
     return MakeAutoDiffScalar(m_value / other, (m_derivatives * (Scalar(1) / other)));
   }
@@ -314,22 +288,6 @@
     return MakeAutoDiffScalar(other / a.value(), a.derivatives() * (Scalar(-other) / (a.value() * a.value())));
   }
 
-  //     inline const AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >
-  //     operator/(const Real& other) const
-  //     {
-  //       return AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >(
-  //         m_value / other,
-  //         (m_derivatives * (Real(1)/other)));
-  //     }
-  //
-  //     friend inline const AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >
-  //     operator/(const Real& other, const AutoDiffScalar& a)
-  //     {
-  //       return AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >(
-  //         other / a.value(),
-  //         a.derivatives() * (-Real(1)/other));
-  //     }
-
   template <typename OtherDerType>
   inline auto operator/(const AutoDiffScalar<OtherDerType>& other) const {
     return MakeAutoDiffScalar(m_value / other.value(),
@@ -383,16 +341,6 @@
   typedef typename traits<DerType>::Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real Real;
 
-  //   typedef auto_diff_scalar_op<DerivativeType, typename NumTraits<Scalar>::Real,
-  //                            is_same<Scalar,typename NumTraits<Scalar>::Real>::value> Base;
-
-  //   using Base::operator+;
-  //   using Base::operator+=;
-  //   using Base::operator-;
-  //   using Base::operator-=;
-  //   using Base::operator*;
-  //   using Base::operator*=;
-
   const AutoDiffScalar<DerivativeType>& derived() const {
     return *static_cast<const AutoDiffScalar<DerivativeType>*>(this);
   }
@@ -448,22 +396,6 @@
   typedef AutoDiffScalar<DerType> ReturnType;
 };
 
-// The following is an attempt to let Eigen's known about expression template, but that's more tricky!
-
-// template<typename DerType, typename BinOp>
-// struct ScalarBinaryOpTraits<AutoDiffScalar<DerType>,AutoDiffScalar<DerType>, BinOp>
-// {
-//   enum { Defined = 1 };
-//   typedef AutoDiffScalar<typename DerType::PlainObject> ReturnType;
-// };
-//
-// template<typename DerType1,typename DerType2, typename BinOp>
-// struct ScalarBinaryOpTraits<AutoDiffScalar<DerType1>,AutoDiffScalar<DerType2>, BinOp>
-// {
-//   enum { Defined = 1 };//internal::is_same<typename DerType1::Scalar,typename DerType2::Scalar>::value };
-//   typedef AutoDiffScalar<typename DerType1::PlainObject> ReturnType;
-// };
-
 #define EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(FUNC, CODE)                                              \
   template <typename DerType>                                                                        \
   inline auto FUNC(const Eigen::AutoDiffScalar<DerType>& x) {                                        \

diff --git a/unsupported/Eigen/src/AutoDiff/AutoDiffVector.h b/unsupported/Eigen/src/AutoDiff/AutoDiffVector.h
index 6231456..892980e 100644
--- a/unsupported/Eigen/src/AutoDiff/AutoDiffVector.h
+++ b/unsupported/Eigen/src/AutoDiff/AutoDiffVector.h

@@ -35,7 +35,6 @@
 template <typename ValueType, typename JacobianType>
 class AutoDiffVector {
  public:
-  // typedef typename internal::traits<ValueType>::Scalar Scalar;
   typedef typename internal::traits<ValueType>::Scalar BaseScalar;
   typedef AutoDiffScalar<Matrix<BaseScalar, JacobianType::RowsAtCompileTime, 1> > ActiveScalar;
   typedef ActiveScalar Scalar;
@@ -57,10 +56,8 @@
 
   Index size() const { return m_values.size(); }
 
-  // FIXME here we could return an expression of the sum
-  Scalar sum() const { /*std::cerr << "sum \n\n";*/ /*std::cerr << m_jacobian.rowwise().sum() << "\n\n";*/
-    return Scalar(m_values.sum(), m_jacobian.rowwise().sum());
-  }
+  // FIXME: Here we could return an expression of the sum.
+  Scalar sum() const { return Scalar(m_values.sum(), m_jacobian.rowwise().sum()); }
 
   inline AutoDiffVector(const ValueType& values, const JacobianType& jac) : m_values(values), m_jacobian(jac) {}
 
@@ -150,23 +147,6 @@
         v.values() * other, v.jacobian() * other);
   }
 
-  //     template<typename OtherValueType,typename OtherJacobianType>
-  //     inline const AutoDiffVector<
-  //       CwiseBinaryOp<internal::scalar_multiple_op<Scalar>, ValueType, OtherValueType>
-  //       CwiseBinaryOp<internal::scalar_sum_op<Scalar>,
-  //         CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, JacobianType>,
-  //         CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, OtherJacobianType> > >
-  //     operator*(const AutoDiffVector<OtherValueType,OtherJacobianType>& other) const
-  //     {
-  //       return AutoDiffVector<
-  //         CwiseBinaryOp<internal::scalar_multiple_op<Scalar>, ValueType, OtherValueType>
-  //         CwiseBinaryOp<internal::scalar_sum_op<Scalar>,
-  //           CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, JacobianType>,
-  //           CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, OtherJacobianType> > >(
-  //             m_values.cwise() * other.values(),
-  //             (m_jacobian * other.values()) + (m_values * other.jacobian()));
-  //     }
-
   inline AutoDiffVector& operator*=(const Scalar& other) {
     m_values *= other;
     m_jacobian *= other;

diff --git a/unsupported/Eigen/src/AutoDiff/CoherentPadOp.h b/unsupported/Eigen/src/AutoDiff/CoherentPadOp.h
index 7d3a3fb..0659e9a 100644
--- a/unsupported/Eigen/src/AutoDiff/CoherentPadOp.h
+++ b/unsupported/Eigen/src/AutoDiff/CoherentPadOp.h

@@ -50,13 +50,13 @@
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoherentPadOp(const CoherentPadOp&) = default;
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoherentPadOp(CoherentPadOp&& other) = default;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoherentPadOp(const XprType& xpr, Index size) : xpr_(xpr), size_(size) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoherentPadOp(const XprType& xpr, Index size) : m_xpr(xpr), m_size(size) {
     static_assert(XprNested_::IsVectorAtCompileTime, "input type must be a vector");
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprNested_& nestedExpression() const { return xpr_; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprNested_& nestedExpression() const { return m_xpr; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return size_.value(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_size.value(); }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rows() const {
     return traits<CoherentPadOp>::IsRowMajor ? Index(1) : size();
@@ -67,8 +67,8 @@
   }
 
  private:
-  XprNested xpr_;
-  const internal::variable_if_dynamic<Index, SizeAtCompileTime> size_;
+  XprNested m_xpr;
+  const internal::variable_if_dynamic<Index, SizeAtCompileTime> m_size;
 };
 
 // Adapted from the Replicate evaluator.

diff --git a/unsupported/Eigen/src/BVH/KdBVH.h b/unsupported/Eigen/src/BVH/KdBVH.h
index d421e6f..631aef7 100644
--- a/unsupported/Eigen/src/BVH/KdBVH.h
+++ b/unsupported/Eigen/src/BVH/KdBVH.h

@@ -33,7 +33,7 @@
 // iterator range or using bounding_box in a unified way
 template <typename ObjectList, typename VolumeList, typename BoxIter>
 struct get_boxes_helper {
-  void operator()(const ObjectList &objects, BoxIter boxBegin, BoxIter boxEnd, VolumeList &outBoxes) {
+  void operator()(const ObjectList &objects, BoxIter boxBegin, BoxIter boxEnd, VolumeList &outBoxes) const {
     outBoxes.insert(outBoxes.end(), boxBegin, boxEnd);
     eigen_assert(outBoxes.size() == objects.size());
     EIGEN_ONLY_USED_FOR_DEBUG(objects);
@@ -42,7 +42,7 @@
 
 template <typename ObjectList, typename VolumeList>
 struct get_boxes_helper<ObjectList, VolumeList, int> {
-  void operator()(const ObjectList &objects, int, int, VolumeList &outBoxes) {
+  void operator()(const ObjectList &objects, int, int, VolumeList &outBoxes) const {
     outBoxes.reserve(objects.size());
     for (int i = 0; i < (int)objects.size(); ++i) outBoxes.push_back(bounding_box(objects[i]));
   }

diff --git a/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h b/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h
index bc21d94..b110aee 100644
--- a/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h
+++ b/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h

@@ -27,8 +27,6 @@
 template <typename MatrixType, typename MatrixSolver = SimplicialLLT<MatrixType>, bool BisSPD = false>
 class ArpackGeneralizedSelfAdjointEigenSolver {
  public:
-  // typedef typename MatrixSolver::MatrixType MatrixType;
-
   /** \brief Scalar type for matrices of type \p MatrixType. */
   typedef typename MatrixType::Scalar Scalar;
   typedef typename MatrixType::Index Index;
@@ -423,15 +421,6 @@
   int info = 0;
 
   Scalar scale = 1.0;
-  // if (!isBempty)
-  //{
-  // Scalar scale = B.norm() / std::sqrt(n);
-  // scale = std::pow(2, std::floor(std::log(scale+1)));
-  ////M /= scale;
-  // for (size_t i=0; i<(size_t)B.outerSize(); i++)
-  //     for (typename MatrixType::InnerIterator it(B, i); it; ++it)
-  //         it.valueRef() /= scale;
-  // }
 
   MatrixSolver OP;
   if (mode == 1 || mode == 2) {
@@ -455,7 +444,8 @@
   }
 
   if (!(mode == 1 && isBempty) && !(mode == 2 && isBempty) && OP.info() != Success) {
-    m_info = OP.info() delete[] v;
+    m_info = OP.info();
+    delete[] v;
     delete[] iparam;
     delete[] ipntr;
     delete[] workd;

diff --git a/unsupported/Eigen/src/EulerAngles/EulerSystem.h b/unsupported/Eigen/src/EulerAngles/EulerSystem.h
index bf8a334..c4b190b 100644
--- a/unsupported/Eigen/src/EulerAngles/EulerSystem.h
+++ b/unsupported/Eigen/src/EulerAngles/EulerSystem.h

@@ -120,8 +120,7 @@
 template <int _AlphaAxis, int _BetaAxis, int _GammaAxis>
 class EulerSystem {
  public:
-  // It's defined this way and not as enum, because I think
-  //  that enum is not guerantee to support negative numbers
+  // Defined as static constexpr rather than enum to ensure support for negative values.
 
   /** The first rotation axis */
   static constexpr int AlphaAxis = _AlphaAxis;

diff --git a/unsupported/Eigen/src/FFT/fftw_impl.h b/unsupported/Eigen/src/FFT/fftw_impl.h
index 0b9ad3d..588d09e 100644
--- a/unsupported/Eigen/src/FFT/fftw_impl.h
+++ b/unsupported/Eigen/src/FFT/fftw_impl.h

@@ -16,8 +16,8 @@
 
 namespace internal {
 
-// FFTW uses non-const arguments
-// so we must use ugly const_cast calls for all the args it uses
+// FFTW uses non-const arguments,
+// so const_cast is needed for all the args it uses.
 //
 // This should be safe as long as
 // 1. we use FFTW_ESTIMATE for all our planning

diff --git a/unsupported/Eigen/src/FFT/kissfft_impl.h b/unsupported/Eigen/src/FFT/kissfft_impl.h
index c201d80..c7e5960 100644
--- a/unsupported/Eigen/src/FFT/kissfft_impl.h
+++ b/unsupported/Eigen/src/FFT/kissfft_impl.h

@@ -389,7 +389,7 @@
   inline int PlanKey(int nfft, bool isinverse) const { return (nfft << 1) | int(isinverse); }
 
   inline PlanData &get_plan(int nfft, bool inverse) {
-    // TODO look for PlanKey(nfft, ! inverse) and conjugate the twiddles
+    // TODO: look for PlanKey(nfft, ! inverse) and conjugate the twiddles
     PlanData &pd = m_plans[PlanKey(nfft, inverse)];
     if (pd.m_twiddles.size() == 0) {
       pd.make_twiddles(nfft, inverse);

diff --git a/unsupported/Eigen/src/IterativeSolvers/DGMRES.h b/unsupported/Eigen/src/IterativeSolvers/DGMRES.h
index 6f6df3e..2c10509 100644
--- a/unsupported/Eigen/src/IterativeSolvers/DGMRES.h
+++ b/unsupported/Eigen/src/IterativeSolvers/DGMRES.h

@@ -34,7 +34,7 @@
  * \param ncut Put  the ncut smallest elements at the end of the vector
  * WARNING This is an expensive sort, so should be used only
  * for small size vectors
- * TODO Use modified QuickSplit or std::nth_element to get the smallest values
+ * TODO: Use modified QuickSplit or std::nth_element to get the smallest values
  */
 template <typename VectorType, typename IndexType>
 void sortWithPermutation(VectorType& vec, IndexType& perm, typename IndexType::Scalar& ncut) {
@@ -160,7 +160,7 @@
   /**
    * Get the restart value
    */
-  Index restart() { return m_restart; }
+  Index restart() const { return m_restart; }
 
   /**
    * Set the restart value (default is 30)
@@ -178,7 +178,7 @@
   /**
    * Get the size of the deflation subspace size
    */
-  Index deflSize() { return m_r; }
+  Index deflSize() const { return m_r; }
 
   /**
    * Set the maximum size of the deflation subspace
@@ -321,7 +321,7 @@
     m_H(it + 1, it) = coef;
     //     m_Hes(it+1,it) = coef;
 
-    // FIXME Check for happy breakdown
+    // FIXME: Check for happy breakdown.
 
     // Update Hessenberg matrix with Givens rotations
     for (Index i = 1; i <= it; ++i) {
@@ -335,7 +335,6 @@
 
     beta = std::abs(g(it + 1));
     m_error = beta / normRhs;
-    // std::cerr << nbIts << " Relative Residual Norm " << m_error << std::endl;
     it++;
     nbIts++;
 
@@ -347,8 +346,7 @@
   }
 
   // Compute the new coefficients by solving the least square problem
-  //   it++;
-  // FIXME  Check first if the matrix is singular ... zero diagonal
+  // FIXME: Check first if the matrix is singular (zero diagonal).
   DenseVector nrs(m_restart);
   nrs = m_H.topLeftCorner(it, it).template triangularView<Upper>().solve(g.head(it));
 
@@ -476,7 +474,7 @@
   // Factorize m_T into m_luT
   m_luT.compute(m_T.topLeftCorner(m_r, m_r));
 
-  // FIXME CHeck if the factorization was correctly done (nonsingular matrix)
+  // FIXME: Check if the factorization was correctly done (nonsingular matrix).
   m_isDeflInitialized = true;
   return 0;
 }

diff --git a/unsupported/Eigen/src/IterativeSolvers/GMRES.h b/unsupported/Eigen/src/IterativeSolvers/GMRES.h
index 238d0ec..f2c85ff 100644
--- a/unsupported/Eigen/src/IterativeSolvers/GMRES.h
+++ b/unsupported/Eigen/src/IterativeSolvers/GMRES.h

@@ -291,7 +291,7 @@
 
   /** Get the number of iterations after that a restart is performed.
    */
-  Index get_restart() { return m_restart; }
+  Index get_restart() const { return m_restart; }
 
   /** Set the number of iterations after that a restart is performed.
    *  \param restart   number of iterations for a restarti, default is 30.

diff --git a/unsupported/Eigen/src/IterativeSolvers/IDRSTABL.h b/unsupported/Eigen/src/IterativeSolvers/IDRSTABL.h
index e55c3b3..a08a71d 100644
--- a/unsupported/Eigen/src/IterativeSolvers/IDRSTABL.h
+++ b/unsupported/Eigen/src/IterativeSolvers/IDRSTABL.h

@@ -208,7 +208,7 @@
   // Pre-allocate sigma.
   DenseMatrixType sigma(S, S);
 
-  bool reset_while = false;  // Should the while loop be reset for some reason?
+  bool reset_while = false;  // Should the while loop be reset?
 
   while (k < maxIters) {
     for (Index j = 1; j <= L; ++j) {

diff --git a/unsupported/Eigen/src/IterativeSolvers/Scaling.h b/unsupported/Eigen/src/IterativeSolvers/Scaling.h
index 248c7b8..045c24a 100644
--- a/unsupported/Eigen/src/IterativeSolvers/Scaling.h
+++ b/unsupported/Eigen/src/IterativeSolvers/Scaling.h

@@ -67,7 +67,7 @@
   /**
    * Compute the left and right diagonal matrices to scale the input matrix @p mat
    *
-   * FIXME This algorithm will be modified such that the diagonal elements are permuted on the diagonal.
+   * FIXME: This algorithm will be modified such that the diagonal elements are permuted on the diagonal.
    *
    * \sa LeftScaling() RightScaling()
    */
@@ -164,7 +164,7 @@
   mutable ComputationInfo m_info;
   bool m_isInitialized;
   VectorXd m_left;   // Left scaling vector
-  VectorXd m_right;  // m_right scaling vector
+  VectorXd m_right;  // Right scaling vector
   double m_tol;
   int m_maxits;  // Maximum number of iterations allowed
 };

diff --git a/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h b/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h
index 1d29f2f..038bf42 100644
--- a/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h
+++ b/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h

@@ -70,7 +70,7 @@
  *
  * This class is the return value of kroneckerProduct(MatrixBase,
  * MatrixBase). Use the function rather than construct this class
- * directly to avoid specifying template prarameters.
+ * directly to avoid specifying template parameters.
  *
  * \tparam Lhs  Type of the left-hand side, a matrix expression.
  * \tparam Rhs  Type of the rignt-hand side, a matrix expression.
@@ -101,7 +101,7 @@
  *
  * This class is the return value of kroneckerProduct(EigenBase,
  * EigenBase). Use the function rather than construct this class
- * directly to avoid specifying template prarameters.
+ * directly to avoid specifying template parameters.
  *
  * \tparam Lhs  Type of the left-hand side, a matrix expression.
  * \tparam Rhs  Type of the rignt-hand side, a matrix expression.
@@ -153,7 +153,7 @@
 
   // compute number of non-zeros per innervectors of dst
   {
-    // TODO VectorXi is not necessarily big enough!
+    // TODO: VectorXi is not necessarily big enough!
     VectorXi nnzA = VectorXi::Zero(Dest::IsRowMajor ? m_A.rows() : m_A.cols());
     for (Index kA = 0; kA < m_A.outerSize(); ++kA)
       for (LhsInnerIterator itA(lhs1, kA); itA; ++itA) nnzA(Dest::IsRowMajor ? itA.row() : itA.col())++;

diff --git a/unsupported/Eigen/src/LevenbergMarquardt/LMpar.h b/unsupported/Eigen/src/LevenbergMarquardt/LMpar.h
index 01fcfdc..75ca4e8 100644
--- a/unsupported/Eigen/src/LevenbergMarquardt/LMpar.h
+++ b/unsupported/Eigen/src/LevenbergMarquardt/LMpar.h

@@ -28,8 +28,6 @@
   using std::sqrt;
   typedef typename QRSolver::MatrixType MatrixType;
   typedef typename QRSolver::Scalar Scalar;
-  //    typedef typename QRSolver::StorageIndex StorageIndex;
-
   /* Local variables */
   Index j;
   Scalar fp;
@@ -55,11 +53,10 @@
   /* compute and store in x the gauss-newton direction. if the */
   /* jacobian is rank-deficient, obtain a least squares solution. */
 
-  //    const Index rank = qr.nonzeroPivots(); // exactly double(0.)
   const Index rank = qr.rank();  // use a threshold
   wa1 = qtb;
   wa1.tail(n - rank).setZero();
-  // FIXME There is no solve in place for sparse triangularView
+  // FIXME: There is no solve-in-place for sparse triangularView.
   wa1.head(rank) = s.topLeftCorner(rank, rank).template triangularView<Upper>().solve(qtb.head(rank));
 
   x = qr.colsPermutation() * wa1;

diff --git a/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h b/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h
index b8a6dda..eea424c 100644
--- a/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h
+++ b/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h

@@ -185,22 +185,22 @@
   FVectorType &diag() { return m_diag; }
 
   /** \returns the number of iterations performed */
-  Index iterations() { return m_iter; }
+  Index iterations() const { return m_iter; }
 
   /** \returns the number of functions evaluation */
-  Index nfev() { return m_nfev; }
+  Index nfev() const { return m_nfev; }
 
   /** \returns the number of jacobian evaluation */
-  Index njev() { return m_njev; }
+  Index njev() const { return m_njev; }
 
   /** \returns the norm of current vector function */
-  RealScalar fnorm() { return m_fnorm; }
+  RealScalar fnorm() const { return m_fnorm; }
 
   /** \returns the norm of the gradient of the error */
-  RealScalar gnorm() { return m_gnorm; }
+  RealScalar gnorm() const { return m_gnorm; }
 
   /** \returns the LevenbergMarquardt parameter */
-  RealScalar lm_param(void) { return m_par; }
+  RealScalar lm_param(void) const { return m_par; }
 
   /** \returns a reference to the  current vector function
    */
@@ -217,7 +217,7 @@
 
   /** the permutation used in the QR factorization
    */
-  PermutationType permutation() { return m_permutation; }
+  PermutationType permutation() const { return m_permutation; }
 
   /**
    * \brief Reports whether the minimization was successful
@@ -265,7 +265,6 @@
     return status;
   }
   do {
-    //       std::cout << " uv " << x.transpose() << "\n";
     status = minimizeOneStep(x);
   } while (status == LevenbergMarquardtSpace::Running);
   m_isInitialized = true;
@@ -282,9 +281,8 @@
   m_wa3.resize(n);
   m_wa4.resize(m);
   m_fvec.resize(m);
-  // FIXME Sparse Case : Allocate space for the jacobian
+  // FIXME: Sparse case: allocate space for the Jacobian.
   m_fjac.resize(m, n);
-  //     m_fjac.reserve(VectorXi::Constant(n,5)); // FIXME Find a better alternative
   if (!m_useExternalScaling) m_diag.resize(n);
   eigen_assert((!m_useExternalScaling || m_diag.size() == n) &&
                "When m_useExternalScaling is set, the caller must provide a valid 'm_diag'");

diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h b/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h
index 0c18ad6..0014700 100644
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h

@@ -86,7 +86,7 @@
         for (Index i = 0; i < rows; i++)
           mx = (std::max)(mx, std::abs(m_f(Ashifted(i, i) + avgEival, static_cast<int>(s + r))));
         if (r != 0) rfactorial *= RealScalar(r);
-        delta = (std::max)(delta, mx / rfactorial);
+        delta = (std::max)(delta, RealScalar(mx / rfactorial));
       }
       const RealScalar P_norm = P.cwiseAbs().rowwise().sum().maxCoeff();
       if (mu * delta * P_norm < NumTraits<Scalar>::epsilon() * F_norm)  // series converged

diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h b/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
index 398971e..40e1b1d 100644
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h

@@ -135,7 +135,7 @@
   const int minPadeDegree = 3;
   const int maxPadeDegree = 11;
   eigen_assert(degree >= minPadeDegree && degree <= maxPadeDegree);
-  // FIXME this creates float-conversion-warnings if these are enabled.
+  // FIXME: This creates float-conversion warnings if these are enabled.
   // Either manually convert each value, or disable the warning locally
   const RealScalar nodes[][maxPadeDegree] = {
       {0.1127016653792583114820734600217600L, 0.5000000000000000000000000000000000L,  // degree 3
@@ -257,7 +257,7 @@
   }
 
   matrix_log_compute_pade(result, T, degree);
-  result *= pow(RealScalar(2), RealScalar(numberOfSquareRoots));  // TODO replace by bitshift if possible
+  result *= pow(RealScalar(2), RealScalar(numberOfSquareRoots));  // TODO: Replace by bitshift if possible.
 }
 
 /** \ingroup MatrixFunctions_Module

diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h b/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
index a420ee7..8e0a2f1 100644
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h

@@ -31,7 +31,7 @@
  * MatrixPower::operator() and related functions and most of the
  * time this is the only way it is used.
  */
-/* TODO This class is only used by MatrixPower, so it should be nested
+/* TODO: This class is only used by MatrixPower, so it should be nested
  * into MatrixPower, like MatrixPower::ReturnValue. However, my
  * compiler complained about unused template parameter in the
  * following declaration in namespace internal.

diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h b/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h
index b11eb74..34e7531 100644
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h

@@ -21,8 +21,7 @@
 // post: sqrtT.block(i,i,2,2) is square root of T.block(i,i,2,2)
 template <typename MatrixType, typename ResultType>
 void matrix_sqrt_quasi_triangular_2x2_diagonal_block(const MatrixType& T, Index i, ResultType& sqrtT) {
-  // TODO: This case (2-by-2 blocks with complex conjugate eigenvalues) is probably hidden somewhere
-  //       in EigenSolver. If we expose it, we could call it directly from here.
+  // TODO: this 2x2 complex-conjugate eigenvalue case could reuse logic from EigenSolver if exposed.
   typedef typename traits<MatrixType>::Scalar Scalar;
   Matrix<Scalar, 2, 2> block = T.template block<2, 2>(i, i);
   EigenSolver<Matrix<Scalar, 2, 2> > es(block);

diff --git a/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h b/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h
index 19ec8ea..6503e62 100644
--- a/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h
+++ b/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h

@@ -105,7 +105,7 @@
   Scalar fnorm, gnorm;
   bool useExternalScaling;
 
-  Scalar lm_param(void) { return par; }
+  Scalar lm_param(void) const { return par; }
 
  private:
   FunctorType &functor;
@@ -421,14 +421,14 @@
   permutation.setIdentity(n);
   if (sing) {
     wa2 = fjac.colwise().blueNorm();
-    // TODO We have no unit test covering this code path, do not modify
+    // TODO: We have no unit test covering this code path, do not modify
     // until it is carefully tested
     ColPivHouseholderQR<JacobianType> qrfac(fjac);
     fjac = qrfac.matrixQR();
     wa1 = fjac.diagonal();
     fjac.diagonal() = qrfac.hCoeffs();
     permutation = qrfac.colsPermutation();
-    // TODO : avoid this:
+    // TODO: Avoid this:
     for (Index ii = 0; ii < fjac.cols(); ii++)
       fjac.col(ii).segment(ii + 1, fjac.rows() - ii - 1) *= fjac(ii, ii);  // rescale vectors
 

diff --git a/unsupported/Eigen/src/NonLinearOptimization/lmpar.h b/unsupported/Eigen/src/NonLinearOptimization/lmpar.h
index 1420201..cbff829 100644
--- a/unsupported/Eigen/src/NonLinearOptimization/lmpar.h
+++ b/unsupported/Eigen/src/NonLinearOptimization/lmpar.h

@@ -67,8 +67,7 @@
       l = ipvt[j];
       wa1[j] = diag[l] * (wa2[l] / dxnorm);
     }
-    // it's actually a triangularView.solveInplace(), though in a weird
-    // way:
+    // Triangular solve (forward substitution):
     for (j = 0; j < n; ++j) {
       Scalar sum = 0.;
       for (i = 0; i < j; ++i) sum += r(i, j) * wa1[i];
@@ -170,7 +169,6 @@
   /* compute and store in x the gauss-newton direction. if the */
   /* jacobian is rank-deficient, obtain a least squares solution. */
 
-  //    const Index rank = qr.nonzeroPivots(); // exactly double(0.)
   const Index rank = qr.rank();  // use a threshold
   wa1 = qtb;
   wa1.tail(n - rank).setZero();

diff --git a/unsupported/Eigen/src/NumericalDiff/NumericalDiff.h b/unsupported/Eigen/src/NumericalDiff/NumericalDiff.h
index 1f55263..0f4f929 100644
--- a/unsupported/Eigen/src/NumericalDiff/NumericalDiff.h
+++ b/unsupported/Eigen/src/NumericalDiff/NumericalDiff.h

@@ -66,7 +66,7 @@
     const Scalar eps = sqrt(((std::max)(epsfcn, NumTraits<Scalar>::epsilon())));
     ValueType val1, val2;
     InputType x = _x;
-    // TODO : we should do this only if the size is not already known
+    // TODO: We should do this only if the size is not already known.
     val1.resize(Functor::values());
     val2.resize(Functor::values());
 

diff --git a/unsupported/Eigen/src/Polynomials/Companion.h b/unsupported/Eigen/src/Polynomials/Companion.h
index 1b7f6e1..3000b4c 100644
--- a/unsupported/Eigen/src/Polynomials/Companion.h
+++ b/unsupported/Eigen/src/Polynomials/Companion.h

@@ -40,7 +40,6 @@
   typedef Scalar_ Scalar;
   typedef typename NumTraits<Scalar>::Real RealScalar;
   typedef Matrix<Scalar, Deg, 1> RightColumn;
-  // typedef DiagonalMatrix< Scalar, Deg_1, Deg_1 > BottomLeftDiagonal;
   typedef Matrix<Scalar, Deg_1, 1> BottomLeftDiagonal;
 
   typedef Matrix<Scalar, Deg, Deg> DenseCompanionMatrixType;

diff --git a/unsupported/Eigen/src/Polynomials/PolynomialSolver.h b/unsupported/Eigen/src/Polynomials/PolynomialSolver.h
index aa357a4..a79a49e 100644
--- a/unsupported/Eigen/src/Polynomials/PolynomialSolver.h
+++ b/unsupported/Eigen/src/Polynomials/PolynomialSolver.h

@@ -82,8 +82,8 @@
   }
 
  protected:
-  template <typename squaredNormBinaryPredicate>
-  inline const RootType& selectComplexRoot_withRespectToNorm(squaredNormBinaryPredicate& pred) const {
+  template <typename Predicate>
+  inline const RootType& selectComplexRoot_withRespectToNorm(Predicate& pred) const {
     Index res = 0;
     RealScalar norm2 = numext::abs2(m_roots[0]);
     for (Index i = 1; i < m_roots.size(); ++i) {
@@ -114,25 +114,25 @@
   }
 
  protected:
-  template <typename squaredRealPartBinaryPredicate>
+  template <typename Predicate>
   inline const RealScalar& selectRealRoot_withRespectToAbsRealPart(
-      squaredRealPartBinaryPredicate& pred, bool& hasArealRoot,
+      Predicate& pred, bool& hasArealRoot,
       const RealScalar& absImaginaryThreshold = NumTraits<Scalar>::dummy_precision()) const {
     using std::abs;
     hasArealRoot = false;
     Index res = 0;
-    RealScalar abs2(0);
+    RealScalar val(0);
 
     for (Index i = 0; i < m_roots.size(); ++i) {
       if (abs(m_roots[i].imag()) <= absImaginaryThreshold) {
         if (!hasArealRoot) {
           hasArealRoot = true;
           res = i;
-          abs2 = m_roots[i].real() * m_roots[i].real();
+          val = abs(m_roots[i].real());
         } else {
-          const RealScalar currAbs2 = m_roots[i].real() * m_roots[i].real();
-          if (pred(currAbs2, abs2)) {
-            abs2 = currAbs2;
+          const RealScalar curr = abs(m_roots[i].real());
+          if (pred(curr, val)) {
+            val = curr;
             res = i;
           }
         }
@@ -145,9 +145,9 @@
     return numext::real_ref(m_roots[res]);
   }
 
-  template <typename RealPartBinaryPredicate>
+  template <typename Predicate>
   inline const RealScalar& selectRealRoot_withRespectToRealPart(
-      RealPartBinaryPredicate& pred, bool& hasArealRoot,
+      Predicate& pred, bool& hasArealRoot,
       const RealScalar& absImaginaryThreshold = NumTraits<Scalar>::dummy_precision()) const {
     using std::abs;
     hasArealRoot = false;

diff --git a/unsupported/Eigen/src/SparseExtra/BlockSparseMatrix.h b/unsupported/Eigen/src/SparseExtra/BlockSparseMatrix.h
index 6e8be84..a7958af 100644
--- a/unsupported/Eigen/src/SparseExtra/BlockSparseMatrix.h
+++ b/unsupported/Eigen/src/SparseExtra/BlockSparseMatrix.h

@@ -65,7 +65,7 @@
 struct traits<BlockSparseMatrix<Scalar_, _BlockAtCompileTime, Options_, Index_> > {
   typedef Scalar_ Scalar;
   typedef Index_ Index;
-  typedef Sparse StorageKind;  // FIXME Where is it used ??
+  typedef Sparse StorageKind;  // FIXME: determine where StorageKind is used.
   typedef MatrixXpr XprKind;
   enum {
     RowsAtCompileTime = Dynamic,
@@ -315,7 +315,7 @@
         m_nonzeros(other.m_nonzeros),
         m_blockPtr(0),
         m_blockSize(other.m_blockSize) {
-    // should we allow copying between variable-size blocks and fixed-size blocks ??
+    // TODO: decide whether to allow copying between variable-size and fixed-size blocks.
     eigen_assert(m_blockSize == BlockSize && " CAN NOT COPY BETWEEN FIXED-SIZE AND VARIABLE-SIZE BLOCKS");
 
     std::copy(other.m_innerOffset, other.m_innerOffset + m_innerBSize + 1, m_innerOffset);
@@ -486,7 +486,7 @@
         m_indices[offset] = nzBlockIdx[idx];
         if (m_blockSize == Dynamic)
           m_blockPtr[offset] = m_blockPtr[offset - 1] + blockInnerSize(nzBlockIdx[idx]) * blockOuterSize(bj);
-        // There is no blockPtr for fixed-size blocks... not needed !???
+        // There is no blockPtr for fixed-size blocks; not needed.
       }
       // Save the pointer to the next outer block
       m_outerIndex[bj + 1] = m_outerIndex[bj] + nzBlockIdx.size();
@@ -551,7 +551,7 @@
     eigen_assert((m_innerBSize != 0 && m_outerBSize != 0) &&
                  "TRYING TO RESERVE ZERO-SIZE MATRICES, CALL resize() first");
 
-    // FIXME Should free if already allocated
+    // FIXME: Should free if already allocated.
     m_outerIndex = new StorageIndex[m_outerBSize + 1];
 
     m_nonzerosblocks = nonzerosblocks;
@@ -574,14 +574,14 @@
    *
    * \note For fixed-size blocks, call setBlockSize() before this function.
    *
-   * FIXME Do not accept duplicates
+   * FIXME: Do not accept duplicates.
    */
   template <typename InputIterator>
   void setFromTriplets(const InputIterator& begin, const InputIterator& end) {
     eigen_assert((m_innerBSize != 0 && m_outerBSize != 0) && "ZERO BLOCKS, PLEASE CALL resize() before");
 
     /* First, sort the triplet list
-     * FIXME This can be unnecessarily expensive since only the inner indices have to be sorted
+     * FIXME: This can be unnecessarily expensive since only the inner indices have to be sorted.
      * The best approach is like in SparseMatrix::setFromTriplets()
      */
     internal::TripletComp<InputIterator, IsColMajor> tripletcomp;
@@ -646,49 +646,17 @@
       }
       block_id(outer)++;
     }
-
-    // An alternative when the outer indices are sorted...no need to use an array of markers
-    //      for(Index bcol = 0; bcol < m_outerBSize; ++bcol)
-    //      {
-    //      Index id = 0, id_nz = 0, id_nzblock = 0;
-    //      for(InputIterator it(begin); it!=end; ++it)
-    //      {
-    //        while (id<bcol) // one pass should do the job unless there are empty columns
-    //        {
-    //          id++;
-    //          m_outerIndex[id+1]=m_outerIndex[id];
-    //        }
-    //        m_outerIndex[id+1] += 1;
-    //        m_indices[id_nzblock]=brow;
-    //        Index block_size = it->value().rows()*it->value().cols();
-    //        m_blockPtr[id_nzblock+1] = m_blockPtr[id_nzblock] + block_size;
-    //        id_nzblock++;
-    //        memcpy(&(m_values[id_nz]),it->value().data(), block_size*sizeof(Scalar));
-    //        id_nz += block_size;
-    //      }
-    //      while(id < m_outerBSize-1) // Empty columns at the end
-    //      {
-    //        id++;
-    //        m_outerIndex[id+1]=m_outerIndex[id];
-    //      }
-    //      }
   }
 
   /**
    * \returns the number of rows
    */
-  inline Index rows() const {
-    //      return blockRows();
-    return (IsColMajor ? innerSize() : outerSize());
-  }
+  inline Index rows() const { return (IsColMajor ? innerSize() : outerSize()); }
 
   /**
    * \returns the number of cols
    */
-  inline Index cols() const {
-    //      return blockCols();
-    return (IsColMajor ? outerSize() : innerSize());
-  }
+  inline Index cols() const { return (IsColMajor ? outerSize() : innerSize()); }
 
   inline Index innerSize() const {
     if (m_blockSize == Dynamic)
@@ -748,7 +716,7 @@
     if (m_indices[offset] == inner) {
       return Map<BlockScalar>(&(m_values[blockPtr(offset)]), rsize, csize);
     } else {
-      // FIXME the block does not exist, Insert it !!!!!!!!!
+      // FIXME: The block does not exist; insert it.
       eigen_assert("DYNAMIC INSERTION IS NOT YET SUPPORTED");
     }
   }
@@ -769,7 +737,6 @@
     if (m_indices[offset] == inner) {
       return Map<const BlockScalar>(&(m_values[blockPtr(offset)]), rsize, csize);
     } else
-      //        return BlockScalar::Zero(rsize, csize);
       eigen_assert("NOT YET SUPPORTED");
   }
 
@@ -810,7 +777,7 @@
     return (m_blockSize == Dynamic) ? m_innerOffset[bi] : (bi * m_blockSize);
   }
 
-  // Not needed ???
+  // Not needed.
   inline Index blockInnerSize(Index bi) const {
     return (m_blockSize == Dynamic) ? (m_innerOffset[bi + 1] - m_innerOffset[bi]) : m_blockSize;
   }
@@ -848,19 +815,9 @@
       return m_blockPtr[id];
     else
       return id * m_blockSize * m_blockSize;
-    // return blockDynIdx(id, std::conditional_t<(BlockSize==Dynamic), internal::true_type, internal::false_type>());
   }
 
  protected:
-  //    inline Index blockDynIdx(Index id, internal::true_type) const
-  //    {
-  //      return m_blockPtr[id];
-  //    }
-  //    inline Index blockDynIdx(Index id, internal::false_type) const
-  //    {
-  //      return id * BlockSize * BlockSize;
-  //    }
-
   // To be implemented
   // Insert a block at a particular location... need to make a room for that
   Map<BlockScalar> insert(Index brow, Index bcol);
@@ -905,7 +862,7 @@
   inline Index row() const { return index(); }
   // block column index
   inline Index col() const { return outer(); }
-  // FIXME Number of rows in the current block
+  // FIXME: Number of rows in the current block.
   inline Index rows() const {
     return (m_mat.m_blockSize == Dynamic) ? (m_mat.m_innerOffset[index() + 1] - m_mat.m_innerOffset[index()])
                                           : m_mat.m_blockSize;

diff --git a/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h b/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h
index 15d7fb2..d991e75 100644
--- a/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h
+++ b/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h

@@ -146,14 +146,14 @@
 
   inline std::string& matname() { return m_matname; }
 
-  inline int sym() { return m_sym; }
+  inline int sym() const { return m_sym; }
 
-  bool hasRhs() { return m_hasRhs; }
-  bool hasrefX() { return m_hasrefX; }
-  bool isFolderValid() { return bool(m_folder_id); }
+  bool hasRhs() const { return m_hasRhs; }
+  bool hasrefX() const { return m_hasrefX; }
+  bool isFolderValid() const { return bool(m_folder_id); }
 
  protected:
-  inline bool Fileexists(std::string file) {
+  inline bool Fileexists(std::string file) const {
     std::ifstream file_id(file.c_str());
     if (!file_id.good()) {
       return false;
@@ -171,10 +171,7 @@
       std::string curfile;
       curfile = m_folder + "/" + m_curs_id->d_name;
       // Discard if it is a folder
-      if (m_curs_id->d_type == DT_DIR) continue;  // FIXME This may not be available on non BSD systems
-      //         struct stat st_buf;
-      //         stat (curfile.c_str(), &st_buf);
-      //         if (S_ISDIR(st_buf.st_mode)) continue;
+      if (m_curs_id->d_type == DT_DIR) continue;  // FIXME: This may not be available on non-BSD systems.
 
       // Determine from the header if it is a matrix or a right hand side
       bool isvector, iscomplex = false;

diff --git a/unsupported/Eigen/src/SparseExtra/RandomSetter.h b/unsupported/Eigen/src/SparseExtra/RandomSetter.h
index 54b2646..fab31f2 100644
--- a/unsupported/Eigen/src/SparseExtra/RandomSetter.h
+++ b/unsupported/Eigen/src/SparseExtra/RandomSetter.h

@@ -177,7 +177,7 @@
    * a sparse matrix from scratch, then you must set it to zero first using the
    * setZero() function.
    */
-  inline RandomSetter(SparseMatrixType& target) : mp_target(&target) {
+  inline RandomSetter(SparseMatrixType& target) : m_target(&target) {
     const Index outerSize = SwapStorage ? target.innerSize() : target.outerSize();
     const Index innerSize = SwapStorage ? target.outerSize() : target.innerSize();
     m_outerPackets = outerSize >> OuterPacketBits;
@@ -194,8 +194,8 @@
     for (Index k = 0; k < m_outerPackets; ++k) MapTraits<ScalarWrapper>::setInvalidKey(m_hashmaps[k], ik);
 
     // insert current coeffs
-    for (Index j = 0; j < mp_target->outerSize(); ++j)
-      for (typename SparseMatrixType::InnerIterator it(*mp_target, j); it; ++it)
+    for (Index j = 0; j < m_target->outerSize(); ++j)
+      for (typename SparseMatrixType::InnerIterator it(*m_target, j); it; ++it)
         (*this)(TargetRowMajor ? j : it.index(), TargetRowMajor ? it.index() : j) = it.value();
   }
 
@@ -204,9 +204,9 @@
     KeyType keyBitsMask = (1 << m_keyBitsOffset) - 1;
     if (!SwapStorage)  // also means the map is sorted
     {
-      mp_target->setZero();
-      mp_target->makeCompressed();
-      mp_target->reserve(nonZeros());
+      m_target->setZero();
+      m_target->makeCompressed();
+      m_target->reserve(nonZeros());
       Index prevOuter = -1;
       for (Index k = 0; k < m_outerPackets; ++k) {
         const Index outerOffset = (1 << OuterPacketBits) * k;
@@ -215,15 +215,15 @@
           const Index outer = (it->first >> m_keyBitsOffset) + outerOffset;
           const Index inner = it->first & keyBitsMask;
           if (prevOuter != outer) {
-            for (Index j = prevOuter + 1; j <= outer; ++j) mp_target->startVec(j);
+            for (Index j = prevOuter + 1; j <= outer; ++j) m_target->startVec(j);
             prevOuter = outer;
           }
-          mp_target->insertBackByOuterInner(outer, inner) = it->second.value;
+          m_target->insertBackByOuterInner(outer, inner) = it->second.value;
         }
       }
-      mp_target->finalize();
+      m_target->finalize();
     } else {
-      VectorXi positions(mp_target->outerSize());
+      VectorXi positions(m_target->outerSize());
       positions.setZero();
       // pass 1
       for (Index k = 0; k < m_outerPackets; ++k) {
@@ -235,15 +235,15 @@
       }
       // prefix sum
       StorageIndex count = 0;
-      for (Index j = 0; j < mp_target->outerSize(); ++j) {
+      for (Index j = 0; j < m_target->outerSize(); ++j) {
         StorageIndex tmp = positions[j];
-        mp_target->outerIndexPtr()[j] = count;
+        m_target->outerIndexPtr()[j] = count;
         positions[j] = count;
         count += tmp;
       }
-      mp_target->makeCompressed();
-      mp_target->outerIndexPtr()[mp_target->outerSize()] = count;
-      mp_target->resizeNonZeros(count);
+      m_target->makeCompressed();
+      m_target->outerIndexPtr()[m_target->outerSize()] = count;
+      m_target->resizeNonZeros(count);
       // pass 2
       for (Index k = 0; k < m_outerPackets; ++k) {
         const Index outerOffset = (1 << OuterPacketBits) * k;
@@ -255,15 +255,15 @@
           // Note that we have to deal with at most 2^OuterPacketBits unsorted coefficients,
           // moreover those 2^OuterPacketBits coeffs are likely to be sparse, an so only a
           // small fraction of them have to be sorted, whence the following simple procedure:
-          Index posStart = mp_target->outerIndexPtr()[outer];
+          Index posStart = m_target->outerIndexPtr()[outer];
           Index i = (positions[outer]++) - 1;
-          while ((i >= posStart) && (mp_target->innerIndexPtr()[i] > inner)) {
-            mp_target->valuePtr()[i + 1] = mp_target->valuePtr()[i];
-            mp_target->innerIndexPtr()[i + 1] = mp_target->innerIndexPtr()[i];
+          while ((i >= posStart) && (m_target->innerIndexPtr()[i] > inner)) {
+            m_target->valuePtr()[i + 1] = m_target->valuePtr()[i];
+            m_target->innerIndexPtr()[i + 1] = m_target->innerIndexPtr()[i];
             --i;
           }
-          mp_target->innerIndexPtr()[i + 1] = internal::convert_index<StorageIndex>(inner);
-          mp_target->valuePtr()[i + 1] = it->second.value;
+          m_target->innerIndexPtr()[i + 1] = internal::convert_index<StorageIndex>(inner);
+          m_target->valuePtr()[i + 1] = it->second.value;
         }
       }
     }
@@ -298,7 +298,7 @@
 
  protected:
   HashMapType* m_hashmaps;
-  SparseMatrixType* mp_target;
+  SparseMatrixType* m_target;
   Index m_outerPackets;
   unsigned char m_keyBitsOffset;
 };

diff --git a/unsupported/Eigen/src/SparseExtra/SparseInverse.h b/unsupported/Eigen/src/SparseExtra/SparseInverse.h
index 142cc8f..b9e1c55 100644
--- a/unsupported/Eigen/src/SparseExtra/SparseInverse.h
+++ b/unsupported/Eigen/src/SparseExtra/SparseInverse.h

@@ -38,7 +38,7 @@
   Scalar _correction{};
 
  public:
-  Scalar value() { return _sum; }
+  Scalar value() const { return _sum; }
 
   void operator+=(Scalar increment) {
     const Scalar correctedIncrement = increment + _correction;
@@ -59,7 +59,7 @@
   Index _blockUsed{};
 
  public:
-  Scalar value() { return _block.topRows(_blockUsed).sum() + _totalSum.value(); }
+  Scalar value() const { return _block.topRows(_blockUsed).sum() + _totalSum.value(); }
 
   void operator+=(Scalar increment) {
     _block(_blockUsed++, 0) = increment;

diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
index 387836b..83eada3 100644
--- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h

@@ -1103,7 +1103,9 @@
     }
 
     if ((x < one) || (x < a)) {
-      return (one - igamma_series_impl<Scalar, VALUE>::run(a, x));
+      // Clamp to [0,1] since 1-igamma can produce tiny negative values
+      // due to floating-point cancellation for extreme arguments.
+      return numext::mini(one, numext::maxi(zero, one - igamma_series_impl<Scalar, VALUE>::run(a, x)));
     }
 
     return igammac_cf_impl<Scalar, VALUE>::run(a, x);
@@ -1155,13 +1157,21 @@
     if ((x > one) && (x > a)) {
       Scalar ret = igammac_cf_impl<Scalar, mode>::run(a, x);
       if (mode == VALUE) {
-        return one - ret;
+        // Clamp to [0,1] since 1-igammac can produce tiny negative values
+        // due to floating-point cancellation for extreme arguments.
+        return numext::mini(one, numext::maxi(zero, one - ret));
       } else {
         return -ret;
       }
     }
 
-    return igamma_series_impl<Scalar, mode>::run(a, x);
+    Scalar ret = igamma_series_impl<Scalar, mode>::run(a, x);
+    if (mode == VALUE) {
+      // Clamp to [0,1] since accumulated series terms can slightly exceed 1.0
+      // due to floating-point rounding for extreme arguments.
+      return numext::mini(one, numext::maxi(zero, ret));
+    }
+    return ret;
   }
 };
 
@@ -1844,14 +1854,15 @@
     const float nan = NumTraits<float>::quiet_NaN();
     float ans, t;
 
-    if (a <= 0.0f) return nan;
-    if (b <= 0.0f) return nan;
-    if ((x <= 0.0f) || (x >= 1.0f)) {
-      if (x == 0.0f) return 0.0f;
-      if (x == 1.0f) return 1.0f;
-      // mtherr("betaincf", DOMAIN);
-      return nan;
-    }
+    if (a == 0.0f && b == 0.0f) return nan;
+    if (x < 0.0f || x > 1.0f) return nan;
+    if (a < 0.0f) return nan;
+    if (b < 0.0f) return nan;
+    if (a == 0.0f) return 1.0f;
+    if (b == 0.0f) return 0.0f;
+    if (x == 0.0f) return 0.0f;
+    if (x == 1.0f) return 1.0f;
+    // mtherr("betaincf", DOMAIN);
 
     /* transformation for small aa */
     if (a <= 1.0f) {
@@ -1909,21 +1920,18 @@
   EIGEN_DEVICE_FUNC static double run(double aa, double bb, double xx) {
     const double nan = NumTraits<double>::quiet_NaN();
     const double machep = cephes_helper<double>::machep();
-    // const double maxgam = 171.624376956302725;
-
     double a, b, t, x, xc, w, y;
     bool reversed_a_b = false;
 
-    if (aa <= 0.0 || bb <= 0.0) {
-      return nan;  // goto domerr;
-    }
-
-    if ((xx <= 0.0) || (xx >= 1.0)) {
-      if (xx == 0.0) return (0.0);
-      if (xx == 1.0) return (1.0);
-      // mtherr("incbet", DOMAIN);
-      return nan;
-    }
+    if (aa == 0.0 && bb == 0.0) return nan;
+    if (xx < 0.0 || xx > 1.0) return nan;
+    if (aa < 0.0) return nan;
+    if (bb < 0.0) return nan;
+    if (aa == 0.0) return 1.0;
+    if (bb == 0.0) return 0.0;
+    if (xx == 0.0) return 0.0;
+    if (xx == 1.0) return 1.0;
+    // mtherr("incbet", DOMAIN);
 
     if ((bb * xx) <= 1.0 && xx <= 0.95) {
       return betainc_helper<double>::incbps(aa, bb, xx);

diff --git a/unsupported/Eigen/src/SpecialFunctions/arch/AVX/BesselFunctions.h b/unsupported/Eigen/src/SpecialFunctions/arch/AVX/BesselFunctions.h
index 2d76692..d1a39be 100644
--- a/unsupported/Eigen/src/SpecialFunctions/arch/AVX/BesselFunctions.h
+++ b/unsupported/Eigen/src/SpecialFunctions/arch/AVX/BesselFunctions.h

@@ -4,41 +4,8 @@
 namespace Eigen {
 namespace internal {
 
-F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_i0)
-BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_i0)
-
-F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_i0e)
-BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_i0e)
-
-F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_i1)
-BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_i1)
-
-F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_i1e)
-BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_i1e)
-
-F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_j0)
-BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_j0)
-
-F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_j1)
-BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_j1)
-
-F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_k0)
-BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_k0)
-
-F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_k0e)
-BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_k0e)
-
-F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_k1)
-BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_k1)
-
-F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_k1e)
-BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_k1e)
-
-F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_y0)
-BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_y0)
-
-F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_y1)
-BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_y1)
+EIGEN_INSTANTIATE_BESSEL_FUNCS_F16(Packet8f, Packet8h)
+EIGEN_INSTANTIATE_BESSEL_FUNCS_BF16(Packet8f, Packet8bf)
 
 }  // namespace internal
 }  // namespace Eigen

diff --git a/unsupported/Eigen/src/SpecialFunctions/arch/AVX/SpecialFunctions.h b/unsupported/Eigen/src/SpecialFunctions/arch/AVX/SpecialFunctions.h
index 35e62a8..a5a0805 100644
--- a/unsupported/Eigen/src/SpecialFunctions/arch/AVX/SpecialFunctions.h
+++ b/unsupported/Eigen/src/SpecialFunctions/arch/AVX/SpecialFunctions.h

@@ -4,13 +4,10 @@
 namespace Eigen {
 namespace internal {
 
-F16_PACKET_FUNCTION(Packet8f, Packet8h, perf)
-BF16_PACKET_FUNCTION(Packet8f, Packet8bf, perf)
-
-F16_PACKET_FUNCTION(Packet8f, Packet8h, pndtri)
-BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pndtri)
+EIGEN_INSTANTIATE_SPECIAL_FUNCS_F16(Packet8f, Packet8h)
+EIGEN_INSTANTIATE_SPECIAL_FUNCS_BF16(Packet8f, Packet8bf)
 
 }  // namespace internal
 }  // namespace Eigen
 
-#endif  // EIGEN_AVX_SPECIAL_FUNCTIONS_H
+#endif  // EIGEN_AVX_SPECIALFUNCTIONS_H

diff --git a/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h b/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h
index 7dd3c3e..63ccc2d 100644
--- a/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h
+++ b/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h

@@ -4,41 +4,8 @@
 namespace Eigen {
 namespace internal {
 
-F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_i0)
-BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_i0)
-
-F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_i0e)
-BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_i0e)
-
-F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_i1)
-BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_i1)
-
-F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_i1e)
-BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_i1e)
-
-F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_j0)
-BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_j0)
-
-F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_j1)
-BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_j1)
-
-F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_k0)
-BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_k0)
-
-F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_k0e)
-BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_k0e)
-
-F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_k1)
-BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_k1)
-
-F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_k1e)
-BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_k1e)
-
-F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_y0)
-BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_y0)
-
-F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_y1)
-BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_y1)
+EIGEN_INSTANTIATE_BESSEL_FUNCS_F16(Packet16f, Packet16h)
+EIGEN_INSTANTIATE_BESSEL_FUNCS_BF16(Packet16f, Packet16bf)
 
 }  // namespace internal
 }  // namespace Eigen

diff --git a/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/SpecialFunctions.h b/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/SpecialFunctions.h
index 79878f2..d00d991 100644
--- a/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/SpecialFunctions.h
+++ b/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/SpecialFunctions.h

@@ -4,13 +4,10 @@
 namespace Eigen {
 namespace internal {
 
-F16_PACKET_FUNCTION(Packet16f, Packet16h, perf)
-BF16_PACKET_FUNCTION(Packet16f, Packet16bf, perf)
-
-F16_PACKET_FUNCTION(Packet16f, Packet16h, pndtri)
-BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pndtri)
+EIGEN_INSTANTIATE_SPECIAL_FUNCS_F16(Packet16f, Packet16h)
+EIGEN_INSTANTIATE_SPECIAL_FUNCS_BF16(Packet16f, Packet16bf)
 
 }  // namespace internal
 }  // namespace Eigen
 
-#endif  // EIGEN_AVX512_SPECIAL_FUNCTIONS_H
+#endif  // EIGEN_AVX512_SPECIALFUNCTIONS_H

diff --git a/unsupported/Eigen/src/SpecialFunctions/arch/NEON/BesselFunctions.h b/unsupported/Eigen/src/SpecialFunctions/arch/NEON/BesselFunctions.h
index 70d9056..568abab 100644
--- a/unsupported/Eigen/src/SpecialFunctions/arch/NEON/BesselFunctions.h
+++ b/unsupported/Eigen/src/SpecialFunctions/arch/NEON/BesselFunctions.h

@@ -35,18 +35,7 @@
 #undef NEON_HALF_TO_FLOAT_FUNCTIONS
 #endif
 
-BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_i0)
-BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_i0e)
-BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_i1)
-BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_i1e)
-BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_j0)
-BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_j1)
-BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_k0)
-BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_k0e)
-BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_k1)
-BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_k1e)
-BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_y0)
-BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_y1)
+EIGEN_INSTANTIATE_BESSEL_FUNCS_BF16(Packet4f, Packet4bf)
 
 }  // namespace internal
 }  // namespace Eigen

diff --git a/unsupported/Eigen/src/SpecialFunctions/arch/NEON/SpecialFunctions.h b/unsupported/Eigen/src/SpecialFunctions/arch/NEON/SpecialFunctions.h
index 5590d2b..c31bb66 100644
--- a/unsupported/Eigen/src/SpecialFunctions/arch/NEON/SpecialFunctions.h
+++ b/unsupported/Eigen/src/SpecialFunctions/arch/NEON/SpecialFunctions.h

@@ -25,8 +25,7 @@
 #undef NEON_HALF_TO_FLOAT_FUNCTIONS
 #endif
 
-BF16_PACKET_FUNCTION(Packet4f, Packet4bf, perf)
-BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pndtri)
+EIGEN_INSTANTIATE_SPECIAL_FUNCS_BF16(Packet4f, Packet4bf)
 
 }  // namespace internal
 }  // namespace Eigen

diff --git a/unsupported/Eigen/src/Splines/Spline.h b/unsupported/Eigen/src/Splines/Spline.h
index 6ff1eea..77b3522 100644
--- a/unsupported/Eigen/src/Splines/Spline.h
+++ b/unsupported/Eigen/src/Splines/Spline.h

@@ -367,7 +367,7 @@
 
   Matrix<Scalar, Order, Order> ndu(p + 1, p + 1);
 
-  Scalar saved, temp;  // FIXME These were double instead of Scalar. Was there a reason for that?
+  Scalar saved, temp;  // FIXME: These were double instead of Scalar. Was there a reason for that?
 
   ndu(0, 0) = 1.0;
 

diff --git a/unsupported/Eigen/src/Tensor/InternalHeaderCheck.h b/unsupported/Eigen/src/Tensor/InternalHeaderCheck.h
new file mode 100644
index 0000000..b741924
--- /dev/null
+++ b/unsupported/Eigen/src/Tensor/InternalHeaderCheck.h

@@ -0,0 +1,3 @@
+#ifndef EIGEN_CXX11_TENSOR_MODULE_H
+#error "Please include unsupported/Eigen/Tensor instead of including headers inside the src directory directly."
+#endif

diff --git a/unsupported/Eigen/CXX11/src/Tensor/README.md b/unsupported/Eigen/src/Tensor/README.md
similarity index 76%
rename from unsupported/Eigen/CXX11/src/Tensor/README.md
rename to unsupported/Eigen/src/Tensor/README.md
index 45a9454..bf65304 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/README.md
+++ b/unsupported/Eigen/src/Tensor/README.md

@@ -3,26 +3,68 @@
 Tensors are multidimensional arrays of elements. Elements are typically scalars,
 but more complex types such as strings are also supported.
 
+The Tensor module is part of Eigen's unsupported modules. While it is actively
+used in production (e.g. in TensorFlow), its API may change without notice.
+
+To use the Tensor module, include the following header:
+
+```cpp
+#include <unsupported/Eigen/Tensor>
+```
+
+## Quick Start
+
+```cpp
+#include <unsupported/Eigen/Tensor>
+#include <iostream>
+
+int main() {
+  // Create a 3x4 matrix as a rank-2 tensor.
+  Eigen::Tensor<float, 2> a(3, 4);
+  a.setRandom();
+
+  // Create another tensor and compute their element-wise sum.
+  Eigen::Tensor<float, 2> b(3, 4);
+  b.setConstant(1.0f);
+  Eigen::Tensor<float, 2> c = a + b;
+
+  // Reduce: compute the sum of all elements.
+  Eigen::Tensor<float, 0> total = c.sum();
+  std::cout << "Sum of all elements: " << total() << "\n";
+
+  // Reshape and broadcast.
+  Eigen::Tensor<float, 2> d = c.reshape(Eigen::array<Eigen::Index, 2>{{1, 12}})
+                                .broadcast(Eigen::array<Eigen::Index, 2>{{3, 1}});
+  std::cout << "d has shape: " << d.dimension(0) << " x " << d.dimension(1) << "\n";
+  return 0;
+}
+```
+
 ## Tensor Classes
 
 You can manipulate a tensor with one of the following classes.  They all are in
-the namespace `::Eigen.`
+the namespace `::Eigen`.
 
-### Class Tensor<data_type, rank>
+### Class Tensor\<Scalar, NumIndices, Options, IndexType\>
 
-This is the class to use to create a tensor and allocate memory for it.  The
-class is templatized with the tensor datatype, such as float or int, and the
-tensor rank.  The rank is the number of dimensions, for example rank 2 is a
-matrix.
+This is the class to use to create a tensor and allocate memory for it.
+
+Template parameters:
+
+| Parameter | Description | Default |
+|-----------|-------------|---------|
+| `Scalar` | Element type (e.g. `float`, `int`, `std::string`) | (required) |
+| `NumIndices` | Rank (number of dimensions) | (required) |
+| `Options` | `ColMajor` (0) or `RowMajor` | `0` (`ColMajor`) |
+| `IndexType` | Type used for indexing (e.g. `int`, `long`) | `Eigen::DenseIndex` |
 
 Tensors of this class are resizable.  For example, if you assign a tensor of a
 different size to a Tensor, that tensor is resized to match its new value.
 
-#### Constructor Tensor<data_type, rank>(size0, size1, ...)
+#### Constructor Tensor\<Scalar, NumIndices\>(size0, size1, ...)
 
-Constructor for a Tensor.  The constructor must be passed `rank` integers
-indicating the sizes of the instance along each of the the `rank`
-dimensions.
+Constructor for a Tensor.  The constructor must be passed `NumIndices` integers
+indicating the sizes of the instance along each of the dimensions.
 
 ```cpp
 // Create a tensor of rank 3 of sizes 2, 3, 4.  This tensor owns
@@ -33,19 +75,19 @@
 t_3d = Tensor<float, 3>(3, 4, 3);
 ```
 
-#### Constructor Tensor<data_type, rank>(size_array)
+#### Constructor Tensor\<Scalar, NumIndices\>(size_array)
 
 Constructor where the sizes for the constructor are specified as an array of
-values instead of an explicitly list of parameters.  The array type to use is
-`Eigen::array<Eigen::Index>`.  The array can be constructed automatically
-from an initializer list.
+values instead of an explicit list of parameters.  The array type to use is
+`Eigen::array<Eigen::Index, NumIndices>`.  The array can be constructed
+automatically from an initializer list.
 
 ```cpp
 // Create a tensor of strings of rank 2 with sizes 5, 7.
 Tensor<string, 2> t_2d({5, 7});
 ```
 
-### Class TensorFixedSize<data_type, Sizes<size0, size1, ...>>
+### Class TensorFixedSize\<Scalar, Sizes\<size0, size1, ...\>, Options, IndexType\>
 
 Class to use for tensors of fixed size, where the size is known at compile
 time.  Fixed sized tensors can provide very fast computations because all their
@@ -59,7 +101,7 @@
 TensorFixedSize<float, Sizes<4, 3>> t_4x3;
 ```
 
-### Class TensorMap<Tensor<data_type, rank>>
+### Class TensorMap\<Tensor\<Scalar, NumIndices, Options\>\>
 
 This is the class to use to create a tensor on top of memory allocated and
 owned by another part of your code.  It allows to view any piece of allocated
@@ -69,10 +111,13 @@
 A `TensorMap` is not resizable because it does not own the memory where its data
 are stored.
 
-#### Constructor TensorMap<Tensor<data_type, rank>>(data, size0, size1, ...)
+An optional alignment template parameter controls whether Eigen can assume
+the data pointer is aligned: `TensorMap<Tensor<float, 2>, Aligned>`.
 
-Constructor for a Tensor.  The constructor must be passed a pointer to the
-storage for the data, and "rank" size attributes.  The storage has to be
+#### Constructor TensorMap\<Tensor\<Scalar, NumIndices\>\>(data, size0, size1, ...)
+
+Constructor for a TensorMap.  The constructor must be passed a pointer to the
+storage for the data, and `NumIndices` size attributes.  The storage has to be
 large enough to hold all the data.
 
 ```cpp
@@ -96,7 +141,7 @@
 
 ## Accessing Tensor Elements
 
-#### data_type tensor(index0, index1...)
+#### Scalar tensor(index0, index1...)
 
 Return the element at position `(index0, index1...)` in tensor
 `tensor`.  You must pass as many parameters as the rank of `tensor`.
@@ -128,12 +173,12 @@
 The tensor library supports 2 layouts: `ColMajor` (the default) and
 `RowMajor`.
 
-The layout of a tensor is optionally specified as part of its type. If not
-specified explicitly column major is assumed.
+The layout of a tensor is optionally specified as the third template parameter
+(`Options`). If not specified explicitly, column major is assumed.
 
 ```cpp
 Tensor<float, 3, ColMajor> col_major;  // equivalent to Tensor<float, 3>
-TensorMap<Tensor<float, 3, RowMajor> > row_major(data, ...);
+TensorMap<Tensor<float, 3, RowMajor>> row_major(data, ...);
 ```
 
 All the arguments to an expression must use the same layout. Attempting to mix
@@ -156,7 +201,7 @@
 eigen_assert(col_major_result.dimension(1) == 2);
 
 // Swap the layout and preserve the order of the dimensions
-array<int, 2> shuffle(1, 0);
+array<int, 2> shuffle{{1, 0}};
 col_major_result = row_major.swap_layout().shuffle(shuffle);
 eigen_assert(col_major_result.dimension(0) == 2);
 eigen_assert(col_major_result.dimension(1) == 4);
@@ -172,7 +217,7 @@
 
 ```cpp
 Tensor<float, 3> t1(2, 3, 4);
-t2.setRandom();
+t1.setRandom();
 Tensor<float, 3> t2(2, 3, 4);
 t2.setRandom();
 // Set t3 to the element wise sum of t1 and t2
@@ -241,11 +286,12 @@
 Tensor<float, 3> result = t4;  // Could also be: result(t4);
 std::cout << result(0, 0, 0);
 
-TensorMap<float, 4> result(<a float* with enough space>, <size0>, ...) = t4;
-std::cout << result(0, 0, 0);
+TensorMap<Tensor<float, 3>> result2(some_float_ptr, dim0, dim1, dim2);
+result2 = t4;
+std::cout << result2(0, 0, 0);
 
-TensorFixedSize<float, Sizes<size0, ...>> result = t4;
-std::cout << result(0, 0, 0);
+TensorFixedSize<float, Sizes<4, 4, 2>> result3 = t4;
+std::cout << result3(0, 0, 0);
 ```
 
 Until you need the results, you can keep the operation around, and even reuse
@@ -263,7 +309,7 @@
 Tensor<float, 3> result = ((t1 + t2) * 0.2f).exp();
 ```
 
-### Controlling When Expression are Evaluated
+### Controlling When Expressions are Evaluated
 
 There are several ways to control when expressions are evaluated:
 
@@ -335,7 +381,7 @@
 auto t4 = (t3 * 0.2f).exp();
 
 // The value is evaluated when you assign the Operation to a Tensor, using
-// an intermediate tensor to represent t3.x
+// an intermediate tensor to represent t3.
 Tensor<float, 3> result = t4;
 ```
 
@@ -405,7 +451,7 @@
 ```cpp
 // Create a TensorRef for the expression.  The expression is not
 // evaluated yet.
-TensorRef<Tensor<float, 3> > ref = ((t1 + t2) * 0.2f).exp();
+TensorRef<Tensor<float, 3>> ref = ((t1 + t2) * 0.2f).exp();
 
 // Use "ref" to access individual elements.  The expression is evaluated
 // on the fly.
@@ -428,16 +474,17 @@
 
 The tensor library provides several implementations of the various operations
 such as contractions and convolutions.  The implementations are optimized for
-different environments: single threaded on CPU, multi threaded on CPU, or on a GPU using cuda.
+different environments: single threaded on CPU, multi threaded on CPU, or on a
+GPU using CUDA/HIP/SYCL.
 
 You can choose which implementation to use with the `device()` call.  If
 you do not choose an implementation explicitly the default implementation that
 uses a single thread on the CPU is used.
 
-The default implementation has been optimized for recent Intel CPUs, taking
-advantage of SSE, AVX, and FMA instructions.  Work is ongoing to tune the
-library on ARM CPUs.  Note that you need to pass compiler-dependent flags
-to enable the use of SSE, AVX, and other instructions.
+The default implementation has been optimized for modern CPUs, taking
+advantage of SSE, AVX, AVX-512, ARM NEON, SVE, RISC-V Vector (RVV), and other
+SIMD instruction sets. Note that you need to pass compiler-dependent flags
+to enable the use of these instructions (e.g. `-mavx2`, `-march=native`).
 
 For example, the following code adds two tensors using the default
 single-threaded CPU implementation:
@@ -461,8 +508,8 @@
 The call to `device()` must be the last call on the left of the operator=.
 
 You must pass to the `device()` call an Eigen device object.  There are
-presently three devices you can use: `DefaultDevice`, `ThreadPoolDevice` and
-`GpuDevice`.
+presently four devices you can use: `DefaultDevice`, `ThreadPoolDevice`,
+`GpuDevice`, and `SyclDevice`.
 
 
 #### Evaluating With the DefaultDevice
@@ -476,9 +523,15 @@
 
 #### Evaluating with a Thread Pool
 
+To use `ThreadPoolDevice`, you must define `EIGEN_USE_THREADS` before
+including the Tensor header:
+
 ```cpp
-// Create the Eigen ThreadPool
-Eigen::ThreadPool pool(8 /* number of threads in pool */)
+#define EIGEN_USE_THREADS
+#include <unsupported/Eigen/Tensor>
+
+// Create the Eigen ThreadPool.
+Eigen::ThreadPool pool(8 /* number of threads in pool */);
 
 // Create the Eigen ThreadPoolDevice.
 Eigen::ThreadPoolDevice my_device(&pool, 4 /* number of threads to use */);
@@ -491,9 +544,58 @@
 
 #### Evaluating On GPU
 
-This is presently a bit more complicated than just using a thread pool device.
-You need to create a GPU device but you also need to explicitly allocate the
-memory for tensors with cuda.
+To use `GpuDevice`, you must define `EIGEN_USE_GPU` before including the
+Tensor header.  GPU tensors require explicitly allocating device memory
+with CUDA or HIP APIs.
+
+```cpp
+#define EIGEN_USE_GPU
+#include <unsupported/Eigen/Tensor>
+
+// Allocate data on GPU.
+float* d_a;
+float* d_b;
+float* d_c;
+cudaMalloc((void**)&d_a, 30 * 40 * sizeof(float));
+cudaMalloc((void**)&d_b, 30 * 40 * sizeof(float));
+cudaMalloc((void**)&d_c, 30 * 40 * sizeof(float));
+
+// Copy host data to device.
+cudaMemcpy(d_a, h_a, 30 * 40 * sizeof(float), cudaMemcpyHostToDevice);
+cudaMemcpy(d_b, h_b, 30 * 40 * sizeof(float), cudaMemcpyHostToDevice);
+
+// Create device maps.
+Eigen::TensorMap<Eigen::Tensor<float, 2>> gpu_a(d_a, 30, 40);
+Eigen::TensorMap<Eigen::Tensor<float, 2>> gpu_b(d_b, 30, 40);
+Eigen::TensorMap<Eigen::Tensor<float, 2>> gpu_c(d_c, 30, 40);
+
+// Create a GPU device and evaluate.
+Eigen::GpuStreamDevice stream;
+Eigen::GpuDevice gpu_device(&stream);
+gpu_c.device(gpu_device) = gpu_a + gpu_b;
+
+// Synchronize and copy back.
+cudaStreamSynchronize(stream.stream());
+cudaMemcpy(h_c, d_c, 30 * 40 * sizeof(float), cudaMemcpyDeviceToHost);
+
+cudaFree(d_a);
+cudaFree(d_b);
+cudaFree(d_c);
+```
+
+For HIP, replace `cuda*` calls with the corresponding `hip*` calls.
+
+#### Asynchronous Device Execution
+
+You can pass a callback to the `device()` call that will be invoked when the
+computation completes.  This is supported by `ThreadPoolDevice` and `GpuDevice`.
+
+```cpp
+Eigen::Tensor<float, 2> c(30, 40);
+auto done = []() { std::cout << "Computation complete!\n"; };
+c.device(my_device, done) = a + b;
+// The callback will be invoked when evaluation finishes.
+```
 
 
 ## API Reference
@@ -503,18 +605,19 @@
 In the documentation of the tensor methods and Operation we mention datatypes
 that are tensor-type specific:
 
-#### <Tensor-Type>::Dimensions
+#### \<Tensor-Type\>::Dimensions
 
-Acts like an array of `int`. Has an `int size` attribute, and can be
+Acts like an array of `Index`. Has a `size()` method (inherited from
+`std::array`) and a static `count` member equal to the rank. Can be
 indexed like an array to access individual values.  Used to represent the
 dimensions of a tensor.  See `dimensions()`.
 
-#### <Tensor-Type>::Index
+#### \<Tensor-Type\>::Index
 
 Acts like an `int`.  Used for indexing tensors along their dimensions.  See
 `operator()`, `dimension()`, and `size()`.
 
-#### <Tensor-Type>::Scalar
+#### \<Tensor-Type\>::Scalar
 
 Represents the datatype of individual tensor elements.  For example, for a
 `Tensor<float>`, `Scalar` is the type `float`.  See `setConstant()`.
@@ -558,16 +661,16 @@
 ```cpp
 Eigen::Tensor<float, 2> a(3, 4);
 const Eigen::Tensor<float, 2>::Dimensions& d = a.dimensions();
-std::cout << "Dim size: " << d.size << ", dim 0: " << d[0]
+std::cout << "Dim size: " << d.size() << ", dim 0: " << d[0]
           << ", dim 1: " << d[1];
 //  Dim size: 2, dim 0: 3, dim 1: 4
 ```
 
-If you use a C++11 compiler, you can use `auto` to simplify the code:
+You can use `auto` to simplify the code:
 
 ```cpp
 const auto& d = a.dimensions();
-std::cout << "Dim size: " << d.size << ", dim 0: " << d[0]
+std::cout << "Dim size: " << d.size() << ", dim 0: " << d[0]
         << ", dim 1: " << d[1];
 // Dim size: 2, dim 0: 3, dim 1: 4
 ```
@@ -600,7 +703,7 @@
 ### Getting Dimensions From An Operation
 
 A few operations provide `dimensions()` directly,
-e.g. `TensorReslicingOp`.  Most operations defer calculating dimensions
+e.g. `TensorSlicingOp`.  Most operations defer calculating dimensions
 until the operation is being evaluated.  If you need access to the dimensions
 of a deferred operation, you can wrap it in a `TensorRef` (see
 **Assigning to a TensorRef** above), which provides
@@ -666,7 +769,7 @@
 have an immediate effect on the tensor and return the tensor itself as a
 result.  These are not tensor Operations which delay evaluation.
 
-### <Tensor-Type> setConstant(const Scalar& val)
+### \<Tensor-Type\> setConstant(const Scalar& val)
 
 Sets all elements of the tensor to the constant value `val`.  `Scalar`
 is the type of data stored in the tensor.  You can pass any value that is
@@ -696,7 +799,7 @@
 // yolo yolo yolo
 ```
 
-### <Tensor-Type> setZero()
+### \<Tensor-Type\> setZero()
 
 Fills the tensor with zeros.  Equivalent to `setConstant(Scalar(0))`.
 Returns the tensor itself in case you want to chain another call.
@@ -711,7 +814,7 @@
 // 0 0 0 0
 ```
 
-### <Tensor-Type> setValues({..initializer_list})
+### \<Tensor-Type\> setValues({..initializer_list})
 
 Fills the tensor with explicit values specified in a std::initializer_list.
 The type of the initializer list depends on the type and rank of the tensor.
@@ -720,8 +823,8 @@
 most deeply nested lists must contains P scalars of the `Tensor` type where P is
 the size of the last dimension of the Tensor.
 
-For example, for a `TensorFixedSize<float, 2, 3>` the initializer list must
-contains 2 lists of 3 floats each.
+For example, for a `TensorFixedSize<float, Sizes<2, 3>>` the initializer list
+must contains 2 lists of 3 floats each.
 
 `setValues()` returns the tensor itself in case you want to chain another
 call.
@@ -750,7 +853,7 @@
 // 1000 1000 1000
 ```
 
-### <Tensor-Type> setRandom()
+### \<Tensor-Type\> setRandom()
 
 Fills the tensor with random values.  Returns the tensor itself in case you
 want to chain another call.
@@ -848,7 +951,7 @@
 returned by the method.
 
 The chain of Operation is evaluated lazily, typically when it is assigned to a
-tensor.  See **Controlling When Expression are Evaluated** for more details about
+tensor.  See **Controlling When Expressions are Evaluated** for more details about
 their evaluation.
 
 ### (Operation) constant(const Scalar& val)
@@ -935,63 +1038,78 @@
 
 ### (Operation) sqrt()
 
-Returns a tensor of the same type and dimensions as the original tensor
-containing the square roots of the original tensor.
+Returns a tensor containing the square roots of the original tensor.
 
 ### (Operation) rsqrt()
 
-Returns a tensor of the same type and dimensions as the original tensor
-containing the inverse square roots of the original tensor.
+Returns a tensor containing the inverse square roots (1/sqrt(x)) of the original tensor.
 
 ### (Operation) square()
 
-Returns a tensor of the same type and dimensions as the original tensor
-containing the squares of the original tensor values.
+Returns a tensor containing the squares of the original tensor values.
+
+### (Operation) cube()
+
+Returns a tensor containing the cubes (x^3) of the original tensor values.
 
 ### (Operation) inverse()
 
-Returns a tensor of the same type and dimensions as the original tensor
-containing the inverse of the original tensor values.
+Returns a tensor containing the inverse (1/x) of the original tensor values.
 
 ### (Operation) exp()
 
-Returns a tensor of the same type and dimensions as the original tensor
-containing the exponential of the original tensor.
+Returns a tensor containing the exponential of the original tensor.
+
+### (Operation) expm1()
+
+Returns a tensor containing `exp(x) - 1` for each element. More accurate
+than `exp(x) - 1` for small values of x.
 
 ### (Operation) log()
 
-Returns a tensor of the same type and dimensions as the original tensor
-containing the natural logarithms of the original tensor.
+Returns a tensor containing the natural logarithms of the original tensor.
+
+### (Operation) log1p()
+
+Returns a tensor containing `log(1 + x)` for each element. More accurate
+than `log(1 + x)` for small values of x.
+
+### (Operation) log2()
+
+Returns a tensor containing the base-2 logarithms of the original tensor.
 
 ### (Operation) abs()
 
-Returns a tensor of the same type and dimensions as the original tensor
-containing the absolute values of the original tensor.
+Returns a tensor containing the absolute values of the original tensor.
+
+### (Operation) sign()
+
+Returns a tensor containing the sign (-1, 0, or +1) of each element.
 
 ### (Operation) arg()
 
-Returns a tensor with the same dimensions as the original tensor
-containing the complex argument (phase angle) of the values of the
-original tensor.
+Returns a tensor containing the complex argument (phase angle) of the
+values of the original tensor.
 
 ### (Operation) real()
 
-Returns a tensor with the same dimensions as the original tensor
-containing the real part of the complex values of the original tensor.
-The result has a real-valued scalar type.
+Returns a tensor containing the real part of the complex values of the
+original tensor.  The result has a real-valued scalar type.
 
 ### (Operation) imag()
 
-Returns a tensor with the same dimensions as the original tensor
-containing the imaginary part of the complex values of the original
-tensor.
-The result has a real-valued scalar type.
+Returns a tensor containing the imaginary part of the complex values of
+the original tensor. The result has a real-valued scalar type.
+
+### (Operation) conjugate()
+
+Returns a tensor containing the complex conjugate of each element.
+For real-valued tensors, this is a no-op.
 
 ### (Operation) pow(Scalar exponent)
 
-Returns a tensor of the same type and dimensions as the original tensor
-containing the coefficients of the original tensor to the power of the
-exponent.
+Returns a tensor containing the coefficients of the original tensor raised
+to the power of the exponent.
 
 The type of the exponent, Scalar, is always the same as the type of the
 tensor coefficients.  For example, only integer exponents can be used in
@@ -1016,7 +1134,130 @@
 // 3 4 5
 ```
 
-### (Operation)  operator* (Scalar s)
+### (Operation) clip(Scalar min_val, Scalar max_val)
+
+Returns a tensor with each element clamped to the range `[min_val, max_val]`.
+
+```cpp
+Eigen::Tensor<float, 1> a(5);
+a.setValues({-2.0f, -0.5f, 0.0f, 0.5f, 2.0f});
+Eigen::Tensor<float, 1> b = a.clip(-1.0f, 1.0f);
+// b: -1 -0.5  0  0.5  1
+```
+
+### Rounding Operations
+
+### (Operation) round()
+
+Returns a tensor with each element rounded to the nearest integer.
+
+### (Operation) rint()
+
+Returns a tensor with each element rounded to the nearest integer
+(using the current rounding mode).
+
+### (Operation) ceil()
+
+Returns a tensor with each element rounded up to the nearest integer.
+
+### (Operation) floor()
+
+Returns a tensor with each element rounded down to the nearest integer.
+
+### Predicates
+
+### (Operation) (isnan)()
+
+Returns a bool tensor indicating which elements are NaN.
+
+```cpp
+Eigen::Tensor<float, 1> a(3);
+a.setValues({1.0f, std::numeric_limits<float>::quiet_NaN(), 3.0f});
+Eigen::Tensor<bool, 1> b = a.isnan().cast<bool>();
+// b: false true false
+```
+
+### (Operation) (isinf)()
+
+Returns a bool tensor indicating which elements are infinite.
+
+### (Operation) (isfinite)()
+
+Returns a bool tensor indicating which elements are finite (not NaN or Inf).
+
+### Hyperbolic and Activation Functions
+
+### (Operation) tanh()
+
+Returns a tensor containing the hyperbolic tangent of each element.
+
+### (Operation) sigmoid()
+
+Returns a tensor containing the logistic sigmoid (1/(1+exp(-x))) of each element.
+
+### Error Functions
+
+### (Operation) erf()
+
+Returns a tensor containing the error function of each element.
+
+### (Operation) erfc()
+
+Returns a tensor containing the complementary error function (1 - erf(x)) of each element.
+
+### (Operation) ndtri()
+
+Returns a tensor containing the inverse of the normal cumulative distribution function of each element.
+
+### Special Math Functions
+
+These require including `<unsupported/Eigen/SpecialFunctions>` in addition to
+the Tensor header.
+
+### (Operation) lgamma()
+
+Returns a tensor containing the log-gamma function of each element.
+
+### (Operation) digamma()
+
+Returns a tensor containing the digamma (psi) function of each element.
+
+### (Operation) bessel_i0(), bessel_i0e(), bessel_i1(), bessel_i1e()
+
+Modified Bessel functions of the first kind. The `e` variants are exponentially scaled.
+
+### (Operation) bessel_j0(), bessel_j1()
+
+Bessel functions of the first kind.
+
+### (Operation) bessel_y0(), bessel_y1()
+
+Bessel functions of the second kind.
+
+### (Operation) bessel_k0(), bessel_k0e(), bessel_k1(), bessel_k1e()
+
+Modified Bessel functions of the second kind. The `e` variants are exponentially scaled.
+
+### (Operation) igamma(const OtherDerived& other)
+
+Regularized lower incomplete gamma function. `this` is the parameter `a` and
+`other` is `x`.
+
+### (Operation) igammac(const OtherDerived& other)
+
+Regularized upper incomplete gamma function (1 - igamma).
+
+### (Operation) zeta(const OtherDerived& other)
+
+Riemann zeta function. `this` is `x` and `other` is `q`.
+
+### (Operation) polygamma(const OtherDerived& other)
+
+Polygamma function. `this` is `n` and `other` is `x`.
+
+### Scalar Arithmetic
+
+### (Operation) operator*(Scalar s)
 
 Multiplies every element of the input tensor by the scalar `s`:
 ```cpp
@@ -1046,13 +1287,14 @@
 Divides every element in the tensor by `s`.
 
 ### (Operation) operator% (Scalar s)
-Computes the element-wise modulus (remainder) of each tensor element divided by `s`
+Computes the element-wise modulus (remainder) of each tensor element divided by `s`.
 
 **Only integer types are supported.**
 For floating-point tensors, implement a `unaryExpr` using `std::fmod`.
 
 ### (Operation)  cwiseMax(Scalar threshold)
-Returns the coefficient-wise maximum between two tensors.
+Returns a tensor where each element is the maximum of the original element and the
+scalar threshold.
 ```cpp
 Eigen::Tensor<int, 2> a(2, 3);
 a.setValues({{0, 100, 200}, {300, 400, 500}});
@@ -1079,7 +1321,8 @@
 // 300 555 500
 ```
 ### (Operation)  cwiseMin(Scalar threshold)
-Returns the coefficient-wise minimum between two tensors.
+Returns a tensor where each element is the minimum of the original element and the
+scalar threshold.
 
 ```cpp
 Eigen::Tensor<int, 2> a(2, 2);
@@ -1107,6 +1350,28 @@
 // 300 -900
 ```
 
+### NaN Propagation for cwiseMax and cwiseMin
+
+The `cwiseMax` and `cwiseMin` operations accept an optional template parameter
+controlling NaN propagation:
+
+*   `cwiseMax<Eigen::PropagateNaN>(other)` — if either operand is NaN, the result is NaN.
+*   `cwiseMax<Eigen::PropagateNumbers>(other)` — NaN is treated as missing; the non-NaN value wins.
+*   `cwiseMax(other)` — default behavior (fast; may or may not propagate NaN, depends on platform).
+
+```cpp
+Eigen::Tensor<float, 1> a(3), b(3);
+a.setValues({1.0f, NAN, 3.0f});
+b.setValues({2.0f, 2.0f, NAN});
+
+Eigen::Tensor<float, 1> c = a.cwiseMax<Eigen::PropagateNaN>(b);
+// c: 2.0, NaN, NaN
+
+Eigen::Tensor<float, 1> d = a.cwiseMax<Eigen::PropagateNumbers>(b);
+// d: 2.0, 2.0, 3.0
+```
+
+
 ### (Operation)  unaryExpr(const CustomUnaryOp& func)
 Applies a user defined function to each element in the tensor.
 Supports lambdas or functor structs with an operator().
@@ -1150,6 +1415,16 @@
 // 0.75   1       1
 ```
 
+### Bitwise and Boolean Unary Operations
+
+### (Operation) operator~()
+
+Bitwise NOT of each element (integer types only).
+
+### (Operation) operator!()
+
+Boolean NOT of each element.
+
 
 ## Binary Element Wise Operations
 
@@ -1189,7 +1464,18 @@
 ### (Operation) cwiseMin(const OtherDerived& other)
 
 Returns a tensor of the same type and dimensions as the input tensors
-containing the coefficient wise mimimums of the inputs.
+containing the coefficient wise minimums of the inputs.
+
+### (Operation) binaryExpr(const OtherDerived& other, const CustomBinaryOp& func)
+
+Applies a custom binary functor element-wise to two tensors.
+
+```cpp
+Eigen::Tensor<float, 2> a(2, 3), b(2, 3);
+a.setRandom(); b.setRandom();
+auto my_op = [](float x, float y) { return x * x + y * y; };
+Eigen::Tensor<float, 2> c = a.binaryExpr(b, my_op);
+```
 
 ### (Operation) Logical operators
 
@@ -1212,6 +1498,8 @@
 
 The resulting tensor retains the input scalar type.
 
+Scalar comparison variants are also available (e.g. `a < 0.5f`).
+
 ## Selection (select(const ThenDerived& thenTensor, const ElseDerived& elseTensor)
 
 Selection is a coefficient-wise ternary operator that is the tensor equivalent
@@ -1297,7 +1585,7 @@
 Eigen::Tensor<int, 2> a(2, 3);
 a.setValues({{1, 2, 3}, {6, 5, 4}});
 // Reduce it along the second dimension (1)...
-Eigen::array<int, 1> dims({1 /* dimension to reduce */});
+Eigen::array<int, 1> dims{1 /* dimension to reduce */};
 // ...using the "maximum" operator.
 // The result is a tensor with one dimension.  The size of
 // that dimension is the same as the first (non-reduced) dimension of a.
@@ -1328,7 +1616,7 @@
 // Note that we pass the array of reduction dimensions
 // directly to the maximum() call.
 Eigen::Tensor<float, 1, Eigen::ColMajor> b =
-    a.maximum(Eigen::array<int, 2>({0, 1}));
+    a.maximum(Eigen::array<int, 2>{0, 1});
 std::cout << "b" << endl << b << endl << endl;
 
 // b
@@ -1387,6 +1675,19 @@
 Reduce a tensor using the `minimum()` operator.  The resulting values
 are the smallest of the reduced values.
 
+### NaN Propagation for maximum and minimum
+
+Like `cwiseMax` and `cwiseMin`, the `maximum` and `minimum` reductions accept
+an optional NaN propagation template parameter:
+
+```cpp
+// If any element along the reduction is NaN, the result is NaN.
+Eigen::Tensor<float, 1> b = a.maximum<Eigen::PropagateNaN>(dims);
+
+// NaN values are ignored during reduction.
+Eigen::Tensor<float, 1> c = a.maximum<Eigen::PropagateNumbers>(dims);
+```
+
 ### (Operation) prod(const Dimensions& reduction_dims)
 ### (Operation) prod()
 
@@ -1475,7 +1776,7 @@
 // Specify the dimensions along which the trace will be computed.
 // In this example, the trace can only be computed along the dimensions
 // with indices 0 and 1
-Eigen::array<int, 2> dims({0, 1});
+Eigen::array<int, 2> dims{0, 1};
 // The output tensor contains all but the trace dimensions.
 Tensor<int, 1> a_trace = a.trace(dims);
 std::cout << "a_trace:" << endl;
@@ -1541,14 +1842,36 @@
 // 4  9 15
 ```
 
-### (Operation) cumsum(const Index& axis)
+### (Operation) cumsum(const Index& axis, bool exclusive = false)
 
 Perform a scan by summing consecutive entries.
 
-### (Operation) cumprod(const Index& axis)
+When `exclusive` is true, element `i` contains the sum of all elements before
+index `i` (exclusive prefix sum). The first element along the axis is 0.
+
+```cpp
+Eigen::Tensor<int, 1> a(4);
+a.setValues({1, 2, 3, 4});
+
+Eigen::Tensor<int, 1> inclusive = a.cumsum(0);           // 1, 3, 6, 10
+Eigen::Tensor<int, 1> exclusive = a.cumsum(0, true);     // 0, 1, 3, 6
+```
+
+### (Operation) cumprod(const Index& axis, bool exclusive = false)
 
 Perform a scan by multiplying consecutive entries.
 
+When `exclusive` is true, element `i` contains the product of all elements
+before index `i`. The first element along the axis is 1.
+
+```cpp
+Eigen::Tensor<int, 1> a(4);
+a.setValues({1, 2, 3, 4});
+
+Eigen::Tensor<int, 1> inclusive = a.cumprod(0);           // 1, 2, 6, 24
+Eigen::Tensor<int, 1> exclusive = a.cumprod(0, true);     // 1, 1, 2, 6
+```
+
 ## Convolutions
 
 ### (Operation) convolve(const Kernel& kernel, const Dimensions& dims)
@@ -1572,7 +1895,7 @@
 input.setRandom();
 kernel.setRandom();
 
-Eigen::array<ptrdiff_t, 2> dims({1, 2});  // Specify second and third dimension for convolution.
+Eigen::array<ptrdiff_t, 2> dims{1, 2};  // Specify second and third dimension for convolution.
 output = input.convolve(kernel, dims);
 
 for (int i = 0; i < 3; ++i) {
@@ -1591,6 +1914,46 @@
 }
 ```
 
+## FFT (Fast Fourier Transform)
+
+### (Operation) fft\<FFTResultType, FFTDirection\>(const FFTDims& dims)
+
+Computes the Fast Fourier Transform of the input tensor along the specified
+dimensions.
+
+Template parameters:
+
+| Parameter | Values | Description |
+|-----------|--------|-------------|
+| `FFTResultType` | `RealPart`, `ImagPart`, `BothParts` | Which part(s) of the result to return |
+| `FFTDirection` | `FFT_FORWARD`, `FFT_REVERSE` | Forward or inverse transform |
+
+When `FFTResultType` is `BothParts`, the output scalar type is
+`std::complex<Scalar>`. When `RealPart` or `ImagPart`, the output retains
+the real scalar type.
+
+```cpp
+// Forward FFT of a 2D tensor along both dimensions.
+Eigen::Tensor<float, 2> input(8, 16);
+input.setRandom();
+Eigen::array<int, 2> fft_dims{{0, 1}};
+
+// Get the full complex result.
+Eigen::Tensor<std::complex<float>, 2> complex_result =
+    input.fft<Eigen::BothParts, Eigen::FFT_FORWARD>(fft_dims);
+
+// Get only the real part.
+Eigen::Tensor<float, 2> real_result =
+    input.fft<Eigen::RealPart, Eigen::FFT_FORWARD>(fft_dims);
+
+// Inverse FFT to recover the original signal.
+Eigen::Tensor<float, 2> recovered =
+    complex_result.fft<Eigen::RealPart, Eigen::FFT_REVERSE>(fft_dims);
+```
+
+The FFT uses the Cooley-Tukey algorithm for power-of-2 sizes and falls back to
+the Bluestein algorithm for arbitrary sizes.
+
 ## Geometrical Operations
 
 These operations return a `Tensor` with different dimensions than the original
@@ -1630,7 +1993,7 @@
 ```cpp
 Eigen::Tensor<float, 2, Eigen::ColMajor> a(2, 3);
 a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}});
-Eigen::array<Eigen::DenseIndex, 1> one_dim({3 * 2});
+Eigen::array<Eigen::DenseIndex, 1> one_dim{3 * 2};
 Eigen::Tensor<float, 1, Eigen::ColMajor> b = a.reshape(one_dim);
 std::cout << "b" << endl << b << endl;
 
@@ -1648,7 +2011,7 @@
 ```cpp
 Eigen::Tensor<float, 2, Eigen::RowMajor> a(2, 3);
 a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}});
-Eigen::array<Eigen::DenseIndex, 1> one_dim({3 * 2});
+Eigen::array<Eigen::DenseIndex, 1> one_dim{3 * 2};
 Eigen::Tensor<float, 1, Eigen::RowMajor> b = a.reshape(one_dim);
 std::cout << "b" << endl << b << endl;
 
@@ -1669,7 +2032,7 @@
 ```cpp
 Eigen::Tensor<float, 2, Eigen::ColMajor> a(2, 3);
 a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}});
-Eigen::array<Eigen::DenseIndex, 2> two_dim({2, 3});
+Eigen::array<Eigen::DenseIndex, 2> two_dim{2, 3};
 Eigen::Tensor<float, 1, Eigen::ColMajor> b(6);
 b.reshape(two_dim) = a;
 std::cout << "b" << endl << b << endl;
@@ -1747,7 +2110,7 @@
              {300, 400, 500},
              {600, 700, 800},
              {900, 1000, 1100}});
-Eigen::array<Eigen::DenseIndex, 2> strides({3, 2});
+Eigen::array<Eigen::DenseIndex, 2> strides{3, 2};
 Eigen::Tensor<int, 2> b = a.stride(strides);
 std::cout << "b" << endl << b << endl;
 // b
@@ -1916,7 +2279,7 @@
                            .chip<1>(0); // Now has shape [2]
 ```
 
-In general, it’s more intuitive to chip from the outermost dimension first.
+In general, it's more intuitive to chip from the outermost dimension first.
 
 
 ### (Operation) reverse(const ReverseDimensions& reverse)
@@ -1934,7 +2297,7 @@
 Eigen::Tensor<int, 2> a(4, 3);
 a.setValues({{0, 100, 200}, {300, 400, 500},
             {600, 700, 800}, {900, 1000, 1100}});
-Eigen::array<bool, 2> reverse({true, false});
+Eigen::array<bool, 2> reverse{true, false};
 Eigen::Tensor<int, 2> b = a.reverse(reverse);
 std::cout << "a\n" << a << "\n";
 std::cout << "b\n" << b << "\n";
@@ -1994,7 +2357,7 @@
 ```cpp
 Eigen::Tensor<int, 2> a(2, 3);
 a.setValues({{0, 100, 200}, {300, 400, 500}});
-Eigen::array<int, 2> bcast({3, 2});
+Eigen::array<int, 2> bcast{3, 2};
 Eigen::Tensor<int, 2> b = a.broadcast(bcast);
 std::cout << "a" << endl << a << endl << "b" << endl << b << endl;
 // a
@@ -2067,6 +2430,9 @@
 
 Returns a view of the input tensor in which the input is padded with zeros.
 
+An optional second argument specifies the padding value (default is zero):
+`a.pad(paddings, 42)` pads with the value 42.
+
 ```cpp
 Eigen::Tensor<int, 2> a(2, 3);
 a.setValues({{0, 100, 200}, {300, 400, 500}});
@@ -2088,6 +2454,33 @@
 //    0     0     0    0
 ```
 
+### (Operation) inflate(const Strides& strides)
+
+Returns a tensor with zeros inserted between the elements of the input tensor
+along each dimension. The `strides` array specifies the inflation factor for
+each dimension: a stride of `s` inserts `s-1` zeros between consecutive
+elements in that dimension. A stride of 1 leaves the dimension unchanged.
+
+The output dimension sizes are `(input_dim - 1) * stride + 1`.
+
+```cpp
+Eigen::Tensor<float, 2> a(2, 3);
+a.setValues({{1, 2, 3}, {4, 5, 6}});
+Eigen::array<Eigen::Index, 2> strides{{2, 3}};
+Eigen::Tensor<float, 2> b = a.inflate(strides);
+std::cout << "b dimensions: " << b.dimension(0) << " x " << b.dimension(1) << "\n";
+std::cout << "b\n" << b << "\n";
+
+// b dimensions: 3 x 7
+// b
+// 1 0 0 2 0 0 3
+// 0 0 0 0 0 0 0
+// 4 0 0 5 0 0 6
+```
+
+This is the adjoint of the `stride()` operation and is useful for implementing
+transposed convolutions (deconvolutions).
+
 ### (Operation)  extract_patches(const PatchDims& patch_dims)
 
 Returns a tensor of coefficient patches extracted from the input tensor, where
@@ -2241,9 +2634,82 @@
 // twod_patch_row_major.dimension(4) == 2
 ```
 
+## Generation and Custom Operations
+
+### (Operation) generate(const Generator& generator)
+
+Returns a tensor whose values are computed by the given generator functor based
+on element coordinates. The generator must define `operator()` taking an
+`array<Index, NumDims>` of coordinates and returning a `Scalar`.
+
+```cpp
+// Generator that produces the linear index of each element.
+template <typename Index, int NumDims>
+struct LinearIndexGenerator {
+  Eigen::array<Index, NumDims> dims_;
+  LinearIndexGenerator(const Eigen::array<Index, NumDims>& dims) : dims_(dims) {}
+
+  float operator()(const Eigen::array<Index, NumDims>& coords) const {
+    float idx = 0;
+    float stride = 1;
+    for (int i = 0; i < NumDims; ++i) {
+      idx += coords[i] * stride;
+      stride *= dims_[i];
+    }
+    return idx;
+  }
+};
+
+Eigen::Tensor<float, 2> t(3, 4);
+Eigen::array<Eigen::Index, 2> dims{{3, 4}};
+Eigen::Tensor<float, 2> result = t.generate(LinearIndexGenerator<Eigen::Index, 2>(dims));
+```
+
+### (Operation) customOp(const CustomUnaryFunc& func)
+
+Applies a custom operation that can produce output with different dimensions
+than the input. Unlike `unaryExpr()` which is element-wise, `customOp()`
+gives full control over how the output is computed.
+
+The functor must implement:
+- `dimensions(const InputType& input)` — returns the output dimensions.
+- `eval(const InputType& input, OutputType& output, const Device& device)` —
+  computes the result.
+
+```cpp
+struct RowSumOp {
+  // Output is a 1D tensor with size equal to the number of rows.
+  template <typename Input>
+  Eigen::DSizes<Eigen::Index, 1> dimensions(const Input& input) const {
+    return Eigen::DSizes<Eigen::Index, 1>(input.dimension(0));
+  }
+
+  template <typename Input, typename Output, typename Device>
+  void eval(const Input& input, Output& output, const Device& device) const {
+    Eigen::array<Eigen::Index, 1> reduce_dims{{1}};
+    output.device(device) = input.sum(reduce_dims);
+  }
+};
+
+Eigen::Tensor<float, 2> a(3, 4);
+a.setRandom();
+Eigen::Tensor<float, 1> row_sums = a.customOp(RowSumOp());
+```
+
+A binary variant is also available:
+```cpp
+Eigen::Tensor<float, 2> result = a.customOp(b, MyBinaryCustomOp());
+```
+
+### (Operation) nullaryExpr(const CustomNullaryOp& func)
+
+Creates a tensor from a custom nullary functor. The functor is called for
+each element position.
+
+
 ## Special Operations
 
-### (Operation) cast<T>()
+### (Operation) cast\<T\>()
 
 Returns a tensor of type `T` with the same dimensions as the original tensor.
 The returned tensor contains the values of the original tensor converted to
@@ -2285,7 +2751,7 @@
 Tensors can be printed into a stream object (e.g. `std::cout`) using different formatting options.
 
 ```cpp
-Eigen::Tensor<float, 3> tensor3d = {4, 3, 2};
+Eigen::Tensor<float, 3> tensor3d(4, 3, 2);
 tensor3d.setValues( {{{1, 2},
                       {3, 4},
                       {5, 6}},
@@ -2298,7 +2764,7 @@
                      {{19, 20},
                       {21, 22},
                       {23, 24}}} );
-std::cout << tensor3d.format(Eigen::TensorIOFormat::Plain()) << ;
+std::cout << tensor3d.format(Eigen::TensorIOFormat::Plain()) << "\n";
 //  1  2
 //  3  4
 //  5  6
@@ -2323,7 +2789,7 @@
 - `Eigen::TensorIOFormat::Native()` for a `c++` like output which can be directly copy-pasted to `setValues()`.
 - `Eigen::TensorIOFormat::Legacy()` for a backwards compatible printing of tensors.
 
-If you send the tensor directly to the stream the default format is called which is `Eigen::IOFormats::Plain()`.
+If you send the tensor directly to the stream the default format is called which is `Eigen::TensorIOFormat::Plain()`.
 
 You can define your own format by explicitly providing a `Eigen::TensorIOFormat` class instance. Here, you can specify:
 - The overall prefix and suffix with `std::string tenPrefix` and `std::string tenSuffix`
@@ -2332,6 +2798,46 @@
 - `int precision`
 - `int flags`: an OR-ed combination of flags, the default value is 0, the only currently available flag is `Eigen::DontAlignCols` which allows to disable the alignment of columns, resulting in faster code.
 
+## Interop with Eigen Matrix and Vector Types
+
+Tensor data can be wrapped as an Eigen `Map<Matrix>`, and vice versa, Eigen
+dense matrix/vector data can be wrapped as a `TensorMap`. This is a zero-copy
+operation that simply reinterprets the underlying memory.
+
+### Wrapping a Tensor as a Matrix
+
+```cpp
+Eigen::Tensor<float, 2> tensor(3, 4);
+tensor.setRandom();
+
+// View the tensor's data as an Eigen Matrix (no copy).
+Eigen::Map<Eigen::MatrixXf> matrix(tensor.data(), 3, 4);
+std::cout << "Matrix view:\n" << matrix << "\n";
+
+// Modifications through the map are reflected in the tensor.
+matrix(0, 0) = 42.0f;
+assert(tensor(0, 0) == 42.0f);
+```
+
+### Wrapping a Matrix as a Tensor
+
+```cpp
+Eigen::MatrixXf matrix(3, 4);
+matrix.setRandom();
+
+// View the matrix's data as a rank-2 Tensor (no copy).
+Eigen::TensorMap<Eigen::Tensor<float, 2>> tensor(matrix.data(), 3, 4);
+std::cout << "Tensor view:\n" << tensor << "\n";
+
+// You can also reshape to a different rank.
+Eigen::TensorMap<Eigen::Tensor<float, 1>> flat(matrix.data(), 12);
+```
+
+**Important**: Both the Map and TensorMap are non-owning views. The underlying
+data must remain valid for the lifetime of the view. Also note that the default
+storage order of Eigen matrices is `ColMajor`, which matches the default Tensor
+layout.
+
 ## Representation of scalar values
 
 Scalar values are often represented by tensors of size 1 and rank 0.
@@ -2345,8 +2851,6 @@
 
 ## Limitations
 
-*   The number of tensor dimensions is currently limited to 250 when using a
-    compiler that supports cxx11. It is limited to only 5 for older compilers.
-*   The `IndexList` class requires a cxx11 compliant compiler. You can use an
-    array of indices instead if you don't have access to a modern compiler.
+*   The number of tensor dimensions is currently limited to 250.
 *   On GPUs only floating point values are properly tested and optimized for.
+*   C++14 or later is required to use the Tensor module.

diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/src/Tensor/Tensor.h
similarity index 100%
rename from unsupported/Eigen/CXX11/src/Tensor/Tensor.h
rename to unsupported/Eigen/src/Tensor/Tensor.h


diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/src/Tensor/TensorArgMax.h
similarity index 97%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
rename to unsupported/Eigen/src/Tensor/TensorArgMax.h
index 3f9866a..5d49fa9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
+++ b/unsupported/Eigen/src/Tensor/TensorArgMax.h

@@ -78,8 +78,8 @@
   typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
-    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
-    PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
+    IsAligned = false,
+    PacketAccess = false,
     BlockAccess = false,
     PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
     CoordAccess = false,  // to be implemented
@@ -195,8 +195,8 @@
   typedef StorageMemory<PairType, Device> PairStorageMem;
 
   enum {
-    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
-    PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
+    IsAligned = false,
+    PacketAccess = false,
     BlockAccess = false,
     PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
     CoordAccess = false,  // to be implemented

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/src/Tensor/TensorAssign.h
similarity index 98%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
rename to unsupported/Eigen/src/Tensor/TensorAssign.h
index 37d914e..330eae7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
+++ b/unsupported/Eigen/src/Tensor/TensorAssign.h

@@ -48,7 +48,7 @@
 /** The tensor assignment class.
  * \ingroup CXX11_Tensor_Module
  *
- * This class is represents the assignment of the values resulting from the evaluation of
+ * This class represents the assignment of the values resulting from the evaluation of
  * the rhs expression to the memory locations denoted by the lhs expression.
  */
 template <typename LhsXprType, typename RhsXprType>

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/src/Tensor/TensorBase.h
similarity index 99%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
rename to unsupported/Eigen/src/Tensor/TensorBase.h
index fc3f3b7..f6812d4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/src/Tensor/TensorBase.h

@@ -26,7 +26,7 @@
   * making it possible to use either class interchangeably in expressions.
   */
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-// FIXME Doxygen does not like the inheritance with different template parameters
+// FIXME: Doxygen does not like the inheritance with different template parameters
 // Since there is no doxygen documentation inside, we disable it for now
 template<typename Derived>
 class TensorBase<Derived, ReadOnlyAccessors>

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/src/Tensor/TensorBlock.h
similarity index 98%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
rename to unsupported/Eigen/src/Tensor/TensorBlock.h
index 0b068a7..00bec13 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
+++ b/unsupported/Eigen/src/Tensor/TensorBlock.h

@@ -23,7 +23,7 @@
 // Helper function to compute strides for densely stored buffer of given
 // dimensions.
 
-// TODO(ezhulenev): We compute strides 1000 times in different evaluators, use
+// TODO(ezhulenev): We compute strides many times in different evaluators, use
 // this function instead everywhere.
 template <int Layout, typename IndexType, int NumDims>
 EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(const DSizes<IndexType, NumDims>& dimensions) {
@@ -176,7 +176,7 @@
   // a memory buffer, then we might do performance optimization, and evaluate
   // the root expression directly into the final output memory. Some time it's
   // possible to reuse it for materializing subexpressions inside an expression
-  // tree, to to avoid dynamic memory allocation.
+  // tree, to avoid dynamic memory allocation.
   //
   // The pointer type of the underlying storage is erased, because passing
   // Scalar type through all the expression evaluation layers is way too many
@@ -409,7 +409,7 @@
           std::pow(static_cast<float>(target_block_size), 1.0f / static_cast<float>(m_block_dimensions.rank())));
 
       for (int i = 0; i < NumDims; ++i) {
-        // TODO(andydavis) Adjust the inner most 'block_dim_size' to make it
+        // TODO(andydavis): Adjust the inner most 'block_dim_size' to make it
         // a multiple of the packet size. Note that reducing
         // 'block_dim_size' in this manner can increase the number of
         // blocks, and so will amplify any per-block overhead.
@@ -486,7 +486,7 @@
     // TODO(ezhulenev): Remove when replaced with inlined vector.
     if (m_allocations.capacity() == 0) m_allocations.reserve(8);
 
-    // Check if we already have an existing allocation att current index.
+    // Check if we already have an existing allocation at current index.
     const int num_allocations = static_cast<int>(m_allocations.size());
     const bool has_allocation = m_allocation_index < num_allocations;
 
@@ -504,7 +504,7 @@
       m_allocations[m_allocation_index].size = size;
     }
 
-    // Make a new allocation if we don't have and existing one.
+    // Make a new allocation if we don't have an existing one.
     if (!has_allocation) {
       Allocation allocation;
       allocation.ptr = m_device.allocate(size);
@@ -560,7 +560,7 @@
 };
 
 // -------------------------------------------------------------------------- //
-// TensorBlockNotImplemented should be used to defined TensorBlock typedef in
+// TensorBlockNotImplemented should be used to define TensorBlock typedef in
 // TensorEvaluators that do not support block evaluation.
 
 class TensorBlockNotImplemented {
@@ -789,7 +789,7 @@
 };
 
 // -------------------------------------------------------------------------- //
-// TensorCwiseUnaryBlock is a lazy tensor expression block that applies BinaryOp
+// TensorCwiseBinaryBlock is a lazy tensor expression block that applies BinaryOp
 // functor to the blocks produced by the underlying Tensor expression.
 
 template <typename BinaryOp, typename LhsTensorBlock, typename RhsTensorBlock>

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/src/Tensor/TensorBroadcasting.h
similarity index 99%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
rename to unsupported/Eigen/src/Tensor/TensorBroadcasting.h
index aad1647..799e513 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/src/Tensor/TensorBroadcasting.h

@@ -486,7 +486,7 @@
     }
     inputIndex += innermostLoc;
 
-    // Todo: this could be extended to the second dimension if we're not
+    // TODO: This could be extended to the second dimension if we're not
     // broadcasting alongside the first dimension, and so on.
     if (innermostLoc + PacketSize <= m_impl.dimensions()[0]) {
       return m_impl.template packet<Unaligned>(inputIndex);
@@ -542,7 +542,7 @@
     }
     inputIndex += innermostLoc;
 
-    // Todo: this could be extended to the second dimension if we're not
+    // TODO: This could be extended to the second dimension if we're not
     // broadcasting alongside the first dimension, and so on.
     if (innermostLoc + PacketSize <= m_impl.dimensions()[NumDims - 1]) {
       return m_impl.template packet<Unaligned>(inputIndex);

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/src/Tensor/TensorChipping.h
similarity index 100%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
rename to unsupported/Eigen/src/Tensor/TensorChipping.h


diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/src/Tensor/TensorConcatenation.h
similarity index 98%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
rename to unsupported/Eigen/src/Tensor/TensorConcatenation.h
index 0203f01..29607a7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
+++ b/unsupported/Eigen/src/Tensor/TensorConcatenation.h

@@ -169,7 +169,7 @@
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  // TODO(phli): Add short-circuit memcpy evaluation if underlying data are linear?
+  // TODO(phli): Add short-circuit memcpy evaluation if underlying data are linear.
   EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
     m_leftImpl.evalSubExprsIfNeeded(NULL);
     m_rightImpl.evalSubExprsIfNeeded(NULL);
@@ -181,8 +181,7 @@
     m_rightImpl.cleanup();
   }
 
-  // TODO(phli): attempt to speed this up. The integer divisions and modulo are slow.
-  // See CL/76180724 comments for more ideas.
+  // TODO(phli): Attempt to speed this up. The integer divisions and modulo are slow.
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
     // Collect dimension-wise indices (subs).
     array<Index, NumDims> subs;

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/src/Tensor/TensorContraction.h
similarity index 82%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
rename to unsupported/Eigen/src/Tensor/TensorContraction.h
index 97e7da3..6768814 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/src/Tensor/TensorContraction.h

@@ -20,49 +20,49 @@
 template <typename Dimensions, typename LhsXprType, typename RhsXprType, typename OutputKernelType>
 struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType>> {
   // Type promotion to handle the case where the types of the lhs and the rhs are different.
-  typedef typename gebp_traits<std::remove_const_t<typename LhsXprType::Scalar>,
-                               std::remove_const_t<typename RhsXprType::Scalar>>::ResScalar Scalar;
+  using Scalar = typename gebp_traits<std::remove_const_t<typename LhsXprType::Scalar>,
+                                      std::remove_const_t<typename RhsXprType::Scalar>>::ResScalar;
 
-  typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind,
-                                        typename traits<RhsXprType>::StorageKind>::ret StorageKind;
-  typedef
-      typename promote_index_type<typename traits<LhsXprType>::Index, typename traits<RhsXprType>::Index>::type Index;
-  typedef typename LhsXprType::Nested LhsNested;
-  typedef typename RhsXprType::Nested RhsNested;
-  typedef std::remove_reference_t<LhsNested> LhsNested_;
-  typedef std::remove_reference_t<RhsNested> RhsNested_;
+  using StorageKind = typename promote_storage_type<typename traits<LhsXprType>::StorageKind,
+                                                    typename traits<RhsXprType>::StorageKind>::ret;
+  using Index =
+      typename promote_index_type<typename traits<LhsXprType>::Index, typename traits<RhsXprType>::Index>::type;
+  using LhsNested = typename LhsXprType::Nested;
+  using RhsNested = typename RhsXprType::Nested;
+  using LhsNested_ = std::remove_reference_t<LhsNested>;
+  using RhsNested_ = std::remove_reference_t<RhsNested>;
 
   // From NumDims below.
   static constexpr int NumDimensions =
       traits<LhsXprType>::NumDimensions + traits<RhsXprType>::NumDimensions - 2 * array_size<Dimensions>::value;
   static constexpr int Layout = traits<LhsXprType>::Layout;
-  typedef std::conditional_t<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val,
-                             typename traits<LhsXprType>::PointerType, typename traits<RhsXprType>::PointerType>
-      PointerType;
+  using PointerType =
+      std::conditional_t<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val,
+                         typename traits<LhsXprType>::PointerType, typename traits<RhsXprType>::PointerType>;
 
   enum { Flags = 0 };
 };
 
 template <typename Dimensions, typename LhsXprType, typename RhsXprType, typename OutputKernelType>
 struct eval<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType>, Eigen::Dense> {
-  typedef const TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType>& type;
+  using type = const TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType>&;
 };
 
 template <typename Dimensions, typename LhsXprType, typename RhsXprType, typename OutputKernelType>
 struct nested<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType>, 1,
               typename eval<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType>>::type> {
-  typedef TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType> type;
+  using type = TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType>;
 };
 
 template <typename Indices_, typename LeftArgType_, typename RightArgType_, typename OutputKernelType_,
           typename Device_>
 struct traits<
     TensorEvaluator<const TensorContractionOp<Indices_, LeftArgType_, RightArgType_, OutputKernelType_>, Device_>> {
-  typedef Indices_ Indices;
-  typedef LeftArgType_ LeftArgType;
-  typedef RightArgType_ RightArgType;
-  typedef OutputKernelType_ OutputKernelType;
-  typedef Device_ Device;
+  using Indices = Indices_;
+  using LeftArgType = LeftArgType_;
+  using RightArgType = RightArgType_;
+  using OutputKernelType = OutputKernelType_;
+  using Device = Device_;
 
   // From NumDims below.
   static constexpr int NumDimensions =
@@ -72,7 +72,7 @@
 // Helper class to allocate and deallocate temporary memory for packed buffers.
 template <typename LhsScalar, typename RhsScalar>
 struct TensorContractionBlockMemAllocator {
-  typedef void* BlockMemHandle;
+  using BlockMemHandle = void*;
 
   template <typename Device>
   EIGEN_DEVICE_FUNC static BlockMemHandle allocate(Device& d, const Index bm, const Index bk, const Index bn,
@@ -141,12 +141,12 @@
 // blocks of Lhs and Rhs tensor expressions, and how we invoke matrix
 // multiplication for these blocks. Default tensor contraction uses
 // gemm_pack_rhs, gemm_pack_lhs and gebp_kernel from Eigen Core (see
-// GeneralBlocPanelKernel.h for details).
+// GeneralBlockPanelKernel.h for details).
 //
 // By specializing contraction kernels we can use other low level libraries to
 // perform matrix multiplication, and still rely on Eigen contraction evaluator.
 // This also includes full support in TensorContractionThreadPool, assuming that
-// underlying gemm do not use it's own threading.
+// underlying gemm does not use its own threading.
 //
 // - ResScalar/LhsScalar/RhsScalar - scalar type for the result of
 //   multiplication, lhs tensor and rhs tensor respectively.
@@ -175,25 +175,23 @@
       : m(m_), k(k_), n(n_), bm(bm_), bk(bk_), bn(bn_) {}
 
   // Pack blocks of Lhs and Rhs into contiguous blocks in memory.
-  typedef LhsScalar* LhsBlock;
-  typedef RhsScalar* RhsBlock;
+  using LhsBlock = LhsScalar*;
+  using RhsBlock = RhsScalar*;
 
   // Packed Lhs/Rhs block memory allocator.
-  typedef TensorContractionBlockMemAllocator<LhsScalar, RhsScalar> BlockMemAllocator;
-  typedef typename BlockMemAllocator::BlockMemHandle BlockMemHandle;
+  using BlockMemAllocator = TensorContractionBlockMemAllocator<LhsScalar, RhsScalar>;
+  using BlockMemHandle = typename BlockMemAllocator::BlockMemHandle;
 
-  typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
+  using Traits = typename internal::gebp_traits<LhsScalar, RhsScalar>;
 
-  typedef internal::gemm_pack_lhs<LhsScalar, StorageIndex, typename LhsMapper::SubMapper, Traits::mr,
-                                  Traits::LhsProgress, typename Traits::LhsPacket4Packing, ColMajor>
-      LhsPacker;
+  using LhsPacker = internal::gemm_pack_lhs<LhsScalar, StorageIndex, typename LhsMapper::SubMapper, Traits::mr,
+                                            Traits::LhsProgress, typename Traits::LhsPacket4Packing, ColMajor>;
 
-  typedef internal::gemm_pack_rhs<RhsScalar, StorageIndex, typename RhsMapper::SubMapper, Traits::nr, ColMajor>
-      RhsPacker;
+  using RhsPacker =
+      internal::gemm_pack_rhs<RhsScalar, StorageIndex, typename RhsMapper::SubMapper, Traits::nr, ColMajor>;
 
-  typedef internal::gebp_kernel<LhsScalar, RhsScalar, StorageIndex, OutputMapper, Traits::mr, Traits::nr,
-                                /*ConjugateLhs*/ false, /*ConjugateRhs*/ false>
-      GebpKernel;
+  using GebpKernel = internal::gebp_kernel<LhsScalar, RhsScalar, StorageIndex, OutputMapper, Traits::mr, Traits::nr,
+                                           /*ConjugateLhs*/ false, /*ConjugateRhs*/ false>;
 
   template <typename Device>
   EIGEN_DEVICE_FUNC BlockMemHandle allocate(Device& d, LhsBlock* lhs_block, RhsBlock* rhs_block) {
@@ -228,6 +226,7 @@
                                                   const StorageIndex depth, const StorageIndex cols,
                                                   const ResScalar alpha, const ResScalar beta) {
     // Default GEBP kernel does not support beta.
+    EIGEN_ONLY_USED_FOR_DEBUG(beta);
     eigen_assert(beta == ResScalar(1));
     static const int kComputeStrideFromBlockDimensions = -1;
     GebpKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha,
@@ -302,12 +301,12 @@
 class TensorContractionOp
     : public TensorBase<TensorContractionOp<Indices, LhsXprType, RhsXprType, OutputKernelType>, ReadOnlyAccessors> {
  public:
-  typedef typename Eigen::internal::traits<TensorContractionOp>::Scalar Scalar;
-  typedef typename internal::gebp_traits<typename LhsXprType::CoeffReturnType,
-                                         typename RhsXprType::CoeffReturnType>::ResScalar CoeffReturnType;
-  typedef typename Eigen::internal::nested<TensorContractionOp>::type Nested;
-  typedef typename Eigen::internal::traits<TensorContractionOp>::StorageKind StorageKind;
-  typedef typename Eigen::internal::traits<TensorContractionOp>::Index Index;
+  using Scalar = typename Eigen::internal::traits<TensorContractionOp>::Scalar;
+  using CoeffReturnType = typename internal::gebp_traits<typename LhsXprType::CoeffReturnType,
+                                                         typename RhsXprType::CoeffReturnType>::ResScalar;
+  using Nested = typename Eigen::internal::nested<TensorContractionOp>::type;
+  using StorageKind = typename Eigen::internal::traits<TensorContractionOp>::StorageKind;
+  using Index = typename Eigen::internal::traits<TensorContractionOp>::Index;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionOp(const LhsXprType& lhs, const RhsXprType& rhs,
                                                             const Indices& dims,
@@ -336,19 +335,19 @@
 
 template <typename Derived>
 struct TensorContractionEvaluatorBase {
-  typedef typename internal::traits<Derived>::Indices Indices;
-  typedef typename internal::traits<Derived>::LeftArgType LeftArgType;
-  typedef typename internal::traits<Derived>::RightArgType RightArgType;
-  typedef typename internal::traits<Derived>::OutputKernelType OutputKernelType;
-  typedef typename internal::traits<Derived>::Device Device;
+  using Indices = typename internal::traits<Derived>::Indices;
+  using LeftArgType = typename internal::traits<Derived>::LeftArgType;
+  using RightArgType = typename internal::traits<Derived>::RightArgType;
+  using OutputKernelType = typename internal::traits<Derived>::OutputKernelType;
+  using Device = typename internal::traits<Derived>::Device;
 
-  typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
-  typedef std::remove_const_t<typename XprType::Scalar> Scalar;
-  typedef typename XprType::Index Index;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  typedef StorageMemory<Scalar, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
+  using XprType = TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>;
+  using Scalar = std::remove_const_t<typename XprType::Scalar>;
+  using Index = typename XprType::Index;
+  using CoeffReturnType = typename XprType::CoeffReturnType;
+  using PacketReturnType = typename PacketType<CoeffReturnType, Device>::type;
+  using Storage = StorageMemory<Scalar, Device>;
+  using EvaluatorPointerType = typename Storage::Type;
 
   static constexpr int Layout = TensorEvaluator<LeftArgType, Device>::Layout;
   enum {
@@ -361,20 +360,20 @@
   };
 
   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
+  using TensorBlock = internal::TensorBlockNotImplemented;
   //===--------------------------------------------------------------------===//
 
   // Most of the code is assuming that both input tensors are ColMajor. If the
   // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
   // If we want to compute A * B = C, where A is LHS and B is RHS, the code
   // will pretend B is LHS and A is RHS.
-  typedef std::conditional_t<static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>
-      EvalLeftArgType;
-  typedef std::conditional_t<static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>
-      EvalRightArgType;
+  using EvalLeftArgType =
+      std::conditional_t<static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>;
+  using EvalRightArgType =
+      std::conditional_t<static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>;
 
-  typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluatorType;
-  typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluatorType;
+  using LeftEvaluatorType = TensorEvaluator<EvalLeftArgType, Device>;
+  using RightEvaluatorType = TensorEvaluator<EvalRightArgType, Device>;
 
   static constexpr int LDims =
       internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
@@ -383,11 +382,11 @@
   static constexpr int ContractDims = internal::array_size<Indices>::value;
   static constexpr int NumDims = LDims + RDims - 2 * ContractDims;
 
-  typedef array<Index, ContractDims> contract_t;
-  typedef array<Index, LDims - ContractDims> left_nocontract_t;
-  typedef array<Index, RDims - ContractDims> right_nocontract_t;
+  using contract_t = array<Index, ContractDims>;
+  using left_nocontract_t = array<Index, LDims - ContractDims>;
+  using right_nocontract_t = array<Index, RDims - ContractDims>;
 
-  typedef DSizes<Index, NumDims> Dimensions;
+  using Dimensions = DSizes<Index, NumDims>;
 
   EIGEN_STRONG_INLINE TensorContractionEvaluatorBase(const XprType& op, const Device& device)
       : m_leftImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(), op.lhsExpression(),
@@ -696,23 +695,22 @@
     const Index rows = m_i_size;
     const Index cols = m_k_size;
 
-    typedef std::remove_const_t<typename EvalLeftArgType::Scalar> LhsScalar;
-    typedef std::remove_const_t<typename EvalRightArgType::Scalar> RhsScalar;
-    typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
-    typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
+    using LhsScalar = std::remove_const_t<typename EvalLeftArgType::Scalar>;
+    using RhsScalar = std::remove_const_t<typename EvalRightArgType::Scalar>;
+    using LeftEvaluator = TensorEvaluator<EvalLeftArgType, Device>;
+    using RightEvaluator = TensorEvaluator<EvalRightArgType, Device>;
     const int lhs_packet_size = internal::unpacket_traits<typename LeftEvaluator::PacketReturnType>::size;
     const int rhs_packet_size = internal::unpacket_traits<typename RightEvaluator::PacketReturnType>::size;
     const int lhs_alignment = LeftEvaluator::IsAligned ? Aligned : Unaligned;
     const int rhs_alignment = RightEvaluator::IsAligned ? Aligned : Unaligned;
-    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t,
-                                                   contract_t, lhs_packet_size, lhs_inner_dim_contiguous, false,
-                                                   lhs_alignment>
-        LhsMapper;
+    using LhsMapper = internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs, LeftEvaluator,
+                                                             left_nocontract_t, contract_t, lhs_packet_size,
+                                                             lhs_inner_dim_contiguous, false, lhs_alignment>;
 
-    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t,
-                                                   contract_t, rhs_packet_size, rhs_inner_dim_contiguous,
-                                                   rhs_inner_dim_reordered, rhs_alignment>
-        RhsMapper;
+    using RhsMapper =
+        internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t,
+                                               contract_t, rhs_packet_size, rhs_inner_dim_contiguous,
+                                               rhs_inner_dim_reordered, rhs_alignment>;
 
     LhsMapper lhs(m_leftImpl, m_left_nocontract_strides, m_i_strides, m_left_contracting_strides, m_k_strides);
     RhsMapper rhs(m_rightImpl, m_right_nocontract_strides, m_j_strides, m_right_contracting_strides, m_k_strides);
@@ -726,7 +724,7 @@
     internal::general_matrix_vector_product<Index, LhsScalar, LhsMapper, ColMajor, false, RhsScalar, RhsMapper,
                                             false>::run(rows, cols, lhs, rhs, buffer, resIncr, alpha);
 
-    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+    using OutputMapper = internal::blas_data_mapper<Scalar, Index, ColMajor>;
     m_output_kernel(OutputMapper(buffer, rows), m_tensor_contraction_params, static_cast<Index>(0),
                     static_cast<Index>(0), rows, static_cast<Index>(1));
   }
@@ -764,29 +762,28 @@
     const Index n = this->m_j_size;
 
     // define data mappers for Lhs and Rhs
-    typedef std::remove_const_t<typename EvalLeftArgType::Scalar> LhsScalar;
-    typedef std::remove_const_t<typename EvalRightArgType::Scalar> RhsScalar;
+    using LhsScalar = std::remove_const_t<typename EvalLeftArgType::Scalar>;
+    using RhsScalar = std::remove_const_t<typename EvalRightArgType::Scalar>;
 
-    typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
-    typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
+    using LeftEvaluator = TensorEvaluator<EvalLeftArgType, Device>;
+    using RightEvaluator = TensorEvaluator<EvalRightArgType, Device>;
 
     const int lhs_packet_size = internal::unpacket_traits<typename LeftEvaluator::PacketReturnType>::size;
     const int rhs_packet_size = internal::unpacket_traits<typename RightEvaluator::PacketReturnType>::size;
 
-    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t,
-                                                   contract_t, lhs_packet_size, lhs_inner_dim_contiguous, false,
-                                                   Unaligned>
-        LhsMapper;
+    using LhsMapper =
+        internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t,
+                                               contract_t, lhs_packet_size, lhs_inner_dim_contiguous, false, Unaligned>;
 
-    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t,
-                                                   contract_t, rhs_packet_size, rhs_inner_dim_contiguous,
-                                                   rhs_inner_dim_reordered, Unaligned>
-        RhsMapper;
+    using RhsMapper =
+        internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t,
+                                               contract_t, rhs_packet_size, rhs_inner_dim_contiguous,
+                                               rhs_inner_dim_reordered, Unaligned>;
 
-    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+    using OutputMapper = internal::blas_data_mapper<Scalar, Index, ColMajor>;
 
-    typedef internal::TensorContractionKernel<Scalar, LhsScalar, RhsScalar, Index, OutputMapper, LhsMapper, RhsMapper>
-        TensorContractionKernel;
+    using TensorContractionKernel =
+        internal::TensorContractionKernel<Scalar, LhsScalar, RhsScalar, Index, OutputMapper, LhsMapper, RhsMapper>;
 
     // initialize data mappers
     LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
@@ -804,15 +801,15 @@
     const Index mc = numext::mini(m, blocking.mc());
     const Index nc = numext::mini(n, blocking.nc());
 
-    typedef typename TensorContractionKernel::LhsBlock LhsBlock;
-    typedef typename TensorContractionKernel::RhsBlock RhsBlock;
+    using LhsBlock = typename TensorContractionKernel::LhsBlock;
+    using RhsBlock = typename TensorContractionKernel::RhsBlock;
 
     LhsBlock blockA;
     RhsBlock blockB;
 
     TensorContractionKernel kernel(m, k_slice, n, mc, kc, nc);
 
-    typedef typename TensorContractionKernel::BlockMemHandle BlockMemHandle;
+    using BlockMemHandle = typename TensorContractionKernel::BlockMemHandle;
     const BlockMemHandle packed_mem = kernel.allocate(this->m_device, &blockA, &blockB);
 
     // If a contraction kernel does not support beta, explicitly initialize
@@ -912,14 +909,14 @@
 struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device>
     : public TensorContractionEvaluatorBase<
           TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device>> {
-  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self;
-  typedef TensorContractionEvaluatorBase<Self> Base;
+  using Self = TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device>;
+  using Base = TensorContractionEvaluatorBase<Self>;
 
-  typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
-  typedef std::remove_const_t<typename XprType::Scalar> Scalar;
-  typedef typename XprType::Index Index;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  using XprType = TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>;
+  using Scalar = std::remove_const_t<typename XprType::Scalar>;
+  using Index = typename XprType::Index;
+  using CoeffReturnType = typename XprType::CoeffReturnType;
+  using PacketReturnType = typename PacketType<CoeffReturnType, Device>::type;
 
   static constexpr int Layout = TensorEvaluator<LeftArgType, Device>::Layout;
 
@@ -927,8 +924,8 @@
   // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
   // If we want to compute A * B = C, where A is LHS and B is RHS, the code
   // will pretend B is LHS and A is RHS.
-  typedef std::conditional_t<Layout == static_cast<int>(ColMajor), LeftArgType, RightArgType> EvalLeftArgType;
-  typedef std::conditional_t<Layout == static_cast<int>(ColMajor), RightArgType, LeftArgType> EvalRightArgType;
+  using EvalLeftArgType = std::conditional_t<Layout == static_cast<int>(ColMajor), LeftArgType, RightArgType>;
+  using EvalRightArgType = std::conditional_t<Layout == static_cast<int>(ColMajor), RightArgType, LeftArgType>;
 
   static constexpr int LDims =
       internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
@@ -936,14 +933,14 @@
       internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
   static constexpr int ContractDims = internal::array_size<Indices>::value;
 
-  typedef array<Index, ContractDims> contract_t;
-  typedef array<Index, LDims - ContractDims> left_nocontract_t;
-  typedef array<Index, RDims - ContractDims> right_nocontract_t;
+  using contract_t = array<Index, ContractDims>;
+  using left_nocontract_t = array<Index, LDims - ContractDims>;
+  using right_nocontract_t = array<Index, RDims - ContractDims>;
 
   static constexpr int NumDims = LDims + RDims - 2 * ContractDims;
 
   // Could we use NumDimensions here?
-  typedef DSizes<Index, NumDims> Dimensions;
+  using Dimensions = DSizes<Index, NumDims>;
 
   TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {}
 

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/unsupported/Eigen/src/Tensor/TensorContractionBlocking.h
similarity index 100%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
rename to unsupported/Eigen/src/Tensor/TensorContractionBlocking.h


diff --git a/unsupported/Eigen/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/src/Tensor/TensorContractionCuda.h
new file mode 100644
index 0000000..54415af
--- /dev/null
+++ b/unsupported/Eigen/src/Tensor/TensorContractionCuda.h

@@ -0,0 +1,7 @@
+
+#if defined(__clang__) || defined(__GNUC__)
+#warning \
+    "Deprecated header file, please either include the main unsupported/Eigen/Tensor header or the respective TensorContractionGpu.h file"
+#endif
+
+#include "TensorContractionGpu.h"

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h b/unsupported/Eigen/src/Tensor/TensorContractionGpu.h
similarity index 68%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
rename to unsupported/Eigen/src/Tensor/TensorContractionGpu.h
index 780e896..eb2fa2f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
+++ b/unsupported/Eigen/src/Tensor/TensorContractionGpu.h

@@ -30,6 +30,9 @@
 
   const Index base_m = 64 * m_block_idx;
   const Index base_n = 64 * n_block_idx;
+  const Index thread_x = threadIdx.x;
+  const Index thread_y = threadIdx.y;
+  const Index thread_z = threadIdx.z;
 
   // declare and initialize 64 registers for output 8x8 block
 
@@ -66,8 +69,8 @@
   // conflicts on writes and also none on reads.
 
   // storage indices
-  const Index lhs_store_idx_base = threadIdx.y * 72 + threadIdx.x * 9 + threadIdx.z;
-  const Index rhs_store_idx_base = threadIdx.y * 72 + threadIdx.z * 8 + threadIdx.x;
+  const Index lhs_store_idx_base = thread_y * 72 + thread_x * 9 + thread_z;
+  const Index rhs_store_idx_base = thread_y * 72 + thread_z * 8 + thread_x;
 
   const Index lhs_store_idx_0 = lhs_store_idx_base + 576 * 0;
   const Index lhs_store_idx_1 = lhs_store_idx_base + 576 * 1;
@@ -88,151 +91,151 @@
   const Index rhs_store_idx_7 = rhs_store_idx_base + 576 * 7;
 
   // in the loading code, the following variables are important:
-  // threadIdx.x: the vertical position in an 8x8 block
-  // threadIdx.y: the vertical index of the 8x8 block in the grid
-  // threadIdx.z: the horizontal position in an 8x8 block
+  // thread_x: the vertical position in an 8x8 block
+  // thread_y: the vertical index of the 8x8 block in the grid
+  // thread_z: the horizontal position in an 8x8 block
   // k: the horizontal index of the 8x8 block in the grid
   //
   // The k parameter is implicit (it was the loop counter for a loop that went
   // from 0 to <8, but now that loop is unrolled in the below code.
 
-  const Index load_idx_vert = threadIdx.x + 8 * threadIdx.y;
+  const Index load_idx_vert = thread_x + 8 * thread_y;
   const Index lhs_vert = base_m + load_idx_vert;
 
-#define prefetchIntoRegisters(base_k)                         \
-  {                                                           \
-    lhs_pf0 = conv(0);                                        \
-    lhs_pf1 = conv(0);                                        \
-    lhs_pf2 = conv(0);                                        \
-    lhs_pf3 = conv(0);                                        \
-    lhs_pf4 = conv(0);                                        \
-    lhs_pf5 = conv(0);                                        \
-    lhs_pf6 = conv(0);                                        \
-    lhs_pf7 = conv(0);                                        \
-                                                              \
-    rhs_pf0 = conv(0);                                        \
-    rhs_pf1 = conv(0);                                        \
-    rhs_pf2 = conv(0);                                        \
-    rhs_pf3 = conv(0);                                        \
-    rhs_pf4 = conv(0);                                        \
-    rhs_pf5 = conv(0);                                        \
-    rhs_pf6 = conv(0);                                        \
-    rhs_pf7 = conv(0);                                        \
-                                                              \
-    if (!needs_edge_check || lhs_vert < m_size) {             \
-      const Index lhs_horiz_0 = base_k + threadIdx.z + 0 * 8; \
-      const Index lhs_horiz_1 = base_k + threadIdx.z + 1 * 8; \
-      const Index lhs_horiz_2 = base_k + threadIdx.z + 2 * 8; \
-      const Index lhs_horiz_3 = base_k + threadIdx.z + 3 * 8; \
-      const Index lhs_horiz_4 = base_k + threadIdx.z + 4 * 8; \
-      const Index lhs_horiz_5 = base_k + threadIdx.z + 5 * 8; \
-      const Index lhs_horiz_6 = base_k + threadIdx.z + 6 * 8; \
-      const Index lhs_horiz_7 = base_k + threadIdx.z + 7 * 8; \
-                                                              \
-      if (!needs_edge_check || lhs_horiz_7 < k_size) {        \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                 \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                 \
-        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                 \
-        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                 \
-        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                 \
-        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                 \
-        lhs_pf6 = lhs(lhs_vert, lhs_horiz_6);                 \
-        lhs_pf7 = lhs(lhs_vert, lhs_horiz_7);                 \
-      } else if (lhs_horiz_6 < k_size) {                      \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                 \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                 \
-        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                 \
-        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                 \
-        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                 \
-        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                 \
-        lhs_pf6 = lhs(lhs_vert, lhs_horiz_6);                 \
-      } else if (lhs_horiz_5 < k_size) {                      \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                 \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                 \
-        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                 \
-        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                 \
-        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                 \
-        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                 \
-      } else if (lhs_horiz_4 < k_size) {                      \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                 \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                 \
-        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                 \
-        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                 \
-        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                 \
-      } else if (lhs_horiz_3 < k_size) {                      \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                 \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                 \
-        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                 \
-        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                 \
-      } else if (lhs_horiz_2 < k_size) {                      \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                 \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                 \
-        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                 \
-      } else if (lhs_horiz_1 < k_size) {                      \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                 \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                 \
-      } else if (lhs_horiz_0 < k_size) {                      \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                 \
-      }                                                       \
-    }                                                         \
-                                                              \
-    const Index rhs_vert = base_k + load_idx_vert;            \
-    if (!needs_edge_check || rhs_vert < k_size) {             \
-      const Index rhs_horiz_0 = base_n + threadIdx.z + 0 * 8; \
-      const Index rhs_horiz_1 = base_n + threadIdx.z + 1 * 8; \
-      const Index rhs_horiz_2 = base_n + threadIdx.z + 2 * 8; \
-      const Index rhs_horiz_3 = base_n + threadIdx.z + 3 * 8; \
-      const Index rhs_horiz_4 = base_n + threadIdx.z + 4 * 8; \
-      const Index rhs_horiz_5 = base_n + threadIdx.z + 5 * 8; \
-      const Index rhs_horiz_6 = base_n + threadIdx.z + 6 * 8; \
-      const Index rhs_horiz_7 = base_n + threadIdx.z + 7 * 8; \
-                                                              \
-      if (rhs_horiz_7 < n_size) {                             \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                 \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                 \
-        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                 \
-        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                 \
-        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                 \
-        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                 \
-        rhs_pf6 = rhs(rhs_vert, rhs_horiz_6);                 \
-        rhs_pf7 = rhs(rhs_vert, rhs_horiz_7);                 \
-      } else if (rhs_horiz_6 < n_size) {                      \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                 \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                 \
-        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                 \
-        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                 \
-        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                 \
-        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                 \
-        rhs_pf6 = rhs(rhs_vert, rhs_horiz_6);                 \
-      } else if (rhs_horiz_5 < n_size) {                      \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                 \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                 \
-        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                 \
-        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                 \
-        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                 \
-        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                 \
-      } else if (rhs_horiz_4 < n_size) {                      \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                 \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                 \
-        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                 \
-        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                 \
-        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                 \
-      } else if (rhs_horiz_3 < n_size) {                      \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                 \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                 \
-        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                 \
-        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                 \
-      } else if (rhs_horiz_2 < n_size) {                      \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                 \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                 \
-        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                 \
-      } else if (rhs_horiz_1 < n_size) {                      \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                 \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                 \
-      } else if (rhs_horiz_0 < n_size) {                      \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                 \
-      }                                                       \
-    }                                                         \
+#define prefetchIntoRegisters(base_k)                      \
+  {                                                        \
+    lhs_pf0 = conv(0);                                     \
+    lhs_pf1 = conv(0);                                     \
+    lhs_pf2 = conv(0);                                     \
+    lhs_pf3 = conv(0);                                     \
+    lhs_pf4 = conv(0);                                     \
+    lhs_pf5 = conv(0);                                     \
+    lhs_pf6 = conv(0);                                     \
+    lhs_pf7 = conv(0);                                     \
+                                                           \
+    rhs_pf0 = conv(0);                                     \
+    rhs_pf1 = conv(0);                                     \
+    rhs_pf2 = conv(0);                                     \
+    rhs_pf3 = conv(0);                                     \
+    rhs_pf4 = conv(0);                                     \
+    rhs_pf5 = conv(0);                                     \
+    rhs_pf6 = conv(0);                                     \
+    rhs_pf7 = conv(0);                                     \
+                                                           \
+    if (!needs_edge_check || lhs_vert < m_size) {          \
+      const Index lhs_horiz_0 = base_k + thread_z + 0 * 8; \
+      const Index lhs_horiz_1 = base_k + thread_z + 1 * 8; \
+      const Index lhs_horiz_2 = base_k + thread_z + 2 * 8; \
+      const Index lhs_horiz_3 = base_k + thread_z + 3 * 8; \
+      const Index lhs_horiz_4 = base_k + thread_z + 4 * 8; \
+      const Index lhs_horiz_5 = base_k + thread_z + 5 * 8; \
+      const Index lhs_horiz_6 = base_k + thread_z + 6 * 8; \
+      const Index lhs_horiz_7 = base_k + thread_z + 7 * 8; \
+                                                           \
+      if (!needs_edge_check || lhs_horiz_7 < k_size) {     \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);              \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);              \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);              \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);              \
+        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);              \
+        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);              \
+        lhs_pf6 = lhs(lhs_vert, lhs_horiz_6);              \
+        lhs_pf7 = lhs(lhs_vert, lhs_horiz_7);              \
+      } else if (lhs_horiz_6 < k_size) {                   \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);              \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);              \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);              \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);              \
+        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);              \
+        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);              \
+        lhs_pf6 = lhs(lhs_vert, lhs_horiz_6);              \
+      } else if (lhs_horiz_5 < k_size) {                   \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);              \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);              \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);              \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);              \
+        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);              \
+        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);              \
+      } else if (lhs_horiz_4 < k_size) {                   \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);              \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);              \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);              \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);              \
+        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);              \
+      } else if (lhs_horiz_3 < k_size) {                   \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);              \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);              \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);              \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);              \
+      } else if (lhs_horiz_2 < k_size) {                   \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);              \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);              \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);              \
+      } else if (lhs_horiz_1 < k_size) {                   \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);              \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);              \
+      } else if (lhs_horiz_0 < k_size) {                   \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);              \
+      }                                                    \
+    }                                                      \
+                                                           \
+    const Index rhs_vert = base_k + load_idx_vert;         \
+    if (!needs_edge_check || rhs_vert < k_size) {          \
+      const Index rhs_horiz_0 = base_n + thread_z + 0 * 8; \
+      const Index rhs_horiz_1 = base_n + thread_z + 1 * 8; \
+      const Index rhs_horiz_2 = base_n + thread_z + 2 * 8; \
+      const Index rhs_horiz_3 = base_n + thread_z + 3 * 8; \
+      const Index rhs_horiz_4 = base_n + thread_z + 4 * 8; \
+      const Index rhs_horiz_5 = base_n + thread_z + 5 * 8; \
+      const Index rhs_horiz_6 = base_n + thread_z + 6 * 8; \
+      const Index rhs_horiz_7 = base_n + thread_z + 7 * 8; \
+                                                           \
+      if (rhs_horiz_7 < n_size) {                          \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);              \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);              \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);              \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);              \
+        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);              \
+        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);              \
+        rhs_pf6 = rhs(rhs_vert, rhs_horiz_6);              \
+        rhs_pf7 = rhs(rhs_vert, rhs_horiz_7);              \
+      } else if (rhs_horiz_6 < n_size) {                   \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);              \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);              \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);              \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);              \
+        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);              \
+        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);              \
+        rhs_pf6 = rhs(rhs_vert, rhs_horiz_6);              \
+      } else if (rhs_horiz_5 < n_size) {                   \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);              \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);              \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);              \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);              \
+        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);              \
+        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);              \
+      } else if (rhs_horiz_4 < n_size) {                   \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);              \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);              \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);              \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);              \
+        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);              \
+      } else if (rhs_horiz_3 < n_size) {                   \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);              \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);              \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);              \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);              \
+      } else if (rhs_horiz_2 < n_size) {                   \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);              \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);              \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);              \
+      } else if (rhs_horiz_1 < n_size) {                   \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);              \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);              \
+      } else if (rhs_horiz_0 < n_size) {                   \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);              \
+      }                                                    \
+    }                                                      \
   }
 
 #define writeRegToShmem()               \
@@ -285,7 +288,7 @@
 
   for (Index base_k = 0; base_k < k_size; base_k += 64) {
     // wait for previous iteration to finish with shmem. Despite common sense,
-    // the code is a bit faster with this here then at bottom of loop
+    // the code is a bit faster with this here than at bottom of loop
     __syncthreads();
 
     prefetchIntoRegisters(base_k);
@@ -321,8 +324,8 @@
     Scalar rrow(7);
 
     // Now x corresponds to k, y to m, and z to n
-    const Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y];
-    const Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z];
+    const Scalar* lhs_block = &lhs_shmem[thread_x + 9 * thread_y];
+    const Scalar* rhs_block = &rhs_shmem[thread_x + 8 * thread_z];
 
 #define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))]
 #define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))]
@@ -441,7 +444,7 @@
   // wait for shared mem to be out of use
   __syncthreads();
 
-#define writeResultShmem(i, j) lhs_shmem[i + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j] = res(i, j);
+#define writeResultShmem(i, j) lhs_shmem[i + 8 * thread_y + 64 * thread_z + 512 * j] = res(i, j);
 
 #define writeRow(i)       \
   writeResultShmem(i, 0); \
@@ -453,7 +456,7 @@
   writeResultShmem(i, 6); \
   writeResultShmem(i, 7);
 
-  if (threadIdx.x == 0) {
+  if (thread_x == 0) {
     writeRow(0);
     writeRow(1);
     writeRow(2);
@@ -466,34 +469,34 @@
 #undef writeResultShmem
 #undef writeRow
 
-  const int max_i_write = numext::mini((int)((m_size - base_m - threadIdx.y + 7) / 8), 8);
-  const int max_j_write = numext::mini((int)((n_size - base_n - threadIdx.z + 7) / 8), 8);
+  const int max_i_write = numext::mini((int)((m_size - base_m - thread_y + 7) / 8), 8);
+  const int max_j_write = numext::mini((int)((n_size - base_n - thread_z + 7) / 8), 8);
 
-  if (threadIdx.x < max_i_write) {
+  if (thread_x < max_i_write) {
     if (max_j_write == 8) {
-      // TODO: can i trade bank conflicts for coalesced writes?
-      Scalar val0 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 0];
-      Scalar val1 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 1];
-      Scalar val2 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 2];
-      Scalar val3 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 3];
-      Scalar val4 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 4];
-      Scalar val5 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 5];
-      Scalar val6 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 6];
-      Scalar val7 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 7];
+      // TODO: Can we trade bank conflicts for coalesced writes?
+      Scalar val0 = lhs_shmem[thread_x + 8 * thread_y + 64 * thread_z + 512 * 0];
+      Scalar val1 = lhs_shmem[thread_x + 8 * thread_y + 64 * thread_z + 512 * 1];
+      Scalar val2 = lhs_shmem[thread_x + 8 * thread_y + 64 * thread_z + 512 * 2];
+      Scalar val3 = lhs_shmem[thread_x + 8 * thread_y + 64 * thread_z + 512 * 3];
+      Scalar val4 = lhs_shmem[thread_x + 8 * thread_y + 64 * thread_z + 512 * 4];
+      Scalar val5 = lhs_shmem[thread_x + 8 * thread_y + 64 * thread_z + 512 * 5];
+      Scalar val6 = lhs_shmem[thread_x + 8 * thread_y + 64 * thread_z + 512 * 6];
+      Scalar val7 = lhs_shmem[thread_x + 8 * thread_y + 64 * thread_z + 512 * 7];
 
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 0) = val0;
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 1) = val1;
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 2) = val2;
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 3) = val3;
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 4) = val4;
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 5) = val5;
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 6) = val6;
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 7) = val7;
+      output(base_m + thread_y + 8 * thread_x, base_n + thread_z + 8 * 0) = val0;
+      output(base_m + thread_y + 8 * thread_x, base_n + thread_z + 8 * 1) = val1;
+      output(base_m + thread_y + 8 * thread_x, base_n + thread_z + 8 * 2) = val2;
+      output(base_m + thread_y + 8 * thread_x, base_n + thread_z + 8 * 3) = val3;
+      output(base_m + thread_y + 8 * thread_x, base_n + thread_z + 8 * 4) = val4;
+      output(base_m + thread_y + 8 * thread_x, base_n + thread_z + 8 * 5) = val5;
+      output(base_m + thread_y + 8 * thread_x, base_n + thread_z + 8 * 6) = val6;
+      output(base_m + thread_y + 8 * thread_x, base_n + thread_z + 8 * 7) = val7;
     } else {
 #pragma unroll 7
       for (int j = 0; j < max_j_write; j++) {
-        Scalar val = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j];
-        output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * j) = val;
+        Scalar val = lhs_shmem[thread_x + 8 * thread_y + 64 * thread_z + 512 * j];
+        output(base_m + thread_y + 8 * thread_x, base_n + thread_z + 8 * j) = val;
       }
     }
   }
@@ -539,6 +542,8 @@
   float4 lhs_pf0, rhs_pf0;
 
   float4 results[4];
+  const Index thread_x = threadIdx.x;
+  const Index thread_y = threadIdx.y;
   for (int i = 0; i < 4; i++) {
     results[i].x = results[i].y = results[i].z = results[i].w = 0;
   }
@@ -565,17 +570,17 @@
     }                                                               \
   }
 
-  Index lhs_vert = base_m + threadIdx.x * 4;
+  Index lhs_vert = base_m + thread_x * 4;
 
   for (Index k = 0; k < k_size; k += 16) {
     lhs_pf0 = internal::pset1<float4>(0);
     rhs_pf0 = internal::pset1<float4>(0);
 
-    Index lhs_horiz = threadIdx.y + k;
+    Index lhs_horiz = thread_y + k;
     prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz)
 
-        Index rhs_vert = k + (threadIdx.x % 4) * 4;
-    Index rhs_horiz0 = (threadIdx.x >> 2) + threadIdx.y * 4 + base_n;
+        Index rhs_vert = k + (thread_x % 4) * 4;
+    Index rhs_horiz0 = (thread_x >> 2) + thread_y * 4 + base_n;
 
     if (!CHECK_RHS_BOUNDARY) {
       if ((rhs_vert + 3) < k_size) {
@@ -609,8 +614,8 @@
       }
     }
     float x1, x2;
-    // the following can be a bitwise operation..... some day.
-    if ((threadIdx.x % 8) < 4) {
+    // TODO: The following can be a bitwise operation.
+    if ((thread_x % 8) < 4) {
       x1 = rhs_pf0.y;
       x2 = rhs_pf0.w;
     } else {
@@ -624,7 +629,7 @@
     x1 = __shfl_xor_sync(0xFFFFFFFF, x1, 4);
     x2 = __shfl_xor_sync(0xFFFFFFFF, x2, 4);
 #endif
-    if ((threadIdx.x % 8) < 4) {
+    if ((thread_x % 8) < 4) {
       rhs_pf0.y = x1;
       rhs_pf0.w = x2;
     } else {
@@ -639,8 +644,8 @@
     // Row 31 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 62, 63
     // Row 32 -> times (2, 6, 10, 14, 3, 7, 11, 15) for features 0, 1
     // ...
-    rhs_shmem2[(threadIdx.x >> 3) + threadIdx.y * 2][threadIdx.x % 8] = make_float2(rhs_pf0.x, rhs_pf0.y);
-    rhs_shmem2[(threadIdx.x >> 3) + threadIdx.y * 2 + 32][threadIdx.x % 8] = make_float2(rhs_pf0.z, rhs_pf0.w);
+    rhs_shmem2[(thread_x >> 3) + thread_y * 2][thread_x % 8] = make_float2(rhs_pf0.x, rhs_pf0.y);
+    rhs_shmem2[(thread_x >> 3) + thread_y * 2 + 32][thread_x % 8] = make_float2(rhs_pf0.z, rhs_pf0.w);
 
     // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
     // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
@@ -649,8 +654,8 @@
     // Row 16 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63)
     // ...
 
-    lhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(lhs_pf0.x, lhs_pf0.y);
-    lhs_shmem2[threadIdx.y + 16][threadIdx.x] = make_float2(lhs_pf0.z, lhs_pf0.w);
+    lhs_shmem2[thread_y][thread_x] = make_float2(lhs_pf0.x, lhs_pf0.y);
+    lhs_shmem2[thread_y + 16][thread_x] = make_float2(lhs_pf0.z, lhs_pf0.w);
 
 #define add_vals(fl1, fl2, fr1, fr2) \
   results[0].x += fl1.x * fr1.x;     \
@@ -679,10 +684,10 @@
 #pragma unroll
     for (int koff = 0; koff < 16; koff++) {
       // 32 x threads.
-      float2 fl1 = lhs_shmem2[koff][threadIdx.x];
-      float2 fl2 = lhs_shmem2[koff + 16][threadIdx.x];
+      float2 fl1 = lhs_shmem2[koff][thread_x];
+      float2 fl2 = lhs_shmem2[koff + 16][thread_x];
 
-      int start_feature = threadIdx.y * 4;
+      int start_feature = thread_y * 4;
       float2 fr1 = rhs_shmem2[(start_feature >> 1) + 32 * ((koff % 4) / 2)][koff / 4 + (koff % 2) * 4];
       float2 fr2 = rhs_shmem2[(start_feature >> 1) + 1 + 32 * ((koff % 4) / 2)][koff / 4 + (koff % 2) * 4];
 
@@ -694,7 +699,7 @@
 #undef prefetch_lhs
 #undef add_vals
 
-  Index horiz_base = threadIdx.y * 4 + base_n;
+  Index horiz_base = thread_y * 4 + base_n;
   if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
     for (int i = 0; i < 4; i++) {
       output(lhs_vert, horiz_base + i) = results[i].x;
@@ -770,11 +775,15 @@
   float4 rhs_pf0, rhs_pf1;
 
   float4 results[8];
+  const Index thread_x = threadIdx.x;
+  const Index thread_y = threadIdx.y;
+
   for (int i = 0; i < 8; i++) {
     results[i].x = results[i].y = results[i].z = results[i].w = 0;
   }
 
-  Index lhs_vert = base_m + threadIdx.x * 4 + (threadIdx.y % 4) * 32;
+  Index lhs_vert = base_m + thread_x * 4 + (thread_y % 4) * 32;
+
   for (Index k = 0; k < k_size; k += 32) {
     lhs_pf0 = internal::pset1<float4>(0);
     lhs_pf1 = internal::pset1<float4>(0);
@@ -785,123 +794,123 @@
     rhs_pf1 = internal::pset1<float4>(0);
 
     if (!CHECK_LHS_BOUNDARY) {
-      if ((threadIdx.y / 4 + k + 24) < k_size) {
-        lhs_pf0 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k));
-        lhs_pf1 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k + 8));
-        lhs_pf2 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k + 16));
-        lhs_pf3 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k + 24));
-      } else if ((threadIdx.y / 4 + k + 16) < k_size) {
-        lhs_pf0 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k));
-        lhs_pf1 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k + 8));
-        lhs_pf2 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k + 16));
-      } else if ((threadIdx.y / 4 + k + 8) < k_size) {
-        lhs_pf0 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k));
-        lhs_pf1 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k + 8));
-      } else if ((threadIdx.y / 4 + k) < k_size) {
-        lhs_pf0 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k));
+      if ((thread_y / 4 + k + 24) < k_size) {
+        lhs_pf0 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (thread_y / 4 + k));
+        lhs_pf1 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (thread_y / 4 + k + 8));
+        lhs_pf2 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (thread_y / 4 + k + 16));
+        lhs_pf3 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (thread_y / 4 + k + 24));
+      } else if ((thread_y / 4 + k + 16) < k_size) {
+        lhs_pf0 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (thread_y / 4 + k));
+        lhs_pf1 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (thread_y / 4 + k + 8));
+        lhs_pf2 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (thread_y / 4 + k + 16));
+      } else if ((thread_y / 4 + k + 8) < k_size) {
+        lhs_pf0 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (thread_y / 4 + k));
+        lhs_pf1 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (thread_y / 4 + k + 8));
+      } else if ((thread_y / 4 + k) < k_size) {
+        lhs_pf0 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (thread_y / 4 + k));
       }
     } else {
       // just CHECK_LHS_BOUNDARY
       if (lhs_vert + 3 < m_size) {
-        if ((threadIdx.y / 4 + k + 24) < k_size) {
-          lhs_pf0 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k));
-          lhs_pf1 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k + 8));
-          lhs_pf2 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k + 16));
-          lhs_pf3 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k + 24));
-        } else if ((threadIdx.y / 4 + k + 16) < k_size) {
-          lhs_pf0 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k));
-          lhs_pf1 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k + 8));
-          lhs_pf2 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k + 16));
-        } else if ((threadIdx.y / 4 + k + 8) < k_size) {
-          lhs_pf0 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k));
-          lhs_pf1 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k + 8));
-        } else if ((threadIdx.y / 4 + k) < k_size) {
-          lhs_pf0 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k));
+        if ((thread_y / 4 + k + 24) < k_size) {
+          lhs_pf0 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (thread_y / 4 + k));
+          lhs_pf1 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (thread_y / 4 + k + 8));
+          lhs_pf2 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (thread_y / 4 + k + 16));
+          lhs_pf3 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (thread_y / 4 + k + 24));
+        } else if ((thread_y / 4 + k + 16) < k_size) {
+          lhs_pf0 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (thread_y / 4 + k));
+          lhs_pf1 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (thread_y / 4 + k + 8));
+          lhs_pf2 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (thread_y / 4 + k + 16));
+        } else if ((thread_y / 4 + k + 8) < k_size) {
+          lhs_pf0 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (thread_y / 4 + k));
+          lhs_pf1 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (thread_y / 4 + k + 8));
+        } else if ((thread_y / 4 + k) < k_size) {
+          lhs_pf0 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (thread_y / 4 + k));
         }
       } else if (lhs_vert + 2 < m_size) {
-        if ((threadIdx.y / 4 + k + 24) < k_size) {
-          lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k));
-          lhs_pf0.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k));
-          lhs_pf0.z = lhs(lhs_vert + 2, (threadIdx.y / 4 + k));
-          lhs_pf1.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 8));
-          lhs_pf1.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 8));
-          lhs_pf1.z = lhs(lhs_vert + 2, (threadIdx.y / 4 + k + 8));
-          lhs_pf2.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 16));
-          lhs_pf2.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 16));
-          lhs_pf2.z = lhs(lhs_vert + 2, (threadIdx.y / 4 + k + 16));
-          lhs_pf3.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 24));
-          lhs_pf3.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 24));
-          lhs_pf3.z = lhs(lhs_vert + 2, (threadIdx.y / 4 + k + 24));
-        } else if ((threadIdx.y / 4 + k + 16) < k_size) {
-          lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k));
-          lhs_pf0.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k));
-          lhs_pf0.z = lhs(lhs_vert + 2, (threadIdx.y / 4 + k));
-          lhs_pf1.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 8));
-          lhs_pf1.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 8));
-          lhs_pf1.z = lhs(lhs_vert + 2, (threadIdx.y / 4 + k + 8));
-          lhs_pf2.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 16));
-          lhs_pf2.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 16));
-          lhs_pf2.z = lhs(lhs_vert + 2, (threadIdx.y / 4 + k + 16));
-        } else if ((threadIdx.y / 4 + k + 8) < k_size) {
-          lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k));
-          lhs_pf0.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k));
-          lhs_pf0.z = lhs(lhs_vert + 2, (threadIdx.y / 4 + k));
-          lhs_pf1.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 8));
-          lhs_pf1.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 8));
-          lhs_pf1.z = lhs(lhs_vert + 2, (threadIdx.y / 4 + k + 8));
-        } else if ((threadIdx.y / 4 + k) < k_size) {
-          lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k));
-          lhs_pf0.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k));
-          lhs_pf0.z = lhs(lhs_vert + 2, (threadIdx.y / 4 + k));
+        if ((thread_y / 4 + k + 24) < k_size) {
+          lhs_pf0.x = lhs(lhs_vert + 0, (thread_y / 4 + k));
+          lhs_pf0.y = lhs(lhs_vert + 1, (thread_y / 4 + k));
+          lhs_pf0.z = lhs(lhs_vert + 2, (thread_y / 4 + k));
+          lhs_pf1.x = lhs(lhs_vert + 0, (thread_y / 4 + k + 8));
+          lhs_pf1.y = lhs(lhs_vert + 1, (thread_y / 4 + k + 8));
+          lhs_pf1.z = lhs(lhs_vert + 2, (thread_y / 4 + k + 8));
+          lhs_pf2.x = lhs(lhs_vert + 0, (thread_y / 4 + k + 16));
+          lhs_pf2.y = lhs(lhs_vert + 1, (thread_y / 4 + k + 16));
+          lhs_pf2.z = lhs(lhs_vert + 2, (thread_y / 4 + k + 16));
+          lhs_pf3.x = lhs(lhs_vert + 0, (thread_y / 4 + k + 24));
+          lhs_pf3.y = lhs(lhs_vert + 1, (thread_y / 4 + k + 24));
+          lhs_pf3.z = lhs(lhs_vert + 2, (thread_y / 4 + k + 24));
+        } else if ((thread_y / 4 + k + 16) < k_size) {
+          lhs_pf0.x = lhs(lhs_vert + 0, (thread_y / 4 + k));
+          lhs_pf0.y = lhs(lhs_vert + 1, (thread_y / 4 + k));
+          lhs_pf0.z = lhs(lhs_vert + 2, (thread_y / 4 + k));
+          lhs_pf1.x = lhs(lhs_vert + 0, (thread_y / 4 + k + 8));
+          lhs_pf1.y = lhs(lhs_vert + 1, (thread_y / 4 + k + 8));
+          lhs_pf1.z = lhs(lhs_vert + 2, (thread_y / 4 + k + 8));
+          lhs_pf2.x = lhs(lhs_vert + 0, (thread_y / 4 + k + 16));
+          lhs_pf2.y = lhs(lhs_vert + 1, (thread_y / 4 + k + 16));
+          lhs_pf2.z = lhs(lhs_vert + 2, (thread_y / 4 + k + 16));
+        } else if ((thread_y / 4 + k + 8) < k_size) {
+          lhs_pf0.x = lhs(lhs_vert + 0, (thread_y / 4 + k));
+          lhs_pf0.y = lhs(lhs_vert + 1, (thread_y / 4 + k));
+          lhs_pf0.z = lhs(lhs_vert + 2, (thread_y / 4 + k));
+          lhs_pf1.x = lhs(lhs_vert + 0, (thread_y / 4 + k + 8));
+          lhs_pf1.y = lhs(lhs_vert + 1, (thread_y / 4 + k + 8));
+          lhs_pf1.z = lhs(lhs_vert + 2, (thread_y / 4 + k + 8));
+        } else if ((thread_y / 4 + k) < k_size) {
+          lhs_pf0.x = lhs(lhs_vert + 0, (thread_y / 4 + k));
+          lhs_pf0.y = lhs(lhs_vert + 1, (thread_y / 4 + k));
+          lhs_pf0.z = lhs(lhs_vert + 2, (thread_y / 4 + k));
         }
       } else if (lhs_vert + 1 < m_size) {
-        if ((threadIdx.y / 4 + k + 24) < k_size) {
-          lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k));
-          lhs_pf0.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k));
-          lhs_pf1.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 8));
-          lhs_pf1.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 8));
-          lhs_pf2.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 16));
-          lhs_pf2.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 16));
-          lhs_pf3.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 24));
-          lhs_pf3.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 24));
-        } else if ((threadIdx.y / 4 + k + 16) < k_size) {
-          lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k));
-          lhs_pf0.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k));
-          lhs_pf1.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 8));
-          lhs_pf1.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 8));
-          lhs_pf2.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 16));
-          lhs_pf2.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 16));
-        } else if ((threadIdx.y / 4 + k + 8) < k_size) {
-          lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k));
-          lhs_pf0.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k));
-          lhs_pf1.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 8));
-          lhs_pf1.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 8));
-        } else if ((threadIdx.y / 4 + k) < k_size) {
-          lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k));
-          lhs_pf0.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k));
+        if ((thread_y / 4 + k + 24) < k_size) {
+          lhs_pf0.x = lhs(lhs_vert + 0, (thread_y / 4 + k));
+          lhs_pf0.y = lhs(lhs_vert + 1, (thread_y / 4 + k));
+          lhs_pf1.x = lhs(lhs_vert + 0, (thread_y / 4 + k + 8));
+          lhs_pf1.y = lhs(lhs_vert + 1, (thread_y / 4 + k + 8));
+          lhs_pf2.x = lhs(lhs_vert + 0, (thread_y / 4 + k + 16));
+          lhs_pf2.y = lhs(lhs_vert + 1, (thread_y / 4 + k + 16));
+          lhs_pf3.x = lhs(lhs_vert + 0, (thread_y / 4 + k + 24));
+          lhs_pf3.y = lhs(lhs_vert + 1, (thread_y / 4 + k + 24));
+        } else if ((thread_y / 4 + k + 16) < k_size) {
+          lhs_pf0.x = lhs(lhs_vert + 0, (thread_y / 4 + k));
+          lhs_pf0.y = lhs(lhs_vert + 1, (thread_y / 4 + k));
+          lhs_pf1.x = lhs(lhs_vert + 0, (thread_y / 4 + k + 8));
+          lhs_pf1.y = lhs(lhs_vert + 1, (thread_y / 4 + k + 8));
+          lhs_pf2.x = lhs(lhs_vert + 0, (thread_y / 4 + k + 16));
+          lhs_pf2.y = lhs(lhs_vert + 1, (thread_y / 4 + k + 16));
+        } else if ((thread_y / 4 + k + 8) < k_size) {
+          lhs_pf0.x = lhs(lhs_vert + 0, (thread_y / 4 + k));
+          lhs_pf0.y = lhs(lhs_vert + 1, (thread_y / 4 + k));
+          lhs_pf1.x = lhs(lhs_vert + 0, (thread_y / 4 + k + 8));
+          lhs_pf1.y = lhs(lhs_vert + 1, (thread_y / 4 + k + 8));
+        } else if ((thread_y / 4 + k) < k_size) {
+          lhs_pf0.x = lhs(lhs_vert + 0, (thread_y / 4 + k));
+          lhs_pf0.y = lhs(lhs_vert + 1, (thread_y / 4 + k));
         }
       } else if (lhs_vert < m_size) {
-        if ((threadIdx.y / 4 + k + 24) < k_size) {
-          lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k));
-          lhs_pf1.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 8));
-          lhs_pf2.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 16));
-          lhs_pf3.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 24));
-        } else if ((threadIdx.y / 4 + k + 16) < k_size) {
-          lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k));
-          lhs_pf1.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 8));
-          lhs_pf2.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 16));
-        } else if ((threadIdx.y / 4 + k + 8) < k_size) {
-          lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k));
-          lhs_pf1.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 8));
-        } else if ((threadIdx.y / 4 + k) < k_size) {
-          lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k));
+        if ((thread_y / 4 + k + 24) < k_size) {
+          lhs_pf0.x = lhs(lhs_vert + 0, (thread_y / 4 + k));
+          lhs_pf1.x = lhs(lhs_vert + 0, (thread_y / 4 + k + 8));
+          lhs_pf2.x = lhs(lhs_vert + 0, (thread_y / 4 + k + 16));
+          lhs_pf3.x = lhs(lhs_vert + 0, (thread_y / 4 + k + 24));
+        } else if ((thread_y / 4 + k + 16) < k_size) {
+          lhs_pf0.x = lhs(lhs_vert + 0, (thread_y / 4 + k));
+          lhs_pf1.x = lhs(lhs_vert + 0, (thread_y / 4 + k + 8));
+          lhs_pf2.x = lhs(lhs_vert + 0, (thread_y / 4 + k + 16));
+        } else if ((thread_y / 4 + k + 8) < k_size) {
+          lhs_pf0.x = lhs(lhs_vert + 0, (thread_y / 4 + k));
+          lhs_pf1.x = lhs(lhs_vert + 0, (thread_y / 4 + k + 8));
+        } else if ((thread_y / 4 + k) < k_size) {
+          lhs_pf0.x = lhs(lhs_vert + 0, (thread_y / 4 + k));
         }
       }
     }
     __syncthreads();
-    Index rhs_vert = k + threadIdx.x * 4;
-    Index rhs_horiz0 = threadIdx.y * 2 + base_n;
-    Index rhs_horiz1 = threadIdx.y * 2 + 1 + base_n;
+    Index rhs_vert = k + thread_x * 4;
+    Index rhs_horiz0 = thread_y * 2 + base_n;
+    Index rhs_horiz1 = thread_y * 2 + 1 + base_n;
     if (!CHECK_RHS_BOUNDARY) {
       if ((rhs_vert + 3) < k_size) {
         // just CHECK_RHS_BOUNDARY
@@ -938,12 +947,12 @@
           rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
           rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
           rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
-        } else if (k + threadIdx.x * 4 + 1 < k_size) {
+        } else if (k + thread_x * 4 + 1 < k_size) {
           rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
           rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
           rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
           rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
-        } else if (k + threadIdx.x * 4 < k_size) {
+        } else if (k + thread_x * 4 < k_size) {
           rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
           rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
         }
@@ -970,17 +979,17 @@
     // Row 1 -> times (0, 4, 8, .. 28) for features 2, 3.
     // ..
     // Row 31 -> times (0, 4, 8, .. 28) for features 62, 63
-    rhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(rhs_pf0.x, rhs_pf1.x);
+    rhs_shmem2[thread_y][thread_x] = make_float2(rhs_pf0.x, rhs_pf1.x);
     // Row 32 -> times (1, 5, 9, .. 29) for features 0, 1.
     // Row 33 -> times (1, 5, 9, .. 29) for features 2, 3.
     // ..
-    rhs_shmem2[threadIdx.y + 32][threadIdx.x] = make_float2(rhs_pf0.y, rhs_pf1.y);
+    rhs_shmem2[thread_y + 32][thread_x] = make_float2(rhs_pf0.y, rhs_pf1.y);
     // Row 64 -> times (2, 6, 10, .. 30) for features 0, 1.
     // Row 65 -> times (2, 6, 10, .. 30) for features 2, 3.
-    rhs_shmem2[threadIdx.y + 64][threadIdx.x] = make_float2(rhs_pf0.z, rhs_pf1.z);
+    rhs_shmem2[thread_y + 64][thread_x] = make_float2(rhs_pf0.z, rhs_pf1.z);
     // Row 96 -> times (3, 7, 11, .. 31) for features 0, 1.
     // Row 97 -> times (3, 7, 11, .. 31) for features 2, 3.
-    rhs_shmem2[threadIdx.y + 96][threadIdx.x] = make_float2(rhs_pf0.w, rhs_pf1.w);
+    rhs_shmem2[thread_y + 96][thread_x] = make_float2(rhs_pf0.w, rhs_pf1.w);
 
     // LHS.
     // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61) .. (124, 125)
@@ -1026,26 +1035,26 @@
   results[6].w += a_feat2.y * f4.x;                \
   results[7].w += a_feat2.y * f4.y;
 
-    lhs_shmem2[threadIdx.y / 4][threadIdx.x + (threadIdx.y % 4) * 8] = make_float2(lhs_pf0.x, lhs_pf0.y);
-    lhs_shmem2[threadIdx.y / 4 + 8][threadIdx.x + (threadIdx.y % 4) * 8] = make_float2(lhs_pf1.x, lhs_pf1.y);
-    lhs_shmem2[threadIdx.y / 4 + 16][threadIdx.x + (threadIdx.y % 4) * 8] = make_float2(lhs_pf2.x, lhs_pf2.y);
-    lhs_shmem2[threadIdx.y / 4 + 24][threadIdx.x + (threadIdx.y % 4) * 8] = make_float2(lhs_pf3.x, lhs_pf3.y);
+    lhs_shmem2[thread_y / 4][thread_x + (thread_y % 4) * 8] = make_float2(lhs_pf0.x, lhs_pf0.y);
+    lhs_shmem2[thread_y / 4 + 8][thread_x + (thread_y % 4) * 8] = make_float2(lhs_pf1.x, lhs_pf1.y);
+    lhs_shmem2[thread_y / 4 + 16][thread_x + (thread_y % 4) * 8] = make_float2(lhs_pf2.x, lhs_pf2.y);
+    lhs_shmem2[thread_y / 4 + 24][thread_x + (thread_y % 4) * 8] = make_float2(lhs_pf3.x, lhs_pf3.y);
 
-    lhs_shmem2[threadIdx.y / 4 + 32][threadIdx.x + (threadIdx.y % 4) * 8] = make_float2(lhs_pf0.z, lhs_pf0.w);
-    lhs_shmem2[threadIdx.y / 4 + 40][threadIdx.x + (threadIdx.y % 4) * 8] = make_float2(lhs_pf1.z, lhs_pf1.w);
-    lhs_shmem2[threadIdx.y / 4 + 48][threadIdx.x + (threadIdx.y % 4) * 8] = make_float2(lhs_pf2.z, lhs_pf2.w);
-    lhs_shmem2[threadIdx.y / 4 + 56][threadIdx.x + (threadIdx.y % 4) * 8] = make_float2(lhs_pf3.z, lhs_pf3.w);
+    lhs_shmem2[thread_y / 4 + 32][thread_x + (thread_y % 4) * 8] = make_float2(lhs_pf0.z, lhs_pf0.w);
+    lhs_shmem2[thread_y / 4 + 40][thread_x + (thread_y % 4) * 8] = make_float2(lhs_pf1.z, lhs_pf1.w);
+    lhs_shmem2[thread_y / 4 + 48][thread_x + (thread_y % 4) * 8] = make_float2(lhs_pf2.z, lhs_pf2.w);
+    lhs_shmem2[thread_y / 4 + 56][thread_x + (thread_y % 4) * 8] = make_float2(lhs_pf3.z, lhs_pf3.w);
 
     __syncthreads();
 
 // Do the multiplies.
 #pragma unroll
     for (int koff = 0; koff < 32; koff++) {
-      float2 a3 = lhs_shmem2[koff][threadIdx.x + (threadIdx.y % 4) * 8];
-      float2 a4 = lhs_shmem2[koff + 32][threadIdx.x + (threadIdx.y % 4) * 8];
+      float2 a3 = lhs_shmem2[koff][thread_x + (thread_y % 4) * 8];
+      float2 a4 = lhs_shmem2[koff + 32][thread_x + (thread_y % 4) * 8];
 
-      // first feature is at (threadIdx.y/4) * 8 last is at start + 8.
-      int start_feature = (threadIdx.y / 4) * 8;
+      // first feature is at (thread_y/4) * 8 last is at start + 8.
+      int start_feature = (thread_y / 4) * 8;
 
       float2 br1 = rhs_shmem2[start_feature / 2 + (koff % 4) * 32][koff / 4];
       float2 br2 = rhs_shmem2[start_feature / 2 + 1 + (koff % 4) * 32][koff / 4];
@@ -1060,7 +1069,7 @@
 #undef add_vals
 
   __syncthreads();
-  Index horiz_base = (threadIdx.y / 4) * 8 + base_n;
+  Index horiz_base = (thread_y / 4) * 8 + base_n;
   if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
     for (int i = 0; i < 8; i++) {
       output(lhs_vert, horiz_base + i) = results[i].x;

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/unsupported/Eigen/src/Tensor/TensorContractionMapper.h
similarity index 93%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
rename to unsupported/Eigen/src/Tensor/TensorContractionMapper.h
index 83fec99..2fc2102 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
+++ b/unsupported/Eigen/src/Tensor/TensorContractionMapper.h

@@ -82,7 +82,7 @@
   }
 
  private:
-  typedef typename Tensor::Scalar Scalar;
+  using Scalar = typename Tensor::Scalar;
 
   typename MakePointer_<const Scalar>::Type m_data;
 };
@@ -243,9 +243,8 @@
     : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size,
                                            inner_dim_contiguous, Alignment, MakePointer_> {
  public:
-  typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size,
-                                        inner_dim_contiguous, Alignment, MakePointer_>
-      ParentMapper;
+  using ParentMapper = SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size,
+                                                     inner_dim_contiguous, Alignment, MakePointer_>;
 
   EIGEN_DEVICE_FUNC BaseTensorContractionMapper(const Tensor& tensor, const nocontract_t& nocontract_strides,
                                                 const nocontract_t& ij_strides, const contract_t& contract_strides,
@@ -330,9 +329,8 @@
     : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1,
                                            inner_dim_contiguous, Alignment, MakePointer_> {
  public:
-  typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous,
-                                        Alignment, MakePointer_>
-      ParentMapper;
+  using ParentMapper = SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1,
+                                                     inner_dim_contiguous, Alignment, MakePointer_>;
 
   EIGEN_DEVICE_FUNC BaseTensorContractionMapper(const Tensor& tensor, const nocontract_t& nocontract_strides,
                                                 const nocontract_t& ij_strides, const contract_t& contract_strides,
@@ -358,14 +356,12 @@
           template <class> class MakePointer_ = MakePointer>
 class TensorContractionSubMapper {
  public:
-  typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size,
-                                      inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_>
-      ParentMapper;
-  typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size,
-                                     inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_>
-      Self;
-  typedef Self LinearMapper;
-  typedef Self SubMapper;
+  using ParentMapper = BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size,
+                                                   inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_>;
+  using Self = TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size,
+                                          inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_>;
+  using LinearMapper = Self;
+  using SubMapper = Self;
 
   enum {
     // We can use direct offsets iff the parent mapper supports then and we can compute the strides.
@@ -485,15 +481,13 @@
     : public BaseTensorContractionMapper<Scalar_, Index, side, Tensor, nocontract_t, contract_t, packet_size,
                                          inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> {
  public:
-  typedef Scalar_ Scalar;
-  typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size,
-                                      inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_>
-      Base;
-  typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size,
-                                     inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_>
-      SubMapper;
-  typedef SubMapper VectorMapper;
-  typedef SubMapper LinearMapper;
+  using Scalar = Scalar_;
+  using Base = BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size,
+                                           inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_>;
+  using SubMapper = TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size,
+                                               inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_>;
+  using VectorMapper = SubMapper;
+  using LinearMapper = SubMapper;
 
   EIGEN_DEVICE_FUNC TensorContractionInputMapper(const Tensor& tensor, const nocontract_t& nocontract_strides,
                                                  const nocontract_t& ij_strides, const contract_t& contract_strides,
@@ -526,7 +520,7 @@
 struct TensorContractionInputMapperTrait<
     TensorContractionInputMapper<Scalar_, Index_, side_, Tensor_, nocontract_t_, contract_t_, packet_size_,
                                  inner_dim_contiguous_, inner_dim_reordered_, Alignment_, MakePointer_> > {
-  typedef Tensor_ XprType;
+  using XprType = Tensor_;
   static const bool inner_dim_contiguous = inner_dim_contiguous_;
   static const bool inner_dim_reordered = inner_dim_reordered_;
 };

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/unsupported/Eigen/src/Tensor/TensorContractionSycl.h
similarity index 99%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
rename to unsupported/Eigen/src/Tensor/TensorContractionSycl.h
index ea210a1..fb98d5b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
+++ b/unsupported/Eigen/src/Tensor/TensorContractionSycl.h

@@ -632,7 +632,7 @@
     (thread_properties.is_internal) ? compute_panel<true>(itemID, thread_properties, out_ptr)
                                     : compute_panel<false>(itemID, thread_properties, out_ptr);
   }
-  // The compute block computes the contraction operation private block for each thread and store the resutl in the
+  // The compute block computes the contraction operation private block for each thread and store the result in the
   // privateRes memory of Each computation the compute block function is independent of local and no local concepts as
   // it only compute the block on each thread's private memory space
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_block_per_tile(OutScalar *lhs_block_ptr, OutScalar *rhs_block_ptr,

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/src/Tensor/TensorContractionThreadPool.h
similarity index 94%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
rename to unsupported/Eigen/src/Tensor/TensorContractionThreadPool.h
index 99e7304..5dd5d60 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/src/Tensor/TensorContractionThreadPool.h

@@ -23,16 +23,16 @@
                        ThreadPoolDevice>
     : public TensorContractionEvaluatorBase<TensorEvaluator<
           const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, ThreadPoolDevice>> {
-  typedef ThreadPoolDevice Device;
+  using Device = ThreadPoolDevice;
 
-  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self;
-  typedef TensorContractionEvaluatorBase<Self> Base;
+  using Self = TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device>;
+  using Base = TensorContractionEvaluatorBase<Self>;
 
-  typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
-  typedef std::remove_const_t<typename XprType::Scalar> Scalar;
-  typedef typename XprType::Index Index;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  using XprType = TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>;
+  using Scalar = std::remove_const_t<typename XprType::Scalar>;
+  using Index = typename XprType::Index;
+  using CoeffReturnType = typename XprType::CoeffReturnType;
+  using PacketReturnType = typename PacketType<CoeffReturnType, Device>::type;
 
   static constexpr int Layout = TensorEvaluator<LeftArgType, Device>::Layout;
 
@@ -40,10 +40,10 @@
   // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
   // If we want to compute A * B = C, where A is LHS and B is RHS, the code
   // will pretend B is LHS and A is RHS.
-  typedef std::conditional_t<static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>
-      EvalLeftArgType;
-  typedef std::conditional_t<static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>
-      EvalRightArgType;
+  using EvalLeftArgType =
+      std::conditional_t<static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>;
+  using EvalRightArgType =
+      std::conditional_t<static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>;
 
   static constexpr int LDims =
       internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
@@ -51,24 +51,24 @@
       internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
   static constexpr int ContractDims = internal::array_size<Indices>::value;
 
-  typedef array<Index, LDims> left_dim_mapper_t;
-  typedef array<Index, RDims> right_dim_mapper_t;
+  using left_dim_mapper_t = array<Index, LDims>;
+  using right_dim_mapper_t = array<Index, RDims>;
 
-  typedef array<Index, ContractDims> contract_t;
-  typedef array<Index, LDims - ContractDims> left_nocontract_t;
-  typedef array<Index, RDims - ContractDims> right_nocontract_t;
+  using contract_t = array<Index, ContractDims>;
+  using left_nocontract_t = array<Index, LDims - ContractDims>;
+  using right_nocontract_t = array<Index, RDims - ContractDims>;
 
   static constexpr int NumDims = LDims + RDims - 2 * ContractDims;
 
-  typedef DSizes<Index, NumDims> Dimensions;
+  using Dimensions = DSizes<Index, NumDims>;
 
   // typedefs needed in evalTo
-  typedef std::remove_const_t<typename EvalLeftArgType::Scalar> LhsScalar;
-  typedef std::remove_const_t<typename EvalRightArgType::Scalar> RhsScalar;
-  typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
+  using LhsScalar = std::remove_const_t<typename EvalLeftArgType::Scalar>;
+  using RhsScalar = std::remove_const_t<typename EvalRightArgType::Scalar>;
+  using Traits = typename internal::gebp_traits<LhsScalar, RhsScalar>;
 
-  typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
-  typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
+  using LeftEvaluator = TensorEvaluator<EvalLeftArgType, Device>;
+  using RightEvaluator = TensorEvaluator<EvalRightArgType, Device>;
 
   TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {}
 
@@ -222,7 +222,7 @@
     Index nn = numext::div_ceil(nn0, gn);
 
     // If there is enough concurrency in the sharding dimension, we choose not
-    // to paralellize by the other dimension, and execute all kernels in sync
+    // to parallelize by the other dimension, and execute all kernels in sync
     // mode. This reduces parallelism from the nm x nn down to nn
     // (shard_by_col==true) or nm (shard_by_col==false).
     const Index sharding_dim_tasks = shard_by_col ? nn : nm;
@@ -258,7 +258,7 @@
     // optimization.
     if (parallelize_by_sharding_dim_only) parallel_pack = false;
 
-    // TODO(ezhulnev): With if contexpr we don't need SyncEvalParallelContext.
+    // TODO(ezhulnev): With if constexpr we don't need SyncEvalParallelContext.
     if (IsEvalInSyncMode) {
 #define CONTEXT_ARGS                                                                                          \
   (this, num_threads, buffer, m, n, k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, nn0, shard_by_col, parallel_pack, \
@@ -281,7 +281,7 @@
   // Dummy struct to represent an empty DoneCallback.
 
   struct NoCallback {
-    void operator()() { eigen_assert(false && "NoCallback should never be called"); }
+    void operator()() const { eigen_assert(false && "NoCallback should never be called"); }
   };
 
   // ------------------------------------------------------------------------ //
@@ -335,23 +335,23 @@
             bool rhs_inner_dim_reordered, int Alignment>
   class EvalParallelContext {
    public:
-    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t,
-                                                   contract_t, internal::packet_traits<LhsScalar>::size,
-                                                   lhs_inner_dim_contiguous, false, Unaligned>
-        LhsMapper;
-    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t,
-                                                   contract_t, internal::packet_traits<RhsScalar>::size,
-                                                   rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned>
-        RhsMapper;
+    using LhsMapper =
+        internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t,
+                                               contract_t, internal::packet_traits<LhsScalar>::size,
+                                               lhs_inner_dim_contiguous, false, Unaligned>;
+    using RhsMapper =
+        internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t,
+                                               contract_t, internal::packet_traits<RhsScalar>::size,
+                                               rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned>;
 
-    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+    using OutputMapper = internal::blas_data_mapper<Scalar, Index, ColMajor>;
 
-    typedef internal::TensorContractionKernel<Scalar, LhsScalar, RhsScalar, Index, OutputMapper, LhsMapper, RhsMapper>
-        TensorContractionKernel;
+    using TensorContractionKernel =
+        internal::TensorContractionKernel<Scalar, LhsScalar, RhsScalar, Index, OutputMapper, LhsMapper, RhsMapper>;
 
-    typedef typename TensorContractionKernel::LhsBlock LhsBlock;
-    typedef typename TensorContractionKernel::RhsBlock RhsBlock;
-    typedef typename TensorContractionKernel::BlockMemHandle BlockMemHandle;
+    using LhsBlock = typename TensorContractionKernel::LhsBlock;
+    using RhsBlock = typename TensorContractionKernel::RhsBlock;
+    using BlockMemHandle = typename TensorContractionKernel::BlockMemHandle;
 
     EvalParallelContext(const Self* self, int num_threads, Scalar* buffer, Index tm, Index tn, Index tk, Index bm,
                         Index bn, Index bk, Index nm, Index nn, Index nk, Index gm, Index gn, Index nm0, Index nn0,
@@ -541,7 +541,7 @@
     // output block, so they must not run in parallel.
     //
     // This gives us the following dependency graph.
-    // On each k slice we have m x n kernel tasks, m lhs paking tasks and n rhs
+    // On each k slice we have m x n kernel tasks, m lhs packing tasks and n rhs
     // packing tasks.
     // Kernel (m, n, k) can start when:
     //  - kernel (m, n, k-1) has finished
@@ -568,7 +568,7 @@
     std::vector<RhsBlock> packed_rhs_[P - 1];
 
     // If we choose to parallelize only by the sharding dimension, each thread
-    // will have it's own "thead local" (not a c++ thread local storage) memory
+    // will have its own "thread local" (not a C++ thread local storage) memory
     // for packed_lhs or packed_rhs (shard_by_col = false of true). This memory
     // can't be passed to a kernel that might execute on a different thread.
     //
@@ -763,7 +763,7 @@
 
         Index grain_index = m1 - m * gm_;
         return blocks.block(
-            internal::convert_index<int>(grain_index));  // FIXME better make ThreadLocalBlocks use Eigen::Index?
+            internal::convert_index<int>(grain_index));  // FIXME: Consider making ThreadLocalBlocks use Eigen::Index.
       } else {
         return packed_lhs_[k % (P - 1)][m1];
       }
@@ -776,7 +776,7 @@
 
         Index grain_index = n1 - n * gn_;
         return blocks.block(
-            internal::convert_index<int>(grain_index));  // FIXME better make ThreadLocalBlocks use Eigen::Index?
+            internal::convert_index<int>(grain_index));  // FIXME: Consider making ThreadLocalBlocks use Eigen::Index.
       } else {
         return packed_rhs_[k % (P - 1)][n1];
       }
@@ -932,9 +932,7 @@
         kernel(m, n, k, use_thread_local);
       } else {
         eigen_assert(!use_thread_local);
-        device_.enqueue([this, m, n, k, use_thread_local]() { 
-            kernel(m, n, k, use_thread_local); 
-          });
+        device_.enqueue([this, m, n, k, use_thread_local]() { kernel(m, n, k, use_thread_local); });
       }
     }
 
@@ -982,9 +980,7 @@
       } else {
         while (end - start > 1) {
           Index mid = (start + end) / 2;
-          device_.enqueue([this, mid, end, k, rhs]() { 
-              enqueue_packing_helper(mid, end, k, rhs);
-            });
+          device_.enqueue([this, mid, end, k, rhs]() { enqueue_packing_helper(mid, end, k, rhs); });
           end = mid;
         }
 
@@ -1000,9 +996,7 @@
                           (k > 0 || std::this_thread::get_id() == created_by_thread_id_);
 
         if (pack_async) {
-          device_.enqueue([this, start, end, k, rhs]() { 
-              enqueue_packing_helper(start, end, k, rhs);
-            });
+          device_.enqueue([this, start, end, k, rhs]() { enqueue_packing_helper(start, end, k, rhs); });
         } else {
           enqueue_packing_helper(start, end, k, rhs);
         }
@@ -1201,7 +1195,7 @@
     }
 
     void applyOutputKernel() const {
-      typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+      using OutputMapper = internal::blas_data_mapper<Scalar, Index, ColMajor>;
       evaluator->m_output_kernel(OutputMapper(result, m), evaluator->m_tensor_contraction_params,
                                  static_cast<Eigen::Index>(0), static_cast<Eigen::Index>(0), m, n);
     }
@@ -1283,9 +1277,7 @@
       while (end_block_idx - start_block_idx > 1) {
         Index mid_block_idx = (start_block_idx + end_block_idx) / 2;
         evaluator->m_device.enqueue(
-            [this, mid_block_idx, end_block_idx]() { 
-              evalAsync<Alignment>(mid_block_idx, end_block_idx);
-            });
+            [this, mid_block_idx, end_block_idx]() { evalAsync<Alignment>(mid_block_idx, end_block_idx); });
         end_block_idx = mid_block_idx;
       }
 
@@ -1343,7 +1335,7 @@
   // ------------------------------------------------------------------------ //
 
   // Below are the function used by evalProductImpl heuristics, trying to select
-  // optimcal parameters for parallelization algorithm.
+  // optimal parameters for parallelization algorithm.
 
   // Decide whether we want to shard m x n contraction by columns or by rows.
   static bool shardByCol(Index m, Index n, Index num_threads) {

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/src/Tensor/TensorConversion.h
similarity index 100%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
rename to unsupported/Eigen/src/Tensor/TensorConversion.h


diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/src/Tensor/TensorConvolution.h
similarity index 96%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
rename to unsupported/Eigen/src/Tensor/TensorConvolution.h
index 016498f..021f7cd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/src/Tensor/TensorConvolution.h

@@ -666,8 +666,8 @@
 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel3D(
     InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout> indexMapper,
     const float* __restrict kernel, const size_t numPlanes, const size_t numX, const size_t maxX, const size_t numY,
-    const size_t maxY, const size_t numZ, const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY,
-    const size_t kernelSizeZ, float* buffer) {
+    const size_t maxY, const size_t numZ, const size_t maxZ, const int kernelSizeX, const int kernelSizeY,
+    const int kernelSizeZ, float* buffer) {
 #if defined(EIGEN_HIPCC)
   HIP_DYNAMIC_SHARED(float, s)
 #else
@@ -675,19 +675,25 @@
 #endif
 
   // Load inputs to shared memory
-  const int first_x = blockIdx.x * maxX;
-  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
+  const int first_x = blockIdx.x * static_cast<int>(maxX);
+  const int last_x = (first_x + static_cast<int>(maxX) < static_cast<int>(numX) ? first_x + static_cast<int>(maxX)
+                                                                                : static_cast<int>(numX)) -
+                     1;
   const int num_x_input = last_x - first_x + kernelSizeX;
 
-  const int first_y = blockIdx.y * maxY;
-  const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
+  const int first_y = blockIdx.y * static_cast<int>(maxY);
+  const int last_y = (first_y + static_cast<int>(maxY) < static_cast<int>(numY) ? first_y + static_cast<int>(maxY)
+                                                                                : static_cast<int>(numY)) -
+                     1;
   const int num_y_input = last_y - first_y + kernelSizeY;
 
-  const int first_z = blockIdx.z * maxZ;
-  const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
+  const int first_z = blockIdx.z * static_cast<int>(maxZ);
+  const int last_z = (first_z + static_cast<int>(maxZ) < static_cast<int>(numZ) ? first_z + static_cast<int>(maxZ)
+                                                                                : static_cast<int>(numZ)) -
+                     1;
   const int num_z_input = last_z - first_z + kernelSizeZ;
 
-  for (int p = 0; p < numPlanes; ++p) {
+  for (int p = 0; p < static_cast<int>(numPlanes); ++p) {
     const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
     const int plane_kernel_offset = 0;
 
@@ -890,10 +896,6 @@
 
         dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks, ceil(numP, block_size.y)));
 
-        // cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << "
-        // num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: "
-        // << shared_mem << " in stream " << m_device.stream() << endl;
-
         const array<Index, 1> indices{m_indices[0]};
         const array<Index, 1> kernel_dims{m_kernelImpl.dimensions()[0]};
         internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
@@ -956,11 +958,6 @@
 
         dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks, ceil(numP, block_size.z)));
 
-        // cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y  << "
-        // block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y <<
-        // " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << "
-        // shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
-
         const array<Index, 2> indices{m_indices[idxX], m_indices[idxY]};
         const array<Index, 2> kernel_dims{m_kernelImpl.dimensions()[idxX], m_kernelImpl.dimensions()[idxY]};
         internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
@@ -1051,10 +1048,6 @@
             (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar);
         gpu_assert(shared_mem <= maxSharedMem);
 
-        // cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y  << "
-        // block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y <<
-        // " num_blocks.z: " << num_blocks.z  << " shared_mem: " << shared_mem << " in stream " << m_device.stream() <<
-        // endl;
         const array<Index, 3> indices{m_indices[idxX], m_indices[idxY], m_indices[idxZ]};
         const array<Index, 3> kernel_dims{m_kernelImpl.dimensions()[idxX], m_kernelImpl.dimensions()[idxY],
                                           m_kernelImpl.dimensions()[idxZ]};
@@ -1087,7 +1080,7 @@
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
-    // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
+    // TODO(rmlarsen): For now, this is just a copy of the CPU cost
     // model.
     const double kernel_size = m_kernelImpl.dimensions().TotalSize();
     // We ignore the use of fused multiply-add.

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/src/Tensor/TensorConvolutionSycl.h
similarity index 98%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
rename to unsupported/Eigen/src/Tensor/TensorConvolutionSycl.h
index 915c5de..37cf966 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
+++ b/unsupported/Eigen/src/Tensor/TensorConvolutionSycl.h

@@ -56,7 +56,7 @@
   void operator()(cl::sycl::nd_item<2> itemID) const {
     auto buffer_ptr = buffer_acc;
     auto kernel_ptr = kernel_filter;
-    // the required row to be calculated for the for each plane in shered memory
+    // the required row to be calculated for each plane in shared memory
     const size_t num_input = (itemID.get_local_range()[0] + kernelSize - 1);
     const size_t plane_kernel_offset = itemID.get_local_id(1) * num_input;
     const size_t input_offset = itemID.get_group(0) * itemID.get_local_range()[0];
@@ -123,7 +123,7 @@
   void operator()(cl::sycl::nd_item<3> itemID) const {
     auto buffer_ptr = buffer_acc;
     auto kernel_ptr = kernel_filter;
-    // the required row to be calculated for the for each plane in shered memory
+    // the required row to be calculated for each plane in shared memory
     const auto num_input = cl::sycl::range<2>{
         (cl::sycl::range<2>(itemID.get_local_range()[0], itemID.get_local_range()[1]) + kernel_size - 1)};
 
@@ -506,7 +506,7 @@
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
-    // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
+    // TODO(rmlarsen): For now, this is just a copy of the CPU cost
     // model.
     const double kernel_size = m_kernelImpl.dimensions().TotalSize();
     // We ignore the use of fused multiply-add.

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/unsupported/Eigen/src/Tensor/TensorCostModel.h
similarity index 98%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
rename to unsupported/Eigen/src/Tensor/TensorCostModel.h
index 88e36dc..d17d356 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
+++ b/unsupported/Eigen/src/Tensor/TensorCostModel.h

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2016 Rasmus Munk Larsen <rmlarsen@google.com>
+// Copyright (C) 2016 Rasmus Munk Larsen <rmlarsen@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -126,7 +126,7 @@
   double compute_cycles_;
 };
 
-// TODO(rmlarsen): Implement a policy that chooses an "optimal" number of theads
+// TODO(rmlarsen): Implement a policy that chooses an "optimal" number of threads
 // in [1:max_threads] instead of just switching multi-threading off for small
 // work units.
 /**

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/unsupported/Eigen/src/Tensor/TensorCustomOp.h
similarity index 100%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
rename to unsupported/Eigen/src/Tensor/TensorCustomOp.h


diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/unsupported/Eigen/src/Tensor/TensorDevice.h
similarity index 99%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
rename to unsupported/Eigen/src/Tensor/TensorDevice.h
index 1b9b49e..1ed2089 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
+++ b/unsupported/Eigen/src/Tensor/TensorDevice.h

@@ -24,7 +24,7 @@
  * Example:
  *    C.device(EIGEN_GPU) = A + B;
  *
- * Todo: operator *= and /=.
+ * TODO: operator *= and /=.
  */
 template <typename ExpressionType, typename DeviceType>
 class TensorDevice {

diff --git a/unsupported/Eigen/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/src/Tensor/TensorDeviceCuda.h
new file mode 100644
index 0000000..bad8034
--- /dev/null
+++ b/unsupported/Eigen/src/Tensor/TensorDeviceCuda.h

@@ -0,0 +1,7 @@
+
+#if defined(__clang__) || defined(__GNUC__)
+#warning \
+    "Deprecated header file, please either include the main unsupported/Eigen/Tensor header or the respective TensorDeviceGpu.h file"
+#endif
+
+#include "TensorDeviceGpu.h"

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/unsupported/Eigen/src/Tensor/TensorDeviceDefault.h
similarity index 96%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
rename to unsupported/Eigen/src/Tensor/TensorDeviceDefault.h
index eaaf332..1f955f0 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
+++ b/unsupported/Eigen/src/Tensor/TensorDeviceDefault.h

@@ -68,7 +68,7 @@
     return l1CacheSize();
 #elif defined(EIGEN_HIP_DEVICE_COMPILE)
     // Running on a HIP device
-    return 48 * 1024;  // FIXME : update this number for HIP
+    return 48 * 1024;  // FIXME: Update this number for HIP.
 #else
     // Running on a CUDA device, return the amount of shared memory available.
     return 48 * 1024;
@@ -81,7 +81,7 @@
     return l3CacheSize();
 #elif defined(EIGEN_HIP_DEVICE_COMPILE)
     // Running on a HIP device
-    return firstLevelCacheSize();  // FIXME : update this number for HIP
+    return firstLevelCacheSize();  // FIXME: Update this number for HIP.
 #else
     // Running on a CUDA device
     return firstLevelCacheSize();

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h b/unsupported/Eigen/src/Tensor/TensorDeviceGpu.h
similarity index 83%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
rename to unsupported/Eigen/src/Tensor/TensorDeviceGpu.h
index 2a3b087..76e2fda 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
+++ b/unsupported/Eigen/src/Tensor/TensorDeviceGpu.h

@@ -13,7 +13,7 @@
 // IWYU pragma: private
 #include "./InternalHeaderCheck.h"
 
-#include "../../../../../Eigen/src/Core/util/GpuHipCudaDefines.inc"
+#include "../../../../Eigen/src/Core/util/GpuHipCudaDefines.inc"
 
 namespace Eigen {
 
@@ -44,68 +44,40 @@
 
 class GpuDeviceProperties {
  public:
-  GpuDeviceProperties() : initialized_(false), first_(true), device_properties_(nullptr) {}
+  static const GpuDeviceProperties& instance() {
+    static const GpuDeviceProperties* kInstance = new GpuDeviceProperties();
 
-  ~GpuDeviceProperties() {
-    if (device_properties_) {
-      delete[] device_properties_;
-    }
+    return *kInstance;
   }
 
   EIGEN_STRONG_INLINE const gpuDeviceProp_t& get(int device) const { return device_properties_[device]; }
 
-  EIGEN_STRONG_INLINE bool isInitialized() const { return initialized_; }
+ private:
+  GpuDeviceProperties() = default;
 
-  void initialize() {
-    if (!initialized_) {
-      // Attempts to ensure proper behavior in the case of multiple threads
-      // calling this function simultaneously. This would be trivial to
-      // implement if we could use std::mutex, but unfortunately mutex don't
-      // compile with nvcc, so we resort to atomics and thread fences instead.
-      // Note that if the caller uses a compiler that doesn't support c++11 we
-      // can't ensure that the initialization is thread safe.
-      if (first_.exchange(false)) {
-        // We're the first thread to reach this point.
-        int num_devices;
-        gpuError_t status = gpuGetDeviceCount(&num_devices);
-        if (status != gpuSuccess) {
-          std::cerr << "Failed to get the number of GPU devices: " << gpuGetErrorString(status) << std::endl;
-          gpu_assert(status == gpuSuccess);
-        }
-        device_properties_ = new gpuDeviceProp_t[num_devices];
-        for (int i = 0; i < num_devices; ++i) {
-          status = gpuGetDeviceProperties(&device_properties_[i], i);
-          if (status != gpuSuccess) {
-            std::cerr << "Failed to initialize GPU device #" << i << ": " << gpuGetErrorString(status) << std::endl;
-            gpu_assert(status == gpuSuccess);
-          }
-        }
-
-        std::atomic_thread_fence(std::memory_order_release);
-        initialized_ = true;
-      } else {
-        // Wait for the other thread to inititialize the properties.
-        while (!initialized_) {
-          std::atomic_thread_fence(std::memory_order_acquire);
-          std::this_thread::sleep_for(std::chrono::milliseconds(1000));
-        }
+  static std::vector<gpuDeviceProp_t> GetDeviceProperties() {
+    int num_devices = 0;
+    gpuError_t status = gpuGetDeviceCount(&num_devices);
+    if (status != gpuSuccess) {
+      std::cerr << "Failed to get the number of GPU devices: " << gpuGetErrorString(status) << std::endl;
+      gpu_assert(status == gpuSuccess);
+    }
+    std::vector<gpuDeviceProp_t> device_properties(num_devices);
+    for (int i = 0; i < num_devices; ++i) {
+      status = gpuGetDeviceProperties(&device_properties[i], i);
+      if (status != gpuSuccess) {
+        std::cerr << "Failed to initialize GPU device #" << i << ": " << gpuGetErrorString(status) << std::endl;
+        gpu_assert(status == gpuSuccess);
       }
     }
+
+    return device_properties;
   }
 
- private:
-  volatile bool initialized_;
-  std::atomic<bool> first_;
-  gpuDeviceProp_t* device_properties_;
+  std::vector<gpuDeviceProp_t> device_properties_ = GetDeviceProperties();
 };
 
-EIGEN_ALWAYS_INLINE const GpuDeviceProperties& GetGpuDeviceProperties() {
-  static GpuDeviceProperties* deviceProperties = new GpuDeviceProperties();
-  if (!deviceProperties->isInitialized()) {
-    deviceProperties->initialize();
-  }
-  return *deviceProperties;
-}
+EIGEN_ALWAYS_INLINE const GpuDeviceProperties& GetGpuDeviceProperties() { return GpuDeviceProperties::instance(); }
 
 EIGEN_ALWAYS_INLINE const gpuDeviceProp_t& GetGpuDeviceProperties(int device) {
   return GetGpuDeviceProperties().get(device);
@@ -301,12 +273,12 @@
   }
 
   EIGEN_STRONG_INLINE size_t numThreads() const {
-    // FIXME
+    // FIXME: Return a more accurate thread count.
     return 32;
   }
 
   EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
-    // FIXME
+    // FIXME: Return a more accurate cache size.
     return 48 * 1024;
   }
 
@@ -387,6 +359,6 @@
 }  // end namespace Eigen
 
 // undefine all the gpu* macros we defined at the beginning of the file
-#include "../../../../../Eigen/src/Core/util/GpuHipCudaUndefines.inc"
+#include "../../../../Eigen/src/Core/util/GpuHipCudaUndefines.inc"
 
 #endif  // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/src/Tensor/TensorDeviceSycl.h
similarity index 100%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
rename to unsupported/Eigen/src/Tensor/TensorDeviceSycl.h


diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/src/Tensor/TensorDeviceThreadPool.h
similarity index 99%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
rename to unsupported/Eigen/src/Tensor/TensorDeviceThreadPool.h
index 3320990..85c1dc8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/src/Tensor/TensorDeviceThreadPool.h

@@ -92,7 +92,7 @@
 
   EIGEN_STRONG_INLINE int numThreads() const { return num_threads_; }
 
-  // Number of theads available in the underlying thread pool. This number can
+  // Number of threads available in the underlying thread pool. This number can
   // be different from the value returned by numThreads().
   EIGEN_STRONG_INLINE int numThreadsInPool() const { return pool_->NumThreads(); }
 

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h b/unsupported/Eigen/src/Tensor/TensorDimensionList.h
similarity index 100%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h
rename to unsupported/Eigen/src/Tensor/TensorDimensionList.h


diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/src/Tensor/TensorDimensions.h
similarity index 99%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
rename to unsupported/Eigen/src/Tensor/TensorDimensions.h
index e20052c..8aca0f3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
+++ b/unsupported/Eigen/src/Tensor/TensorDimensions.h

@@ -90,12 +90,12 @@
   EIGEN_DEVICE_FUNC Sizes() {}
   template <typename DenseIndex>
   explicit EIGEN_DEVICE_FUNC Sizes(const array<DenseIndex, Base::count>& /*indices*/) {
-    // todo: add assertion
+    // TODO: Add assertion.
   }
   template <typename... DenseIndex>
   EIGEN_DEVICE_FUNC Sizes(DenseIndex...) {}
   explicit EIGEN_DEVICE_FUNC Sizes(std::initializer_list<std::ptrdiff_t> /*l*/) {
-    // todo: add assertion
+    // TODO: Add assertion.
   }
 
   template <typename T>

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/src/Tensor/TensorEvalTo.h
similarity index 98%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
rename to unsupported/Eigen/src/Tensor/TensorEvalTo.h
index 9bc0eac..9c83fad 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/src/Tensor/TensorEvalTo.h

@@ -18,7 +18,6 @@
 namespace internal {
 template <typename XprType, template <class> class MakePointer_>
 struct traits<TensorEvalToOp<XprType, MakePointer_> > {
-  // Type promotion to handle the case where the types of the lhs and the rhs are different.
   typedef typename XprType::Scalar Scalar;
   typedef traits<XprType> XprTraits;
   typedef typename XprTraits::StorageKind StorageKind;

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/src/Tensor/TensorEvaluator.h
similarity index 99%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
rename to unsupported/Eigen/src/Tensor/TensorEvaluator.h
index bd9a7d8..7a19d7a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/src/Tensor/TensorEvaluator.h

@@ -78,7 +78,7 @@
 #ifdef EIGEN_USE_THREADS
   template <typename EvalSubExprsCallback>
   EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType dest, EvalSubExprsCallback done) {
-    // TODO(ezhulenev): ThreadPoolDevice memcpy is blockign operation.
+    // TODO(ezhulenev): ThreadPoolDevice memcpy is a blocking operation.
     done(evalSubExprsIfNeeded(dest));
   }
 #endif  // EIGEN_USE_THREADS
@@ -245,7 +245,7 @@
 #ifdef EIGEN_USE_THREADS
   template <typename EvalSubExprsCallback>
   EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType dest, EvalSubExprsCallback done) {
-    // TODO(ezhulenev): ThreadPoolDevice memcpy is a blockign operation.
+    // TODO(ezhulenev): ThreadPoolDevice memcpy is a blocking operation.
     done(evalSubExprsIfNeeded(dest));
   }
 #endif  // EIGEN_USE_THREADS

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/src/Tensor/TensorExecutor.h
similarity index 97%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
rename to unsupported/Eigen/src/Tensor/TensorExecutor.h
index da33210..2ae0b27 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/src/Tensor/TensorExecutor.h

@@ -61,7 +61,7 @@
  *
  * \brief The tensor executor class.
  *
- * This class is responsible for launch the evaluation of the expression on
+ * This class is responsible for launching the evaluation of the expression on
  * the specified computing device.
  *
  * Default strategy: the expression is evaluated sequentially with a single cpu
@@ -77,7 +77,7 @@
  public:
   typedef typename Expression::Index StorageIndex;
 
-  // Including `unsupported/Eigen/CXX11/Tensor` in different translation units
+  // Including `unsupported/Eigen/Tensor` in different translation units
   // with/without `EIGEN_USE_THREADS` or `EIGEN_USE_GPU` is a potential ODR
   // violation. If this template is instantiated with a non-default device, it
   // means that this header file was included without defining
@@ -91,7 +91,7 @@
     TensorEvaluator<Expression, Device> evaluator(expr, device);
     const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
     if (needs_assign) {
-      const StorageIndex size = array_prod(evaluator.dimensions());
+      const StorageIndex size = static_cast<StorageIndex>(array_prod(evaluator.dimensions()));
       for (StorageIndex i = 0; i < size; ++i) {
         evaluator.evalScalar(i);
       }
@@ -120,7 +120,7 @@
     TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
     const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
     if (needs_assign) {
-      const StorageIndex size = array_prod(evaluator.dimensions());
+      const StorageIndex size = static_cast<StorageIndex>(array_prod(evaluator.dimensions()));
       const int PacketSize =
           unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
 
@@ -302,7 +302,7 @@
     Evaluator evaluator(expr, device);
     const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
     if (needs_assign) {
-      const StorageIndex size = array_prod(evaluator.dimensions());
+      const StorageIndex size = static_cast<StorageIndex>(array_prod(evaluator.dimensions()));
       device.parallelFor(
           size, evaluator.costPerCoeff(Vectorizable), EvalRange::alignBlockSize,
           [&evaluator](StorageIndex firstIdx, StorageIndex lastIdx) { EvalRange::run(&evaluator, firstIdx, lastIdx); });
@@ -375,7 +375,7 @@
       }
 
       typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
-      const StorageIndex size = array_prod(ctx->evaluator.dimensions());
+      const StorageIndex size = static_cast<StorageIndex>(array_prod(ctx->evaluator.dimensions()));
       device.parallelForAsync(
           size, ctx->evaluator.costPerCoeff(Vectorizable), EvalRange::alignBlockSize,
           [ctx](StorageIndex firstIdx, StorageIndex lastIdx) { EvalRange::run(&ctx->evaluator, firstIdx, lastIdx); },
@@ -583,8 +583,8 @@
         numext::mini<int64_t>(device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor(),
                               NumTraits<StorageIndex>::highest()) /
         block_size);
-    const StorageIndex size = array_prod(evaluator.dimensions());
-    // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
+    const StorageIndex size = static_cast<StorageIndex>(array_prod(evaluator.dimensions()));
+    // Create at least one block to ensure we don't crash with tensors of size 0.
     const int num_blocks = numext::maxi<int>(
         numext::mini<int>(max_blocks, static_cast<int>(numext::div_ceil<StorageIndex>(size, block_size))), 1);
 

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/unsupported/Eigen/src/Tensor/TensorExpr.h
similarity index 99%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
rename to unsupported/Eigen/src/Tensor/TensorExpr.h
index a0e558b..108bc92 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
+++ b/unsupported/Eigen/src/Tensor/TensorExpr.h

@@ -35,7 +35,7 @@
  *
  * \brief Tensor nullary expression.
  *
- * The TensorCwiseNullaryOp class applies a nullary operators to an expression.
+ * The TensorCwiseNullaryOp class applies a nullary operator to an expression.
  * This is typically used to generate constants.
  */
 template <typename NullaryOp, typename XprType>

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/src/Tensor/TensorFFT.h
similarity index 96%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
rename to unsupported/Eigen/src/Tensor/TensorFFT.h
index b9d6f37..141b416 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
+++ b/unsupported/Eigen/src/Tensor/TensorFFT.h

@@ -57,7 +57,7 @@
 
 namespace internal {
 template <typename FFT, typename XprType, int FFTResultType, int FFTDir>
-struct traits<TensorFFTOp<FFT, XprType, FFTResultType, FFTDir> > : public traits<XprType> {
+struct traits<TensorFFTOp<FFT, XprType, FFTResultType, FFTDir>> : public traits<XprType> {
   typedef traits<XprType> XprTraits;
   typedef typename XprTraits::Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real RealScalar;
@@ -81,7 +81,7 @@
 
 template <typename FFT, typename XprType, int FFTResultType, int FFTDirection>
 struct nested<TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection>, 1,
-              typename eval<TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection> >::type> {
+              typename eval<TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection>>::type> {
   typedef TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection> type;
 };
 
@@ -248,23 +248,7 @@
 
         // The recurrence is correct in exact arithmetic, but causes
         // numerical issues for large transforms, especially in
-        // single-precision floating point.
-        //
-        // pos_j_base_powered[0] = ComplexScalar(1, 0);
-        // if (line_len > 1) {
-        //   const ComplexScalar pos_j_base = ComplexScalar(
-        //       numext::cos(EIGEN_PI / line_len), numext::sin(EIGEN_PI / line_len));
-        //   pos_j_base_powered[1] = pos_j_base;
-        //   if (line_len > 2) {
-        //     const ComplexScalar pos_j_base_sq = pos_j_base * pos_j_base;
-        //     for (int i = 2; i < line_len + 1; ++i) {
-        //       pos_j_base_powered[i] = pos_j_base_powered[i - 1] *
-        //           pos_j_base_powered[i - 1] /
-        //           pos_j_base_powered[i - 2] *
-        //           pos_j_base_sq;
-        //     }
-        //   }
-        // }
+        // single-precision floating point. Use direct computation instead.
         // TODO(rmlarsen): Find a way to use Eigen's vectorized sin
         // and cosine functions here.
         for (int j = 0; j < line_len + 1; ++j) {

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/src/Tensor/TensorFixedSize.h
similarity index 96%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
rename to unsupported/Eigen/src/Tensor/TensorFixedSize.h
index 753a25a..faa7af8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
+++ b/unsupported/Eigen/src/Tensor/TensorFixedSize.h

@@ -205,10 +205,6 @@
     using internal::logical_and_op;
 
     return true;
-    // check whether the indices are all >= 0
-    /*       array_apply_and_reduce<logical_and_op, greater_equal_zero_op>(indices) &&
-  // check whether the indices fit in the dimensions
-  array_zip_and_reduce<logical_and_op, lesser_op>(indices, m_storage.dimensions());*/
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index linearizedIndex(const array<Index, NumIndices>& indices) const {

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/src/Tensor/TensorForcedEval.h
similarity index 97%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
rename to unsupported/Eigen/src/Tensor/TensorForcedEval.h
index dadccb3..e945dad 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/src/Tensor/TensorForcedEval.h

@@ -20,7 +20,6 @@
 namespace internal {
 template <typename XprType>
 struct traits<TensorForcedEvalOp<XprType>> {
-  // Type promotion to handle the case where the types of the lhs and the rhs are different.
   typedef typename XprType::Scalar Scalar;
   typedef traits<XprType> XprTraits;
   typedef typename traits<XprType>::StorageKind StorageKind;
@@ -49,7 +48,7 @@
 /**
  * \ingroup CXX11_Tensor_Module
  *
- * \brief Tensor reshaping class.
+ * \brief Tensor forced evaluation class.
  */
 template <typename XprType>
 class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType>, ReadOnlyAccessors> {
@@ -73,7 +72,7 @@
 template <typename Device, typename CoeffReturnType>
 struct non_integral_type_placement_new {
   template <typename StorageType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index numValues, StorageType m_buffer) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index numValues, StorageType m_buffer) const {
     // Initialize non-trivially constructible types.
     if (!internal::is_arithmetic<CoeffReturnType>::value) {
       for (Index i = 0; i < numValues; ++i) new (m_buffer + i) CoeffReturnType();
@@ -87,7 +86,7 @@
 template <typename CoeffReturnType>
 struct non_integral_type_placement_new<Eigen::SyclDevice, CoeffReturnType> {
   template <typename StorageType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index, StorageType) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index, StorageType) const {}
 };
 }  // end namespace internal
 

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/src/Tensor/TensorForwardDeclarations.h
similarity index 100%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
rename to unsupported/Eigen/src/Tensor/TensorForwardDeclarations.h


diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/src/Tensor/TensorFunctors.h
similarity index 100%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
rename to unsupported/Eigen/src/Tensor/TensorFunctors.h


diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/src/Tensor/TensorGenerator.h
similarity index 100%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
rename to unsupported/Eigen/src/Tensor/TensorGenerator.h


diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h b/unsupported/Eigen/src/Tensor/TensorGlobalFunctions.h
similarity index 100%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h
rename to unsupported/Eigen/src/Tensor/TensorGlobalFunctions.h


diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h b/unsupported/Eigen/src/Tensor/TensorIO.h
similarity index 100%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorIO.h
rename to unsupported/Eigen/src/Tensor/TensorIO.h


diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/src/Tensor/TensorImagePatch.h
similarity index 98%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
rename to unsupported/Eigen/src/Tensor/TensorImagePatch.h
index 8bd1c43..2f94ce5 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
+++ b/unsupported/Eigen/src/Tensor/TensorImagePatch.h

@@ -316,8 +316,8 @@
           break;
         default:
           eigen_assert(false && "unexpected padding");
-          m_outputCols = 0;  // silence the uninitialised warning;
-          m_outputRows = 0;  //// silence the uninitialised warning;
+          m_outputCols = 0;  // Silence the uninitialized warning.
+          m_outputRows = 0;  // Silence the uninitialized warning.
       }
     }
     eigen_assert(m_outputRows > 0);
@@ -482,7 +482,7 @@
       const Index rowOffsets[2] = {patchOffsets[0] - colOffsets[0] * m_colStride,
                                    patchOffsets[1] - colOffsets[1] * m_colStride};
       eigen_assert(rowOffsets[0] <= rowOffsets[1]);
-      // Calculate col indices in the original input tensor.
+      // Calculate row indices in the original input tensor.
       const Index inputRows[2] = {rowIndex * m_row_strides + rowOffsets[0] - m_rowPaddingTop,
                                   rowIndex * m_row_strides + rowOffsets[1] - m_rowPaddingTop};
 

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/src/Tensor/TensorIndexList.h
similarity index 98%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
rename to unsupported/Eigen/src/Tensor/TensorIndexList.h
index 394c150..d244b50 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
+++ b/unsupported/Eigen/src/Tensor/TensorIndexList.h

@@ -19,7 +19,10 @@
 struct type2index {
   static constexpr Index value = n;
   EIGEN_DEVICE_FUNC constexpr operator Index() const { return n; }
-  EIGEN_DEVICE_FUNC void set(Index val) { eigen_assert(val == n); }
+  EIGEN_DEVICE_FUNC void set(Index val) {
+    EIGEN_ONLY_USED_FOR_DEBUG(val);
+    eigen_assert(val == n);
+  }
 };
 
 // This can be used with IndexPairList to get compile-time constant pairs,
@@ -32,6 +35,7 @@
   constexpr EIGEN_DEVICE_FUNC operator IndexPair<Index>() const { return IndexPair<Index>(f, s); }
 
   EIGEN_DEVICE_FUNC void set(const IndexPair<Index>& val) {
+    EIGEN_ONLY_USED_FOR_DEBUG(val);
     eigen_assert(val.first == f);
     eigen_assert(val.second == s);
   }
@@ -185,7 +189,6 @@
 struct tuple_coeff {
   template <typename... T>
   EIGEN_DEVICE_FUNC static constexpr ValueT get(const Index i, const IndexTuple<T...>& t) {
-    //    return array_get<Idx>(t) * (i == Idx) + tuple_coeff<Idx-1>::get(i, t) * (i != Idx);
     return (i == Idx ? array_get<Idx>(t) : tuple_coeff<Idx - 1, ValueT>::get(i, t));
   }
   template <typename... T>
@@ -222,11 +225,11 @@
 struct tuple_coeff<0, ValueT> {
   template <typename... T>
   EIGEN_DEVICE_FUNC static constexpr ValueT get(const Index /*i*/, const IndexTuple<T...>& t) {
-    //  eigen_assert (i == 0);  // gcc fails to compile assertions in constexpr
-    return array_get<0>(t) /* * (i == 0)*/;
+    return array_get<0>(t);
   }
   template <typename... T>
   EIGEN_DEVICE_FUNC static void set(const Index i, IndexTuple<T...>& t, const ValueT value) {
+    EIGEN_ONLY_USED_FOR_DEBUG(i);
     eigen_assert(i == 0);
     update_value(array_get<0>(t), value);
   }

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/src/Tensor/TensorInflation.h
similarity index 100%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
rename to unsupported/Eigen/src/Tensor/TensorInflation.h


diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h b/unsupported/Eigen/src/Tensor/TensorInitializer.h
similarity index 97%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h
rename to unsupported/Eigen/src/Tensor/TensorInitializer.h
index 26cd50f..579caac 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h
+++ b/unsupported/Eigen/src/Tensor/TensorInitializer.h

@@ -47,7 +47,7 @@
                   Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions>* indices,
                   const InitList& vals) {
     int i = 0;
-    // There is likely a faster way to do that than iterating.
+    // TODO: Consider a faster approach than iterating.
     for (const auto& v : vals) {
       (*indices)[traits<Derived>::NumDimensions - 1] = i++;
       tensor.coeffRef(*indices) = v;

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/src/Tensor/TensorIntDiv.h
similarity index 98%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
rename to unsupported/Eigen/src/Tensor/TensorIntDiv.h
index cd04680..ed25646 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
+++ b/unsupported/Eigen/src/Tensor/TensorIntDiv.h

@@ -168,7 +168,6 @@
   // type numerator should also be less than 2^32-1.
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T divide(const T numerator) const {
     eigen_assert(static_cast<typename UnsignedTraits<T>::type>(numerator) < NumTraits<UnsignedType>::highest() / 2);
-    // eigen_assert(numerator >= 0); // this is implicitly asserted by the line above
 
     UnsignedType t1 = muluh(multiplier, numerator);
     UnsignedType t = (static_cast<UnsignedType>(numerator) - t1) >> shift1;

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/src/Tensor/TensorLayoutSwap.h
similarity index 100%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
rename to unsupported/Eigen/src/Tensor/TensorLayoutSwap.h


diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h b/unsupported/Eigen/src/Tensor/TensorMacros.h
similarity index 100%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
rename to unsupported/Eigen/src/Tensor/TensorMacros.h


diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/src/Tensor/TensorMap.h
similarity index 97%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
rename to unsupported/Eigen/src/Tensor/TensorMap.h
index 9abfddb..548edfe 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
+++ b/unsupported/Eigen/src/Tensor/TensorMap.h

@@ -15,7 +15,7 @@
 
 namespace Eigen {
 
-// FIXME use proper doxygen documentation (e.g. \tparam MakePointer_)
+// FIXME: Use proper doxygen documentation (e.g. \tparam MakePointer_).
 
 /**
  * \ingroup CXX11_Tensor_Module
@@ -104,7 +104,6 @@
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StoragePointerType data() const { return m_data; }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageRefType operator()(const array<Index, NumIndices>& indices) const {
-    //      eigen_assert(checkIndexRange(indices));
     if (PlainObjectType::Options & RowMajor) {
       const Index index = m_dimensions.IndexOfRowMajor(indices);
       return m_data[index];
@@ -141,7 +140,6 @@
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageRefType operator()(const array<Index, NumIndices>& indices) {
-    //      eigen_assert(checkIndexRange(indices));
     if (PlainObjectType::Options & RowMajor) {
       const Index index = m_dimensions.IndexOfRowMajor(indices);
       return m_data[index];

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/src/Tensor/TensorMeta.h
similarity index 100%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
rename to unsupported/Eigen/src/Tensor/TensorMeta.h


diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/src/Tensor/TensorMorphing.h
similarity index 99%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
rename to unsupported/Eigen/src/Tensor/TensorMorphing.h
index 3a697d3..7a5c4b0 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/src/Tensor/TensorMorphing.h

@@ -160,8 +160,6 @@
     return internal::TensorBlockResourceRequirements::any();
   }
 
-  // required in block(OutputTensorBlock* output_block) const
-  // For C++03 compatibility this must be defined outside the method
   struct BlockIteratorState {
     Index stride;
     Index span;
@@ -312,10 +310,11 @@
 
 namespace internal {
 
-// Fixme: figure out the exact threshold
+// FIXME: Figure out the exact threshold.
 template <typename Index, typename Device, bool BlockAccess>
 struct MemcpyTriggerForSlicing {
-  EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const Device& device) : threshold_(2 * device.numThreads()) {}
+  EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const Device& device)
+      : threshold_(static_cast<Index>(2 * device.numThreads())) {}
   EIGEN_DEVICE_FUNC bool operator()(Index total, Index contiguous) const {
     const bool prefer_block_evaluation = BlockAccess && total > 32 * 1024;
     return !prefer_block_evaluation && contiguous > threshold_;

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/src/Tensor/TensorPadding.h
similarity index 99%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
rename to unsupported/Eigen/src/Tensor/TensorPadding.h
index 7b2db49..8157596 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/unsupported/Eigen/src/Tensor/TensorPadding.h

@@ -260,7 +260,7 @@
     Index output_offset = 0;
     const DSizes<Index, NumDims> output_strides = internal::strides<Layout>(desc.dimensions());
 
-    // NOTE(ezhulenev): We initialize bock iteration state for `NumDims - 1`
+    // NOTE(ezhulenev): We initialize block iteration state for `NumDims - 1`
     // dimensions, skipping innermost dimension. In theory it should be possible
     // to squeeze matching innermost dimensions, however in practice that did
     // not show any improvements in benchmarks. Also in practice first outer

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/src/Tensor/TensorPatch.h
similarity index 100%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
rename to unsupported/Eigen/src/Tensor/TensorPatch.h


diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/unsupported/Eigen/src/Tensor/TensorRandom.h
similarity index 83%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
rename to unsupported/Eigen/src/Tensor/TensorRandom.h
index e9de988..ffda20a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
+++ b/unsupported/Eigen/src/Tensor/TensorRandom.h

@@ -61,7 +61,6 @@
   unsigned rnd = PCG_XSH_RS_generator(state, stream);
   const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x3ffu) | (static_cast<uint16_t>(15) << 10);
   Eigen::half result = Eigen::numext::bit_cast<Eigen::half>(half_bits);
-  // Return the final result
   return result - Eigen::half(1.0f);
 }
 
@@ -72,7 +71,6 @@
   unsigned rnd = PCG_XSH_RS_generator(state, stream);
   const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x7fu) | (static_cast<uint16_t>(127) << 7);
   Eigen::bfloat16 result = Eigen::numext::bit_cast<Eigen::bfloat16>(half_bits);
-  // Return the final result
   return result - Eigen::bfloat16(1.0f);
 }
 
@@ -83,12 +81,11 @@
     float fp;
   } internal;
   internal result;
-  // Generate 23 random bits for the mantissa mantissa
+  // Generate 23 random bits for the mantissa.
   const unsigned rnd = PCG_XSH_RS_generator(state, stream);
   result.raw = rnd & 0x7fffffu;
-  // Set the exponent
+  // Set the exponent.
   result.raw |= (static_cast<uint32_t>(127) << 23);
-  // Return the final result
   return result.fp - 1.0f;
 }
 
@@ -103,7 +100,7 @@
   // Generate 52 random bits for the mantissa
   // First generate the upper 20 bits
   unsigned rnd1 = PCG_XSH_RS_generator(state, stream) & 0xfffffu;
-  // The generate the lower 32 bits
+  // Then generate the lower 32 bits.
   unsigned rnd2 = PCG_XSH_RS_generator(state, stream);
   result.raw = (static_cast<uint64_t>(rnd1) << 32) | rnd2;
   // Set the exponent
@@ -132,19 +129,10 @@
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(uint64_t seed = 0) {
     m_state = PCG_XSH_RS_state(seed);
 #ifdef EIGEN_USE_SYCL
-    // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
-    // Therefore, we need two steps to initializate the m_state.
-    // IN SYCL, the constructor of the functor is s called on the CPU
-    // and we get the clock seed here from the CPU. However, This seed is
-    // the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
-    // and only  available on the Operator() function (which is called on the GPU).
-    // Thus for CUDA (((CLOCK  + global_thread_id)* 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each
-    // thread but for SYCL ((CLOCK * 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread and each
-    // thread adds the  (global_thread_id* 6364136223846793005ULL) for itself only once, in order to complete the
-    // construction similar to CUDA Therefore, the thread Id injection is not available at this stage.
-    // However when the operator() is called the thread ID will be available. So inside the operator,
-    // we add the thrreadID, BlockId,... (which is equivalent of i)
-    // to the seed and construct the unique m_state per thead similar to cuda.
+    // In SYCL, the constructor runs on the CPU where thread IDs are unavailable.
+    // We initialize m_state here with just the clock seed; the per-thread
+    // component (i * 6364136223846793005ULL) is added in operator() when
+    // the thread ID becomes available, completing the PCG state setup.
     m_exec_once = false;
 #endif
   }
@@ -240,16 +228,10 @@
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) {
     m_state = PCG_XSH_RS_state(seed);
 #ifdef EIGEN_USE_SYCL
-    // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
-    // Therefore, we need two steps to initializate the m_state.
-    // IN SYCL, the constructor of the functor is s called on the CPU
-    // and we get the clock seed here from the CPU. However, This seed is
-    // the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
-    // and only  available on the Operator() function (which is called on the GPU).
-    // Therefore, the thread Id injection is not available at this stage. However when the operator()
-    // is called the thread ID will be available. So inside the operator,
-    // we add the thrreadID, BlockId,... (which is equivalent of i)
-    // to the seed and construct the unique m_state per thead similar to cuda.
+    // In SYCL, the constructor runs on the CPU where thread IDs are unavailable.
+    // We initialize m_state here with just the clock seed; the per-thread
+    // component (i * 6364136223846793005ULL) is added in operator() when
+    // the thread ID becomes available, completing the PCG state setup.
     m_exec_once = false;
 #endif
   }

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/src/Tensor/TensorReduction.h
similarity index 97%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
rename to unsupported/Eigen/src/Tensor/TensorReduction.h
index 9bbf945..25ba136 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/src/Tensor/TensorReduction.h

@@ -386,7 +386,7 @@
     Barrier barrier(internal::convert_index<unsigned int>(numblocks));
     MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
     for (Index i = 0; i < numblocks; ++i) {
-      auto run_shard = [i, blocksize, &self, &barrier, &shards, &reducer](){
+      auto run_shard = [i, blocksize, &self, &barrier, &shards, &reducer]() {
         shards[i] = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, i * blocksize, blocksize, reducer);
         barrier.Notify();
       };
@@ -693,7 +693,7 @@
       if (internal::InnerReducer<Self, Op, Device>::HasOptimizedImplementation &&
           (reducing_inner_dims || ReducingInnerMostDims)) {
         const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
-        const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
+        const Index num_coeffs_to_preserve = static_cast<Index>(internal::array_prod(m_dimensions));
         if (!data) {
           if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve &&
                num_values_to_reduce > 128) ||
@@ -706,7 +706,7 @@
           }
         }
         Op reducer(m_reducer);
-        // For SYCL this if always return false
+        // For SYCL, this always returns false.
         if (internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce,
                                                           num_coeffs_to_preserve)) {
           if (m_result) {
@@ -729,7 +729,7 @@
       }
       if (internal::OuterReducer<Self, Op, Device>::HasOptimizedImplementation && preserving_inner_dims) {
         const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
-        const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
+        const Index num_coeffs_to_preserve = static_cast<Index>(internal::array_prod(m_dimensions));
         if (!data) {
           if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve &&
                num_values_to_reduce > 32) ||
@@ -742,7 +742,7 @@
           }
         }
         Op reducer(m_reducer);
-        // For SYCL this if always return false
+        // For SYCL, this always returns false.
         if (internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce,
                                                           num_coeffs_to_preserve)) {
           if (m_result) {
@@ -759,7 +759,7 @@
       // must break into two subexpression and use the SYCL generic Reducer on the device.
       if (RunningOnSycl) {
         const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
-        const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
+        const Index num_coeffs_to_preserve = static_cast<Index>(internal::array_prod(m_dimensions));
         if (!data) {
           data = static_cast<EvaluatorPointerType>(
               m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
@@ -835,7 +835,7 @@
     } else if (PreservingInnerMostDims) {
       const Index firstIndex = firstInput(index);
       const int innermost_dim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : NumOutputDims - 1;
-      // TBD: extend this the the n innermost dimensions that we preserve.
+      // TBD: extend this to the n innermost dimensions that we preserve.
       if (((firstIndex % m_dimensions[innermost_dim]) + PacketSize - 1) < m_dimensions[innermost_dim]) {
         Op reducer(m_reducer);
         typename Self::PacketReturnType accum = reducer.template initializePacket<typename Self::PacketReturnType>();
@@ -903,7 +903,7 @@
 #if defined(EIGEN_USE_SYCL)
   template <typename Evaluator_, typename Op__>
   friend class TensorSycl::internal::GenericNondeterministicReducer;
-  // SYCL need the Generic reducer for the case the recution algorithm is neither inner, outer, and full reducer
+  // SYCL needs the generic reducer when the reduction is neither inner, outer, nor full.
   template <typename, typename, typename>
   friend struct internal::GenericReducer;
 #endif
@@ -1005,15 +1005,13 @@
       Base;
   EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Eigen::SyclDevice& device)
       : Base(op, device) {}
-  // The coeff function in the base the recursive method which is not an standard layout and cannot be used in the SYCL
-  // kernel
-  // Therefore the coeff function should be overridden by for SYCL kernel
+  // The base coeff function uses a recursive method that is not standard layout and cannot be used in
+  // SYCL kernels, so it must be overridden.
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::CoeffReturnType coeff(typename Base::Index index) const {
     return *(this->data() + index);
   }
-  // The packet function in the base the recursive method which is not an standard layout and cannot be used in the SYCL
-  // kernel
-  // Therefore the packet function should be overridden by for SYCL kernel
+  // The base packet function uses a recursive method that is not standard layout and cannot be used in
+  // SYCL kernels, so it must be overridden.
   template <int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::PacketReturnType packet(typename Base::Index index) const {
     return internal::pload<typename Base::PacketReturnType>(this->data() + index);

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h b/unsupported/Eigen/src/Tensor/TensorReductionGpu.h
similarity index 98%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
rename to unsupported/Eigen/src/Tensor/TensorReductionGpu.h
index c5273e9..f3e5db6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
+++ b/unsupported/Eigen/src/Tensor/TensorReductionGpu.h

@@ -181,7 +181,7 @@
   for (int offset = warpSize / 2; offset > 0; offset /= 2) {
 #if defined(EIGEN_HIPCC)
     // use std::is_floating_point to determine the type of reduced_val
-    // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error
+    // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambiguous" error
     // and list the float and int versions of __shfl_down as the candidate functions.
     if (std::is_floating_point<typename Self::CoeffReturnType>::value) {
       reducer.reduce(__shfl_down(static_cast<float>(accum), offset, warpSize), &accum);
@@ -429,7 +429,7 @@
     half* scratch = static_cast<half*>(device.scratchpad());
 
     if (num_blocks > 1) {
-      // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there
+      // We initialize the output and the scratchpad outside the reduction kernel when we can't be sure that there
       // won't be a race conditions between multiple thread blocks.
       LAUNCH_GPU_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>), 1, 1, 0, device, reducer, self,
                         num_coeffs, scratch);
@@ -536,7 +536,7 @@
       for (int offset = warpSize / 2; offset > 0; offset /= 2) {
 #if defined(EIGEN_HIPCC)
         // use std::is_floating_point to determine the type of reduced_val
-        // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error
+        // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambiguous" error
         // and list the float and int versions of __shfl_down as the candidate functions.
         if (std::is_floating_point<Type>::value) {
           reducer.reduce(__shfl_down(static_cast<float>(reduced_val), offset), &reduced_val);
@@ -556,6 +556,11 @@
     }
   }
 #else   // EIGEN_CUDA_ARCH >= 300
+  EIGEN_UNUSED_VARIABLE(reducer);
+  EIGEN_UNUSED_VARIABLE(input);
+  EIGEN_UNUSED_VARIABLE(num_coeffs_to_reduce);
+  EIGEN_UNUSED_VARIABLE(num_preserved_coeffs);
+  EIGEN_UNUSED_VARIABLE(output);
   gpu_assert(0 && "Shouldn't be called on unsupported device");
 #endif  // EIGEN_CUDA_ARCH >= 300
 }
@@ -802,8 +807,8 @@
     }
 
     const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
-    const int block_size = /*256*/ 128;
-    const int num_per_thread = /*128*/ 64;
+    const int block_size = 128;
+    const int num_per_thread = 64;
     const int dyn_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread);
     const int max_blocks = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / block_size;
     const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/src/Tensor/TensorReductionSycl.h
similarity index 98%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
rename to unsupported/Eigen/src/Tensor/TensorReductionSycl.h
index b4749b4..2b5b734 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
+++ b/unsupported/Eigen/src/Tensor/TensorReductionSycl.h

@@ -253,7 +253,7 @@
 };
 
 enum class reduction_dim { inner_most, outer_most };
-// default is preserver
+// Default partial reduction (preserve dimensions).
 template <typename Evaluator, typename OpType, typename PannelParameters, reduction_dim rt>
 struct PartialReductionKernel {
   typedef typename Evaluator::CoeffReturnType CoeffReturnType;
@@ -398,7 +398,7 @@
     }
     output_accessor[globalId] = op.finalize(accumulator);
   }
-};  // namespace internal
+};
 
 template <typename Index, Index LTP, Index LTR, bool BC_>
 struct ReductionPannel {
@@ -497,13 +497,9 @@
     // Our empirical research shows that if each thread reduces at least 512
     // elements individually, we get better performance.
     const Index reductionPerThread = 2048;
-    // const Index num_work_group =
     Index reductionGroup = dev.getPowerOfTwo(
         (inputSize + (reductionPerThread * local_range - 1)) / (reductionPerThread * local_range), true);
     const Index num_work_group = std::min(reductionGroup, local_range);
-    // 1
-    // ? local_range
-    // : 1);
     const Index global_range = num_work_group * local_range;
 
     auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
@@ -561,8 +557,8 @@
   }
 };
 
-// ArmgMax uses this kernel for partial reduction//
-// TODO(@mehdi.goli) come up with a better kernel
+// ArgMax uses this kernel for partial reduction.
+// TODO(@mehdi.goli): Come up with a better kernel.
 // generic partial reduction
 template <typename Self, typename Op>
 struct GenericReducer<Self, Op, Eigen::SyclDevice> {

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/unsupported/Eigen/src/Tensor/TensorRef.h
similarity index 99%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
rename to unsupported/Eigen/src/Tensor/TensorRef.h
index 98223fe..d0df443 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
+++ b/unsupported/Eigen/src/Tensor/TensorRef.h

@@ -45,7 +45,6 @@
 class TensorLazyEvaluatorReadOnly
     : public TensorLazyBaseEvaluator<Dimensions, typename TensorEvaluator<Expr, Device>::Scalar> {
  public:
-  //  typedef typename TensorEvaluator<Expr, Device>::Dimensions Dimensions;
   typedef typename TensorEvaluator<Expr, Device>::Scalar Scalar;
   typedef StorageMemory<Scalar, Device> Storage;
   typedef typename Storage::Type EvaluatorPointerType;

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/src/Tensor/TensorReverse.h
similarity index 98%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
rename to unsupported/Eigen/src/Tensor/TensorReverse.h
index 4f167e7..ece85ca 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/unsupported/Eigen/src/Tensor/TensorReverse.h

@@ -214,10 +214,6 @@
     // TODO(ezhulenev): If underlying tensor expression supports and prefers
     // block evaluation we must use it. Currently we use coeff and packet
     // access into the underlying tensor expression.
-    // static const bool useBlockAccessForArgType =
-    //     TensorEvaluator<ArgType, Device>::BlockAccess &&
-    //     TensorEvaluator<ArgType, Device>::PreferBlockAccess;
-
     static const bool isColMajor = static_cast<int>(Layout) == static_cast<int>(ColMajor);
 
     static const Index inner_dim_idx = isColMajor ? 0 : NumDims - 1;
@@ -395,7 +391,7 @@
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) const {
     eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
 
-    // This code is pilfered from TensorMorphing.h
+    // This code is adapted from TensorMorphing.h
     EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize];
     internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
     EIGEN_UNROLL_LOOP

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRoll.h b/unsupported/Eigen/src/Tensor/TensorRoll.h
similarity index 100%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorRoll.h
rename to unsupported/Eigen/src/Tensor/TensorRoll.h


diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/src/Tensor/TensorScan.h
similarity index 97%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
rename to unsupported/Eigen/src/Tensor/TensorScan.h
index 6de0867..f3e5bd4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
+++ b/unsupported/Eigen/src/Tensor/TensorScan.h

@@ -143,7 +143,7 @@
 
 template <typename Self, bool Vectorize, bool Parallel>
 struct ReduceBlock {
-  EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1, typename Self::CoeffReturnType* data) {
+  EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1, typename Self::CoeffReturnType* data) const {
     for (Index idx2 = 0; idx2 < self.stride(); idx2++) {
       // Calculate the starting offset for the scan
       Index offset = idx1 + idx2;
@@ -155,7 +155,7 @@
 // Specialization for vectorized reduction.
 template <typename Self>
 struct ReduceBlock<Self, /*Vectorize=*/true, /*Parallel=*/false> {
-  EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1, typename Self::CoeffReturnType* data) {
+  EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1, typename Self::CoeffReturnType* data) const {
     using Packet = typename Self::PacketReturnType;
     const int PacketSize = internal::unpacket_traits<Packet>::size;
     Index idx2 = 0;
@@ -204,7 +204,7 @@
 
 template <typename Self>
 struct ReduceBlock<Self, /*Vectorize=*/true, /*Parallel=*/true> {
-  EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1, typename Self::CoeffReturnType* data) {
+  EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1, typename Self::CoeffReturnType* data) const {
     using Scalar = typename Self::CoeffReturnType;
     using Packet = typename Self::PacketReturnType;
     const int PacketSize = internal::unpacket_traits<Packet>::size;
@@ -243,7 +243,7 @@
 
 template <typename Self>
 struct ReduceBlock<Self, /*Vectorize=*/false, /*Parallel=*/true> {
-  EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1, typename Self::CoeffReturnType* data) {
+  EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1, typename Self::CoeffReturnType* data) const {
     using Scalar = typename Self::CoeffReturnType;
     self.device().parallelFor(
         self.stride(), TensorOpCost(self.size(), self.size(), 16 * self.size()),
@@ -261,7 +261,7 @@
 // Specialization for multi-threaded execution.
 template <typename Self, typename Reducer, bool Vectorize>
 struct ScanLauncher<Self, Reducer, ThreadPoolDevice, Vectorize> {
-  void operator()(Self& self, typename Self::CoeffReturnType* data) {
+  void operator()(Self& self, typename Self::CoeffReturnType* data) const {
     using Scalar = typename Self::CoeffReturnType;
     using Packet = typename Self::PacketReturnType;
     const int PacketSize = internal::unpacket_traits<Packet>::size;
@@ -304,7 +304,7 @@
 #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
 
 // GPU implementation of scan
-// TODO(ibab) This placeholder implementation performs multiple scans in
+// TODO(ibab): This placeholder implementation performs multiple scans in
 // parallel, but it would be better to use a parallel scan algorithm and
 // optimize memory access.
 template <typename Self, typename Reducer>
@@ -333,7 +333,7 @@
 
 template <typename Self, typename Reducer, bool Vectorize>
 struct ScanLauncher<Self, Reducer, GpuDevice, Vectorize> {
-  void operator()(const Self& self, typename Self::CoeffReturnType* data) {
+  void operator()(const Self& self, typename Self::CoeffReturnType* data) const {
     Index total_size = internal::array_prod(self.dimensions());
     Index num_blocks = (total_size / self.size() + 63) / 64;
     Index block_size = 64;
@@ -396,8 +396,8 @@
       }
     } else {
       // dims can only be indexed through unsigned integers,
-      // so let's use an unsigned type to let the compiler knows.
-      // This prevents stupid warnings: ""'*((void*)(& evaluator)+64)[18446744073709551615]' may be used uninitialized
+      // so use an unsigned type to let the compiler know.
+      // This prevents spurious warnings: "'*((void*)(& evaluator)+64)[18446744073709551615]' may be used uninitialized
       // in this function"
       unsigned int axis = internal::convert_index<unsigned int>(op.axis());
       for (unsigned int i = NumDims - 1; i > axis; --i) {

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h b/unsupported/Eigen/src/Tensor/TensorScanSycl.h
similarity index 99%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h
rename to unsupported/Eigen/src/Tensor/TensorScanSycl.h
index 3636788..f08b83a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h
+++ b/unsupported/Eigen/src/Tensor/TensorScanSycl.h

@@ -268,7 +268,7 @@
         }
       });
       next_elements = 0;
-      // right the first set of private param
+      // Write the first set of private params.
       EIGEN_UNROLL_LOOP
       for (Index i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
         Index global_id = global_offset + next_elements;

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/src/Tensor/TensorShuffling.h
similarity index 100%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
rename to unsupported/Eigen/src/Tensor/TensorShuffling.h


diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/src/Tensor/TensorStorage.h
similarity index 100%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
rename to unsupported/Eigen/src/Tensor/TensorStorage.h


diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/src/Tensor/TensorStriding.h
similarity index 97%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
rename to unsupported/Eigen/src/Tensor/TensorStriding.h
index 04ade37..eee7c9b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
+++ b/unsupported/Eigen/src/Tensor/TensorStriding.h

@@ -87,7 +87,7 @@
 
   static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
   enum {
-    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
+    IsAligned = false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     BlockAccess = false,
     PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
@@ -237,13 +237,11 @@
     : public TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device> {
   typedef TensorStridingOp<Strides, ArgType> XprType;
   typedef TensorEvaluator<const XprType, Device> Base;
-  //  typedef typename XprType::Index Index;
   static constexpr int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
-  //  typedef DSizes<Index, NumDims> Dimensions;
 
   static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
   enum {
-    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
+    IsAligned = false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     PreferBlockAccess = false,
     CoordAccess = false,  // to be implemented

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h b/unsupported/Eigen/src/Tensor/TensorTrace.h
similarity index 100%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
rename to unsupported/Eigen/src/Tensor/TensorTrace.h


diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/unsupported/Eigen/src/Tensor/TensorTraits.h
similarity index 98%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
rename to unsupported/Eigen/src/Tensor/TensorTraits.h
index f5954d6..5c97d52 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
+++ b/unsupported/Eigen/src/Tensor/TensorTraits.h

@@ -148,7 +148,7 @@
   typedef const TensorRef<PlainObjectType> EIGEN_DEVICE_REF type;
 };
 
-// TODO nested<> does not exist anymore in Eigen/Core, and it thus has to be removed in favor of ref_selector.
+// TODO: nested<> does not exist anymore in Eigen/Core, and should be removed in favor of ref_selector.
 template <typename T, int n = 1, typename PlainObject = void>
 struct nested {
   typedef typename ref_selector<T>::type type;

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/unsupported/Eigen/src/Tensor/TensorUInt128.h
similarity index 100%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
rename to unsupported/Eigen/src/Tensor/TensorUInt128.h


diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/src/Tensor/TensorVolumePatch.h
similarity index 99%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
rename to unsupported/Eigen/src/Tensor/TensorVolumePatch.h
index cf69fef..61e8b04 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
+++ b/unsupported/Eigen/src/Tensor/TensorVolumePatch.h

@@ -449,8 +449,7 @@
 
     // Find the offset of the element wrt the location of the first element.
     Index first_entry = (indices[0] - patchIndex * m_patchStride) / m_fastOutputDepth;
-    Index second_entry = PacketSize == 1 ? first_entry : 
-                        (indices[1] - patchIndex * m_patchStride) / m_fastOutputDepth;
+    Index second_entry = PacketSize == 1 ? first_entry : (indices[1] - patchIndex * m_patchStride) / m_fastOutputDepth;
 
     const Index patchOffsets[2] = {first_entry, second_entry};
 
@@ -476,7 +475,7 @@
     const Index rowOffsets[2] = {(patchOffsets[0] - colOffsets[0] * m_colStride) / m_fastRowStride,
                                  (patchOffsets[1] - colOffsets[1] * m_colStride) / m_fastRowStride};
     eigen_assert(rowOffsets[0] <= rowOffsets[1]);
-    // Calculate col indices in the original input tensor.
+    // Calculate row indices in the original input tensor.
     const Index inputRows[2] = {rowIndex * m_row_strides + rowOffsets[0] - m_rowPaddingTop,
                                 rowIndex * m_row_strides + rowOffsets[1] - m_rowPaddingTop};
 

diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h b/unsupported/Eigen/src/TensorSymmetry/DynamicSymmetry.h
similarity index 98%
rename from unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h
rename to unsupported/Eigen/src/TensorSymmetry/DynamicSymmetry.h
index ae5c4f4..3d50bc0 100644
--- a/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h
+++ b/unsupported/Eigen/src/TensorSymmetry/DynamicSymmetry.h

@@ -211,7 +211,7 @@
     std::size_t newNumIndices = (one > two) ? one : two + 1;
     for (auto& gelem : m_elements) {
       gelem.representation.reserve(newNumIndices);
-      for (std::size_t i = m_numIndices; i < newNumIndices; i++) gelem.representation.push_back(i);
+      for (std::size_t i = m_numIndices; i < newNumIndices; i++) gelem.representation.push_back(static_cast<int>(i));
     }
     m_numIndices = newNumIndices;
   }
@@ -246,8 +246,8 @@
 
   std::size_t coset_rep = coset_order;
   do {
-    for (auto g : m_generators) {
-      e = mul(m_elements[coset_rep], g);
+    for (auto gen : m_generators) {
+      e = mul(m_elements[coset_rep], gen);
       p = findElement(e);
       if (p < 0) {
         // element not yet in group

diff --git a/unsupported/Eigen/src/TensorSymmetry/InternalHeaderCheck.h b/unsupported/Eigen/src/TensorSymmetry/InternalHeaderCheck.h
new file mode 100644
index 0000000..c7f5992
--- /dev/null
+++ b/unsupported/Eigen/src/TensorSymmetry/InternalHeaderCheck.h

@@ -0,0 +1,3 @@
+#ifndef EIGEN_CXX11_TENSORSYMMETRY_MODULE_H
+#error "Please include unsupported/Eigen/TensorSymmetry instead of including headers inside the src directory directly."
+#endif

diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h b/unsupported/Eigen/src/TensorSymmetry/StaticSymmetry.h
similarity index 98%
rename from unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h
rename to unsupported/Eigen/src/TensorSymmetry/StaticSymmetry.h
index 66a982b..6ee508f 100644
--- a/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h
+++ b/unsupported/Eigen/src/TensorSymmetry/StaticSymmetry.h

@@ -150,8 +150,8 @@
   }
 };
 
-template <EIGEN_TPL_PP_SPEC_HACK_DEF(typename, empty)>
-struct tensor_static_symgroup_do_apply<internal::type_list<EIGEN_TPL_PP_SPEC_HACK_USE(empty)>> {
+template <>
+struct tensor_static_symgroup_do_apply<internal::type_list<>> {
   template <typename Op, typename RV, std::size_t SGNumIndices, typename Index, std::size_t NumIndices,
             typename... Args>
   static inline RV run(const std::array<Index, NumIndices>&, RV initial, Args&&...) {

diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/Symmetry.h b/unsupported/Eigen/src/TensorSymmetry/Symmetry.h
similarity index 99%
rename from unsupported/Eigen/CXX11/src/TensorSymmetry/Symmetry.h
rename to unsupported/Eigen/src/TensorSymmetry/Symmetry.h
index 2d3ff46..2c57e36 100644
--- a/unsupported/Eigen/CXX11/src/TensorSymmetry/Symmetry.h
+++ b/unsupported/Eigen/src/TensorSymmetry/Symmetry.h

@@ -74,7 +74,7 @@
  * \brief Dynamic symmetry group
  *
  * The %DynamicSGroup class represents a symmetry group that need not be known at
- * compile time. It is useful if one wants to support arbitrary run-time defineable
+ * compile time. It is useful if one wants to support arbitrary run-time definable
  * symmetries for tensors, but it is also instantiated if a symmetry group is defined
  * at compile time that would be either too large for the compiler to reasonably
  * generate (using templates to calculate this at compile time is very inefficient)

diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h b/unsupported/Eigen/src/TensorSymmetry/util/TemplateGroupTheory.h
similarity index 97%
rename from unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h
rename to unsupported/Eigen/src/TensorSymmetry/util/TemplateGroupTheory.h
index aa16f3c..df8da6c 100644
--- a/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h
+++ b/unsupported/Eigen/src/TensorSymmetry/util/TemplateGroupTheory.h

@@ -132,8 +132,8 @@
       Equality<id, t>::global_flags | strip_identities<Equality, id, type_list<ts...>>::global_flags;
 };
 
-template <template <typename, typename> class Equality, typename id EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, ts)>
-struct strip_identities<Equality, id, type_list<EIGEN_TPL_PP_SPEC_HACK_USE(ts)>> {
+template <template <typename, typename> class Equality, typename id>
+struct strip_identities<Equality, id, type_list<>> {
   typedef type_list<> type;
   constexpr static int global_flags = 0;
 };
@@ -278,10 +278,9 @@
 };
 
 template <template <typename, typename> class Multiply, template <typename, typename> class Equality, typename id,
-          typename sub_group_elements, typename elements EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, empty),
-          typename rep_element, int sub_group_size>
-struct dimino_add_cosets_for_rep<Multiply, Equality, id, sub_group_elements, elements,
-                                 type_list<EIGEN_TPL_PP_SPEC_HACK_USE(empty)>, rep_element, sub_group_size> {
+          typename sub_group_elements, typename elements, typename rep_element, int sub_group_size>
+struct dimino_add_cosets_for_rep<Multiply, Equality, id, sub_group_elements, elements, type_list<>, rep_element,
+                                 sub_group_size> {
   typedef elements type;
   constexpr static int global_flags = 0;
 };

diff --git a/unsupported/Eigen/CXX11/src/util/CXX11Meta.h b/unsupported/Eigen/src/TensorUtil/CXX11Meta.h
similarity index 88%
rename from unsupported/Eigen/CXX11/src/util/CXX11Meta.h
rename to unsupported/Eigen/src/TensorUtil/CXX11Meta.h
index 74b47ce..f836cd1 100644
--- a/unsupported/Eigen/CXX11/src/util/CXX11Meta.h
+++ b/unsupported/Eigen/src/TensorUtil/CXX11Meta.h

@@ -11,7 +11,7 @@
 #define EIGEN_CXX11META_H
 
 #include <vector>
-#include "../../../../../Eigen/src/Core/util/EmulateArray.h"
+#include "../../../../Eigen/src/Core/util/EmulateArray.h"
 
 #include "CXX11Workarounds.h"
 

diff --git a/unsupported/Eigen/src/TensorUtil/CXX11Workarounds.h b/unsupported/Eigen/src/TensorUtil/CXX11Workarounds.h
new file mode 100644
index 0000000..ffc198c
--- /dev/null
+++ b/unsupported/Eigen/src/TensorUtil/CXX11Workarounds.h

@@ -0,0 +1,41 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11WORKAROUNDS_H
+#define EIGEN_CXX11WORKAROUNDS_H
+
+namespace Eigen {
+
+namespace internal {
+
+/* array_get overloads for std::vector, used by tensor code.
+ */
+
+template <std::size_t I_, class T>
+constexpr T& array_get(std::vector<T>& a) {
+  return a[I_];
+}
+template <std::size_t I_, class T>
+constexpr T&& array_get(std::vector<T>&& a) {
+  return a[I_];
+}
+template <std::size_t I_, class T>
+constexpr T const& array_get(std::vector<T> const& a) {
+  return a[I_];
+}
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11WORKAROUNDS_H
+
+/*
+ * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
+ */

diff --git a/unsupported/bench/bench_svd.cpp b/unsupported/bench/bench_svd.cpp
deleted file mode 100644
index 40a989b..0000000
--- a/unsupported/bench/bench_svd.cpp
+++ /dev/null

@@ -1,111 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2013 Gauthier Brun <brun.gauthier@gmail.com>
-// Copyright (C) 2013 Nicolas Carre <nicolas.carre@ensimag.fr>
-// Copyright (C) 2013 Jean Ceccato <jean.ceccato@ensimag.fr>
-// Copyright (C) 2013 Pierre Zoppitelli <pierre.zoppitelli@ensimag.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/
-
-// Bench to compare the efficiency of SVD algorithms
-
-#include <iostream>
-#include <bench/BenchTimer.h>
-#include <unsupported/Eigen/SVD>
-
-using namespace Eigen;
-using namespace std;
-
-// number of computations of each algorithm before the print of the time
-#ifndef REPEAT
-#define REPEAT 10
-#endif
-
-// number of tests of the same type
-#ifndef NUMBER_SAMPLE
-#define NUMBER_SAMPLE 2
-#endif
-
-template <typename MatrixType>
-void bench_svd(const MatrixType& a = MatrixType()) {
-  MatrixType m = MatrixType::Random(a.rows(), a.cols());
-  BenchTimer timerJacobi;
-  BenchTimer timerBDC;
-  timerJacobi.reset();
-  timerBDC.reset();
-
-  cout << " Only compute Singular Values" << endl;
-  for (int k = 1; k <= NUMBER_SAMPLE; ++k) {
-    timerBDC.start();
-    for (int i = 0; i < REPEAT; ++i) {
-      BDCSVD<MatrixType> bdc_matrix(m);
-    }
-    timerBDC.stop();
-
-    timerJacobi.start();
-    for (int i = 0; i < REPEAT; ++i) {
-      JacobiSVD<MatrixType> jacobi_matrix(m);
-    }
-    timerJacobi.stop();
-
-    cout << "Sample " << k << " : " << REPEAT << " computations :  Jacobi : " << fixed << timerJacobi.value() << "s ";
-    cout << " || "
-         << " BDC : " << timerBDC.value() << "s " << endl
-         << endl;
-
-    if (timerBDC.value() >= timerJacobi.value())
-      cout << "KO : BDC is " << timerJacobi.value() / timerBDC.value() << "  times faster than Jacobi" << endl;
-    else
-      cout << "OK : BDC is " << timerJacobi.value() / timerBDC.value() << "  times faster than Jacobi" << endl;
-  }
-  cout << "       =================" << endl;
-  std::cout << std::endl;
-  timerJacobi.reset();
-  timerBDC.reset();
-  cout << " Computes rotation matrix" << endl;
-  for (int k = 1; k <= NUMBER_SAMPLE; ++k) {
-    timerBDC.start();
-    for (int i = 0; i < REPEAT; ++i) {
-      BDCSVD<MatrixType> bdc_matrix(m, ComputeFullU | ComputeFullV);
-    }
-    timerBDC.stop();
-
-    timerJacobi.start();
-    for (int i = 0; i < REPEAT; ++i) {
-      JacobiSVD<MatrixType> jacobi_matrix(m, ComputeFullU | ComputeFullV);
-    }
-    timerJacobi.stop();
-
-    cout << "Sample " << k << " : " << REPEAT << " computations :  Jacobi : " << fixed << timerJacobi.value() << "s ";
-    cout << " || "
-         << " BDC : " << timerBDC.value() << "s " << endl
-         << endl;
-
-    if (timerBDC.value() >= timerJacobi.value())
-      cout << "KO : BDC is " << timerJacobi.value() / timerBDC.value() << "  times faster than Jacobi" << endl;
-    else
-      cout << "OK : BDC is " << timerJacobi.value() / timerBDC.value() << "  times faster than Jacobi" << endl;
-  }
-  std::cout << std::endl;
-}
-
-int main(int argc, char* argv[]) {
-  std::cout << std::endl;
-
-  std::cout << "On a (Dynamic, Dynamic) (6, 6) Matrix" << std::endl;
-  bench_svd<Matrix<double, Dynamic, Dynamic> >(Matrix<double, Dynamic, Dynamic>(6, 6));
-
-  std::cout << "On a (Dynamic, Dynamic) (32, 32) Matrix" << std::endl;
-  bench_svd<Matrix<double, Dynamic, Dynamic> >(Matrix<double, Dynamic, Dynamic>(32, 32));
-
-  // std::cout<<"On a (Dynamic, Dynamic) (128, 128) Matrix" <<std::endl;
-  // bench_svd<Matrix<double,Dynamic,Dynamic> >(Matrix<double,Dynamic,Dynamic>(128, 128));
-
-  std::cout << "On a (Dynamic, Dynamic) (160, 160) Matrix" << std::endl;
-  bench_svd<Matrix<double, Dynamic, Dynamic> >(Matrix<double, Dynamic, Dynamic>(160, 160));
-
-  std::cout << "--------------------------------------------------------------------" << std::endl;
-}

diff --git a/unsupported/benchmarks/AutoDiff/CMakeLists.txt b/unsupported/benchmarks/AutoDiff/CMakeLists.txt
new file mode 100644
index 0000000..cb5b986
--- /dev/null
+++ b/unsupported/benchmarks/AutoDiff/CMakeLists.txt

@@ -0,0 +1 @@
+eigen_add_benchmark(bench_autodiff bench_autodiff.cpp)

diff --git a/unsupported/benchmarks/AutoDiff/bench_autodiff.cpp b/unsupported/benchmarks/AutoDiff/bench_autodiff.cpp
new file mode 100644
index 0000000..74c334f
--- /dev/null
+++ b/unsupported/benchmarks/AutoDiff/bench_autodiff.cpp

@@ -0,0 +1,177 @@
+// Benchmarks for Eigen AutoDiff module.
+// Compares AutoDiff Jacobian computation against NumericalDiff and hand-coded Jacobians.
+
+#include <benchmark/benchmark.h>
+#include <Eigen/Core>
+#include <unsupported/Eigen/AutoDiff>
+#include <unsupported/Eigen/NumericalDiff>
+
+using namespace Eigen;
+
+// --- Small functor: Rosenbrock-like (2 inputs -> 2 outputs) ---
+struct SmallFunctor {
+  typedef Matrix<double, 2, 1> InputType;
+  typedef Matrix<double, 2, 1> ValueType;
+  typedef Matrix<double, 2, 2> JacobianType;
+
+  enum { InputsAtCompileTime = 2, ValuesAtCompileTime = 2 };
+
+  template <typename T>
+  void operator()(const Matrix<T, 2, 1>& x, Matrix<T, 2, 1>* v) const {
+    (*v)(0) = T(1) - x(0);
+    (*v)(1) = T(10) * (x(1) - x(0) * x(0));
+  }
+};
+
+// --- Medium functor: chain of operations (6 inputs -> 6 outputs) ---
+struct MediumFunctor {
+  typedef Matrix<double, 6, 1> InputType;
+  typedef Matrix<double, 6, 1> ValueType;
+  typedef Matrix<double, 6, 6> JacobianType;
+
+  enum { InputsAtCompileTime = 6, ValuesAtCompileTime = 6 };
+
+  template <typename T>
+  void operator()(const Matrix<T, 6, 1>& x, Matrix<T, 6, 1>* v) const {
+    (*v)(0) = sin(x(0)) * cos(x(1)) + x(2) * x(2);
+    (*v)(1) = exp(x(1) * T(0.1)) + x(3);
+    (*v)(2) = x(0) * x(2) - x(4) * x(5);
+    (*v)(3) = sqrt(x(3) * x(3) + T(1)) + x(0);
+    (*v)(4) = x(4) * x(4) + x(5) * x(5) + x(0) * x(1);
+    (*v)(5) = log(x(2) * x(2) + T(1)) + x(3) * x(4);
+  }
+};
+
+// --- Dynamic-size functor (N inputs -> N outputs) ---
+struct DynamicFunctor {
+  typedef Matrix<double, Dynamic, 1> InputType;
+  typedef Matrix<double, Dynamic, 1> ValueType;
+  typedef Matrix<double, Dynamic, Dynamic> JacobianType;
+
+  const int n_;
+  DynamicFunctor(int n) : n_(n) {}
+
+  enum { InputsAtCompileTime = Dynamic, ValuesAtCompileTime = Dynamic };
+
+  int inputs() const { return n_; }
+  int values() const { return n_; }
+
+  template <typename T>
+  void operator()(const Matrix<T, Dynamic, 1>& x, Matrix<T, Dynamic, 1>* v) const {
+    v->resize(n_);
+    (*v)(0) = T(1) - x(0);
+    for (int i = 1; i < n_; ++i) {
+      (*v)(i) = T(10) * (x(i) - x(i - 1) * x(i - 1));
+    }
+  }
+};
+
+// Wrapper for NumericalDiff compatibility.
+struct SmallFunctorND : SmallFunctor {
+  typedef double Scalar;
+  int inputs() const { return 2; }
+  int values() const { return 2; }
+  int operator()(const InputType& x, ValueType& v) const {
+    SmallFunctor::operator()(x, &v);
+    return 0;
+  }
+};
+
+struct MediumFunctorND : MediumFunctor {
+  typedef double Scalar;
+  int inputs() const { return 6; }
+  int values() const { return 6; }
+  int operator()(const InputType& x, ValueType& v) const {
+    MediumFunctor::operator()(x, &v);
+    return 0;
+  }
+};
+
+// --- AutoDiff Jacobian benchmarks ---
+template <typename Functor>
+static void BM_AutoDiffJacobian(benchmark::State& state, Functor func) {
+  AutoDiffJacobian<Functor> adf(func);
+  typename Functor::InputType x = Functor::InputType::Random();
+  typename Functor::ValueType v;
+  typename Functor::JacobianType jac;
+
+  for (auto _ : state) {
+    adf(x, &v, &jac);
+    benchmark::DoNotOptimize(jac.data());
+    benchmark::ClobberMemory();
+  }
+}
+
+// --- Dynamic AutoDiff Jacobian ---
+static void BM_AutoDiffJacobian_Dynamic(benchmark::State& state) {
+  int n = state.range(0);
+  DynamicFunctor func(n);
+  AutoDiffJacobian<DynamicFunctor> adf(func);
+
+  VectorXd x = VectorXd::Random(n);
+  VectorXd v(n);
+  MatrixXd jac(n, n);
+
+  for (auto _ : state) {
+    adf(x, &v, &jac);
+    benchmark::DoNotOptimize(jac.data());
+    benchmark::ClobberMemory();
+  }
+}
+
+// --- NumericalDiff benchmarks ---
+template <typename Functor>
+static void BM_NumericalDiffJacobian(benchmark::State& state, Functor func) {
+  NumericalDiff<Functor> ndf(func);
+  typename Functor::InputType x = Functor::InputType::Random();
+  typename Functor::JacobianType jac;
+
+  for (auto _ : state) {
+    ndf.df(x, jac);
+    benchmark::DoNotOptimize(jac.data());
+    benchmark::ClobberMemory();
+  }
+}
+
+// --- Hand-coded Jacobian (Rosenbrock) for comparison ---
+static void BM_HandCoded_Small(benchmark::State& state) {
+  Vector2d x = Vector2d::Random();
+  Matrix2d jac;
+
+  for (auto _ : state) {
+    jac(0, 0) = -1;
+    jac(0, 1) = 0;
+    jac(1, 0) = -20 * x(0);
+    jac(1, 1) = 10;
+    benchmark::DoNotOptimize(jac.data());
+    benchmark::ClobberMemory();
+  }
+}
+
+// --- Scalar AutoDiff evaluation (no Jacobian, just forward pass) ---
+static void BM_AutoDiffScalar_Eval(benchmark::State& state) {
+  int n = state.range(0);
+  using ADScalar = AutoDiffScalar<VectorXd>;
+  VectorXd x = VectorXd::Random(n);
+
+  for (auto _ : state) {
+    ADScalar sum(0.0, VectorXd::Zero(n));
+    for (int i = 0; i < n; ++i) {
+      ADScalar xi(x(i), n, i);
+      sum += xi * xi + sin(xi);
+    }
+    benchmark::DoNotOptimize(sum.value());
+    benchmark::DoNotOptimize(sum.derivatives().data());
+    benchmark::ClobberMemory();
+  }
+}
+
+BENCHMARK_CAPTURE(BM_AutoDiffJacobian, Small, SmallFunctor());
+BENCHMARK_CAPTURE(BM_AutoDiffJacobian, Medium, MediumFunctor());
+BENCHMARK(BM_AutoDiffJacobian_Dynamic)->Arg(2)->Arg(6)->Arg(20)->Arg(50)->Arg(100);
+
+BENCHMARK_CAPTURE(BM_NumericalDiffJacobian, Small, SmallFunctorND());
+BENCHMARK_CAPTURE(BM_NumericalDiffJacobian, Medium, MediumFunctorND());
+
+BENCHMARK(BM_HandCoded_Small);
+BENCHMARK(BM_AutoDiffScalar_Eval)->Arg(2)->Arg(6)->Arg(20)->Arg(50)->Arg(100);

diff --git a/unsupported/benchmarks/CMakeLists.txt b/unsupported/benchmarks/CMakeLists.txt
new file mode 100644
index 0000000..ed8c88e
--- /dev/null
+++ b/unsupported/benchmarks/CMakeLists.txt

@@ -0,0 +1,35 @@
+cmake_minimum_required(VERSION 3.10)
+project(EigenUnsupportedBenchmarks CXX)
+
+find_package(benchmark REQUIRED)
+find_package(Threads REQUIRED)
+
+set(EIGEN_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../..")
+
+# Helper: add a Google Benchmark target (mirrors benchmarks/CMakeLists.txt).
+#   eigen_add_benchmark(name source [LIBRARIES lib1 lib2 ...] [DEFINITIONS def1 def2 ...])
+function(eigen_add_benchmark name source)
+  cmake_parse_arguments(BENCH "" "" "LIBRARIES;DEFINITIONS" ${ARGN})
+  if(NOT IS_ABSOLUTE "${source}")
+    set(source "${CMAKE_CURRENT_SOURCE_DIR}/${source}")
+  endif()
+  add_executable(${name} ${source})
+  target_include_directories(${name} PRIVATE ${EIGEN_SOURCE_DIR})
+  target_link_libraries(${name} PRIVATE benchmark::benchmark benchmark::benchmark_main
+                                        Threads::Threads)
+  if(BENCH_LIBRARIES)
+    target_link_libraries(${name} PRIVATE ${BENCH_LIBRARIES})
+  endif()
+  target_compile_options(${name} PRIVATE -O3 -DNDEBUG)
+  if(BENCH_DEFINITIONS)
+    target_compile_definitions(${name} PRIVATE ${BENCH_DEFINITIONS})
+  endif()
+endfunction()
+
+add_subdirectory(Tensor)
+add_subdirectory(MatrixFunctions)
+add_subdirectory(SpecialFunctions)
+add_subdirectory(AutoDiff)
+add_subdirectory(Splines)
+add_subdirectory(IterativeSolvers)
+add_subdirectory(KroneckerProduct)

diff --git a/unsupported/benchmarks/IterativeSolvers/CMakeLists.txt b/unsupported/benchmarks/IterativeSolvers/CMakeLists.txt
new file mode 100644
index 0000000..4ae536a
--- /dev/null
+++ b/unsupported/benchmarks/IterativeSolvers/CMakeLists.txt

@@ -0,0 +1 @@
+eigen_add_benchmark(bench_iterative_solvers bench_iterative_solvers.cpp)

diff --git a/unsupported/benchmarks/IterativeSolvers/bench_iterative_solvers.cpp b/unsupported/benchmarks/IterativeSolvers/bench_iterative_solvers.cpp
new file mode 100644
index 0000000..5105d40
--- /dev/null
+++ b/unsupported/benchmarks/IterativeSolvers/bench_iterative_solvers.cpp

@@ -0,0 +1,209 @@
+// Benchmarks for unsupported iterative solvers: GMRES, MINRES, IDRS, IDRSTABL, BiCGSTABL, DGMRES.
+
+#include <benchmark/benchmark.h>
+#include <Eigen/Sparse>
+#include <Eigen/IterativeLinearSolvers>
+#include <unsupported/Eigen/IterativeSolvers>
+
+using namespace Eigen;
+
+typedef double Scalar;
+typedef SparseMatrix<Scalar> SpMat;
+typedef Matrix<Scalar, Dynamic, 1> Vec;
+
+// Generate a SPD sparse matrix (Laplacian-like with diagonal dominance).
+static SpMat generateSPD(int n, int bandwidth) {
+  SpMat A(n, n);
+  std::vector<Triplet<Scalar>> trips;
+  trips.reserve(n * (2 * bandwidth + 1));
+  for (int i = 0; i < n; ++i) {
+    Scalar diag = 0;
+    for (int j = std::max(0, i - bandwidth); j < std::min(n, i + bandwidth + 1); ++j) {
+      if (i != j) {
+        Scalar val = -1.0 / (1 + std::abs(i - j));
+        trips.emplace_back(i, j, val);
+        diag -= val;
+      }
+    }
+    trips.emplace_back(i, i, diag + 1.0);
+  }
+  A.setFromTriplets(trips.begin(), trips.end());
+  return A;
+}
+
+// Generate a general (non-symmetric) sparse matrix.
+static SpMat generateGeneral(int n, int bandwidth) {
+  SpMat A(n, n);
+  std::vector<Triplet<Scalar>> trips;
+  trips.reserve(n * (2 * bandwidth + 1));
+  for (int i = 0; i < n; ++i) {
+    Scalar diag = 0;
+    for (int j = std::max(0, i - bandwidth); j < std::min(n, i + bandwidth + 1); ++j) {
+      if (i != j) {
+        Scalar val = -0.5 / (1 + std::abs(i - j));
+        if (j > i) val *= 1.5;  // asymmetry
+        trips.emplace_back(i, j, val);
+        diag += std::abs(val);
+      }
+    }
+    trips.emplace_back(i, i, diag + 1.0);  // diagonal dominance
+  }
+  A.setFromTriplets(trips.begin(), trips.end());
+  return A;
+}
+
+// --- GMRES ---
+static void BM_GMRES(benchmark::State& state) {
+  int n = state.range(0);
+  int bw = state.range(1);
+  SpMat A = generateGeneral(n, bw);
+  Vec b = Vec::Random(n);
+
+  GMRES<SpMat> solver;
+  solver.setMaxIterations(1000);
+  solver.setTolerance(1e-10);
+  solver.compute(A);
+
+  for (auto _ : state) {
+    Vec x = solver.solve(b);
+    benchmark::DoNotOptimize(x.data());
+    benchmark::ClobberMemory();
+  }
+  state.counters["iterations"] = solver.iterations();
+}
+
+// --- DGMRES ---
+static void BM_DGMRES(benchmark::State& state) {
+  int n = state.range(0);
+  int bw = state.range(1);
+  SpMat A = generateGeneral(n, bw);
+  Vec b = Vec::Random(n);
+
+  DGMRES<SpMat> solver;
+  solver.setMaxIterations(1000);
+  solver.setTolerance(1e-10);
+  solver.compute(A);
+
+  for (auto _ : state) {
+    Vec x = solver.solve(b);
+    benchmark::DoNotOptimize(x.data());
+    benchmark::ClobberMemory();
+  }
+  state.counters["iterations"] = solver.iterations();
+}
+
+// --- MINRES (SPD matrices) ---
+static void BM_MINRES(benchmark::State& state) {
+  int n = state.range(0);
+  int bw = state.range(1);
+  SpMat A = generateSPD(n, bw);
+  Vec b = Vec::Random(n);
+
+  MINRES<SpMat> solver;
+  solver.setMaxIterations(1000);
+  solver.setTolerance(1e-10);
+  solver.compute(A);
+
+  for (auto _ : state) {
+    Vec x = solver.solve(b);
+    benchmark::DoNotOptimize(x.data());
+    benchmark::ClobberMemory();
+  }
+  state.counters["iterations"] = solver.iterations();
+}
+
+// --- IDRS ---
+static void BM_IDRS(benchmark::State& state) {
+  int n = state.range(0);
+  int bw = state.range(1);
+  SpMat A = generateGeneral(n, bw);
+  Vec b = Vec::Random(n);
+
+  IDRS<SpMat> solver;
+  solver.setMaxIterations(1000);
+  solver.setTolerance(1e-10);
+  solver.compute(A);
+
+  for (auto _ : state) {
+    Vec x = solver.solve(b);
+    benchmark::DoNotOptimize(x.data());
+    benchmark::ClobberMemory();
+  }
+  state.counters["iterations"] = solver.iterations();
+}
+
+// --- BiCGSTABL ---
+static void BM_BiCGSTABL(benchmark::State& state) {
+  int n = state.range(0);
+  int bw = state.range(1);
+  SpMat A = generateGeneral(n, bw);
+  Vec b = Vec::Random(n);
+
+  BiCGSTABL<SpMat> solver;
+  solver.setMaxIterations(1000);
+  solver.setTolerance(1e-10);
+  solver.compute(A);
+
+  for (auto _ : state) {
+    Vec x = solver.solve(b);
+    benchmark::DoNotOptimize(x.data());
+    benchmark::ClobberMemory();
+  }
+  state.counters["iterations"] = solver.iterations();
+}
+
+// --- Compare with CG (supported module, SPD only) ---
+static void BM_CG_Reference(benchmark::State& state) {
+  int n = state.range(0);
+  int bw = state.range(1);
+  SpMat A = generateSPD(n, bw);
+  Vec b = Vec::Random(n);
+
+  ConjugateGradient<SpMat> solver;
+  solver.setMaxIterations(1000);
+  solver.setTolerance(1e-10);
+  solver.compute(A);
+
+  for (auto _ : state) {
+    Vec x = solver.solve(b);
+    benchmark::DoNotOptimize(x.data());
+    benchmark::ClobberMemory();
+  }
+  state.counters["iterations"] = solver.iterations();
+}
+
+// --- Compare with BiCGSTAB (supported module, general) ---
+static void BM_BiCGSTAB_Reference(benchmark::State& state) {
+  int n = state.range(0);
+  int bw = state.range(1);
+  SpMat A = generateGeneral(n, bw);
+  Vec b = Vec::Random(n);
+
+  BiCGSTAB<SpMat> solver;
+  solver.setMaxIterations(1000);
+  solver.setTolerance(1e-10);
+  solver.compute(A);
+
+  for (auto _ : state) {
+    Vec x = solver.solve(b);
+    benchmark::DoNotOptimize(x.data());
+    benchmark::ClobberMemory();
+  }
+  state.counters["iterations"] = solver.iterations();
+}
+
+static void SolverSizes(::benchmark::Benchmark* b) {
+  for (int n : {1000, 10000, 100000}) {
+    for (int bw : {5, 20}) {
+      b->Args({n, bw});
+    }
+  }
+}
+
+BENCHMARK(BM_GMRES)->Apply(SolverSizes);
+BENCHMARK(BM_DGMRES)->Apply(SolverSizes);
+BENCHMARK(BM_MINRES)->Apply(SolverSizes);
+BENCHMARK(BM_IDRS)->Apply(SolverSizes);
+BENCHMARK(BM_BiCGSTABL)->Apply(SolverSizes);
+BENCHMARK(BM_CG_Reference)->Apply(SolverSizes);
+BENCHMARK(BM_BiCGSTAB_Reference)->Apply(SolverSizes);

diff --git a/unsupported/benchmarks/KroneckerProduct/CMakeLists.txt b/unsupported/benchmarks/KroneckerProduct/CMakeLists.txt
new file mode 100644
index 0000000..ff0d289
--- /dev/null
+++ b/unsupported/benchmarks/KroneckerProduct/CMakeLists.txt

@@ -0,0 +1 @@
+eigen_add_benchmark(bench_kronecker bench_kronecker.cpp)

diff --git a/unsupported/benchmarks/KroneckerProduct/bench_kronecker.cpp b/unsupported/benchmarks/KroneckerProduct/bench_kronecker.cpp
new file mode 100644
index 0000000..882887e
--- /dev/null
+++ b/unsupported/benchmarks/KroneckerProduct/bench_kronecker.cpp

@@ -0,0 +1,83 @@
+// Benchmarks for Kronecker product (dense and sparse).
+
+#include <benchmark/benchmark.h>
+#include <Eigen/Core>
+#include <Eigen/Sparse>
+#include <unsupported/Eigen/KroneckerProduct>
+
+using namespace Eigen;
+
+typedef double Scalar;
+typedef Matrix<Scalar, Dynamic, Dynamic> Mat;
+typedef SparseMatrix<Scalar> SpMat;
+
+// --- Dense Kronecker product ---
+static void BM_KroneckerDense(benchmark::State& state) {
+  int na = state.range(0);
+  int nb = state.range(1);
+
+  Mat A = Mat::Random(na, na);
+  Mat B = Mat::Random(nb, nb);
+
+  for (auto _ : state) {
+    Mat C = kroneckerProduct(A, B).eval();
+    benchmark::DoNotOptimize(C.data());
+    benchmark::ClobberMemory();
+  }
+  int outSize = na * nb;
+  state.counters["output_size"] = outSize;
+}
+
+// --- Sparse Kronecker product ---
+static void BM_KroneckerSparse(benchmark::State& state) {
+  int na = state.range(0);
+  int nb = state.range(1);
+
+  // Create sparse identity-like matrices with some fill.
+  SpMat A(na, na);
+  SpMat B(nb, nb);
+
+  std::vector<Triplet<Scalar>> tripsA, tripsB;
+  for (int i = 0; i < na; ++i) {
+    tripsA.emplace_back(i, i, 2.0);
+    if (i + 1 < na) {
+      tripsA.emplace_back(i, i + 1, -1.0);
+      tripsA.emplace_back(i + 1, i, -1.0);
+    }
+  }
+  for (int i = 0; i < nb; ++i) {
+    tripsB.emplace_back(i, i, 2.0);
+    if (i + 1 < nb) {
+      tripsB.emplace_back(i, i + 1, -1.0);
+      tripsB.emplace_back(i + 1, i, -1.0);
+    }
+  }
+  A.setFromTriplets(tripsA.begin(), tripsA.end());
+  B.setFromTriplets(tripsB.begin(), tripsB.end());
+
+  for (auto _ : state) {
+    SpMat C = kroneckerProduct(A, B).eval();
+    benchmark::DoNotOptimize(C.valuePtr());
+    benchmark::ClobberMemory();
+  }
+  state.counters["output_size"] = na * nb;
+}
+
+static void KroneckerSizes(::benchmark::Benchmark* b) {
+  for (int na : {4, 8, 16}) {
+    for (int nb : {4, 8, 16}) {
+      b->Args({na, nb});
+    }
+  }
+}
+
+static void KroneckerSparseSizes(::benchmark::Benchmark* b) {
+  for (int na : {16, 32, 64, 128}) {
+    for (int nb : {16, 32, 64, 128}) {
+      b->Args({na, nb});
+    }
+  }
+}
+
+BENCHMARK(BM_KroneckerDense)->Apply(KroneckerSizes);
+BENCHMARK(BM_KroneckerSparse)->Apply(KroneckerSparseSizes);

diff --git a/unsupported/benchmarks/MatrixFunctions/CMakeLists.txt b/unsupported/benchmarks/MatrixFunctions/CMakeLists.txt
new file mode 100644
index 0000000..2a612ed
--- /dev/null
+++ b/unsupported/benchmarks/MatrixFunctions/CMakeLists.txt

@@ -0,0 +1,3 @@
+eigen_add_benchmark(bench_matrix_exponential bench_matrix_exponential.cpp)
+eigen_add_benchmark(bench_matrix_logarithm bench_matrix_logarithm.cpp)
+eigen_add_benchmark(bench_matrix_power bench_matrix_power.cpp)

diff --git a/unsupported/benchmarks/MatrixFunctions/bench_matrix_exponential.cpp b/unsupported/benchmarks/MatrixFunctions/bench_matrix_exponential.cpp
new file mode 100644
index 0000000..3778b76
--- /dev/null
+++ b/unsupported/benchmarks/MatrixFunctions/bench_matrix_exponential.cpp

@@ -0,0 +1,52 @@
+// Benchmarks for matrix exponential.
+// Critical for Sophus Lie group operations (SLAM, visual odometry).
+
+#include <benchmark/benchmark.h>
+#include <Eigen/Core>
+#include <unsupported/Eigen/MatrixFunctions>
+
+using namespace Eigen;
+
+#ifndef SCALAR
+#define SCALAR double
+#endif
+
+typedef SCALAR Scalar;
+
+static void BM_MatrixExp(benchmark::State& state) {
+  int n = state.range(0);
+  typedef Matrix<Scalar, Dynamic, Dynamic> MatrixType;
+
+  // Generate a random matrix with reasonable spectral radius.
+  MatrixType A = MatrixType::Random(n, n) / Scalar(n);
+  MatrixType result(n, n);
+
+  for (auto _ : state) {
+    result = A.exp();
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+}
+
+// Fixed-size specializations for Lie group sizes.
+template <int N>
+static void BM_MatrixExp_Fixed(benchmark::State& state) {
+  typedef Matrix<Scalar, N, N> MatrixType;
+
+  MatrixType A = MatrixType::Random() / Scalar(N);
+  MatrixType result;
+
+  for (auto _ : state) {
+    result = A.exp();
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+}
+
+// Dynamic sizes: Lie groups (2,3,4) plus larger.
+BENCHMARK(BM_MatrixExp)->Arg(2)->Arg(3)->Arg(4)->Arg(8)->Arg(16)->Arg(32)->Arg(64)->Arg(128);
+
+// Fixed-size Lie group dimensions.
+BENCHMARK(BM_MatrixExp_Fixed<2>);
+BENCHMARK(BM_MatrixExp_Fixed<3>);
+BENCHMARK(BM_MatrixExp_Fixed<4>);

diff --git a/unsupported/benchmarks/MatrixFunctions/bench_matrix_logarithm.cpp b/unsupported/benchmarks/MatrixFunctions/bench_matrix_logarithm.cpp
new file mode 100644
index 0000000..4d1c8ea
--- /dev/null
+++ b/unsupported/benchmarks/MatrixFunctions/bench_matrix_logarithm.cpp

@@ -0,0 +1,51 @@
+// Benchmarks for matrix logarithm.
+// Inverse of matrix exponential, used for Lie group log maps.
+
+#include <benchmark/benchmark.h>
+#include <Eigen/Core>
+#include <unsupported/Eigen/MatrixFunctions>
+
+using namespace Eigen;
+
+#ifndef SCALAR
+#define SCALAR double
+#endif
+
+typedef SCALAR Scalar;
+
+static void BM_MatrixLog(benchmark::State& state) {
+  int n = state.range(0);
+  typedef Matrix<Scalar, Dynamic, Dynamic> MatrixType;
+
+  // Generate a matrix close to identity for stable log computation.
+  MatrixType A = MatrixType::Identity(n, n) + MatrixType::Random(n, n) / Scalar(n * 2);
+  // Ensure A is in the principal branch by computing exp(small matrix).
+  A = (MatrixType::Random(n, n) / Scalar(n * 4)).exp();
+  MatrixType result(n, n);
+
+  for (auto _ : state) {
+    result = A.log();
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+}
+
+template <int N>
+static void BM_MatrixLog_Fixed(benchmark::State& state) {
+  typedef Matrix<Scalar, N, N> MatrixType;
+
+  MatrixType A = (MatrixType::Random() / Scalar(N * 4)).exp();
+  MatrixType result;
+
+  for (auto _ : state) {
+    result = A.log();
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+}
+
+BENCHMARK(BM_MatrixLog)->Arg(2)->Arg(3)->Arg(4)->Arg(8)->Arg(16)->Arg(32)->Arg(64);
+
+BENCHMARK(BM_MatrixLog_Fixed<2>);
+BENCHMARK(BM_MatrixLog_Fixed<3>);
+BENCHMARK(BM_MatrixLog_Fixed<4>);

diff --git a/unsupported/benchmarks/MatrixFunctions/bench_matrix_power.cpp b/unsupported/benchmarks/MatrixFunctions/bench_matrix_power.cpp
new file mode 100644
index 0000000..a1ebea0
--- /dev/null
+++ b/unsupported/benchmarks/MatrixFunctions/bench_matrix_power.cpp

@@ -0,0 +1,99 @@
+// Benchmarks for matrix power functions: sqrt, pow, cos, sin, cosh, sinh.
+
+#include <benchmark/benchmark.h>
+#include <Eigen/Core>
+#include <unsupported/Eigen/MatrixFunctions>
+
+using namespace Eigen;
+
+typedef double Scalar;
+typedef Matrix<Scalar, Dynamic, Dynamic> Mat;
+
+static void BM_MatrixSqrt(benchmark::State& state) {
+  int n = state.range(0);
+  // SPD matrix has well-defined sqrt.
+  Mat tmp = Mat::Random(n, n);
+  Mat A = tmp * tmp.transpose() + Mat::Identity(n, n);
+  Mat result(n, n);
+
+  for (auto _ : state) {
+    result = A.sqrt();
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+}
+
+static void BM_MatrixPow(benchmark::State& state) {
+  int n = state.range(0);
+  Mat tmp = Mat::Random(n, n);
+  Mat A = tmp * tmp.transpose() + Mat::Identity(n, n);
+  Mat result(n, n);
+  Scalar p = 2.5;
+
+  for (auto _ : state) {
+    result = A.pow(p);
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+}
+
+static void BM_MatrixCos(benchmark::State& state) {
+  int n = state.range(0);
+  Mat A = Mat::Random(n, n) / Scalar(n);
+  Mat result(n, n);
+
+  for (auto _ : state) {
+    result = A.cos();
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+}
+
+static void BM_MatrixSin(benchmark::State& state) {
+  int n = state.range(0);
+  Mat A = Mat::Random(n, n) / Scalar(n);
+  Mat result(n, n);
+
+  for (auto _ : state) {
+    result = A.sin();
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+}
+
+static void BM_MatrixCosh(benchmark::State& state) {
+  int n = state.range(0);
+  Mat A = Mat::Random(n, n) / Scalar(n);
+  Mat result(n, n);
+
+  for (auto _ : state) {
+    result = A.cosh();
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+}
+
+static void BM_MatrixSinh(benchmark::State& state) {
+  int n = state.range(0);
+  Mat A = Mat::Random(n, n) / Scalar(n);
+  Mat result(n, n);
+
+  for (auto _ : state) {
+    result = A.sinh();
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+}
+
+static void MatPowerSizes(::benchmark::Benchmark* b) {
+  for (int n : {4, 8, 16, 32, 64}) {
+    b->Arg(n);
+  }
+}
+
+BENCHMARK(BM_MatrixSqrt)->Apply(MatPowerSizes);
+BENCHMARK(BM_MatrixPow)->Apply(MatPowerSizes);
+BENCHMARK(BM_MatrixCos)->Apply(MatPowerSizes);
+BENCHMARK(BM_MatrixSin)->Apply(MatPowerSizes);
+BENCHMARK(BM_MatrixCosh)->Apply(MatPowerSizes);
+BENCHMARK(BM_MatrixSinh)->Apply(MatPowerSizes);

diff --git a/unsupported/benchmarks/SpecialFunctions/CMakeLists.txt b/unsupported/benchmarks/SpecialFunctions/CMakeLists.txt
new file mode 100644
index 0000000..a673413
--- /dev/null
+++ b/unsupported/benchmarks/SpecialFunctions/CMakeLists.txt

@@ -0,0 +1 @@
+eigen_add_benchmark(bench_special_functions bench_special_functions.cpp)

diff --git a/unsupported/benchmarks/SpecialFunctions/bench_special_functions.cpp b/unsupported/benchmarks/SpecialFunctions/bench_special_functions.cpp
new file mode 100644
index 0000000..dc6505d
--- /dev/null
+++ b/unsupported/benchmarks/SpecialFunctions/bench_special_functions.cpp

@@ -0,0 +1,127 @@
+// Benchmarks for special functions beyond what bench_cwise_math.cpp covers.
+// Includes Bessel functions, two-argument functions (igamma, betainc),
+// and additional functions (lgamma, digamma, zeta, polygamma).
+
+#include <benchmark/benchmark.h>
+#include <Eigen/Core>
+#include <unsupported/Eigen/SpecialFunctions>
+
+using namespace Eigen;
+
+// Macro for unary special functions on arrays.
+#define BENCH_SPECIAL_UNARY(NAME, EXPR, LO, HI)                                                          \
+  template <typename Scalar>                                                                             \
+  static void BM_##NAME(benchmark::State& state) {                                                       \
+    const Index n = state.range(0);                                                                      \
+    using Arr = Array<Scalar, Dynamic, 1>;                                                               \
+    Arr a = (Arr::Random(n) + Scalar(1)) * Scalar((double(HI) - double(LO)) / 2.0) + Scalar(LO);         \
+    Arr b(n);                                                                                            \
+    for (auto _ : state) {                                                                               \
+      b = EXPR;                                                                                          \
+      benchmark::DoNotOptimize(b.data());                                                                \
+    }                                                                                                    \
+    state.counters["Elements/s"] = benchmark::Counter(n, benchmark::Counter::kIsIterationInvariantRate); \
+    state.SetBytesProcessed(state.iterations() * n * sizeof(Scalar) * 2);                                \
+  }
+
+// Macro for binary special functions on arrays.
+#define BENCH_SPECIAL_BINARY(NAME, EXPR, LO_A, HI_A, LO_B, HI_B)                                         \
+  template <typename Scalar>                                                                             \
+  static void BM_##NAME(benchmark::State& state) {                                                       \
+    const Index n = state.range(0);                                                                      \
+    using Arr = Array<Scalar, Dynamic, 1>;                                                               \
+    Arr a = (Arr::Random(n) + Scalar(1)) * Scalar((double(HI_A) - double(LO_A)) / 2.0) + Scalar(LO_A);   \
+    Arr b = (Arr::Random(n) + Scalar(1)) * Scalar((double(HI_B) - double(LO_B)) / 2.0) + Scalar(LO_B);   \
+    Arr c(n);                                                                                            \
+    for (auto _ : state) {                                                                               \
+      c = EXPR;                                                                                          \
+      benchmark::DoNotOptimize(c.data());                                                                \
+    }                                                                                                    \
+    state.counters["Elements/s"] = benchmark::Counter(n, benchmark::Counter::kIsIterationInvariantRate); \
+    state.SetBytesProcessed(state.iterations() * n * sizeof(Scalar) * 3);                                \
+  }
+
+// --- Unary special functions ---
+BENCH_SPECIAL_UNARY(Lgamma, Eigen::lgamma(a), 0.1, 20)
+BENCH_SPECIAL_UNARY(Digamma, Eigen::digamma(a), 0.1, 20)
+
+// --- Bessel functions (first kind) ---
+BENCH_SPECIAL_UNARY(BesselI0, Eigen::bessel_i0(a), 0, 10)
+BENCH_SPECIAL_UNARY(BesselI1, Eigen::bessel_i1(a), 0, 10)
+BENCH_SPECIAL_UNARY(BesselI0e, Eigen::bessel_i0e(a), 0, 100)
+BENCH_SPECIAL_UNARY(BesselI1e, Eigen::bessel_i1e(a), 0, 100)
+BENCH_SPECIAL_UNARY(BesselJ0, Eigen::bessel_j0(a), 0, 20)
+BENCH_SPECIAL_UNARY(BesselJ1, Eigen::bessel_j1(a), 0, 20)
+
+// --- Bessel functions (second kind) ---
+BENCH_SPECIAL_UNARY(BesselY0, Eigen::bessel_y0(a), 0.1, 20)
+BENCH_SPECIAL_UNARY(BesselY1, Eigen::bessel_y1(a), 0.1, 20)
+BENCH_SPECIAL_UNARY(BesselK0, Eigen::bessel_k0(a), 0.1, 20)
+BENCH_SPECIAL_UNARY(BesselK1, Eigen::bessel_k1(a), 0.1, 20)
+BENCH_SPECIAL_UNARY(BesselK0e, Eigen::bessel_k0e(a), 0.1, 100)
+BENCH_SPECIAL_UNARY(BesselK1e, Eigen::bessel_k1e(a), 0.1, 100)
+
+// --- Two-argument functions ---
+BENCH_SPECIAL_BINARY(Igamma, Eigen::igamma(a, b), 0.1, 10, 0.1, 10)
+BENCH_SPECIAL_BINARY(Igammac, Eigen::igammac(a, b), 0.1, 10, 0.1, 10)
+BENCH_SPECIAL_BINARY(Zeta, Eigen::zeta(a, b), 1.1, 10, 0.1, 10)
+BENCH_SPECIAL_BINARY(Polygamma, Eigen::polygamma(a, b), 1, 4, 0.1, 10)
+
+// --- Ternary: betainc ---
+template <typename Scalar>
+static void BM_Betainc(benchmark::State& state) {
+  const Index n = state.range(0);
+  using Arr = Array<Scalar, Dynamic, 1>;
+  Arr a = (Arr::Random(n) + Scalar(1)) * Scalar(2.5) + Scalar(0.5);  // [0.5, 5.5]
+  Arr b = (Arr::Random(n) + Scalar(1)) * Scalar(2.5) + Scalar(0.5);
+  Arr x = (Arr::Random(n) + Scalar(1)) * Scalar(0.5);  // [0, 1]
+  Arr result(n);
+  for (auto _ : state) {
+    result = Eigen::betainc(a, b, x);
+    benchmark::DoNotOptimize(result.data());
+  }
+  state.counters["Elements/s"] = benchmark::Counter(n, benchmark::Counter::kIsIterationInvariantRate);
+  state.SetBytesProcessed(state.iterations() * n * sizeof(Scalar) * 4);
+}
+
+static void SpecialSizes(::benchmark::Benchmark* b) {
+  for (int n : {256, 4096, 65536, 1048576}) b->Arg(n);
+}
+
+// --- Register float ---
+BENCHMARK(BM_Lgamma<float>)->Apply(SpecialSizes)->Name("Lgamma_float");
+BENCHMARK(BM_Digamma<float>)->Apply(SpecialSizes)->Name("Digamma_float");
+BENCHMARK(BM_BesselI0<float>)->Apply(SpecialSizes)->Name("BesselI0_float");
+BENCHMARK(BM_BesselI1<float>)->Apply(SpecialSizes)->Name("BesselI1_float");
+BENCHMARK(BM_BesselI0e<float>)->Apply(SpecialSizes)->Name("BesselI0e_float");
+BENCHMARK(BM_BesselI1e<float>)->Apply(SpecialSizes)->Name("BesselI1e_float");
+BENCHMARK(BM_BesselJ0<float>)->Apply(SpecialSizes)->Name("BesselJ0_float");
+BENCHMARK(BM_BesselJ1<float>)->Apply(SpecialSizes)->Name("BesselJ1_float");
+BENCHMARK(BM_BesselY0<float>)->Apply(SpecialSizes)->Name("BesselY0_float");
+BENCHMARK(BM_BesselY1<float>)->Apply(SpecialSizes)->Name("BesselY1_float");
+BENCHMARK(BM_BesselK0<float>)->Apply(SpecialSizes)->Name("BesselK0_float");
+BENCHMARK(BM_BesselK1<float>)->Apply(SpecialSizes)->Name("BesselK1_float");
+BENCHMARK(BM_BesselK0e<float>)->Apply(SpecialSizes)->Name("BesselK0e_float");
+BENCHMARK(BM_BesselK1e<float>)->Apply(SpecialSizes)->Name("BesselK1e_float");
+BENCHMARK(BM_Igamma<float>)->Apply(SpecialSizes)->Name("Igamma_float");
+BENCHMARK(BM_Igammac<float>)->Apply(SpecialSizes)->Name("Igammac_float");
+BENCHMARK(BM_Betainc<float>)->Apply(SpecialSizes)->Name("Betainc_float");
+BENCHMARK(BM_Zeta<float>)->Apply(SpecialSizes)->Name("Zeta_float");
+BENCHMARK(BM_Polygamma<float>)->Apply(SpecialSizes)->Name("Polygamma_float");
+
+// --- Register double ---
+BENCHMARK(BM_Lgamma<double>)->Apply(SpecialSizes)->Name("Lgamma_double");
+BENCHMARK(BM_Digamma<double>)->Apply(SpecialSizes)->Name("Digamma_double");
+BENCHMARK(BM_BesselI0<double>)->Apply(SpecialSizes)->Name("BesselI0_double");
+BENCHMARK(BM_BesselI1<double>)->Apply(SpecialSizes)->Name("BesselI1_double");
+BENCHMARK(BM_BesselJ0<double>)->Apply(SpecialSizes)->Name("BesselJ0_double");
+BENCHMARK(BM_BesselJ1<double>)->Apply(SpecialSizes)->Name("BesselJ1_double");
+BENCHMARK(BM_BesselY0<double>)->Apply(SpecialSizes)->Name("BesselY0_double");
+BENCHMARK(BM_BesselY1<double>)->Apply(SpecialSizes)->Name("BesselY1_double");
+BENCHMARK(BM_BesselK0<double>)->Apply(SpecialSizes)->Name("BesselK0_double");
+BENCHMARK(BM_BesselK1<double>)->Apply(SpecialSizes)->Name("BesselK1_double");
+BENCHMARK(BM_Igamma<double>)->Apply(SpecialSizes)->Name("Igamma_double");
+BENCHMARK(BM_Igammac<double>)->Apply(SpecialSizes)->Name("Igammac_double");
+BENCHMARK(BM_Betainc<double>)->Apply(SpecialSizes)->Name("Betainc_double");
+BENCHMARK(BM_Zeta<double>)->Apply(SpecialSizes)->Name("Zeta_double");
+BENCHMARK(BM_Polygamma<double>)->Apply(SpecialSizes)->Name("Polygamma_double");

diff --git a/unsupported/benchmarks/Splines/CMakeLists.txt b/unsupported/benchmarks/Splines/CMakeLists.txt
new file mode 100644
index 0000000..a54acc4
--- /dev/null
+++ b/unsupported/benchmarks/Splines/CMakeLists.txt

@@ -0,0 +1 @@
+eigen_add_benchmark(bench_splines bench_splines.cpp)

diff --git a/unsupported/benchmarks/Splines/bench_splines.cpp b/unsupported/benchmarks/Splines/bench_splines.cpp
new file mode 100644
index 0000000..e422a8f
--- /dev/null
+++ b/unsupported/benchmarks/Splines/bench_splines.cpp

@@ -0,0 +1,98 @@
+// Benchmarks for Eigen Spline module.
+// Tests fitting, evaluation, and derivative computation.
+
+#include <benchmark/benchmark.h>
+#include <Eigen/Core>
+#include <unsupported/Eigen/Splines>
+
+using namespace Eigen;
+
+typedef double Scalar;
+
+// --- Spline fitting (interpolation) ---
+template <int Dim, int Degree>
+static void BM_SplineFit(benchmark::State& state) {
+  const int n = state.range(0);
+
+  typedef Spline<Scalar, Dim> SplineType;
+  typedef typename SplineType::PointType PointType;
+
+  // Generate random points.
+  Matrix<Scalar, Dim, Dynamic> pts(Dim, n);
+  pts.setRandom();
+
+  for (auto _ : state) {
+    SplineType spline = SplineFitting<SplineType>::Interpolate(pts, Degree);
+    benchmark::DoNotOptimize(spline.knots().data());
+    benchmark::ClobberMemory();
+  }
+}
+
+// --- Spline evaluation ---
+template <int Dim, int Degree>
+static void BM_SplineEval(benchmark::State& state) {
+  const int n = state.range(0);  // number of control points for fitting
+  const int neval = 1000;        // number of evaluation points
+
+  typedef Spline<Scalar, Dim> SplineType;
+
+  Matrix<Scalar, Dim, Dynamic> pts(Dim, n);
+  pts.setRandom();
+  SplineType spline = SplineFitting<SplineType>::Interpolate(pts, Degree);
+
+  // Generate evaluation parameters in [0, 1].
+  VectorXd u = VectorXd::LinSpaced(neval, 0, 1);
+
+  for (auto _ : state) {
+    for (int i = 0; i < neval; ++i) {
+      auto pt = spline(u(i));
+      benchmark::DoNotOptimize(pt.data());
+    }
+    benchmark::ClobberMemory();
+  }
+  state.counters["Evals/s"] = benchmark::Counter(neval, benchmark::Counter::kIsIterationInvariantRate);
+}
+
+// --- Spline derivative evaluation ---
+template <int Dim, int Degree>
+static void BM_SplineDerivatives(benchmark::State& state) {
+  const int n = state.range(0);
+  const int neval = 1000;
+
+  typedef Spline<Scalar, Dim> SplineType;
+
+  Matrix<Scalar, Dim, Dynamic> pts(Dim, n);
+  pts.setRandom();
+  SplineType spline = SplineFitting<SplineType>::Interpolate(pts, Degree);
+
+  VectorXd u = VectorXd::LinSpaced(neval, 0, 1);
+
+  for (auto _ : state) {
+    for (int i = 0; i < neval; ++i) {
+      auto derivs = spline.derivatives(u(i), 1);
+      benchmark::DoNotOptimize(derivs.data());
+    }
+    benchmark::ClobberMemory();
+  }
+  state.counters["Evals/s"] = benchmark::Counter(neval, benchmark::Counter::kIsIterationInvariantRate);
+}
+
+static void SplineSizes(::benchmark::Benchmark* b) {
+  for (int n : {10, 50, 200, 1000}) {
+    b->Arg(n);
+  }
+}
+
+// 2D cubic splines
+BENCHMARK(BM_SplineFit<2, 3>)->Apply(SplineSizes)->Name("SplineFit_2D_Cubic");
+BENCHMARK(BM_SplineEval<2, 3>)->Apply(SplineSizes)->Name("SplineEval_2D_Cubic");
+BENCHMARK(BM_SplineDerivatives<2, 3>)->Apply(SplineSizes)->Name("SplineDerivatives_2D_Cubic");
+
+// 3D cubic splines
+BENCHMARK(BM_SplineFit<3, 3>)->Apply(SplineSizes)->Name("SplineFit_3D_Cubic");
+BENCHMARK(BM_SplineEval<3, 3>)->Apply(SplineSizes)->Name("SplineEval_3D_Cubic");
+BENCHMARK(BM_SplineDerivatives<3, 3>)->Apply(SplineSizes)->Name("SplineDerivatives_3D_Cubic");
+
+// 2D quintic splines
+BENCHMARK(BM_SplineFit<2, 5>)->Apply(SplineSizes)->Name("SplineFit_2D_Quintic");
+BENCHMARK(BM_SplineEval<2, 5>)->Apply(SplineSizes)->Name("SplineEval_2D_Quintic");

diff --git a/unsupported/benchmarks/Tensor/CMakeLists.txt b/unsupported/benchmarks/Tensor/CMakeLists.txt
new file mode 100644
index 0000000..4dab17c
--- /dev/null
+++ b/unsupported/benchmarks/Tensor/CMakeLists.txt

@@ -0,0 +1,8 @@
+eigen_add_benchmark(bench_contraction bench_contraction.cpp)
+eigen_add_benchmark(bench_convolution bench_convolution.cpp)
+eigen_add_benchmark(bench_reduction bench_reduction.cpp)
+eigen_add_benchmark(bench_broadcasting bench_broadcasting.cpp)
+eigen_add_benchmark(bench_shuffling bench_shuffling.cpp)
+eigen_add_benchmark(bench_tensor_fft bench_tensor_fft.cpp)
+eigen_add_benchmark(bench_morphing bench_morphing.cpp)
+eigen_add_benchmark(bench_coefficient_wise bench_coefficient_wise.cpp)

diff --git a/unsupported/benchmarks/Tensor/bench_broadcasting.cpp b/unsupported/benchmarks/Tensor/bench_broadcasting.cpp
new file mode 100644
index 0000000..01aed20
--- /dev/null
+++ b/unsupported/benchmarks/Tensor/bench_broadcasting.cpp

@@ -0,0 +1,111 @@
+// Benchmarks for Eigen Tensor broadcasting.
+// Tests broadcasting along various dimensions and ranks.
+
+#include <benchmark/benchmark.h>
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using namespace Eigen;
+
+typedef float Scalar;
+
+// --- Broadcast row vector {1,N} -> {M,N} ---
+static void BM_BroadcastRow(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+
+  Tensor<Scalar, 2> row(1, N);
+  Tensor<Scalar, 2> result(M, N);
+  row.setRandom();
+
+  Eigen::array<int, 2> bcast = {M, 1};
+
+  for (auto _ : state) {
+    result = row.broadcast(bcast);
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
+}
+
+// --- Broadcast col vector {M,1} -> {M,N} ---
+static void BM_BroadcastCol(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+
+  Tensor<Scalar, 2> col(M, 1);
+  Tensor<Scalar, 2> result(M, N);
+  col.setRandom();
+
+  Eigen::array<int, 2> bcast = {1, N};
+
+  for (auto _ : state) {
+    result = col.broadcast(bcast);
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
+}
+
+// --- Broadcast + element-wise add (bias addition pattern) ---
+static void BM_BroadcastAdd(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+
+  Tensor<Scalar, 2> mat(M, N);
+  Tensor<Scalar, 2> bias(1, N);
+  Tensor<Scalar, 2> result(M, N);
+  mat.setRandom();
+  bias.setRandom();
+
+  Eigen::array<int, 2> bcast = {M, 1};
+
+  for (auto _ : state) {
+    result = mat + bias.broadcast(bcast);
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2);
+}
+
+// --- Rank-4 broadcast (batch x channels x 1 x 1) -> (batch x channels x H x W) ---
+static void BM_BroadcastRank4(benchmark::State& state) {
+  const int batch = state.range(0);
+  const int C = state.range(1);
+  const int H = state.range(2);
+
+  Tensor<Scalar, 4> bias(batch, C, 1, 1);
+  Tensor<Scalar, 4> result(batch, C, H, H);
+  bias.setRandom();
+
+  Eigen::array<int, 4> bcast = {1, 1, H, H};
+
+  for (auto _ : state) {
+    result = bias.broadcast(bcast);
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * batch * C * H * H * sizeof(Scalar));
+}
+
+static void BroadcastSizes(::benchmark::Benchmark* b) {
+  for (int m : {64, 256, 1024}) {
+    for (int n : {64, 256, 1024}) {
+      b->Args({m, n});
+    }
+  }
+}
+
+static void Rank4Sizes(::benchmark::Benchmark* b) {
+  for (int batch : {1, 8}) {
+    for (int c : {64, 256}) {
+      for (int h : {16, 32}) {
+        b->Args({batch, c, h});
+      }
+    }
+  }
+}
+
+BENCHMARK(BM_BroadcastRow)->Apply(BroadcastSizes);
+BENCHMARK(BM_BroadcastCol)->Apply(BroadcastSizes);
+BENCHMARK(BM_BroadcastAdd)->Apply(BroadcastSizes);
+BENCHMARK(BM_BroadcastRank4)->Apply(Rank4Sizes);

diff --git a/unsupported/benchmarks/Tensor/bench_coefficient_wise.cpp b/unsupported/benchmarks/Tensor/bench_coefficient_wise.cpp
new file mode 100644
index 0000000..aed4828
--- /dev/null
+++ b/unsupported/benchmarks/Tensor/bench_coefficient_wise.cpp

@@ -0,0 +1,131 @@
+// Benchmarks for Eigen Tensor coefficient-wise operations.
+// Covers activation functions, normalization, and element-wise arithmetic.
+
+#include <benchmark/benchmark.h>
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using namespace Eigen;
+
+typedef float Scalar;
+
+// Macro to define a benchmark for a unary tensor operation.
+#define BENCH_TENSOR_UNARY(NAME, EXPR)                                        \
+  static void BM_##NAME(benchmark::State& state) {                            \
+    const int M = state.range(0);                                             \
+    const int N = state.range(1);                                             \
+    Tensor<Scalar, 2> a(M, N);                                                \
+    a.setRandom();                                                            \
+    Tensor<Scalar, 2> b(M, N);                                                \
+    for (auto _ : state) {                                                    \
+      b = EXPR;                                                               \
+      benchmark::DoNotOptimize(b.data());                                     \
+      benchmark::ClobberMemory();                                             \
+    }                                                                         \
+    state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2); \
+  }
+
+BENCH_TENSOR_UNARY(Exp, a.exp())
+BENCH_TENSOR_UNARY(Log, a.abs().log())
+BENCH_TENSOR_UNARY(Tanh, a.tanh())
+BENCH_TENSOR_UNARY(Sigmoid, a.sigmoid())
+BENCH_TENSOR_UNARY(ReLU, a.cwiseMax(Scalar(0)))
+BENCH_TENSOR_UNARY(Sqrt, a.abs().sqrt())
+
+// --- Element-wise binary operations ---
+static void BM_Add(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+
+  Tensor<Scalar, 2> a(M, N);
+  Tensor<Scalar, 2> b(M, N);
+  Tensor<Scalar, 2> c(M, N);
+  a.setRandom();
+  b.setRandom();
+
+  for (auto _ : state) {
+    c = a + b;
+    benchmark::DoNotOptimize(c.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 3);
+}
+
+static void BM_Mul(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+
+  Tensor<Scalar, 2> a(M, N);
+  Tensor<Scalar, 2> b(M, N);
+  Tensor<Scalar, 2> c(M, N);
+  a.setRandom();
+  b.setRandom();
+
+  for (auto _ : state) {
+    c = a * b;
+    benchmark::DoNotOptimize(c.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 3);
+}
+
+// --- Fused multiply-add ---
+static void BM_FMA(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+
+  Tensor<Scalar, 2> a(M, N);
+  Tensor<Scalar, 2> b(M, N);
+  Tensor<Scalar, 2> c(M, N);
+  Tensor<Scalar, 2> d(M, N);
+  a.setRandom();
+  b.setRandom();
+  c.setRandom();
+
+  for (auto _ : state) {
+    d = a * b + c;
+    benchmark::DoNotOptimize(d.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 4);
+}
+
+// --- Rank-4 coefficient-wise (CNN feature maps) ---
+static void BM_ReLU_Rank4(benchmark::State& state) {
+  const int batch = state.range(0);
+  const int C = state.range(1);
+  const int H = state.range(2);
+
+  Tensor<Scalar, 4> a(batch, C, H, H);
+  Tensor<Scalar, 4> b(batch, C, H, H);
+  a.setRandom();
+
+  for (auto _ : state) {
+    b = a.cwiseMax(Scalar(0));
+    benchmark::DoNotOptimize(b.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * batch * C * H * H * sizeof(Scalar) * 2);
+}
+
+static void CwiseSizes(::benchmark::Benchmark* b) {
+  for (int size : {256, 1024}) {
+    b->Args({size, size});
+  }
+}
+
+static void Rank4Sizes(::benchmark::Benchmark* b) {
+  b->Args({32, 64, 16});
+  b->Args({8, 128, 32});
+  b->Args({1, 256, 64});
+}
+
+BENCHMARK(BM_Exp)->Apply(CwiseSizes);
+BENCHMARK(BM_Log)->Apply(CwiseSizes);
+BENCHMARK(BM_Tanh)->Apply(CwiseSizes);
+BENCHMARK(BM_Sigmoid)->Apply(CwiseSizes);
+BENCHMARK(BM_ReLU)->Apply(CwiseSizes);
+BENCHMARK(BM_Sqrt)->Apply(CwiseSizes);
+BENCHMARK(BM_Add)->Apply(CwiseSizes);
+BENCHMARK(BM_Mul)->Apply(CwiseSizes);
+BENCHMARK(BM_FMA)->Apply(CwiseSizes);
+BENCHMARK(BM_ReLU_Rank4)->Apply(Rank4Sizes);

diff --git a/unsupported/benchmarks/Tensor/bench_contraction.cpp b/unsupported/benchmarks/Tensor/bench_contraction.cpp
new file mode 100644
index 0000000..faf648f
--- /dev/null
+++ b/unsupported/benchmarks/Tensor/bench_contraction.cpp

@@ -0,0 +1,151 @@
+// Benchmarks for Eigen Tensor contraction (generalized GEMM).
+// Tests single-threaded (DefaultDevice) and multi-threaded (ThreadPoolDevice) variants.
+
+#define EIGEN_USE_THREADS
+
+#include <benchmark/benchmark.h>
+#include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/CXX11/ThreadPool>
+
+using namespace Eigen;
+
+#ifndef SCALAR
+#define SCALAR float
+#endif
+
+typedef SCALAR Scalar;
+
+// --- DefaultDevice contraction (rank-2, equivalent to matrix multiply) ---
+static void BM_Contraction(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+  const int K = state.range(2);
+
+  Tensor<Scalar, 2> A(M, K);
+  Tensor<Scalar, 2> B(K, N);
+  Tensor<Scalar, 2> C(M, N);
+  A.setRandom();
+  B.setRandom();
+
+  using ContractDims = Tensor<Scalar, 2>::DimensionPair;
+  Eigen::array<ContractDims, 1> contract_dims = {ContractDims(1, 0)};
+
+  for (auto _ : state) {
+    C = A.contract(B, contract_dims);
+    benchmark::DoNotOptimize(C.data());
+    benchmark::ClobberMemory();
+  }
+  state.counters["GFLOPS"] =
+      benchmark::Counter(2.0 * M * N * K, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
+}
+
+// --- ThreadPoolDevice contraction ---
+static void BM_Contraction_ThreadPool(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+  const int K = state.range(2);
+  const int threads = state.range(3);
+
+  Tensor<Scalar, 2> A(M, K);
+  Tensor<Scalar, 2> B(K, N);
+  Tensor<Scalar, 2> C(M, N);
+  A.setRandom();
+  B.setRandom();
+
+  ThreadPool tp(threads);
+  ThreadPoolDevice dev(&tp, threads);
+
+  using ContractDims = Tensor<Scalar, 2>::DimensionPair;
+  Eigen::array<ContractDims, 1> contract_dims = {ContractDims(1, 0)};
+
+  for (auto _ : state) {
+    C.device(dev) = A.contract(B, contract_dims);
+    benchmark::DoNotOptimize(C.data());
+    benchmark::ClobberMemory();
+  }
+  state.counters["GFLOPS"] =
+      benchmark::Counter(2.0 * M * N * K, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
+  state.counters["threads"] = threads;
+}
+
+// --- Rank-3 batch contraction ---
+// Contracts A(batch, M, K) with B(batch, K, N) over batch dim (0<->0)
+// and K dim (2<->1), producing C(M, N). This sums over both the batch
+// and inner dimensions: C(m, n) = sum_b sum_k A(b, m, k) * B(b, k, n).
+static void BM_BatchContraction(benchmark::State& state) {
+  const int batch = state.range(0);
+  const int M = state.range(1);
+  const int N = state.range(2);
+  const int K = state.range(3);
+
+  Tensor<Scalar, 3> A(batch, M, K);
+  Tensor<Scalar, 3> B(batch, K, N);
+  Tensor<Scalar, 2> C(M, N);
+  A.setRandom();
+  B.setRandom();
+
+  using ContractDims = Tensor<Scalar, 3>::DimensionPair;
+  Eigen::array<ContractDims, 2> contract_dims = {ContractDims(0, 0), ContractDims(2, 1)};
+
+  for (auto _ : state) {
+    C = A.contract(B, contract_dims);
+    benchmark::DoNotOptimize(C.data());
+    benchmark::ClobberMemory();
+  }
+  state.counters["GFLOPS"] = benchmark::Counter(2.0 * batch * M * N * K, benchmark::Counter::kIsIterationInvariantRate,
+                                                benchmark::Counter::kIs1000);
+}
+
+// --- RowMajor contraction ---
+static void BM_Contraction_RowMajor(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+  const int K = state.range(2);
+
+  Tensor<Scalar, 2, RowMajor> A(M, K);
+  Tensor<Scalar, 2, RowMajor> B(K, N);
+  Tensor<Scalar, 2, RowMajor> C(M, N);
+  A.setRandom();
+  B.setRandom();
+
+  using ContractDims = Tensor<Scalar, 2, RowMajor>::DimensionPair;
+  Eigen::array<ContractDims, 1> contract_dims = {ContractDims(1, 0)};
+
+  for (auto _ : state) {
+    C = A.contract(B, contract_dims);
+    benchmark::DoNotOptimize(C.data());
+    benchmark::ClobberMemory();
+  }
+  state.counters["GFLOPS"] =
+      benchmark::Counter(2.0 * M * N * K, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
+}
+
+static void ContractionSizes(::benchmark::Benchmark* b) {
+  for (int size : {32, 64, 128, 256, 512, 1024}) {
+    b->Args({size, size, size});
+  }
+  // Non-square
+  b->Args({256, 256, 1024});
+  b->Args({1024, 64, 64});
+}
+
+static void ThreadPoolSizes(::benchmark::Benchmark* b) {
+  for (int size : {64, 256, 512, 1024}) {
+    for (int threads : {2, 4, 8}) {
+      b->Args({size, size, size, threads});
+    }
+  }
+}
+
+static void BatchSizes(::benchmark::Benchmark* b) {
+  for (int batch : {1, 8, 32}) {
+    for (int size : {64, 256}) {
+      b->Args({batch, size, size, size});
+    }
+  }
+}
+
+BENCHMARK(BM_Contraction)->Apply(ContractionSizes);
+BENCHMARK(BM_Contraction_RowMajor)->Apply(ContractionSizes);
+BENCHMARK(BM_Contraction_ThreadPool)->Apply(ThreadPoolSizes);
+BENCHMARK(BM_BatchContraction)->Apply(BatchSizes);

diff --git a/unsupported/benchmarks/Tensor/bench_convolution.cpp b/unsupported/benchmarks/Tensor/bench_convolution.cpp
new file mode 100644
index 0000000..46e44ee
--- /dev/null
+++ b/unsupported/benchmarks/Tensor/bench_convolution.cpp

@@ -0,0 +1,151 @@
+// Benchmarks for Eigen Tensor convolution (1D and 2D).
+
+#define EIGEN_USE_THREADS
+
+#include <benchmark/benchmark.h>
+#include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/CXX11/ThreadPool>
+
+using namespace Eigen;
+
+typedef float Scalar;
+
+// --- 1D convolution ---
+static void BM_Convolve1D(benchmark::State& state) {
+  const int input_size = state.range(0);
+  const int kernel_size = state.range(1);
+
+  Tensor<Scalar, 1> input(input_size);
+  Tensor<Scalar, 1> kernel(kernel_size);
+  input.setRandom();
+  kernel.setRandom();
+
+  Eigen::array<int, 1> dims = {0};
+
+  for (auto _ : state) {
+    Tensor<Scalar, 1> result = input.convolve(kernel, dims);
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+  double flops = 2.0 * (input_size - kernel_size + 1) * kernel_size;
+  state.counters["GFLOPS"] =
+      benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
+}
+
+// --- 2D convolution ---
+static void BM_Convolve2D(benchmark::State& state) {
+  const int H = state.range(0);
+  const int W = state.range(1);
+  const int kH = state.range(2);
+  const int kW = state.range(3);
+
+  Tensor<Scalar, 2> input(H, W);
+  Tensor<Scalar, 2> kernel(kH, kW);
+  input.setRandom();
+  kernel.setRandom();
+
+  Eigen::array<int, 2> dims = {0, 1};
+
+  for (auto _ : state) {
+    Tensor<Scalar, 2> result = input.convolve(kernel, dims);
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+  double flops = 2.0 * (H - kH + 1) * (W - kW + 1) * kH * kW;
+  state.counters["GFLOPS"] =
+      benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
+}
+
+// --- 2D convolution with channels (rank-3: C x H x W, convolve on H,W) ---
+static void BM_Convolve2D_Channels(benchmark::State& state) {
+  const int C = state.range(0);
+  const int H = state.range(1);
+  const int kH = state.range(2);
+
+  Tensor<Scalar, 3> input(C, H, H);
+  Tensor<Scalar, 2> kernel(kH, kH);
+  input.setRandom();
+  kernel.setRandom();
+
+  Eigen::array<int, 2> dims = {1, 2};
+
+  for (auto _ : state) {
+    Tensor<Scalar, 3> result = input.convolve(kernel, dims);
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+  int outH = H - kH + 1;
+  double flops = 2.0 * C * outH * outH * kH * kH;
+  state.counters["GFLOPS"] =
+      benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
+}
+
+// --- 2D convolution with ThreadPool ---
+static void BM_Convolve2D_ThreadPool(benchmark::State& state) {
+  const int H = state.range(0);
+  const int kH = state.range(1);
+  const int threads = state.range(2);
+
+  Tensor<Scalar, 2> input(H, H);
+  Tensor<Scalar, 2> kernel(kH, kH);
+  Tensor<Scalar, 2> result(H - kH + 1, H - kH + 1);
+  input.setRandom();
+  kernel.setRandom();
+
+  ThreadPool tp(threads);
+  ThreadPoolDevice dev(&tp, threads);
+
+  Eigen::array<int, 2> dims = {0, 1};
+
+  for (auto _ : state) {
+    result.device(dev) = input.convolve(kernel, dims);
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+  int outH = H - kH + 1;
+  double flops = 2.0 * outH * outH * kH * kH;
+  state.counters["GFLOPS"] =
+      benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
+  state.counters["threads"] = threads;
+}
+
+static void Conv1DSizes(::benchmark::Benchmark* b) {
+  for (int input : {128, 512, 2048}) {
+    for (int kernel : {3, 5, 11}) {
+      b->Args({input, kernel});
+    }
+  }
+}
+
+static void Conv2DSizes(::benchmark::Benchmark* b) {
+  for (int hw : {32, 64, 128, 224}) {
+    for (int k : {3, 5, 7}) {
+      b->Args({hw, hw, k, k});
+    }
+  }
+}
+
+static void Conv2DChannelSizes(::benchmark::Benchmark* b) {
+  for (int c : {3, 64, 128}) {
+    for (int hw : {16, 32, 56}) {
+      for (int k : {3, 5}) {
+        b->Args({c, hw, k});
+      }
+    }
+  }
+}
+
+static void Conv2DThreadPoolSizes(::benchmark::Benchmark* b) {
+  for (int hw : {64, 128, 224}) {
+    for (int k : {3, 5}) {
+      for (int threads : {2, 4, 8}) {
+        b->Args({hw, k, threads});
+      }
+    }
+  }
+}
+
+BENCHMARK(BM_Convolve1D)->Apply(Conv1DSizes);
+BENCHMARK(BM_Convolve2D)->Apply(Conv2DSizes);
+BENCHMARK(BM_Convolve2D_Channels)->Apply(Conv2DChannelSizes);
+BENCHMARK(BM_Convolve2D_ThreadPool)->Apply(Conv2DThreadPoolSizes);

diff --git a/unsupported/benchmarks/Tensor/bench_morphing.cpp b/unsupported/benchmarks/Tensor/bench_morphing.cpp
new file mode 100644
index 0000000..8d226e7
--- /dev/null
+++ b/unsupported/benchmarks/Tensor/bench_morphing.cpp

@@ -0,0 +1,142 @@
+// Benchmarks for Eigen Tensor morphing operations: reshape, slice, chip, pad, stride.
+
+#include <benchmark/benchmark.h>
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using namespace Eigen;
+
+typedef float Scalar;
+
+// --- Reshape (zero-cost if no evaluation needed; force eval via assignment) ---
+static void BM_Reshape(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+
+  Tensor<Scalar, 2> A(M, N);
+  A.setRandom();
+
+  Eigen::array<Index, 1> new_shape = {M * N};
+
+  for (auto _ : state) {
+    Tensor<Scalar, 1> B = A.reshape(new_shape);
+    benchmark::DoNotOptimize(B.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
+}
+
+// --- Slice ---
+static void BM_Slice(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+
+  Tensor<Scalar, 2> A(M, N);
+  A.setRandom();
+
+  int sliceM = M / 2;
+  int sliceN = N / 2;
+  Eigen::array<Index, 2> offsets = {0, 0};
+  Eigen::array<Index, 2> extents = {sliceM, sliceN};
+
+  for (auto _ : state) {
+    Tensor<Scalar, 2> B = A.slice(offsets, extents);
+    benchmark::DoNotOptimize(B.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * sliceM * sliceN * sizeof(Scalar));
+}
+
+// --- Chip (extract a sub-tensor along one dimension) ---
+static void BM_Chip(benchmark::State& state) {
+  const int D0 = state.range(0);
+  const int D1 = state.range(1);
+  const int D2 = state.range(2);
+
+  Tensor<Scalar, 3> A(D0, D1, D2);
+  A.setRandom();
+
+  for (auto _ : state) {
+    Tensor<Scalar, 2> B = A.chip(0, 0);
+    benchmark::DoNotOptimize(B.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * D1 * D2 * sizeof(Scalar));
+}
+
+// --- Pad ---
+static void BM_Pad(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+  const int padSize = state.range(2);
+
+  Tensor<Scalar, 2> A(M, N);
+  A.setRandom();
+
+  Eigen::array<std::pair<int, int>, 2> paddings;
+  paddings[0] = {padSize, padSize};
+  paddings[1] = {padSize, padSize};
+
+  for (auto _ : state) {
+    Tensor<Scalar, 2> B = A.pad(paddings);
+    benchmark::DoNotOptimize(B.data());
+    benchmark::ClobberMemory();
+  }
+  int outM = M + 2 * padSize;
+  int outN = N + 2 * padSize;
+  state.SetBytesProcessed(state.iterations() * outM * outN * sizeof(Scalar));
+}
+
+// --- Stride ---
+static void BM_Stride(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+  const int stride = state.range(2);
+
+  Tensor<Scalar, 2> A(M, N);
+  A.setRandom();
+
+  Eigen::array<Index, 2> strides_arr = {stride, stride};
+
+  for (auto _ : state) {
+    Tensor<Scalar, 2> B = A.stride(strides_arr);
+    benchmark::DoNotOptimize(B.data());
+    benchmark::ClobberMemory();
+  }
+  int outM = (M + stride - 1) / stride;
+  int outN = (N + stride - 1) / stride;
+  state.SetBytesProcessed(state.iterations() * outM * outN * sizeof(Scalar));
+}
+
+static void MorphSizes(::benchmark::Benchmark* b) {
+  for (int size : {256, 1024}) {
+    b->Args({size, size});
+  }
+}
+
+static void ChipSizes(::benchmark::Benchmark* b) {
+  b->Args({32, 256, 256});
+  b->Args({64, 128, 128});
+  b->Args({8, 512, 512});
+}
+
+static void PadSizes(::benchmark::Benchmark* b) {
+  for (int size : {256, 1024}) {
+    for (int pad : {1, 4, 16}) {
+      b->Args({size, size, pad});
+    }
+  }
+}
+
+static void StrideSizes(::benchmark::Benchmark* b) {
+  for (int size : {256, 1024}) {
+    for (int stride : {2, 4}) {
+      b->Args({size, size, stride});
+    }
+  }
+}
+
+BENCHMARK(BM_Reshape)->Apply(MorphSizes);
+BENCHMARK(BM_Slice)->Apply(MorphSizes);
+BENCHMARK(BM_Chip)->Apply(ChipSizes);
+BENCHMARK(BM_Pad)->Apply(PadSizes);
+BENCHMARK(BM_Stride)->Apply(StrideSizes);

diff --git a/unsupported/benchmarks/Tensor/bench_reduction.cpp b/unsupported/benchmarks/Tensor/bench_reduction.cpp
new file mode 100644
index 0000000..795c95c
--- /dev/null
+++ b/unsupported/benchmarks/Tensor/bench_reduction.cpp

@@ -0,0 +1,158 @@
+// Benchmarks for Eigen Tensor reductions (sum, maximum, mean).
+// Tests full and partial reductions, inner vs outer dimension, DefaultDevice and ThreadPoolDevice.
+
+#define EIGEN_USE_THREADS
+
+#include <benchmark/benchmark.h>
+#include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/CXX11/ThreadPool>
+
+using namespace Eigen;
+
+#ifndef SCALAR
+#define SCALAR float
+#endif
+
+typedef SCALAR Scalar;
+
+// --- Full reduction (rank-2) ---
+template <typename ReduceOp>
+static void BM_FullReduction(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+
+  Tensor<Scalar, 2> A(M, N);
+  A.setRandom();
+
+  for (auto _ : state) {
+    Tensor<Scalar, 0> result = A.reduce(Eigen::array<int, 2>{0, 1}, ReduceOp());
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
+}
+
+// --- Partial reduction along dim 0 (inner dim, ColMajor) ---
+static void BM_ReduceInner(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+
+  Tensor<Scalar, 2> A(M, N);
+  A.setRandom();
+
+  Eigen::array<int, 1> reduce_dims = {0};
+
+  for (auto _ : state) {
+    Tensor<Scalar, 1> result = A.sum(reduce_dims);
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
+}
+
+// --- Partial reduction along dim 1 (outer dim, ColMajor) ---
+static void BM_ReduceOuter(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+
+  Tensor<Scalar, 2> A(M, N);
+  A.setRandom();
+
+  Eigen::array<int, 1> reduce_dims = {1};
+
+  for (auto _ : state) {
+    Tensor<Scalar, 1> result = A.sum(reduce_dims);
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
+}
+
+// --- Rank-4 partial reduction (batch x channels x H x W), reduce along spatial dims ---
+static void BM_ReduceSpatial(benchmark::State& state) {
+  const int batch = state.range(0);
+  const int C = state.range(1);
+  const int H = state.range(2);
+
+  Tensor<Scalar, 4> A(batch, C, H, H);
+  A.setRandom();
+
+  Eigen::array<int, 2> reduce_dims = {2, 3};
+
+  for (auto _ : state) {
+    Tensor<Scalar, 2> result = A.sum(reduce_dims);
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * batch * C * H * H * sizeof(Scalar));
+}
+
+// --- Full reduction with ThreadPoolDevice ---
+static void BM_FullReduction_ThreadPool(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+  const int threads = state.range(2);
+
+  Tensor<Scalar, 2> A(M, N);
+  Tensor<Scalar, 0> result;
+  A.setRandom();
+
+  ThreadPool tp(threads);
+  ThreadPoolDevice dev(&tp, threads);
+
+  for (auto _ : state) {
+    result.device(dev) = A.sum();
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
+  state.counters["threads"] = threads;
+}
+
+// --- Maximum reduction (rank-2) ---
+static void BM_MaxReduction(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+
+  Tensor<Scalar, 2> A(M, N);
+  A.setRandom();
+
+  for (auto _ : state) {
+    Tensor<Scalar, 0> result = A.maximum();
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
+}
+
+static void ReductionSizes(::benchmark::Benchmark* b) {
+  for (int size : {64, 256, 1024}) {
+    b->Args({size, size});
+  }
+}
+
+static void ThreadPoolReductionSizes(::benchmark::Benchmark* b) {
+  for (int size : {256, 1024}) {
+    for (int threads : {2, 4, 8}) {
+      b->Args({size, size, threads});
+    }
+  }
+}
+
+static void SpatialSizes(::benchmark::Benchmark* b) {
+  for (int batch : {1, 8, 32}) {
+    for (int c : {64, 128}) {
+      for (int h : {16, 32}) {
+        b->Args({batch, c, h});
+      }
+    }
+  }
+}
+
+BENCHMARK(BM_FullReduction<internal::SumReducer<Scalar>>)->Apply(ReductionSizes)->Name("SumReduction");
+BENCHMARK(BM_FullReduction<internal::MaxReducer<Scalar>>)->Apply(ReductionSizes)->Name("MaxReduction_Full");
+BENCHMARK(BM_MaxReduction)->Apply(ReductionSizes);
+BENCHMARK(BM_ReduceInner)->Apply(ReductionSizes);
+BENCHMARK(BM_ReduceOuter)->Apply(ReductionSizes);
+BENCHMARK(BM_ReduceSpatial)->Apply(SpatialSizes);
+BENCHMARK(BM_FullReduction_ThreadPool)->Apply(ThreadPoolReductionSizes);

diff --git a/unsupported/benchmarks/Tensor/bench_shuffling.cpp b/unsupported/benchmarks/Tensor/bench_shuffling.cpp
new file mode 100644
index 0000000..6296acc
--- /dev/null
+++ b/unsupported/benchmarks/Tensor/bench_shuffling.cpp

@@ -0,0 +1,115 @@
+// Benchmarks for Eigen Tensor shuffling (transpose / permutation).
+
+#include <benchmark/benchmark.h>
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using namespace Eigen;
+
+typedef float Scalar;
+
+// --- Rank-2 transpose ---
+static void BM_Shuffle2D(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+
+  Tensor<Scalar, 2> A(M, N);
+  Tensor<Scalar, 2> B(N, M);
+  A.setRandom();
+
+  Eigen::array<int, 2> perm = {1, 0};
+
+  for (auto _ : state) {
+    B = A.shuffle(perm);
+    benchmark::DoNotOptimize(B.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2);
+}
+
+// --- Identity shuffle (no permutation, measures overhead) ---
+static void BM_ShuffleIdentity(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+
+  Tensor<Scalar, 2> A(M, N);
+  Tensor<Scalar, 2> B(M, N);
+  A.setRandom();
+
+  Eigen::array<int, 2> perm = {0, 1};
+
+  for (auto _ : state) {
+    B = A.shuffle(perm);
+    benchmark::DoNotOptimize(B.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2);
+}
+
+// --- Rank-3 permutation ---
+static void BM_Shuffle3D(benchmark::State& state) {
+  const int D0 = state.range(0);
+  const int D1 = state.range(1);
+  const int D2 = state.range(2);
+
+  Tensor<Scalar, 3> A(D0, D1, D2);
+  A.setRandom();
+
+  // Permutation (2, 0, 1)
+  Eigen::array<int, 3> perm = {2, 0, 1};
+
+  for (auto _ : state) {
+    Tensor<Scalar, 3> B = A.shuffle(perm);
+    benchmark::DoNotOptimize(B.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * D0 * D1 * D2 * sizeof(Scalar) * 2);
+}
+
+// --- Rank-4 permutation (NCHW -> NHWC layout conversion) ---
+static void BM_Shuffle4D_NCHW_to_NHWC(benchmark::State& state) {
+  const int N = state.range(0);
+  const int C = state.range(1);
+  const int H = state.range(2);
+
+  Tensor<Scalar, 4> A(N, C, H, H);
+  A.setRandom();
+
+  // NCHW -> NHWC: permute (0, 2, 3, 1)
+  Eigen::array<int, 4> perm = {0, 2, 3, 1};
+
+  for (auto _ : state) {
+    Tensor<Scalar, 4> B = A.shuffle(perm);
+    benchmark::DoNotOptimize(B.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * N * C * H * H * sizeof(Scalar) * 2);
+}
+
+static void Shuffle2DSizes(::benchmark::Benchmark* b) {
+  for (int size : {256, 1024}) {
+    b->Args({size, size});
+  }
+  b->Args({64, 4096});
+  b->Args({4096, 64});
+}
+
+static void Shuffle3DSizes(::benchmark::Benchmark* b) {
+  b->Args({64, 64, 64});
+  b->Args({128, 128, 64});
+  b->Args({32, 256, 256});
+}
+
+static void Shuffle4DSizes(::benchmark::Benchmark* b) {
+  for (int batch : {1, 8}) {
+    for (int c : {3, 64}) {
+      for (int h : {32, 64}) {
+        b->Args({batch, c, h});
+      }
+    }
+  }
+}
+
+BENCHMARK(BM_Shuffle2D)->Apply(Shuffle2DSizes);
+BENCHMARK(BM_ShuffleIdentity)->Apply(Shuffle2DSizes);
+BENCHMARK(BM_Shuffle3D)->Apply(Shuffle3DSizes);
+BENCHMARK(BM_Shuffle4D_NCHW_to_NHWC)->Apply(Shuffle4DSizes);

diff --git a/unsupported/benchmarks/Tensor/bench_tensor_fft.cpp b/unsupported/benchmarks/Tensor/bench_tensor_fft.cpp
new file mode 100644
index 0000000..26ff64b
--- /dev/null
+++ b/unsupported/benchmarks/Tensor/bench_tensor_fft.cpp

@@ -0,0 +1,80 @@
+// Benchmarks for Eigen Tensor FFT.
+
+#include <benchmark/benchmark.h>
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using namespace Eigen;
+
+#ifndef SCALAR
+#define SCALAR float
+#endif
+
+typedef SCALAR Scalar;
+
+// --- 1D FFT ---
+static void BM_TensorFFT_1D(benchmark::State& state) {
+  const int N = state.range(0);
+
+  Tensor<Scalar, 1> input(N);
+  input.setRandom();
+
+  Eigen::array<int, 1> fft_dims = {0};
+
+  for (auto _ : state) {
+    Tensor<std::complex<Scalar>, 1> result = input.template fft<BothParts, FFT_FORWARD>(fft_dims);
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+  double mflops = 5.0 * N * std::log2(static_cast<double>(N)) / 2.0;  // real->complex
+  state.counters["MFLOPS"] =
+      benchmark::Counter(mflops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
+}
+
+// --- 2D FFT ---
+static void BM_TensorFFT_2D(benchmark::State& state) {
+  const int N = state.range(0);
+
+  Tensor<Scalar, 2> input(N, N);
+  input.setRandom();
+
+  Eigen::array<int, 2> fft_dims = {0, 1};
+
+  for (auto _ : state) {
+    Tensor<std::complex<Scalar>, 2> result = input.template fft<BothParts, FFT_FORWARD>(fft_dims);
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+  double total = N * N;
+  double mflops = 5.0 * total * std::log2(static_cast<double>(N));
+  state.counters["MFLOPS"] =
+      benchmark::Counter(mflops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
+}
+
+// --- 1D inverse FFT ---
+static void BM_TensorIFFT_1D(benchmark::State& state) {
+  const int N = state.range(0);
+
+  Tensor<std::complex<Scalar>, 1> input(N);
+  input.setRandom();
+
+  Eigen::array<int, 1> fft_dims = {0};
+
+  for (auto _ : state) {
+    Tensor<std::complex<Scalar>, 1> result = input.template fft<BothParts, FFT_REVERSE>(fft_dims);
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+  double mflops = 5.0 * N * std::log2(static_cast<double>(N));
+  state.counters["MFLOPS"] =
+      benchmark::Counter(mflops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
+}
+
+static void FFTSizes(::benchmark::Benchmark* b) {
+  for (int n : {64, 256, 1024, 4096}) {
+    b->Arg(n);
+  }
+}
+
+BENCHMARK(BM_TensorFFT_1D)->Apply(FFTSizes);
+BENCHMARK(BM_TensorFFT_2D)->Apply(FFTSizes);
+BENCHMARK(BM_TensorIFFT_1D)->Apply(FFTSizes);

diff --git a/unsupported/doc/examples/SYCL/CwiseMul.cpp b/unsupported/doc/examples/SYCL/CwiseMul.cpp
index eac5b46..0297998 100644
--- a/unsupported/doc/examples/SYCL/CwiseMul.cpp
+++ b/unsupported/doc/examples/SYCL/CwiseMul.cpp

@@ -1,6 +1,6 @@
 #include <iostream>
 #define EIGEN_USE_SYCL
-#include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/Tensor>
 
 using Eigen::array;
 using Eigen::SyclDevice;

diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index 5efa7e8..4160fad 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt

@@ -7,7 +7,7 @@
 set_property(GLOBAL PROPERTY EIGEN_CURRENT_SUBPROJECT "Unsupported")
 add_custom_target(BuildUnsupported)
 
-include_directories(../../test ../../unsupported ../../Eigen
+include_directories(../../test ../../unsupported
                     ${CMAKE_CURRENT_BINARY_DIR}/../../test)
 
 find_package (Threads)
@@ -231,7 +231,7 @@
 ei_add_test(cxx11_tensor_thread_pool "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
 ei_add_test(cxx11_tensor_trace)
 ei_add_test(cxx11_tensor_volume_patch)
-#  ei_add_test(cxx11_tensor_symmetry)
+ei_add_test(cxx11_tensor_symmetry)
 if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8" AND NOT CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
   # This test requires __uint128_t which is only available on 64bit systems
   ei_add_test(cxx11_tensor_uint128)

diff --git a/unsupported/test/NonLinearOptimization.cpp b/unsupported/test/NonLinearOptimization.cpp
index fa2ee32..78990d7 100644
--- a/unsupported/test/NonLinearOptimization.cpp
+++ b/unsupported/test/NonLinearOptimization.cpp

@@ -845,7 +845,7 @@
   VERIFY_IS_APPROX(x[0], 1.0776351733E+00);
   VERIFY_IS_APPROX(x[1], -1.2269296921E-01);
   VERIFY_IS_APPROX(x[2], 4.0863750610E-03);
-  VERIFY_IS_APPROX(x[3], -1.426264e-06);  // shoulde be : -1.4262662514E-06
+  VERIFY_IS_APPROX(x[3], -1.426264e-06);  // should be : -1.4262662514E-06
   VERIFY_IS_APPROX(x[4], -5.7609940901E-03);
   VERIFY_IS_APPROX(x[5], 2.4053735503E-04);
   VERIFY_IS_APPROX(x[6], -1.2314450199E-07);
@@ -867,7 +867,7 @@
   VERIFY_IS_APPROX(x[0], 1.077640);       // should be :  1.0776351733E+00
   VERIFY_IS_APPROX(x[1], -0.1226933);     // should be : -1.2269296921E-01
   VERIFY_IS_APPROX(x[2], 0.004086383);    // should be : 4.0863750610E-03
-  VERIFY_IS_APPROX(x[3], -1.426277e-06);  // shoulde be : -1.4262662514E-06
+  VERIFY_IS_APPROX(x[3], -1.426277e-06);  // should be : -1.4262662514E-06
   VERIFY_IS_APPROX(x[4], -5.7609940901E-03);
   VERIFY_IS_APPROX(x[5], 0.00024053772);  // should be : 2.4053735503E-04
   VERIFY_IS_APPROX(x[6], -1.231450e-07);  // should be : -1.2314450199E-07

diff --git a/unsupported/test/cxx11_tensor_argmax.cpp b/unsupported/test/cxx11_tensor_argmax.cpp
index b86d84c..1bf5064 100644
--- a/unsupported/test/cxx11_tensor_argmax.cpp
+++ b/unsupported/test/cxx11_tensor_argmax.cpp

@@ -10,7 +10,7 @@
 
 #include "main.h"
 
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 using Eigen::array;
 using Eigen::Pair;

diff --git a/unsupported/test/cxx11_tensor_argmax_gpu.cu b/unsupported/test/cxx11_tensor_argmax_gpu.cu
index 90c94c5..3be20d4 100644
--- a/unsupported/test/cxx11_tensor_argmax_gpu.cu
+++ b/unsupported/test/cxx11_tensor_argmax_gpu.cu

@@ -12,7 +12,7 @@
 #define EIGEN_USE_GPU
 
 #include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/Tensor>
 
 using Eigen::Tensor;
 

diff --git a/unsupported/test/cxx11_tensor_argmax_sycl.cpp b/unsupported/test/cxx11_tensor_argmax_sycl.cpp
index 8f4e095..c49ceac 100644
--- a/unsupported/test/cxx11_tensor_argmax_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_argmax_sycl.cpp

@@ -19,7 +19,7 @@
 
 #include "main.h"
 
-#include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/Tensor>
 
 using Eigen::array;
 using Eigen::SyclDevice;

diff --git a/unsupported/test/cxx11_tensor_assign.cpp b/unsupported/test/cxx11_tensor_assign.cpp
index 993249d..47b6361 100644
--- a/unsupported/test/cxx11_tensor_assign.cpp
+++ b/unsupported/test/cxx11_tensor_assign.cpp

@@ -9,7 +9,7 @@
 
 #include "main.h"
 
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 using Eigen::RowMajor;
 using Eigen::Tensor;

diff --git a/unsupported/test/cxx11_tensor_block_access.cpp b/unsupported/test/cxx11_tensor_block_access.cpp
index 7c5203d..8dbe4fd 100644
--- a/unsupported/test/cxx11_tensor_block_access.cpp
+++ b/unsupported/test/cxx11_tensor_block_access.cpp

@@ -13,7 +13,7 @@
 #include <algorithm>
 #include <set>
 
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 using Eigen::ColMajor;
 using Eigen::Index;

diff --git a/unsupported/test/cxx11_tensor_block_eval.cpp b/unsupported/test/cxx11_tensor_block_eval.cpp
index 75e2b8c..0567b3c 100644
--- a/unsupported/test/cxx11_tensor_block_eval.cpp
+++ b/unsupported/test/cxx11_tensor_block_eval.cpp

@@ -7,7 +7,7 @@
 
 // clang-format off
 #include "main.h"
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 // clang-format on
 
 using Eigen::internal::TensorBlockDescriptor;

diff --git a/unsupported/test/cxx11_tensor_block_io.cpp b/unsupported/test/cxx11_tensor_block_io.cpp
index 509c10e..dbc999f 100644
--- a/unsupported/test/cxx11_tensor_block_io.cpp
+++ b/unsupported/test/cxx11_tensor_block_io.cpp

@@ -7,7 +7,7 @@
 
 // clang-format off
 #include "main.h"
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 // clang-format on
 
 // -------------------------------------------------------------------------- //

diff --git a/unsupported/test/cxx11_tensor_broadcast_sycl.cpp b/unsupported/test/cxx11_tensor_broadcast_sycl.cpp
index d255568..a8c6c97 100644
--- a/unsupported/test/cxx11_tensor_broadcast_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_broadcast_sycl.cpp

@@ -18,7 +18,7 @@
 #define EIGEN_USE_SYCL
 
 #include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/Tensor>
 
 using Eigen::array;
 using Eigen::SyclDevice;

diff --git a/unsupported/test/cxx11_tensor_broadcasting.cpp b/unsupported/test/cxx11_tensor_broadcasting.cpp
index 6f43d6a..0fa29a7 100644
--- a/unsupported/test/cxx11_tensor_broadcasting.cpp
+++ b/unsupported/test/cxx11_tensor_broadcasting.cpp

@@ -9,7 +9,7 @@
 
 #include "main.h"
 
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 using Eigen::Tensor;
 

diff --git a/unsupported/test/cxx11_tensor_builtins_sycl.cpp b/unsupported/test/cxx11_tensor_builtins_sycl.cpp
index c8e0a6d..1449bdc 100644
--- a/unsupported/test/cxx11_tensor_builtins_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_builtins_sycl.cpp

@@ -18,7 +18,7 @@
 #define EIGEN_USE_SYCL
 
 #include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/Tensor>
 
 using Eigen::array;
 using Eigen::SyclDevice;

diff --git a/unsupported/test/cxx11_tensor_cast_float16_gpu.cu b/unsupported/test/cxx11_tensor_cast_float16_gpu.cu
index ce5c4c3..2028751 100644
--- a/unsupported/test/cxx11_tensor_cast_float16_gpu.cu
+++ b/unsupported/test/cxx11_tensor_cast_float16_gpu.cu

@@ -14,7 +14,7 @@
 #define EIGEN_USE_GPU
 
 #include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/Tensor>
 
 using Eigen::Tensor;
 

diff --git a/unsupported/test/cxx11_tensor_casts.cpp b/unsupported/test/cxx11_tensor_casts.cpp
index 9cd377c..4b8e67e 100644
--- a/unsupported/test/cxx11_tensor_casts.cpp
+++ b/unsupported/test/cxx11_tensor_casts.cpp

@@ -10,7 +10,7 @@
 #include "main.h"
 #include "random_without_cast_overflow.h"
 
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 using Eigen::array;
 using Eigen::Tensor;

diff --git a/unsupported/test/cxx11_tensor_chipping.cpp b/unsupported/test/cxx11_tensor_chipping.cpp
index dbbd89d..b06b297 100644
--- a/unsupported/test/cxx11_tensor_chipping.cpp
+++ b/unsupported/test/cxx11_tensor_chipping.cpp

@@ -9,7 +9,7 @@
 
 #include "main.h"
 
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 using Eigen::Tensor;
 

diff --git a/unsupported/test/cxx11_tensor_chipping_sycl.cpp b/unsupported/test/cxx11_tensor_chipping_sycl.cpp
index cab0ae8..eb50e15 100644
--- a/unsupported/test/cxx11_tensor_chipping_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_chipping_sycl.cpp

@@ -20,7 +20,7 @@
 
 #include "main.h"
 
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 using Eigen::Tensor;
 

diff --git a/unsupported/test/cxx11_tensor_comparisons.cpp b/unsupported/test/cxx11_tensor_comparisons.cpp
index 17a1776..1ef1179 100644
--- a/unsupported/test/cxx11_tensor_comparisons.cpp
+++ b/unsupported/test/cxx11_tensor_comparisons.cpp

@@ -9,7 +9,7 @@
 
 #include "main.h"
 
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 using Eigen::RowMajor;
 using Eigen::Tensor;

diff --git a/unsupported/test/cxx11_tensor_complex_cwise_ops_gpu.cu b/unsupported/test/cxx11_tensor_complex_cwise_ops_gpu.cu
index 6daa0a7..2633f8f 100644
--- a/unsupported/test/cxx11_tensor_complex_cwise_ops_gpu.cu
+++ b/unsupported/test/cxx11_tensor_complex_cwise_ops_gpu.cu

@@ -12,7 +12,7 @@
 #define EIGEN_USE_GPU
 
 #include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/Tensor>
 
 using Eigen::Tensor;
 
@@ -35,8 +35,8 @@
   Eigen::TensorMap<Eigen::Tensor<std::complex<T>, 1, 0, int>, Eigen::Aligned> gpu_in2(d_in2, kNumItems);
   Eigen::TensorMap<Eigen::Tensor<std::complex<T>, 1, 0, int>, Eigen::Aligned> gpu_out(d_out, kNumItems);
 
-  const std::complex<T> a(3.14f, 2.7f);
-  const std::complex<T> b(-10.6f, 1.4f);
+  const std::complex<T> a(static_cast<T>(3.14), static_cast<T>(2.7));
+  const std::complex<T> b(static_cast<T>(-10.6), static_cast<T>(1.4));
 
   gpu_in1.device(gpu_device) = gpu_in1.constant(a);
   gpu_in2.device(gpu_device) = gpu_in2.constant(b);

diff --git a/unsupported/test/cxx11_tensor_complex_gpu.cu b/unsupported/test/cxx11_tensor_complex_gpu.cu
index 12f11e0..777814e 100644
--- a/unsupported/test/cxx11_tensor_complex_gpu.cu
+++ b/unsupported/test/cxx11_tensor_complex_gpu.cu

@@ -12,7 +12,7 @@
 #define EIGEN_USE_GPU
 
 #include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/Tensor>
 
 using Eigen::Tensor;
 

diff --git a/unsupported/test/cxx11_tensor_concatenation.cpp b/unsupported/test/cxx11_tensor_concatenation.cpp
index 562ac77..cc0a160 100644
--- a/unsupported/test/cxx11_tensor_concatenation.cpp
+++ b/unsupported/test/cxx11_tensor_concatenation.cpp

@@ -9,7 +9,7 @@
 
 #include "main.h"
 
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 using Eigen::Tensor;
 

diff --git a/unsupported/test/cxx11_tensor_concatenation_sycl.cpp b/unsupported/test/cxx11_tensor_concatenation_sycl.cpp
index fccf165..42fb4fa 100644
--- a/unsupported/test/cxx11_tensor_concatenation_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_concatenation_sycl.cpp

@@ -18,7 +18,7 @@
 #define EIGEN_USE_SYCL
 
 #include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/Tensor>
 
 using Eigen::Tensor;
 

diff --git a/unsupported/test/cxx11_tensor_const.cpp b/unsupported/test/cxx11_tensor_const.cpp
index 78a6506..6606417 100644
--- a/unsupported/test/cxx11_tensor_const.cpp
+++ b/unsupported/test/cxx11_tensor_const.cpp

@@ -9,7 +9,7 @@
 
 #include "main.h"
 
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 using Eigen::Tensor;
 
 static void test_simple_assign() {

diff --git a/unsupported/test/cxx11_tensor_contract_gpu.cu b/unsupported/test/cxx11_tensor_contract_gpu.cu
index a41a2df..25d2fb6 100644
--- a/unsupported/test/cxx11_tensor_contract_gpu.cu
+++ b/unsupported/test/cxx11_tensor_contract_gpu.cu

@@ -15,7 +15,7 @@
 #define EIGEN_USE_GPU
 
 #include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/Tensor>
 
 using Eigen::Tensor;
 typedef Tensor<float, 1>::DimensionPair DimPair;

diff --git a/unsupported/test/cxx11_tensor_contract_sycl.cpp b/unsupported/test/cxx11_tensor_contract_sycl.cpp
index 120681b..c8194e1 100644
--- a/unsupported/test/cxx11_tensor_contract_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_contract_sycl.cpp

@@ -24,7 +24,7 @@
 
 #include "main.h"
 
-#include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/Tensor>
 
 using Eigen::array;
 using Eigen::SyclDevice;

diff --git a/unsupported/test/cxx11_tensor_contraction.cpp b/unsupported/test/cxx11_tensor_contraction.cpp
index 2949c14..9bf7ec6 100644
--- a/unsupported/test/cxx11_tensor_contraction.cpp
+++ b/unsupported/test/cxx11_tensor_contraction.cpp

@@ -9,7 +9,7 @@
 
 #include "main.h"
 
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 using Eigen::DefaultDevice;
 using Eigen::Tensor;
@@ -139,7 +139,7 @@
 
   Tensor<float, 1, DataLayout> mat6(2);
   mat6.setZero();
-  Eigen::array<DimPair, 2> dims2({{DimPair(0, 1), DimPair(1, 0)}});
+  Eigen::array<DimPair, 2> dims2{{DimPair(0, 1), DimPair(1, 0)}};
   typedef TensorEvaluator<decltype(mat4.contract(mat5, dims2)), DefaultDevice> Evaluator2;
   Evaluator2 eval2(mat4.contract(mat5, dims2), DefaultDevice());
   eval2.evalTo(mat6.data());
@@ -515,7 +515,7 @@
   Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(1500, 1400);
 
   // this contraction should be equivalent to a single matrix multiplication
-  Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}});
+  Eigen::array<DimPair, 2> dims{{DimPair(2, 0), DimPair(3, 1)}};
 
   // compute results by separate methods
   t_result = t_left.contract(t_right, dims, SqrtOutputKernel());

diff --git a/unsupported/test/cxx11_tensor_convolution.cpp b/unsupported/test/cxx11_tensor_convolution.cpp
index c921865..c7952c2 100644
--- a/unsupported/test/cxx11_tensor_convolution.cpp
+++ b/unsupported/test/cxx11_tensor_convolution.cpp

@@ -9,7 +9,7 @@
 
 #include "main.h"
 
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 using Eigen::DefaultDevice;
 using Eigen::Tensor;

diff --git a/unsupported/test/cxx11_tensor_convolution_sycl.cpp b/unsupported/test/cxx11_tensor_convolution_sycl.cpp
index 350ac33..0d6358c 100644
--- a/unsupported/test/cxx11_tensor_convolution_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_convolution_sycl.cpp

@@ -22,7 +22,7 @@
 #include <ctime>
 
 #include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/Tensor>
 #include <iomanip>
 
 using Eigen::array;

diff --git a/unsupported/test/cxx11_tensor_custom_index.cpp b/unsupported/test/cxx11_tensor_custom_index.cpp
index f15da36..0c446df 100644
--- a/unsupported/test/cxx11_tensor_custom_index.cpp
+++ b/unsupported/test/cxx11_tensor_custom_index.cpp

@@ -12,7 +12,7 @@
 #include <map>
 
 #include <Eigen/Dense>
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 using Eigen::Tensor;
 

diff --git a/unsupported/test/cxx11_tensor_custom_op.cpp b/unsupported/test/cxx11_tensor_custom_op.cpp
index 493da48..becdc21 100644
--- a/unsupported/test/cxx11_tensor_custom_op.cpp
+++ b/unsupported/test/cxx11_tensor_custom_op.cpp

@@ -9,7 +9,7 @@
 
 #include "main.h"
 
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 using Eigen::Tensor;
 

diff --git a/unsupported/test/cxx11_tensor_custom_op_sycl.cpp b/unsupported/test/cxx11_tensor_custom_op_sycl.cpp
index eb9ef6e..a29b54a 100644
--- a/unsupported/test/cxx11_tensor_custom_op_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_custom_op_sycl.cpp

@@ -18,7 +18,7 @@
 #define EIGEN_USE_SYCL
 
 #include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/Tensor>
 
 using Eigen::Tensor;
 template <typename TensorType>

diff --git a/unsupported/test/cxx11_tensor_device.cu b/unsupported/test/cxx11_tensor_device.cu
index b4ba23e..7ba7288 100644
--- a/unsupported/test/cxx11_tensor_device.cu
+++ b/unsupported/test/cxx11_tensor_device.cu

@@ -15,7 +15,7 @@
 
 #include "main.h"
 #include "OffByOneScalar.h"
-#include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/Tensor>
 
 using Eigen::RowMajor;
 using Eigen::Tensor;
@@ -176,7 +176,7 @@
 
 // Helper method to synchronize device.
 template <typename Device>
-void synchronize(Device& device) { /*nothing*/
+void synchronize(Device& /*device*/) { /*nothing*/
 }
 template <>
 void synchronize(Eigen::GpuDevice& device) {
@@ -197,7 +197,7 @@
   device.memcpyDeviceToHost(host.data(), device_data, count * sizeof(DataType));
   synchronize(device);
   memset(expected.data(), byte_value, count * sizeof(DataType));
-  for (size_t i = 0; i < count; i++) {
+  for (Index i = 0; i < count; i++) {
     VERIFY_IS_EQUAL(host(i), expected(i));
   }
 

diff --git a/unsupported/test/cxx11_tensor_device_sycl.cpp b/unsupported/test/cxx11_tensor_device_sycl.cpp
index d190815..9640ed4 100644
--- a/unsupported/test/cxx11_tensor_device_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_device_sycl.cpp

@@ -19,7 +19,7 @@
 
 #include "main.h"
 #include "OffByOneScalar.h"
-#include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/Tensor>
 #include <stdint.h>
 #include <iostream>
 

diff --git a/unsupported/test/cxx11_tensor_dimension.cpp b/unsupported/test/cxx11_tensor_dimension.cpp
index 3423a66..0458373 100644
--- a/unsupported/test/cxx11_tensor_dimension.cpp
+++ b/unsupported/test/cxx11_tensor_dimension.cpp

@@ -9,7 +9,7 @@
 
 #include "main.h"
 
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 using Eigen::Tensor;
 

diff --git a/unsupported/test/cxx11_tensor_empty.cpp b/unsupported/test/cxx11_tensor_empty.cpp
index c90cf36..a688b4d 100644
--- a/unsupported/test/cxx11_tensor_empty.cpp
+++ b/unsupported/test/cxx11_tensor_empty.cpp

@@ -9,7 +9,7 @@
 
 #include "main.h"
 
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 static void test_empty_tensor() {
   Tensor<float, 2> source;

diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp
index 228fa9e..33748c6 100644
--- a/unsupported/test/cxx11_tensor_executor.cpp
+++ b/unsupported/test/cxx11_tensor_executor.cpp

@@ -11,7 +11,7 @@
 
 #include "main.h"
 
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 using Eigen::ColMajor;
 using Eigen::RowMajor;

diff --git a/unsupported/test/cxx11_tensor_expr.cpp b/unsupported/test/cxx11_tensor_expr.cpp
index cf0d8ee..d29b1db 100644
--- a/unsupported/test/cxx11_tensor_expr.cpp
+++ b/unsupported/test/cxx11_tensor_expr.cpp

@@ -11,7 +11,7 @@
 
 #include "main.h"
 
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 using Eigen::RowMajor;
 using Eigen::Tensor;

diff --git a/unsupported/test/cxx11_tensor_fft.cpp b/unsupported/test/cxx11_tensor_fft.cpp
index 70bba2e..8db1a24 100644
--- a/unsupported/test/cxx11_tensor_fft.cpp
+++ b/unsupported/test/cxx11_tensor_fft.cpp

@@ -8,7 +8,7 @@
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 #include "main.h"
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 using Eigen::Tensor;
 

diff --git a/unsupported/test/cxx11_tensor_fixed_size.cpp b/unsupported/test/cxx11_tensor_fixed_size.cpp
index 7cfc18d..fb22859 100644
--- a/unsupported/test/cxx11_tensor_fixed_size.cpp
+++ b/unsupported/test/cxx11_tensor_fixed_size.cpp

@@ -9,7 +9,7 @@
 
 #include "main.h"
 
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 using Eigen::RowMajor;
 using Eigen::Tensor;

diff --git a/unsupported/test/cxx11_tensor_forced_eval.cpp b/unsupported/test/cxx11_tensor_forced_eval.cpp
index 4eb7d91..5ceac39 100644
--- a/unsupported/test/cxx11_tensor_forced_eval.cpp
+++ b/unsupported/test/cxx11_tensor_forced_eval.cpp

@@ -10,7 +10,7 @@
 #include "main.h"
 
 #include <Eigen/Core>
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 using Eigen::MatrixXf;
 using Eigen::Tensor;

diff --git a/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp b/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp
index 19a3c79..7c727d4 100644
--- a/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp

@@ -18,7 +18,7 @@
 #define EIGEN_USE_SYCL
 
 #include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/Tensor>
 
 using Eigen::Tensor;
 template <typename DataType, int DataLayout, typename IndexType>

diff --git a/unsupported/test/cxx11_tensor_generator.cpp b/unsupported/test/cxx11_tensor_generator.cpp
index e4aec1b..d380985 100644
--- a/unsupported/test/cxx11_tensor_generator.cpp
+++ b/unsupported/test/cxx11_tensor_generator.cpp

@@ -9,7 +9,7 @@
 
 #include "main.h"
 
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 struct Generator1D {
   Generator1D() {}

diff --git a/unsupported/test/cxx11_tensor_generator_sycl.cpp b/unsupported/test/cxx11_tensor_generator_sycl.cpp
index 335bf25..8879b58 100644
--- a/unsupported/test/cxx11_tensor_generator_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_generator_sycl.cpp

@@ -19,7 +19,7 @@
 static const float error_threshold = 1e-8f;
 
 #include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/Tensor>
 
 using Eigen::Tensor;
 struct Generator1D {

diff --git a/unsupported/test/cxx11_tensor_gpu.cu b/unsupported/test/cxx11_tensor_gpu.cu
index c4d0be1..d96e1b2 100644
--- a/unsupported/test/cxx11_tensor_gpu.cu
+++ b/unsupported/test/cxx11_tensor_gpu.cu

@@ -13,7 +13,7 @@
 #define EIGEN_USE_GPU
 
 #include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/Tensor>
 
 using Eigen::Tensor;
 

diff --git a/unsupported/test/cxx11_tensor_ifft.cpp b/unsupported/test/cxx11_tensor_ifft.cpp
index 3433aea..04f46eb 100644
--- a/unsupported/test/cxx11_tensor_ifft.cpp
+++ b/unsupported/test/cxx11_tensor_ifft.cpp

@@ -10,7 +10,7 @@
 #include "main.h"
 #include <complex>
 #include <cmath>
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 using Eigen::Tensor;
 

diff --git a/unsupported/test/cxx11_tensor_image_op_sycl.cpp b/unsupported/test/cxx11_tensor_image_op_sycl.cpp
index 2aedaf6..4c901a4 100644
--- a/unsupported/test/cxx11_tensor_image_op_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_image_op_sycl.cpp

@@ -18,7 +18,7 @@
 #define EIGEN_USE_SYCL
 
 #include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/Tensor>
 
 using Eigen::array;
 using Eigen::SyclDevice;

diff --git a/unsupported/test/cxx11_tensor_image_patch.cpp b/unsupported/test/cxx11_tensor_image_patch.cpp
index 55d3c6e..2f48394 100644
--- a/unsupported/test/cxx11_tensor_image_patch.cpp
+++ b/unsupported/test/cxx11_tensor_image_patch.cpp

@@ -9,7 +9,7 @@
 
 #include "main.h"
 
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 using Eigen::Tensor;
 

diff --git a/unsupported/test/cxx11_tensor_image_patch_sycl.cpp b/unsupported/test/cxx11_tensor_image_patch_sycl.cpp
index 9cbf26c..a6d3b7a 100644
--- a/unsupported/test/cxx11_tensor_image_patch_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_image_patch_sycl.cpp

@@ -18,7 +18,7 @@
 #define EIGEN_USE_SYCL
 
 #include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/Tensor>
 
 using Eigen::Tensor;
 static const int DataLayout = ColMajor;

diff --git a/unsupported/test/cxx11_tensor_index_list.cpp b/unsupported/test/cxx11_tensor_index_list.cpp
index 9e0b755..ef71474 100644
--- a/unsupported/test/cxx11_tensor_index_list.cpp
+++ b/unsupported/test/cxx11_tensor_index_list.cpp

@@ -9,7 +9,7 @@
 
 #include "main.h"
 
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 static void test_static_index_list() {
   Tensor<float, 4> tensor(2, 3, 5, 7);

diff --git a/unsupported/test/cxx11_tensor_inflation.cpp b/unsupported/test/cxx11_tensor_inflation.cpp
index 32662ab..1c8edea 100644
--- a/unsupported/test/cxx11_tensor_inflation.cpp
+++ b/unsupported/test/cxx11_tensor_inflation.cpp

@@ -9,7 +9,7 @@
 
 #include "main.h"
 
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 using Eigen::Tensor;
 

diff --git a/unsupported/test/cxx11_tensor_inflation_sycl.cpp b/unsupported/test/cxx11_tensor_inflation_sycl.cpp
index 2475a35..5001100 100644
--- a/unsupported/test/cxx11_tensor_inflation_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_inflation_sycl.cpp

@@ -18,7 +18,7 @@
 #define EIGEN_USE_SYCL
 
 #include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/Tensor>
 
 using Eigen::Tensor;
 

diff --git a/unsupported/test/cxx11_tensor_intdiv.cpp b/unsupported/test/cxx11_tensor_intdiv.cpp
index 433ca6e..3d818e0 100644
--- a/unsupported/test/cxx11_tensor_intdiv.cpp
+++ b/unsupported/test/cxx11_tensor_intdiv.cpp

@@ -9,7 +9,7 @@
 
 #include "main.h"
 
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 void test_signed_32bit() {
   // Divide by one

diff --git a/unsupported/test/cxx11_tensor_io.cpp b/unsupported/test/cxx11_tensor_io.cpp
index 27b3230..ace3e7c 100644
--- a/unsupported/test/cxx11_tensor_io.cpp
+++ b/unsupported/test/cxx11_tensor_io.cpp

@@ -9,7 +9,7 @@
 #include "main.h"
 
 #include <sstream>
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 template <typename Scalar, int rank, int Layout>
 struct test_tensor_ostream_impl {};

diff --git a/unsupported/test/cxx11_tensor_layout_swap.cpp b/unsupported/test/cxx11_tensor_layout_swap.cpp
index 50fb3ae..46a0bb2 100644
--- a/unsupported/test/cxx11_tensor_layout_swap.cpp
+++ b/unsupported/test/cxx11_tensor_layout_swap.cpp

@@ -9,7 +9,7 @@
 
 #include "main.h"
 
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 using Eigen::Tensor;
 

diff --git a/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp b/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp
index 066221e..7d1cbdc 100644
--- a/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp

@@ -20,7 +20,7 @@
 
 #include "main.h"
 
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 using Eigen::Tensor;
 

diff --git a/unsupported/test/cxx11_tensor_lvalue.cpp b/unsupported/test/cxx11_tensor_lvalue.cpp
index d70c7e0..8873dc6 100644
--- a/unsupported/test/cxx11_tensor_lvalue.cpp
+++ b/unsupported/test/cxx11_tensor_lvalue.cpp

@@ -9,7 +9,7 @@
 
 #include "main.h"
 
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 using Eigen::RowMajor;
 using Eigen::Tensor;

diff --git a/unsupported/test/cxx11_tensor_map.cpp b/unsupported/test/cxx11_tensor_map.cpp
index 25176f4..3c896a6 100644
--- a/unsupported/test/cxx11_tensor_map.cpp
+++ b/unsupported/test/cxx11_tensor_map.cpp

@@ -9,7 +9,7 @@
 
 #include "main.h"
 
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 using Eigen::RowMajor;
 using Eigen::Tensor;

diff --git a/unsupported/test/cxx11_tensor_math.cpp b/unsupported/test/cxx11_tensor_math.cpp
index 8184b88..0bcbbb7 100644
--- a/unsupported/test/cxx11_tensor_math.cpp
+++ b/unsupported/test/cxx11_tensor_math.cpp

@@ -9,7 +9,7 @@
 
 #include "main.h"
 
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 using Eigen::RowMajor;
 using Eigen::Tensor;

diff --git a/unsupported/test/cxx11_tensor_math_sycl.cpp b/unsupported/test/cxx11_tensor_math_sycl.cpp
index 7152a58..6987710 100644
--- a/unsupported/test/cxx11_tensor_math_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_math_sycl.cpp

@@ -18,7 +18,7 @@
 #define EIGEN_USE_SYCL
 
 #include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/Tensor>
 
 using Eigen::array;
 using Eigen::SyclDevice;

diff --git a/unsupported/test/cxx11_tensor_mixed_indices.cpp b/unsupported/test/cxx11_tensor_mixed_indices.cpp
index be6c486..2530fca 100644
--- a/unsupported/test/cxx11_tensor_mixed_indices.cpp
+++ b/unsupported/test/cxx11_tensor_mixed_indices.cpp

@@ -9,7 +9,7 @@
 
 #include "main.h"
 
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 static void test_simple() {
   Tensor<float, 1, ColMajor> vec1(6);

diff --git a/unsupported/test/cxx11_tensor_morphing.cpp b/unsupported/test/cxx11_tensor_morphing.cpp
index 55d4291..bbc0de5 100644
--- a/unsupported/test/cxx11_tensor_morphing.cpp
+++ b/unsupported/test/cxx11_tensor_morphing.cpp

@@ -9,7 +9,7 @@
 
 #include "main.h"
 
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 using Eigen::Tensor;
 

diff --git a/unsupported/test/cxx11_tensor_morphing_sycl.cpp b/unsupported/test/cxx11_tensor_morphing_sycl.cpp
index 7242b8e..f96ffb3 100644
--- a/unsupported/test/cxx11_tensor_morphing_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_morphing_sycl.cpp

@@ -19,7 +19,7 @@
 #define EIGEN_USE_SYCL
 
 #include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/Tensor>
 
 using Eigen::array;
 using Eigen::SyclDevice;

diff --git a/unsupported/test/cxx11_tensor_move.cpp b/unsupported/test/cxx11_tensor_move.cpp
index a21ebb1..5b24509 100644
--- a/unsupported/test/cxx11_tensor_move.cpp
+++ b/unsupported/test/cxx11_tensor_move.cpp

@@ -9,7 +9,7 @@
 
 #include "main.h"
 
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 #include <utility>
 
 using Eigen::RowMajor;

diff --git a/unsupported/test/cxx11_tensor_notification.cpp b/unsupported/test/cxx11_tensor_notification.cpp
index 2173866..71c4d05 100644
--- a/unsupported/test/cxx11_tensor_notification.cpp
+++ b/unsupported/test/cxx11_tensor_notification.cpp

@@ -13,7 +13,7 @@
 
 #include <stdlib.h>
 #include "main.h"
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 static void test_notification_single() {
   ThreadPool thread_pool(1);

diff --git a/unsupported/test/cxx11_tensor_of_bfloat16_gpu.cu b/unsupported/test/cxx11_tensor_of_bfloat16_gpu.cu
index 722c92f..599a130 100644
--- a/unsupported/test/cxx11_tensor_of_bfloat16_gpu.cu
+++ b/unsupported/test/cxx11_tensor_of_bfloat16_gpu.cu

@@ -14,7 +14,7 @@
 #define EIGEN_USE_GPU
 
 #include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/Tensor>
 
 using Eigen::Tensor;
 
@@ -278,7 +278,7 @@
   gpu_float2.device(gpu_device) = gpu_float2.random() - gpu_float2.constant(0.5f);
 
   typedef Tensor<float, 2>::DimensionPair DimPair;
-  Eigen::array<DimPair, 1> dims(DimPair(1, 0));
+  Eigen::array<DimPair, 1> dims{DimPair(1, 0)};
   gpu_res_float.device(gpu_device) = gpu_float1.contract(gpu_float2, dims).cast<Eigen::bfloat16>();
   gpu_res_bfloat16.device(gpu_device) =
       gpu_float1.cast<Eigen::bfloat16>().contract(gpu_float2.cast<Eigen::bfloat16>(), dims);

diff --git a/unsupported/test/cxx11_tensor_of_complex.cpp b/unsupported/test/cxx11_tensor_of_complex.cpp
index d34edbd..d02ae27 100644
--- a/unsupported/test/cxx11_tensor_of_complex.cpp
+++ b/unsupported/test/cxx11_tensor_of_complex.cpp

@@ -9,7 +9,7 @@
 
 #include "main.h"
 
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 using Eigen::Tensor;
 using Eigen::TensorMap;

diff --git a/unsupported/test/cxx11_tensor_of_const_values.cpp b/unsupported/test/cxx11_tensor_of_const_values.cpp
index 6c76cf3..7a6f0dc 100644
--- a/unsupported/test/cxx11_tensor_of_const_values.cpp
+++ b/unsupported/test/cxx11_tensor_of_const_values.cpp

@@ -9,7 +9,7 @@
 
 #include "main.h"
 
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 using Eigen::RowMajor;
 using Eigen::Tensor;

diff --git a/unsupported/test/cxx11_tensor_of_float16_gpu.cu b/unsupported/test/cxx11_tensor_of_float16_gpu.cu
index 88482de..d29df4f 100644
--- a/unsupported/test/cxx11_tensor_of_float16_gpu.cu
+++ b/unsupported/test/cxx11_tensor_of_float16_gpu.cu

@@ -14,7 +14,7 @@
 #define EIGEN_USE_GPU
 
 #include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/Tensor>
 
 using Eigen::Tensor;
 

diff --git a/unsupported/test/cxx11_tensor_of_float16_sycl.cpp b/unsupported/test/cxx11_tensor_of_float16_sycl.cpp
index 7de39e7..811ca11 100644
--- a/unsupported/test/cxx11_tensor_of_float16_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_of_float16_sycl.cpp

@@ -18,7 +18,7 @@
 #define EIGEN_SYCL_HALF_SUPPORT
 
 #include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/Tensor>
 
 using Eigen::SyclDevice;
 using Eigen::Tensor;

diff --git a/unsupported/test/cxx11_tensor_of_strings.cpp b/unsupported/test/cxx11_tensor_of_strings.cpp
index f7db25c..6447503 100644
--- a/unsupported/test/cxx11_tensor_of_strings.cpp
+++ b/unsupported/test/cxx11_tensor_of_strings.cpp

@@ -9,7 +9,7 @@
 
 #include "main.h"
 
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 using Eigen::Tensor;
 using Eigen::TensorMap;

diff --git a/unsupported/test/cxx11_tensor_padding.cpp b/unsupported/test/cxx11_tensor_padding.cpp
index 6c5ddf0..a21da4a 100644
--- a/unsupported/test/cxx11_tensor_padding.cpp
+++ b/unsupported/test/cxx11_tensor_padding.cpp

@@ -9,7 +9,7 @@
 
 #include "main.h"
 
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 using Eigen::Tensor;
 

diff --git a/unsupported/test/cxx11_tensor_padding_sycl.cpp b/unsupported/test/cxx11_tensor_padding_sycl.cpp
index ae4d98a..e311dc6 100644
--- a/unsupported/test/cxx11_tensor_padding_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_padding_sycl.cpp

@@ -19,7 +19,7 @@
 #define EIGEN_USE_SYCL
 
 #include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/Tensor>
 
 using Eigen::array;
 using Eigen::SyclDevice;

diff --git a/unsupported/test/cxx11_tensor_patch.cpp b/unsupported/test/cxx11_tensor_patch.cpp
index 59b03e7..76e75f9 100644
--- a/unsupported/test/cxx11_tensor_patch.cpp
+++ b/unsupported/test/cxx11_tensor_patch.cpp

@@ -9,7 +9,7 @@
 
 #include "main.h"
 
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 using Eigen::Tensor;
 

diff --git a/unsupported/test/cxx11_tensor_patch_sycl.cpp b/unsupported/test/cxx11_tensor_patch_sycl.cpp
index b1be9ff..3000cdc 100644
--- a/unsupported/test/cxx11_tensor_patch_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_patch_sycl.cpp

@@ -20,7 +20,7 @@
 
 #include "main.h"
 
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 using Eigen::Tensor;
 

diff --git a/unsupported/test/cxx11_tensor_random.cpp b/unsupported/test/cxx11_tensor_random.cpp
index 585a3f8..7cc2a2f 100644
--- a/unsupported/test/cxx11_tensor_random.cpp
+++ b/unsupported/test/cxx11_tensor_random.cpp

@@ -9,30 +9,49 @@
 
 #include "main.h"
 
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 template <typename Scalar>
 static void test_default() {
   Tensor<Scalar, 1> vec(6);
-  vec.setRandom();
 
   // Fixme: we should check that the generated numbers follow a uniform
   // distribution instead.
-  for (int i = 1; i < 6; ++i) {
-    VERIFY_IS_NOT_EQUAL(vec(i), vec(i - 1));
+  // For low-precision types (half, bfloat16), the RNG has limited distinct
+  // values (e.g. 128 for bfloat16), so adjacent collisions are possible.
+  // Retry a few times to avoid spurious failures.
+  bool all_distinct = false;
+  for (int attempt = 0; attempt < 10 && !all_distinct; ++attempt) {
+    vec.setRandom();
+    all_distinct = true;
+    for (int i = 1; i < 6; ++i) {
+      if (vec(i) == vec(i - 1)) {
+        all_distinct = false;
+        break;
+      }
+    }
   }
+  VERIFY(all_distinct);
 }
 
 template <typename Scalar>
 static void test_normal() {
   Tensor<Scalar, 1> vec(6);
-  vec.template setRandom<Eigen::internal::NormalRandomGenerator<Scalar>>();
 
   // Fixme: we should check that the generated numbers follow a gaussian
   // distribution instead.
-  for (int i = 1; i < 6; ++i) {
-    VERIFY_IS_NOT_EQUAL(vec(i), vec(i - 1));
+  bool all_distinct = false;
+  for (int attempt = 0; attempt < 10 && !all_distinct; ++attempt) {
+    vec.template setRandom<Eigen::internal::NormalRandomGenerator<Scalar>>();
+    all_distinct = true;
+    for (int i = 1; i < 6; ++i) {
+      if (vec(i) == vec(i - 1)) {
+        all_distinct = false;
+        break;
+      }
+    }
   }
+  VERIFY(all_distinct);
 }
 
 struct MyGenerator {

diff --git a/unsupported/test/cxx11_tensor_random_gpu.cu b/unsupported/test/cxx11_tensor_random_gpu.cu
index 6edea1c..f6f269e 100644
--- a/unsupported/test/cxx11_tensor_random_gpu.cu
+++ b/unsupported/test/cxx11_tensor_random_gpu.cu

@@ -14,7 +14,7 @@
 #define EIGEN_USE_GPU
 
 #include "main.h"
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 void test_gpu_random_uniform() {
   Tensor<float, 2> out(72, 97);

diff --git a/unsupported/test/cxx11_tensor_random_sycl.cpp b/unsupported/test/cxx11_tensor_random_sycl.cpp
index 0d29a5d..76bfdbf 100644
--- a/unsupported/test/cxx11_tensor_random_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_random_sycl.cpp

@@ -17,7 +17,7 @@
 #define EIGEN_USE_SYCL
 
 #include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/Tensor>
 
 template <typename DataType, int DataLayout, typename IndexType>
 static void test_sycl_random_uniform(const Eigen::SyclDevice& sycl_device) {

diff --git a/unsupported/test/cxx11_tensor_reduction.cpp b/unsupported/test/cxx11_tensor_reduction.cpp
index ab497c4..81dc0ce 100644
--- a/unsupported/test/cxx11_tensor_reduction.cpp
+++ b/unsupported/test/cxx11_tensor_reduction.cpp

@@ -10,7 +10,7 @@
 #include "main.h"
 #include <limits>
 #include <numeric>
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 using Eigen::Tensor;
 
@@ -471,7 +471,7 @@
     Tensor<ScalarType, 0> sum;
     sum = tensor.sum();
 
-    // Compute the reference value in double precsion.
+    // Compute the reference value in double precision.
     double expected_sum = 0.0;
     double abs_sum = 0.0;
     for (int i = 0; i < num_elements; ++i) {

diff --git a/unsupported/test/cxx11_tensor_reduction_gpu.cu b/unsupported/test/cxx11_tensor_reduction_gpu.cu
index 24cc5be..cedd829 100644
--- a/unsupported/test/cxx11_tensor_reduction_gpu.cu
+++ b/unsupported/test/cxx11_tensor_reduction_gpu.cu

@@ -13,7 +13,7 @@
 #define EIGEN_USE_GPU
 
 #include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/Tensor>
 
 template <typename Type, int DataLayout>
 static void test_full_reductions() {

diff --git a/unsupported/test/cxx11_tensor_reduction_sycl.cpp b/unsupported/test/cxx11_tensor_reduction_sycl.cpp
index 9276953..cf2a4b7 100644
--- a/unsupported/test/cxx11_tensor_reduction_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_reduction_sycl.cpp

@@ -19,7 +19,7 @@
 
 #include "main.h"
 
-#include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/Tensor>
 
 template <typename DataType, int DataLayout, typename IndexType>
 static void test_full_reductions_sum_sycl(const Eigen::SyclDevice& sycl_device) {

diff --git a/unsupported/test/cxx11_tensor_ref.cpp b/unsupported/test/cxx11_tensor_ref.cpp
index 8d6821a..7696558 100644
--- a/unsupported/test/cxx11_tensor_ref.cpp
+++ b/unsupported/test/cxx11_tensor_ref.cpp

@@ -9,7 +9,7 @@
 
 #include "main.h"
 
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 using Eigen::RowMajor;
 using Eigen::Tensor;

diff --git a/unsupported/test/cxx11_tensor_reverse.cpp b/unsupported/test/cxx11_tensor_reverse.cpp
index 150c71b..611cfc8 100644
--- a/unsupported/test/cxx11_tensor_reverse.cpp
+++ b/unsupported/test/cxx11_tensor_reverse.cpp

@@ -10,7 +10,7 @@
 
 #include "main.h"
 
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 using Eigen::array;
 using Eigen::Tensor;

diff --git a/unsupported/test/cxx11_tensor_reverse_sycl.cpp b/unsupported/test/cxx11_tensor_reverse_sycl.cpp
index c9c6172..f2c9824 100644
--- a/unsupported/test/cxx11_tensor_reverse_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_reverse_sycl.cpp

@@ -18,7 +18,7 @@
 #define EIGEN_USE_SYCL
 
 #include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/Tensor>
 
 template <typename DataType, int DataLayout, typename IndexType>
 static void test_simple_reverse(const Eigen::SyclDevice& sycl_device) {

diff --git a/unsupported/test/cxx11_tensor_roll.cpp b/unsupported/test/cxx11_tensor_roll.cpp
index 59f5efe..eaa7886 100644
--- a/unsupported/test/cxx11_tensor_roll.cpp
+++ b/unsupported/test/cxx11_tensor_roll.cpp

@@ -9,7 +9,7 @@
 
 #include "main.h"
 
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 using Eigen::array;
 using Eigen::Tensor;

diff --git a/unsupported/test/cxx11_tensor_roundings.cpp b/unsupported/test/cxx11_tensor_roundings.cpp
index 2c1181a..33e9cb3 100644
--- a/unsupported/test/cxx11_tensor_roundings.cpp
+++ b/unsupported/test/cxx11_tensor_roundings.cpp

@@ -9,7 +9,7 @@
 
 #include "main.h"
 
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 static void test_float_rounding() {
   Tensor<float, 2> ftensor(20, 30);

diff --git a/unsupported/test/cxx11_tensor_scan.cpp b/unsupported/test/cxx11_tensor_scan.cpp
index 50ec63d..30730f2 100644
--- a/unsupported/test/cxx11_tensor_scan.cpp
+++ b/unsupported/test/cxx11_tensor_scan.cpp

@@ -10,7 +10,7 @@
 #include "main.h"
 #include <limits>
 #include <numeric>
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 using Eigen::Tensor;
 

diff --git a/unsupported/test/cxx11_tensor_scan_gpu.cu b/unsupported/test/cxx11_tensor_scan_gpu.cu
index b94ab31..e3a5c4d 100644
--- a/unsupported/test/cxx11_tensor_scan_gpu.cu
+++ b/unsupported/test/cxx11_tensor_scan_gpu.cu

@@ -14,7 +14,7 @@
 #define EIGEN_USE_GPU
 
 #include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/Tensor>
 
 using Eigen::Tensor;
 typedef Tensor<float, 1>::DimensionPair DimPair;

diff --git a/unsupported/test/cxx11_tensor_scan_sycl.cpp b/unsupported/test/cxx11_tensor_scan_sycl.cpp
index 13f7c9e..2c303e5 100644
--- a/unsupported/test/cxx11_tensor_scan_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_scan_sycl.cpp

@@ -17,7 +17,7 @@
 #define EIGEN_USE_SYCL
 
 #include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/Tensor>
 
 using Eigen::Tensor;
 typedef Tensor<float, 1>::DimensionPair DimPair;

diff --git a/unsupported/test/cxx11_tensor_shuffling.cpp b/unsupported/test/cxx11_tensor_shuffling.cpp
index 4e6fb73..dd4e398 100644
--- a/unsupported/test/cxx11_tensor_shuffling.cpp
+++ b/unsupported/test/cxx11_tensor_shuffling.cpp

@@ -9,7 +9,7 @@
 
 #include "main.h"
 
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 using Eigen::array;
 using Eigen::Tensor;

diff --git a/unsupported/test/cxx11_tensor_shuffling_sycl.cpp b/unsupported/test/cxx11_tensor_shuffling_sycl.cpp
index 0f88f52..15e7deb 100644
--- a/unsupported/test/cxx11_tensor_shuffling_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_shuffling_sycl.cpp

@@ -19,7 +19,7 @@
 #define EIGEN_USE_SYCL
 
 #include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/Tensor>
 
 using Eigen::array;
 using Eigen::SyclDevice;

diff --git a/unsupported/test/cxx11_tensor_simple.cpp b/unsupported/test/cxx11_tensor_simple.cpp
index 1686af3..c313119 100644
--- a/unsupported/test/cxx11_tensor_simple.cpp
+++ b/unsupported/test/cxx11_tensor_simple.cpp

@@ -9,7 +9,7 @@
 
 #include "main.h"
 
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 using Eigen::RowMajor;
 using Eigen::Tensor;

diff --git a/unsupported/test/cxx11_tensor_striding.cpp b/unsupported/test/cxx11_tensor_striding.cpp
index 4f3cd0d..b6910ca 100644
--- a/unsupported/test/cxx11_tensor_striding.cpp
+++ b/unsupported/test/cxx11_tensor_striding.cpp

@@ -9,7 +9,7 @@
 
 #include "main.h"
 
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 using Eigen::Tensor;
 

diff --git a/unsupported/test/cxx11_tensor_striding_sycl.cpp b/unsupported/test/cxx11_tensor_striding_sycl.cpp
index e488c0e..65c96ac 100644
--- a/unsupported/test/cxx11_tensor_striding_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_striding_sycl.cpp

@@ -22,7 +22,7 @@
 #include <ctime>
 
 #include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/Tensor>
 
 using Eigen::array;
 using Eigen::SyclDevice;

diff --git a/unsupported/test/cxx11_tensor_sugar.cpp b/unsupported/test/cxx11_tensor_sugar.cpp
index 9c4d878..d1827e8 100644
--- a/unsupported/test/cxx11_tensor_sugar.cpp
+++ b/unsupported/test/cxx11_tensor_sugar.cpp

@@ -1,6 +1,6 @@
 #include "main.h"
 
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 using Eigen::RowMajor;
 using Eigen::Tensor;

diff --git a/unsupported/test/cxx11_tensor_sycl.cpp b/unsupported/test/cxx11_tensor_sycl.cpp
index 842a510..e772f6d 100644
--- a/unsupported/test/cxx11_tensor_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_sycl.cpp

@@ -19,7 +19,7 @@
 #define EIGEN_USE_SYCL
 
 #include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/Tensor>
 
 using Eigen::array;
 using Eigen::SyclDevice;

diff --git a/unsupported/test/cxx11_tensor_symmetry.cpp b/unsupported/test/cxx11_tensor_symmetry.cpp
index 16173b9..e1a1472 100644
--- a/unsupported/test/cxx11_tensor_symmetry.cpp
+++ b/unsupported/test/cxx11_tensor_symmetry.cpp

@@ -9,8 +9,8 @@
 
 #include "main.h"
 
-#include <Eigen/CXX11/Tensor>
-#include <Eigen/CXX11/TensorSymmetry>
+#include <Eigen/Tensor>
+#include <Eigen/TensorSymmetry>
 
 #include <map>
 #include <set>
@@ -30,7 +30,7 @@
 using Eigen::GlobalZeroFlag;
 using Eigen::NegationFlag;
 
-// helper function to determine if the compiler intantiated a static
+// helper function to determine if the compiler instantiated a static
 // or dynamic symmetry group
 template <typename... Sym>
 bool isDynGroup(StaticSGroup<Sym...> const& dummy) {

diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp
index a566d7e..d1645ee 100644
--- a/unsupported/test/cxx11_tensor_thread_pool.cpp
+++ b/unsupported/test/cxx11_tensor_thread_pool.cpp

@@ -11,7 +11,7 @@
 
 #include "main.h"
 #include <iostream>
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 using Eigen::Tensor;
 
@@ -193,7 +193,7 @@
 
   // this contraction should be equivalent to a single matrix multiplication
   typedef Tensor<float, 1>::DimensionPair DimPair;
-  Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}});
+  Eigen::array<DimPair, 2> dims{{DimPair(2, 0), DimPair(3, 1)}};
 
   typedef Map<Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
   MapXf m_left(t_left.data(), 1500, 1147);
@@ -324,7 +324,7 @@
   right += right.constant(1.5f);
 
   typedef Tensor<float, 1>::DimensionPair DimPair;
-  Eigen::array<DimPair, 1> dims({{DimPair(1, 2)}});
+  Eigen::array<DimPair, 1> dims{{DimPair(1, 2)}};
 
   Eigen::ThreadPool tp(internal::random<int>(2, 11));
   Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(2, 11));
@@ -386,7 +386,7 @@
   Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(1500, 1400);
 
   // this contraction should be equivalent to a single matrix multiplication
-  Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}});
+  Eigen::array<DimPair, 2> dims{{DimPair(2, 0), DimPair(3, 1)}};
 
   // compute results by separate methods
   t_result.device(device) = t_left.contract(t_right, dims, SqrtOutputKernel());
@@ -416,7 +416,7 @@
   right += right.constant(1.5f);
 
   typedef Tensor<float, 1>::DimensionPair DimPair;
-  Eigen::array<DimPair, 1> dims({{DimPair(1, 2)}});
+  Eigen::array<DimPair, 1> dims{{DimPair(1, 2)}};
 
   Eigen::ThreadPool tp(internal::random<int>(2, 11));
   Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(8, 32));
@@ -468,7 +468,7 @@
   Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10);
 
   // this contraction should be equivalent to a single matrix multiplication
-  Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
+  Eigen::array<DimPair, 1> dims{{DimPair(1, 0)}};
 
   // compute results by separate methods
   t_result.device(device) = t_left.contract(t_right, dims);
@@ -507,7 +507,7 @@
   Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10);
 
   // this contraction should be equivalent to a single matrix multiplication
-  Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
+  Eigen::array<DimPair, 1> dims{{DimPair(1, 0)}};
 
   // compute results by separate methods
   t_result.device(device) = t_left.contract(t_right, dims, SqrtOutputKernel());
@@ -546,7 +546,7 @@
   Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10);
 
   // this contraction should be equivalent to a single matrix multiplication
-  Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
+  Eigen::array<DimPair, 1> dims{{DimPair(1, 0)}};
 
   // compute results by separate methods
   Eigen::Barrier barrier(1);
@@ -588,7 +588,7 @@
   Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10);
 
   // this contraction should be equivalent to a single matrix multiplication
-  Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
+  Eigen::array<DimPair, 1> dims{{DimPair(1, 0)}};
 
   // compute results by separate methods
   Eigen::Barrier barrier(1);
@@ -616,7 +616,7 @@
   right += right.constant(1.5f);
 
   typedef Tensor<float, 2>::DimensionPair DimPair;
-  Eigen::array<DimPair, 2> dims({{DimPair(0, 0), DimPair(1, 1)}});
+  Eigen::array<DimPair, 2> dims{{DimPair(0, 0), DimPair(1, 1)}};
 
   Eigen::ThreadPool tp(internal::random<int>(2, 11));
   Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(2, 11));

diff --git a/unsupported/test/cxx11_tensor_trace.cpp b/unsupported/test/cxx11_tensor_trace.cpp
index 5b95839..bf013d0 100644
--- a/unsupported/test/cxx11_tensor_trace.cpp
+++ b/unsupported/test/cxx11_tensor_trace.cpp

@@ -9,7 +9,7 @@
 
 #include "main.h"
 
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 using Eigen::array;
 using Eigen::Tensor;

diff --git a/unsupported/test/cxx11_tensor_uint128.cpp b/unsupported/test/cxx11_tensor_uint128.cpp
index 4470f25..739b459 100644
--- a/unsupported/test/cxx11_tensor_uint128.cpp
+++ b/unsupported/test/cxx11_tensor_uint128.cpp

@@ -9,7 +9,7 @@
 
 #include "main.h"
 
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 #if EIGEN_COMP_MSVC || !defined(__SIZEOF_INT128__)
 #define EIGEN_NO_INT128

diff --git a/unsupported/test/cxx11_tensor_volume_patch.cpp b/unsupported/test/cxx11_tensor_volume_patch.cpp
index 9a04861..31da410 100644
--- a/unsupported/test/cxx11_tensor_volume_patch.cpp
+++ b/unsupported/test/cxx11_tensor_volume_patch.cpp

@@ -1,6 +1,6 @@
 #include "main.h"
 
-#include <Eigen/CXX11/Tensor>
+#include <Eigen/Tensor>
 
 using Eigen::Tensor;
 

diff --git a/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp b/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp
index 6a4248e..327ac67 100644
--- a/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp

@@ -18,7 +18,7 @@
 #define EIGEN_USE_SYCL
 
 #include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/Tensor>
 
 using Eigen::Tensor;
 static const int DataLayout = ColMajor;

diff --git a/unsupported/test/levenberg_marquardt.cpp b/unsupported/test/levenberg_marquardt.cpp
index 8a8c838..45edcc2 100644
--- a/unsupported/test/levenberg_marquardt.cpp
+++ b/unsupported/test/levenberg_marquardt.cpp

@@ -505,7 +505,7 @@
   VERIFY_IS_APPROX(x[0], 1.0776351733E+00);
   VERIFY_IS_APPROX(x[1], -1.2269296921E-01);
   VERIFY_IS_APPROX(x[2], 4.0863750610E-03);
-  VERIFY_IS_APPROX(x[3], -1.426264e-06);  // shoulde be : -1.4262662514E-06
+  VERIFY_IS_APPROX(x[3], -1.426264e-06);  // should be : -1.4262662514E-06
   VERIFY_IS_APPROX(x[4], -5.7609940901E-03);
   VERIFY_IS_APPROX(x[5], 2.4053735503E-04);
   VERIFY_IS_APPROX(x[6], -1.2314450199E-07);
@@ -528,7 +528,7 @@
   VERIFY_IS_APPROX(x[0], 1.077640);       // should be :  1.0776351733E+00
   VERIFY_IS_APPROX(x[1], -0.1226933);     // should be : -1.2269296921E-01
   VERIFY_IS_APPROX(x[2], 0.004086383);    // should be : 4.0863750610E-03
-  VERIFY_IS_APPROX(x[3], -1.426277e-06);  // shoulde be : -1.4262662514E-06
+  VERIFY_IS_APPROX(x[3], -1.426277e-06);  // should be : -1.4262662514E-06
   VERIFY_IS_APPROX(x[4], -5.7609940901E-03);
   VERIFY_IS_APPROX(x[5], 0.00024053772);  // should be : 2.4053735503E-04
   VERIFY_IS_APPROX(x[6], -1.231450e-07);  // should be : -1.2314450199E-07

diff --git a/unsupported/test/matrix_power.cpp b/unsupported/test/matrix_power.cpp
index e17d35a..7b449bc 100644
--- a/unsupported/test/matrix_power.cpp
+++ b/unsupported/test/matrix_power.cpp

@@ -152,7 +152,7 @@
   CALL_SUBTEST_1(test2dHyperbolicRotation<float>(1e-5f));
   CALL_SUBTEST_9(test2dHyperbolicRotation<long double>(1e-14L));
 
-  CALL_SUBTEST_10(test3dRotation<double>(1e-13));
+  CALL_SUBTEST_10(test3dRotation<double>(2e-13));
   CALL_SUBTEST_11(test3dRotation<float>(1e-5f));
   CALL_SUBTEST_12(test3dRotation<long double>(1e-13L));
 
@@ -165,7 +165,7 @@
   CALL_SUBTEST_8(testGeneral(Matrix4f(), 1e-4f));
   CALL_SUBTEST_6(testGeneral(MatrixXf(2, 2), 1e-3f));  // see bug 614
   CALL_SUBTEST_9(testGeneral(MatrixXe(7, 7), 1e-12L));
-  CALL_SUBTEST_10(testGeneral(Matrix3d(), 1e-13));
+  CALL_SUBTEST_10(testGeneral(Matrix3d(), 2e-13));
   CALL_SUBTEST_11(testGeneral(Matrix3f(), 1e-4f));
   CALL_SUBTEST_12(testGeneral(Matrix3e(), 1e-13L));
 
@@ -178,7 +178,7 @@
   CALL_SUBTEST_8(testSingular(Matrix4f(), 1e-4f));
   CALL_SUBTEST_6(testSingular(MatrixXf(2, 2), 1e-3f));
   CALL_SUBTEST_9(testSingular(MatrixXe(7, 7), 1e-12L));
-  CALL_SUBTEST_10(testSingular(Matrix3d(), 1e-13));
+  CALL_SUBTEST_10(testSingular(Matrix3d(), 2e-13));
   CALL_SUBTEST_11(testSingular(Matrix3f(), 1e-4f));
   CALL_SUBTEST_12(testSingular(Matrix3e(), 1e-13L));
 
@@ -191,7 +191,7 @@
   CALL_SUBTEST_8(testLogThenExp(Matrix4f(), 1e-4f));
   CALL_SUBTEST_6(testLogThenExp(MatrixXf(2, 2), 1e-3f));
   CALL_SUBTEST_9(testLogThenExp(MatrixXe(7, 7), 1e-12L));
-  CALL_SUBTEST_10(testLogThenExp(Matrix3d(), 1e-13));
+  CALL_SUBTEST_10(testLogThenExp(Matrix3d(), 2e-13));
   CALL_SUBTEST_11(testLogThenExp(Matrix3f(), 1e-4f));
   CALL_SUBTEST_12(testLogThenExp(Matrix3e(), 1e-13L));
 }

diff --git a/unsupported/test/polynomialsolver.cpp b/unsupported/test/polynomialsolver.cpp
index 12d137d..18d5a2e 100644
--- a/unsupported/test/polynomialsolver.cpp
+++ b/unsupported/test/polynomialsolver.cpp

@@ -115,7 +115,7 @@
 
     for (size_t i = 0; i < calc_realRoots.size(); ++i) {
       bool found = false;
-      for (size_t j = 0; j < calc_realRoots.size() && !found; ++j) {
+      for (Index j = 0; j < real_roots.size() && !found; ++j) {
         if (internal::isApprox(calc_realRoots[i], real_roots[j], psPrec)) {
           found = true;
         }
@@ -131,28 +131,28 @@
 
     bool hasRealRoot;
     // Test absGreatestRealRoot
-    RealScalar r = psolve.absGreatestRealRoot(hasRealRoot);
+    RealScalar r = psolve.absGreatestRealRoot(hasRealRoot, test_precision<RealScalar>());
     VERIFY(hasRealRoot == (real_roots.size() > 0));
     if (hasRealRoot) {
       VERIFY(internal::isApprox(real_roots.array().abs().maxCoeff(), abs(r), psPrec));
     }
 
     // Test absSmallestRealRoot
-    r = psolve.absSmallestRealRoot(hasRealRoot);
+    r = psolve.absSmallestRealRoot(hasRealRoot, test_precision<RealScalar>());
     VERIFY(hasRealRoot == (real_roots.size() > 0));
     if (hasRealRoot) {
       VERIFY(internal::isApprox(real_roots.array().abs().minCoeff(), abs(r), psPrec));
     }
 
     // Test greatestRealRoot
-    r = psolve.greatestRealRoot(hasRealRoot);
+    r = psolve.greatestRealRoot(hasRealRoot, test_precision<RealScalar>());
     VERIFY(hasRealRoot == (real_roots.size() > 0));
     if (hasRealRoot) {
       VERIFY(internal::isApprox(real_roots.array().maxCoeff(), r, psPrec));
     }
 
     // Test smallestRealRoot
-    r = psolve.smallestRealRoot(hasRealRoot);
+    r = psolve.smallestRealRoot(hasRealRoot, test_precision<RealScalar>());
     VERIFY(hasRealRoot == (real_roots.size() > 0));
     if (hasRealRoot) {
       VERIFY(internal::isApprox(real_roots.array().minCoeff(), r, psPrec));
@@ -178,10 +178,19 @@
   roots_to_monicPolynomial(allRoots, pols);
   evalSolver<Deg_, PolynomialType>(pols);
 
-  cout << "Test sugar" << endl;
-  RealRootsType realRoots = RealRootsType::Random(deg);
-  roots_to_monicPolynomial(realRoots, pols);
-  evalSolverSugarFunction<Deg_>(pols, realRoots.template cast<std::complex<RealScalar> >().eval(), realRoots);
+  // The companion matrix eigenvalue approach has limited accuracy for float at
+  // high degrees. The PolynomialSolver documentation itself warns: "With 32bit
+  // (float) floating types this problem shows up frequently." Skip the sugar
+  // function test (which requires exact root matching) for float beyond degree 8.
+  if (deg <= 8 || sizeof(RealScalar) > sizeof(float)) {
+    cout << "Test sugar" << endl;
+    RealRootsType realRoots = RealRootsType::Random(deg);
+    // sort by ascending absolute value to mitigate precision lost during polynomial expansion
+    std::sort(realRoots.begin(), realRoots.end(),
+              [](RealScalar a, RealScalar b) { return numext::abs(a) < numext::abs(b); });
+    roots_to_monicPolynomial(realRoots, pols);
+    evalSolverSugarFunction<Deg_>(pols, realRoots.template cast<std::complex<RealScalar> >().eval(), realRoots);
+  }
 }
 
 EIGEN_DECLARE_TEST(polynomialsolver) {

diff --git a/unsupported/test/special_functions.cpp b/unsupported/test/special_functions.cpp
index b16a7bb..0abad05 100644
--- a/unsupported/test/special_functions.cpp
+++ b/unsupported/test/special_functions.cpp

@@ -11,7 +11,7 @@
 #include "main.h"
 #include "../Eigen/SpecialFunctions"
 
-// Hack to allow "implicit" conversions from double to Scalar via comma-initialization.
+// Workaround to allow "implicit" conversions from double to Scalar via comma-initialization.
 template <typename Derived>
 Eigen::CommaInitializer<Derived> operator<<(Eigen::DenseBase<Derived>& dense, double v) {
   return (dense << static_cast<typename Derived::Scalar>(v));
@@ -65,7 +65,7 @@
       ArrayType m1 = ArrayType::Random(rows, cols);
       ArrayType m2 = ArrayType::Random(rows, cols);
 
-      // Test various propreties of igamma & igammac.  These are normalized
+      // Test various properties of igamma & igammac.  These are normalized
       // gamma integrals where
       //   igammac(a, x) = Gamma(a, x) / Gamma(a)
       //   igamma(a, x) = gamma(a, x) / Gamma(a)
@@ -223,10 +223,11 @@
     //   b = np.logspace(-3, 3, 5) - 1e-3
     //   x = np.linspace(-0.1, 1.1, 5)
     //   (full_a, full_b, full_x) = np.vectorize(lambda a, b, x: (a, b, x))(*np.ix_(a, b, x))
-    //   full_a = full_a.flatten().tolist()  # same for full_b, full_x
+    //   full_a = full_a.flatten().tolist()
+    //   full_b = full_b.flatten().tolist()
+    //   full_x = full_x.flatten().tolist()
     //   v = scipy.special.betainc(full_a, full_b, full_x).flatten().tolist()
-    //
-    // Note in Eigen, we call betainc with arguments in the order (x, a, b).
+    // Note in Eigen, we call betainc with arguments in the order (a, b, x);
     ArrayType a(125);
     ArrayType b(125);
     ArrayType x(125);
@@ -272,19 +273,17 @@
         1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1,
         -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1;
 
-    v << nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
-        nan, nan, nan, nan, nan, nan, nan, nan, nan, 0.47972119876364683, 0.5, 0.5202788012363533, nan, nan,
-        0.9518683957740043, 0.9789663010413743, 0.9931729188073435, nan, nan, 0.999995949033062, 0.9999999999993698,
-        0.9999999999999999, nan, nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan, nan,
-        nan, nan, nan, 0.006827081192655869, 0.0210336989586256, 0.04813160422599567, nan, nan, 0.20014344256217678,
-        0.5000000000000001, 0.7998565574378232, nan, nan, 0.9991401428435834, 0.999999999698403, 0.9999999999999999,
-        nan, nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan, nan, nan, nan, nan,
-        1.0646600232370887e-25, 6.301722877826246e-13, 4.050966937974938e-06, nan, nan, 7.864342668429763e-23,
-        3.015969667594166e-10, 0.0008598571564165444, nan, nan, 6.031987710123844e-08, 0.5000000000000007,
-        0.9999999396801229, nan, nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan, nan,
-        nan, nan, nan, 0.0, 7.029920380986636e-306, 2.2450728208591345e-101, nan, nan, 0.0, 9.275871147869727e-302,
-        1.2232913026152827e-97, nan, nan, 0.0, 3.0891393081932924e-252, 2.9303043666183996e-60, nan, nan,
-        2.248913486879199e-196, 0.5000000000004947, 0.9999999999999999, nan;
+    v << nan, nan, nan, nan, nan, nan, 1.0, 1.0, 1.0, nan, nan, 1.0, 1.0, 1.0, nan, nan, 1.0, 1.0, 1.0, nan, nan, 1.0,
+        1.0, 1.0, nan, nan, 0.0, 0.0, 0.0, nan, nan, 0.47972119876364683, 0.4999999999999997, 0.5202788012363533, nan,
+        nan, 0.9518683957740043, 0.9789663010413745, 0.993172918807344, nan, nan, 0.999995949033062, 0.99999999999937,
+        1.0, nan, nan, 1.0, 1.0, 1.0, nan, nan, 0.0, 0.0, 0.0, nan, nan, 0.0068270811926558735, 0.0210336989586256,
+        0.048131604225995696, nan, nan, 0.20014344256217687, 0.5000000000000002, 0.7998565574378237, nan, nan,
+        0.9991401428435834, 0.9999999996984031, 1.0, nan, nan, 1.0, 1.0, 1.0, nan, nan, 0.0, 0.0, 0.0, nan, nan,
+        1.0646600232370862e-25, 6.30172287782625e-13, 4.050966937974916e-06, nan, nan, 7.864342668429712e-23,
+        3.015969667594142e-10, 0.0008598571564165379, nan, nan, 6.031987710123845e-08, 0.5000000000000001,
+        0.9999999396801229, nan, nan, 1.0, 1.0, 1.0, nan, nan, 0.0, 0.0, 0.0, nan, nan, 0.0, 7.029920380993552e-306,
+        2.245072820861537e-101, nan, nan, 0.0, 9.275871147875753e-302, 1.223291302616039e-97, nan, nan, 0.0,
+        3.089139308197642e-252, 2.9303043666230076e-60, nan, nan, 2.248913486881819e-196, 0.5000000000000036, 1.0, nan;
 
     CALL_SUBTEST(res = betainc(a, b, x); verify_component_wise(res, v););
   }