Update Eigen to commit:3564668908afc66351c1c3cc47dca2fcdb91dc12

CHANGELOG
=========
356466890 - Fix overalign check.
f3929ac7e - Fix EIGEN_HAS_CXX17_OVERALIGN for icc
1b33a6374 - Fixes git add . doesn'\''t include scripts/buildtests.in
a8bab0d8a - Patch SparseLU
910f6f65d - Adjust thresholds for bfloat16 product tests that are currently failing.
311cc0f9c - Enable NEON pcmp, plset, and complex psqrt
dbf7ae6f9 - Fix up C++ version detection macros and cmake tests.
bb6675caf - Fix incorrect NEON native fp16 multiplication.
dd85d2694 - Revert "Avoid mixing types in CompressedStorage.h"
c4fb6af24 - Enable NEON pabs for unsigned int types
400bc5cd5 - Add sparse_basic_1 to smoke tests.
04e4f0bb2 - Add missing colon in SparseMatrix.h.
3d8a8def8 - Avoid mixing types in CompressedStorage.h
4bb244679 - Add operators to CompressedStorageIterator
e1aee4ab3 - Update test of numext::signbit.
3717854a2 - Use numext::signbit instead of std::signbit, which is not defined for bfloat16.
37de43290 - Avoid using std::raise() for divide by zero
62de593c4 - Allow std::initializer_list constructors in constexpr expressions
6d3e3678b - optimize equalspace packetop
200483194 - add EqualSpaced / setEqualSpaced
273f80384 - Add BDCSVD_LAPACKE binding
03c9b4738 - Enable direct access for NestByValue.
b59f18b4f - Increase L2 and L3 cache size for Power10.
c614b2bbd - Fix index type for sparse index sorting.
44fe53915 - add sparse sort inner vectors function
d19416714 - Fix the bug using neon instruction fmla for data type half
31ab62d34 - Add support for Power10 (AltiVec) MMA instructions for bfloat16.
dcb042a87 - Fix serialization for non-compressed matrices.
2260e11eb - Fix reshape strides when input has non-zero inner stride.
23524ab6f - Changing BiCGSTAB parameters initialization so that it works with custom types
ab2b26fbc - Fix sparseLU solver when destination has a non-unit stride.
551eebc8c - Add synchronize method to all devices.
b7551bff9 - Fix a bunch of annoying compiler warnings in tests
e7b1ad031 - Add serialization for sparse matrix and sparse vector.
044f3f623 - Fix bug in handmade_aligned_realloc
672868393 - Small cleanup of IDRS.h
02805bd56 - Fix AVX2 psignbit
399ce1ed6 - Fix duplicate execution code for Power 8 Altivec in pstore_partial.
6431dfdb5 - Cross product for vectors of size 2. Fixes #1037
8588d8c74 - Correct pnegate for floating-point zero.
5eacb9e11 - Put brackets around unsigned type names.
37e40dca8 - Fix ambiguity in PPC for vec_splats call.
7dc6db75d - Fix typo in CholmodSupport
9b6d624ea - fix neon
7e398e943 - Add missing return keyword in psignbit for NEON.
82b152dbe - Add signbit function
8f8e36458 - Remove recently added sparse assert in SparseMapBase.
01a31b81b - Remove unused parameter name.
c5b896c5a - Allow empty matrices to be resized.
886aad136 - Disable patan for double on PPC.
ab407b2b6 - Fix handmade_aligned_malloc offset computation.
adb30efb2 - Add assert for invalid outerIndexPtr array in SparseMapBase.
c27d1abe4 - Fix pragma check for disabling fastmath.
a22637137 - Change handmade_aligned_malloc/realloc/free to store a 1 byte offset instead of absolute address
bf48d4633 - Explicitly state that indices must be sorted.

PiperOrigin-RevId: 500278312
Change-Id: I5ba2533555adb8e254879b45d370bb8fccb74c8f
diff --git a/Eigen/CholmodSupport b/Eigen/CholmodSupport
index bed8924..1037bd5 100644
--- a/Eigen/CholmodSupport
+++ b/Eigen/CholmodSupport
@@ -22,7 +22,7 @@
   * This module provides an interface to the Cholmod library which is part of the <a href="http://www.suitesparse.com">suitesparse</a> package.
   * It provides the two following main factorization classes:
   * - class CholmodSupernodalLLT: a supernodal LLT Cholesky factorization.
-  * - class CholmodDecomposiiton: a general L(D)LT Cholesky factorization with automatic or explicit runtime selection of the underlying factorization method (supernodal or simplicial).
+  * - class CholmodDecomposition: a general L(D)LT Cholesky factorization with automatic or explicit runtime selection of the underlying factorization method (supernodal or simplicial).
   *
   * For the sake of completeness, this module also propose the two following classes:
   * - class CholmodSimplicialLLT
diff --git a/Eigen/Core b/Eigen/Core
index 48c2121..623d735 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -177,10 +177,6 @@
 #include "src/Core/arch/Default/TypeCasting.h"
 #include "src/Core/arch/Default/GenericPacketMathFunctionsFwd.h"
 
-#ifndef EIGEN_GPU_COMPILE_PHASE
-  #include <csignal>
-#endif
-
 #if defined EIGEN_VECTORIZE_AVX512
   #if defined EIGEN_VECTORIZE_AVX512FP16
     #include "src/Core/arch/AVX512/PacketMathFP16.h"
diff --git a/Eigen/SVD b/Eigen/SVD
index 3451794..8241c73 100644
--- a/Eigen/SVD
+++ b/Eigen/SVD
@@ -36,14 +36,17 @@
 #include "src/SVD/SVDBase.h"
 #include "src/SVD/JacobiSVD.h"
 #include "src/SVD/BDCSVD.h"
-#if defined(EIGEN_USE_LAPACKE) && !defined(EIGEN_USE_LAPACKE_STRICT)
+#ifdef EIGEN_USE_LAPACKE
 #ifdef EIGEN_USE_MKL
 #include "mkl_lapacke.h"
 #else
 #include "src/misc/lapacke.h"
 #endif
+#ifndef EIGEN_USE_LAPACKE_STRICT
 #include "src/SVD/JacobiSVD_LAPACKE.h"
 #endif
+#include "src/SVD/BDCSVD_LAPACKE.h"
+#endif
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
diff --git a/Eigen/SparseLU b/Eigen/SparseLU
index 37c4a5c..047cf0d 100644
--- a/Eigen/SparseLU
+++ b/Eigen/SparseLU
@@ -25,8 +25,6 @@
 
 #include "src/Core/util/DisableStupidWarnings.h"
 
-#include "src/SparseLU/SparseLU_gemm_kernel.h"
-
 #include "src/SparseLU/SparseLU_Structs.h"
 #include "src/SparseLU/SparseLU_SupernodalMatrix.h"
 #include "src/SparseLU/SparseLUImpl.h"
diff --git a/Eigen/src/Core/Array.h b/Eigen/src/Core/Array.h
index 7be8971..d7a5e7a 100644
--- a/Eigen/src/Core/Array.h
+++ b/Eigen/src/Core/Array.h
@@ -193,8 +193,9 @@
       *
       * \sa  Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
       */
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Array(const std::initializer_list<std::initializer_list<Scalar>>& list) : Base(list) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Array(
+        const std::initializer_list<std::initializer_list<Scalar>>& list)
+        : Base(list) {}
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename T>
diff --git a/Eigen/src/Core/CwiseNullaryOp.h b/Eigen/src/Core/CwiseNullaryOp.h
index a62f54d..b33c052 100644
--- a/Eigen/src/Core/CwiseNullaryOp.h
+++ b/Eigen/src/Core/CwiseNullaryOp.h
@@ -306,6 +306,20 @@
   return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op<Scalar>(low,high,Derived::SizeAtCompileTime));
 }
 
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessEqualSpacedReturnType
+DenseBase<Derived>::EqualSpaced(Index size, const Scalar& low, const Scalar& step) {
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return DenseBase<Derived>::NullaryExpr(size, internal::equalspaced_op<Scalar>(low, step));
+}
+
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessEqualSpacedReturnType
+DenseBase<Derived>::EqualSpaced(const Scalar& low, const Scalar& step) {
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime, internal::equalspaced_op<Scalar>(low, step));
+}
+
 /** \returns true if all coefficients in this matrix are approximately equal to \a val, to within precision \a prec */
 template<typename Derived>
 EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isApproxToConstant
@@ -455,6 +469,19 @@
   return setLinSpaced(size(), low, high);
 }
 
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setEqualSpaced(Index newSize, const Scalar& low,
+                                                                                  const Scalar& step) {
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return derived() = Derived::NullaryExpr(newSize, internal::equalspaced_op<Scalar>(low, step));
+}
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setEqualSpaced(const Scalar& low,
+                                                                                  const Scalar& step) {
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return setEqualSpaced(size(), low, step);
+}
+
 // zero:
 
 /** \returns an expression of a zero matrix.
diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h
index 6e17779..bcfd0f6 100644
--- a/Eigen/src/Core/DenseBase.h
+++ b/Eigen/src/Core/DenseBase.h
@@ -258,6 +258,8 @@
     EIGEN_DEPRECATED typedef CwiseNullaryOp<internal::linspaced_op<Scalar>,PlainObject> SequentialLinSpacedReturnType;
     /** \internal Represents a vector with linearly spaced coefficients that allows random access. */
     typedef CwiseNullaryOp<internal::linspaced_op<Scalar>,PlainObject> RandomAccessLinSpacedReturnType;
+    /** \internal Represents a vector with equally spaced coefficients that allows random access. */
+    typedef CwiseNullaryOp<internal::equalspaced_op<Scalar>, PlainObject> RandomAccessEqualSpacedReturnType;
     /** \internal the return type of MatrixBase::eigenvalues() */
     typedef Matrix<typename NumTraits<typename internal::traits<Derived>::Scalar>::Real, internal::traits<Derived>::ColsAtCompileTime, 1> EigenvaluesReturnType;
 
@@ -336,6 +338,11 @@
     EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType
     LinSpaced(const Scalar& low, const Scalar& high);
 
+    EIGEN_DEVICE_FUNC static const RandomAccessEqualSpacedReturnType
+    EqualSpaced(Index size, const Scalar& low, const Scalar& step);
+    EIGEN_DEVICE_FUNC static const RandomAccessEqualSpacedReturnType
+    EqualSpaced(const Scalar& low, const Scalar& step);
+
     template<typename CustomNullaryOp> EIGEN_DEVICE_FUNC
     static const CwiseNullaryOp<CustomNullaryOp, PlainObject>
     NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func);
@@ -357,6 +364,8 @@
     EIGEN_DEVICE_FUNC Derived& setConstant(const Scalar& value);
     EIGEN_DEVICE_FUNC Derived& setLinSpaced(Index size, const Scalar& low, const Scalar& high);
     EIGEN_DEVICE_FUNC Derived& setLinSpaced(const Scalar& low, const Scalar& high);
+    EIGEN_DEVICE_FUNC Derived& setEqualSpaced(Index size, const Scalar& low, const Scalar& step);
+    EIGEN_DEVICE_FUNC Derived& setEqualSpaced(const Scalar& low, const Scalar& step);
     EIGEN_DEVICE_FUNC Derived& setZero();
     EIGEN_DEVICE_FUNC Derived& setOnes();
     EIGEN_DEVICE_FUNC Derived& setRandom();
@@ -661,8 +670,7 @@
   protected:
     EIGEN_DEFAULT_COPY_CONSTRUCTOR(DenseBase)
     /** Default constructor. Do nothing. */
-    EIGEN_DEVICE_FUNC DenseBase()
-    {
+    EIGEN_DEVICE_FUNC constexpr DenseBase() {
       /* Just checks for self-consistency of the flags.
        * Only do it when debugging Eigen, as this borders on paranoia and could slow compilation down
        */
diff --git a/Eigen/src/Core/DenseStorage.h b/Eigen/src/Core/DenseStorage.h
index 5e2763e..cf588bd 100644
--- a/Eigen/src/Core/DenseStorage.h
+++ b/Eigen/src/Core/DenseStorage.h
@@ -26,14 +26,12 @@
 
 struct constructor_without_unaligned_array_assert {};
 
-template<typename T, int Size>
-EIGEN_DEVICE_FUNC
-void check_static_allocation_size()
-{
-  // if EIGEN_STACK_ALLOCATION_LIMIT is defined to 0, then no limit
-  #if EIGEN_STACK_ALLOCATION_LIMIT
+template <typename T, int Size>
+EIGEN_DEVICE_FUNC constexpr void check_static_allocation_size() {
+// if EIGEN_STACK_ALLOCATION_LIMIT is defined to 0, then no limit
+#if EIGEN_STACK_ALLOCATION_LIMIT
   EIGEN_STATIC_ASSERT(Size * sizeof(T) <= EIGEN_STACK_ALLOCATION_LIMIT, OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG);
-  #endif
+#endif
 }
 
 /** \internal
@@ -47,16 +45,10 @@
 {
   T array[Size];
 
-  EIGEN_DEVICE_FUNC
-  plain_array()
-  {
-    check_static_allocation_size<T,Size>();
-  }
+  EIGEN_DEVICE_FUNC constexpr plain_array() { check_static_allocation_size<T, Size>(); }
 
-  EIGEN_DEVICE_FUNC
-  plain_array(constructor_without_unaligned_array_assert)
-  {
-    check_static_allocation_size<T,Size>();
+  EIGEN_DEVICE_FUNC constexpr plain_array(constructor_without_unaligned_array_assert) {
+    check_static_allocation_size<T, Size>();
   }
 };
 
@@ -87,17 +79,13 @@
 {
   EIGEN_ALIGN_TO_BOUNDARY(8) T array[Size];
 
-  EIGEN_DEVICE_FUNC
-  plain_array()
-  {
+  EIGEN_DEVICE_FUNC constexpr plain_array() {
     EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(7);
     check_static_allocation_size<T,Size>();
   }
 
-  EIGEN_DEVICE_FUNC
-  plain_array(constructor_without_unaligned_array_assert)
-  {
-    check_static_allocation_size<T,Size>();
+  EIGEN_DEVICE_FUNC constexpr plain_array(constructor_without_unaligned_array_assert) {
+    check_static_allocation_size<T, Size>();
   }
 };
 
@@ -106,17 +94,13 @@
 {
   EIGEN_ALIGN_TO_BOUNDARY(16) T array[Size];
 
-  EIGEN_DEVICE_FUNC
-  plain_array()
-  {
+  EIGEN_DEVICE_FUNC constexpr plain_array() {
     EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(15);
     check_static_allocation_size<T,Size>();
   }
 
-  EIGEN_DEVICE_FUNC
-  plain_array(constructor_without_unaligned_array_assert)
-  {
-    check_static_allocation_size<T,Size>();
+  EIGEN_DEVICE_FUNC constexpr plain_array(constructor_without_unaligned_array_assert) {
+    check_static_allocation_size<T, Size>();
   }
 };
 
@@ -125,17 +109,13 @@
 {
   EIGEN_ALIGN_TO_BOUNDARY(32) T array[Size];
 
-  EIGEN_DEVICE_FUNC
-  plain_array()
-  {
+  EIGEN_DEVICE_FUNC constexpr plain_array() {
     EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(31);
     check_static_allocation_size<T,Size>();
   }
 
-  EIGEN_DEVICE_FUNC
-  plain_array(constructor_without_unaligned_array_assert)
-  {
-    check_static_allocation_size<T,Size>();
+  EIGEN_DEVICE_FUNC constexpr plain_array(constructor_without_unaligned_array_assert) {
+    check_static_allocation_size<T, Size>();
   }
 };
 
@@ -144,17 +124,13 @@
 {
   EIGEN_ALIGN_TO_BOUNDARY(64) T array[Size];
 
-  EIGEN_DEVICE_FUNC
-  plain_array()
-  {
+  EIGEN_DEVICE_FUNC constexpr plain_array() {
     EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(63);
     check_static_allocation_size<T,Size>();
   }
 
-  EIGEN_DEVICE_FUNC
-  plain_array(constructor_without_unaligned_array_assert)
-  {
-    check_static_allocation_size<T,Size>();
+  EIGEN_DEVICE_FUNC constexpr plain_array(constructor_without_unaligned_array_assert) {
+    check_static_allocation_size<T, Size>();
   }
 };
 
@@ -162,8 +138,8 @@
 struct plain_array<T, 0, MatrixOrArrayOptions, Alignment>
 {
   T array[1];
-  EIGEN_DEVICE_FUNC plain_array() {}
-  EIGEN_DEVICE_FUNC plain_array(constructor_without_unaligned_array_assert) {}
+  EIGEN_DEVICE_FUNC constexpr plain_array() {}
+  EIGEN_DEVICE_FUNC constexpr plain_array(constructor_without_unaligned_array_assert) {}
 };
 
 struct plain_array_helper {
@@ -211,26 +187,25 @@
 {
     internal::plain_array<T,Size,Options_> m_data;
   public:
-    EIGEN_DEVICE_FUNC DenseStorage() {
+    constexpr EIGEN_DEVICE_FUNC DenseStorage() {
       EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = Size)
     }
-    EIGEN_DEVICE_FUNC
-    explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
+    EIGEN_DEVICE_FUNC explicit constexpr DenseStorage(internal::constructor_without_unaligned_array_assert)
       : m_data(internal::constructor_without_unaligned_array_assert()) {}
 #if defined(EIGEN_DENSE_STORAGE_CTOR_PLUGIN)
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC constexpr
     DenseStorage(const DenseStorage& other) : m_data(other.m_data) {
       EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = Size)
     }
 #else
-    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage&) = default;
+    EIGEN_DEVICE_FUNC constexpr DenseStorage(const DenseStorage&) = default;
 #endif
-    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage&) = default;
-    EIGEN_DEVICE_FUNC DenseStorage(DenseStorage&&) = default;
-    EIGEN_DEVICE_FUNC DenseStorage& operator=(DenseStorage&&) = default;
-    EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) {
+    EIGEN_DEVICE_FUNC constexpr DenseStorage& operator=(const DenseStorage&) = default;
+    EIGEN_DEVICE_FUNC constexpr DenseStorage(DenseStorage&&) = default;
+    EIGEN_DEVICE_FUNC constexpr DenseStorage& operator=(DenseStorage&&) = default;
+    EIGEN_DEVICE_FUNC constexpr DenseStorage(Index size, Index rows, Index cols) {
       EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
-      eigen_internal_assert(size==rows*cols && rows==Rows_ && cols==Cols_);
+      eigen_internal_assert(size == rows * cols && rows == Rows_ && cols == Cols_);
       EIGEN_UNUSED_VARIABLE(size);
       EIGEN_UNUSED_VARIABLE(rows);
       EIGEN_UNUSED_VARIABLE(cols);
@@ -238,57 +213,148 @@
     EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
       numext::swap(m_data, other.m_data);
     }
-    EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index rows(void) EIGEN_NOEXCEPT {return Rows_;}
-    EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index cols(void) EIGEN_NOEXCEPT {return Cols_;}
-    EIGEN_DEVICE_FUNC void conservativeResize(Index,Index,Index) {}
-    EIGEN_DEVICE_FUNC void resize(Index,Index,Index) {}
-    EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
-    EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
+    EIGEN_DEVICE_FUNC static constexpr Index rows(void) EIGEN_NOEXCEPT { return Rows_; }
+    EIGEN_DEVICE_FUNC static constexpr Index cols(void) EIGEN_NOEXCEPT { return Cols_; }
+    EIGEN_DEVICE_FUNC constexpr void conservativeResize(Index, Index, Index) {}
+    EIGEN_DEVICE_FUNC constexpr void resize(Index, Index, Index) {}
+    EIGEN_DEVICE_FUNC constexpr const T* data() const { return m_data.array; }
+    EIGEN_DEVICE_FUNC constexpr T* data() { return m_data.array; }
 };
 
 // null matrix
-template<typename T, int Rows_, int Cols_, int Options_> class DenseStorage<T, 0, Rows_, Cols_, Options_>
+template<typename T, int Rows_, int Cols_, int Options_>
+class DenseStorage<T, 0, Rows_, Cols_, Options_>
 {
   public:
-    EIGEN_DEVICE_FUNC DenseStorage() {}
-    EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert) {}
-    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage&) {}
-    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage&) { return *this; }
-    EIGEN_DEVICE_FUNC DenseStorage(Index,Index,Index) {}
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& ) {}
-    EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index rows(void) EIGEN_NOEXCEPT {return Rows_;}
-    EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index cols(void) EIGEN_NOEXCEPT {return Cols_;}
-    EIGEN_DEVICE_FUNC void conservativeResize(Index,Index,Index) {}
-    EIGEN_DEVICE_FUNC void resize(Index,Index,Index) {}
-    EIGEN_DEVICE_FUNC const T *data() const { return 0; }
-    EIGEN_DEVICE_FUNC T *data() { return 0; }
+    static_assert(Rows_ * Cols_ == 0, "The fixed number of rows times columns must equal the storage size.");
+    EIGEN_DEVICE_FUNC constexpr DenseStorage() {}
+    EIGEN_DEVICE_FUNC explicit constexpr DenseStorage(internal::constructor_without_unaligned_array_assert) {}
+    EIGEN_DEVICE_FUNC constexpr DenseStorage(const DenseStorage&) {}
+    EIGEN_DEVICE_FUNC constexpr DenseStorage& operator=(const DenseStorage&) { return *this; }
+    EIGEN_DEVICE_FUNC constexpr DenseStorage(Index,Index,Index) {}
+    EIGEN_DEVICE_FUNC constexpr void swap(DenseStorage& ) {}
+    EIGEN_DEVICE_FUNC static constexpr Index rows(void) EIGEN_NOEXCEPT {return Rows_;}
+    EIGEN_DEVICE_FUNC static constexpr Index cols(void) EIGEN_NOEXCEPT {return Cols_;}
+    EIGEN_DEVICE_FUNC constexpr void conservativeResize(Index,Index,Index) {}
+    EIGEN_DEVICE_FUNC constexpr void resize(Index,Index,Index) {}
+    EIGEN_DEVICE_FUNC constexpr const T *data() const { return 0; }
+    EIGEN_DEVICE_FUNC constexpr T *data() { return 0; }
 };
 
 // more specializations for null matrices; these are necessary to resolve ambiguities
-template<typename T, int Options_> class DenseStorage<T, 0, Dynamic, Dynamic, Options_>
-: public DenseStorage<T, 0, 0, 0, Options_> { };
+template<typename T, int Options_>
+class DenseStorage<T, 0, Dynamic, Dynamic, Options_> {
+    Index m_rows;
+    Index m_cols;
+  public:
+    EIGEN_DEVICE_FUNC DenseStorage() : m_rows(0), m_cols(0) {}
+    EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : DenseStorage() {}
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) : m_rows(other.m_rows), m_cols(other.m_cols) {}
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) {
+      m_rows = other.m_rows;
+      m_cols = other.m_cols;
+      return *this;
+    }
+    EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index cols) : m_rows(rows), m_cols(cols) {
+      eigen_assert(m_rows * m_cols == 0 && "The number of rows times columns must equal the storage size.");
+    }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
+      numext::swap(m_rows,other.m_rows);
+      numext::swap(m_cols,other.m_cols);
+    }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT {return m_rows;}
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT {return m_cols;}
+    EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index cols) {
+      m_rows = rows;
+      m_cols = cols;
+      eigen_assert(m_rows * m_cols == 0 && "The number of rows times columns must equal the storage size.");
+    }
+    EIGEN_DEVICE_FUNC void resize(Index, Index rows, Index cols) {
+      m_rows = rows;
+      m_cols = cols;
+      eigen_assert(m_rows * m_cols == 0 && "The number of rows times columns must equal the storage size.");
+    }
+    EIGEN_DEVICE_FUNC const T *data() const { return nullptr; }
+    EIGEN_DEVICE_FUNC T *data() { return nullptr; }
+};
 
-template<typename T, int Rows_, int Options_> class DenseStorage<T, 0, Rows_, Dynamic, Options_>
-: public DenseStorage<T, 0, 0, 0, Options_> { };
+template<typename T, int Rows_, int Options_>
+class DenseStorage<T, 0, Rows_, Dynamic, Options_> {
+    Index m_cols;
+  public:
+    EIGEN_DEVICE_FUNC DenseStorage() : m_cols(0) {}
+    EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : DenseStorage() {}
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) : m_cols(other.m_cols) {}
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) {
+      m_cols = other.m_cols;
+      return *this;
+    }
+    EIGEN_DEVICE_FUNC DenseStorage(Index, Index, Index cols) : m_cols(cols) {
+      eigen_assert(Rows_ * m_cols == 0 && "The number of rows times columns must equal the storage size.");
+    }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
+      numext::swap(m_cols, other.m_cols);
+    }
+    EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index rows(void) EIGEN_NOEXCEPT {return Rows_;}
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols(void) const EIGEN_NOEXCEPT {return m_cols;}
+    EIGEN_DEVICE_FUNC void conservativeResize(Index, Index, Index cols) {
+      m_cols = cols;
+      eigen_assert(Rows_ * m_cols == 0 && "The number of rows times columns must equal the storage size.");
+    }
+    EIGEN_DEVICE_FUNC void resize(Index, Index, Index cols) {
+      m_cols = cols;
+      eigen_assert(Rows_ * m_cols == 0 && "The number of rows times columns must equal the storage size.");
+    }
+    EIGEN_DEVICE_FUNC const T *data() const { return nullptr; }
+    EIGEN_DEVICE_FUNC T *data() { return nullptr; }
+};
 
-template<typename T, int Cols_, int Options_> class DenseStorage<T, 0, Dynamic, Cols_, Options_>
-: public DenseStorage<T, 0, 0, 0, Options_> { };
+template<typename T, int Cols_, int Options_>
+class DenseStorage<T, 0, Dynamic, Cols_, Options_> {
+    Index m_rows;
+  public:
+    EIGEN_DEVICE_FUNC DenseStorage() : m_rows(0) {}
+    EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : DenseStorage() {}
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) : m_rows(other.m_rows) {}
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) {
+      m_rows = other.m_rows;
+      return *this;
+    }
+    EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index) : m_rows(rows) {
+      eigen_assert(m_rows * Cols_ == 0 && "The number of rows times columns must equal the storage size.");
+    }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
+      numext::swap(m_rows, other.m_rows);
+    }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows(void) const EIGEN_NOEXCEPT {return m_rows;}
+    EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index cols(void) EIGEN_NOEXCEPT {return Cols_;}
+    EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index) {
+      m_rows = rows;
+      eigen_assert(m_rows * Cols_ == 0 && "The number of rows times columns must equal the storage size.");
+    }
+    EIGEN_DEVICE_FUNC void resize(Index, Index rows, Index) {
+      m_rows = rows;
+      eigen_assert(m_rows * Cols_ == 0 && "The number of rows times columns must equal the storage size.");
+    }
+    EIGEN_DEVICE_FUNC const T *data() const { return nullptr; }
+    EIGEN_DEVICE_FUNC T *data() { return nullptr; }
+};
 
 // dynamic-size matrix with fixed-size storage
-template<typename T, int Size, int Options_> class DenseStorage<T, Size, Dynamic, Dynamic, Options_>
+template<typename T, int Size, int Options_>
+class DenseStorage<T, Size, Dynamic, Dynamic, Options_>
 {
     internal::plain_array<T,Size,Options_> m_data;
     Index m_rows;
     Index m_cols;
   public:
-    EIGEN_DEVICE_FUNC DenseStorage() : m_rows(0), m_cols(0) {}
-    EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
-      : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(0), m_cols(0) {}
-    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other)
-      : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(other.m_rows), m_cols(other.m_cols)
-    {
-      internal::plain_array_helper::copy(other.m_data, m_rows * m_cols, m_data);
-    }
+   EIGEN_DEVICE_FUNC constexpr DenseStorage() : m_data(), m_rows(0), m_cols(0) {}
+   EIGEN_DEVICE_FUNC explicit constexpr DenseStorage(internal::constructor_without_unaligned_array_assert)
+       : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(0), m_cols(0) {}
+   EIGEN_DEVICE_FUNC constexpr DenseStorage(const DenseStorage& other)
+       : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(other.m_rows), m_cols(other.m_cols) {
+    internal::plain_array_helper::copy(other.m_data, m_rows * m_cols, m_data);
+   }
     EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
     {
       if (this != &other)
@@ -299,36 +365,42 @@
       }
       return *this;
     }
-    EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index cols) : m_rows(rows), m_cols(cols) {}
+    EIGEN_DEVICE_FUNC constexpr DenseStorage(Index, Index rows, Index cols) : m_rows(rows), m_cols(cols) {}
     EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
     {
       internal::plain_array_helper::swap(m_data, m_rows * m_cols, other.m_data, other.m_rows * other.m_cols);
       numext::swap(m_rows,other.m_rows);
       numext::swap(m_cols,other.m_cols);
     }
-    EIGEN_DEVICE_FUNC Index rows() const {return m_rows;}
-    EIGEN_DEVICE_FUNC Index cols() const {return m_cols;}
-    EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index cols) { m_rows = rows; m_cols = cols; }
-    EIGEN_DEVICE_FUNC void resize(Index, Index rows, Index cols) { m_rows = rows; m_cols = cols; }
-    EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
-    EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
+    EIGEN_DEVICE_FUNC constexpr Index rows() const { return m_rows; }
+    EIGEN_DEVICE_FUNC constexpr Index cols() const { return m_cols; }
+    EIGEN_DEVICE_FUNC constexpr void conservativeResize(Index, Index rows, Index cols) {
+      m_rows = rows;
+      m_cols = cols;
+    }
+    EIGEN_DEVICE_FUNC constexpr void resize(Index, Index rows, Index cols) {
+      m_rows = rows;
+      m_cols = cols;
+    }
+    EIGEN_DEVICE_FUNC constexpr const T* data() const { return m_data.array; }
+    EIGEN_DEVICE_FUNC constexpr T* data() { return m_data.array; }
 };
 
 // dynamic-size matrix with fixed-size storage and fixed width
-template<typename T, int Size, int Cols_, int Options_> class DenseStorage<T, Size, Dynamic, Cols_, Options_>
+template<typename T, int Size, int Cols_, int Options_>
+class DenseStorage<T, Size, Dynamic, Cols_, Options_>
 {
     internal::plain_array<T,Size,Options_> m_data;
     Index m_rows;
   public:
-    EIGEN_DEVICE_FUNC DenseStorage() : m_rows(0) {}
-    EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
-      : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(0) {}
-    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other)
-      : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(other.m_rows)
-    {
+   EIGEN_DEVICE_FUNC constexpr DenseStorage() : m_rows(0) {}
+   EIGEN_DEVICE_FUNC explicit constexpr DenseStorage(internal::constructor_without_unaligned_array_assert)
+       : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(0) {}
+   EIGEN_DEVICE_FUNC constexpr DenseStorage(const DenseStorage& other)
+       : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(other.m_rows) {
       internal::plain_array_helper::copy(other.m_data, m_rows * Cols_, m_data);
-    }
-    
+   }
+
     EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
     {
       if (this != &other)
@@ -338,34 +410,34 @@
       }
       return *this;
     }
-    EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index) : m_rows(rows) {}
+    EIGEN_DEVICE_FUNC constexpr DenseStorage(Index, Index rows, Index) : m_rows(rows) {}
     EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
     { 
       internal::plain_array_helper::swap(m_data, m_rows * Cols_, other.m_data, other.m_rows * Cols_);
       numext::swap(m_rows, other.m_rows);
     }
-    EIGEN_DEVICE_FUNC Index rows(void) const EIGEN_NOEXCEPT {return m_rows;}
-    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols(void) const EIGEN_NOEXCEPT {return Cols_;}
-    EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index) { m_rows = rows; }
-    EIGEN_DEVICE_FUNC void resize(Index, Index rows, Index) { m_rows = rows; }
-    EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
-    EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
+    EIGEN_DEVICE_FUNC constexpr Index rows(void) const EIGEN_NOEXCEPT { return m_rows; }
+    EIGEN_DEVICE_FUNC constexpr Index cols(void) const EIGEN_NOEXCEPT { return Cols_; }
+    EIGEN_DEVICE_FUNC constexpr void conservativeResize(Index, Index rows, Index) { m_rows = rows; }
+    EIGEN_DEVICE_FUNC constexpr void resize(Index, Index rows, Index) { m_rows = rows; }
+    EIGEN_DEVICE_FUNC constexpr const T* data() const { return m_data.array; }
+    EIGEN_DEVICE_FUNC constexpr T* data() { return m_data.array; }
 };
 
 // dynamic-size matrix with fixed-size storage and fixed height
-template<typename T, int Size, int Rows_, int Options_> class DenseStorage<T, Size, Rows_, Dynamic, Options_>
+template<typename T, int Size, int Rows_, int Options_>
+class DenseStorage<T, Size, Rows_, Dynamic, Options_>
 {
     internal::plain_array<T,Size,Options_> m_data;
     Index m_cols;
   public:
-    EIGEN_DEVICE_FUNC DenseStorage() : m_cols(0) {}
-    EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
-      : m_data(internal::constructor_without_unaligned_array_assert()), m_cols(0) {}
-    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) 
-      : m_data(internal::constructor_without_unaligned_array_assert()), m_cols(other.m_cols)
-    {
+   EIGEN_DEVICE_FUNC constexpr DenseStorage() : m_cols(0) {}
+   EIGEN_DEVICE_FUNC explicit constexpr DenseStorage(internal::constructor_without_unaligned_array_assert)
+       : m_data(internal::constructor_without_unaligned_array_assert()), m_cols(0) {}
+   EIGEN_DEVICE_FUNC constexpr DenseStorage(const DenseStorage& other)
+       : m_data(internal::constructor_without_unaligned_array_assert()), m_cols(other.m_cols) {
       internal::plain_array_helper::copy(other.m_data, Rows_ * m_cols, m_data);
-    }
+   }
     EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
     {
       if (this != &other)
@@ -380,30 +452,32 @@
       internal::plain_array_helper::swap(m_data, Rows_ * m_cols, other.m_data, Rows_ * other.m_cols);
       numext::swap(m_cols, other.m_cols);
     }
-    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows(void) const EIGEN_NOEXCEPT {return Rows_;}
-    EIGEN_DEVICE_FUNC Index cols(void) const EIGEN_NOEXCEPT {return m_cols;}
-    EIGEN_DEVICE_FUNC void conservativeResize(Index, Index, Index cols) { m_cols = cols; }
-    EIGEN_DEVICE_FUNC void resize(Index, Index, Index cols) { m_cols = cols; }
-    EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
-    EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
+    EIGEN_DEVICE_FUNC constexpr Index rows(void) const EIGEN_NOEXCEPT { return Rows_; }
+    EIGEN_DEVICE_FUNC constexpr Index cols(void) const EIGEN_NOEXCEPT { return m_cols; }
+    EIGEN_DEVICE_FUNC constexpr void conservativeResize(Index, Index, Index cols) { m_cols = cols; }
+    EIGEN_DEVICE_FUNC constexpr void resize(Index, Index, Index cols) { m_cols = cols; }
+    EIGEN_DEVICE_FUNC constexpr const T* data() const { return m_data.array; }
+    EIGEN_DEVICE_FUNC constexpr T* data() { return m_data.array; }
 };
 
 // purely dynamic matrix.
-template<typename T, int Options_> class DenseStorage<T, Dynamic, Dynamic, Dynamic, Options_>
+template<typename T, int Options_>
+class DenseStorage<T, Dynamic, Dynamic, Dynamic, Options_>
 {
     T *m_data;
     Index m_rows;
     Index m_cols;
   public:
-    EIGEN_DEVICE_FUNC DenseStorage() : m_data(0), m_rows(0), m_cols(0) {}
-    EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
+   EIGEN_DEVICE_FUNC constexpr DenseStorage() : m_data(0), m_rows(0), m_cols(0) {}
+   EIGEN_DEVICE_FUNC explicit constexpr DenseStorage(internal::constructor_without_unaligned_array_assert)
        : m_data(0), m_rows(0), m_cols(0) {}
-    EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols)
-      : m_data(internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(size)), m_rows(rows), m_cols(cols)
-    {
+   EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols)
+       : m_data(internal::conditional_aligned_new_auto<T, (Options_ & DontAlign) == 0>(size)),
+         m_rows(rows),
+         m_cols(cols) {
       EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
       eigen_internal_assert(size==rows*cols && rows>=0 && cols >=0);
-    }
+   }
     EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other)
       : m_data(internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(other.m_rows*other.m_cols))
       , m_rows(other.m_rows)
@@ -473,19 +547,19 @@
 };
 
 // matrix with dynamic width and fixed height (so that matrix has dynamic size).
-template<typename T, int Rows_, int Options_> class DenseStorage<T, Dynamic, Rows_, Dynamic, Options_>
-{
+template<typename T, int Rows_, int Options_>
+class DenseStorage<T, Dynamic, Rows_, Dynamic, Options_> {
     T *m_data;
     Index m_cols;
   public:
-    EIGEN_DEVICE_FUNC DenseStorage() : m_data(0), m_cols(0) {}
-    explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_cols(0) {}
-    EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) : m_data(internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(size)), m_cols(cols)
-    {
+   EIGEN_DEVICE_FUNC constexpr DenseStorage() : m_data(0), m_cols(0) {}
+   explicit constexpr DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_cols(0) {}
+   EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols)
+       : m_data(internal::conditional_aligned_new_auto<T, (Options_ & DontAlign) == 0>(size)), m_cols(cols) {
       EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
       eigen_internal_assert(size==rows*cols && rows==Rows_ && cols >=0);
       EIGEN_UNUSED_VARIABLE(rows);
-    }
+   }
     EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other)
       : m_data(internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(Rows_*other.m_cols))
       , m_cols(other.m_cols)
@@ -522,7 +596,7 @@
       numext::swap(m_data,other.m_data);
       numext::swap(m_cols,other.m_cols);
     }
-    EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index rows(void) EIGEN_NOEXCEPT {return Rows_;}
+    EIGEN_DEVICE_FUNC static constexpr Index rows(void) EIGEN_NOEXCEPT { return Rows_; }
     EIGEN_DEVICE_FUNC Index cols(void) const EIGEN_NOEXCEPT {return m_cols;}
     EIGEN_DEVICE_FUNC void conservativeResize(Index size, Index, Index cols)
     {
@@ -547,19 +621,20 @@
 };
 
 // matrix with dynamic height and fixed width (so that matrix has dynamic size).
-template<typename T, int Cols_, int Options_> class DenseStorage<T, Dynamic, Dynamic, Cols_, Options_>
+template<typename T, int Cols_, int Options_>
+class DenseStorage<T, Dynamic, Dynamic, Cols_, Options_>
 {
     T *m_data;
     Index m_rows;
   public:
-    EIGEN_DEVICE_FUNC DenseStorage() : m_data(0), m_rows(0) {}
-    explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_rows(0) {}
-    EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) : m_data(internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(size)), m_rows(rows)
-    {
+   EIGEN_DEVICE_FUNC constexpr DenseStorage() : m_data(0), m_rows(0) {}
+   explicit constexpr DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_rows(0) {}
+   EIGEN_DEVICE_FUNC constexpr DenseStorage(Index size, Index rows, Index cols)
+       : m_data(internal::conditional_aligned_new_auto<T, (Options_ & DontAlign) == 0>(size)), m_rows(rows) {
       EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
       eigen_internal_assert(size==rows*cols && rows>=0 && cols == Cols_);
       EIGEN_UNUSED_VARIABLE(cols);
-    }
+   }
     EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other)
       : m_data(internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(other.m_rows*Cols_))
       , m_rows(other.m_rows)
@@ -597,7 +672,7 @@
       numext::swap(m_rows,other.m_rows);
     }
     EIGEN_DEVICE_FUNC Index rows(void) const EIGEN_NOEXCEPT {return m_rows;}
-    EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index cols(void) {return Cols_;}
+    EIGEN_DEVICE_FUNC static constexpr Index cols(void) { return Cols_; }
     void conservativeResize(Index size, Index rows, Index)
     {
       m_data = internal::conditional_aligned_realloc_new_auto<T,(Options_&DontAlign)==0>(m_data, size, m_rows*Cols_);
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index b67c4ed..af773dd 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -563,13 +563,13 @@
 parg(const Packet& a) { using numext::arg; return arg(a); }
 
 
-/** \internal \returns \a a logically shifted by N bits to the right */
+/** \internal \returns \a a arithmetically shifted by N bits to the right */
 template<int N> EIGEN_DEVICE_FUNC inline int
 parithmetic_shift_right(const int& a) { return a >> N; }
 template<int N> EIGEN_DEVICE_FUNC inline long int
 parithmetic_shift_right(const long int& a) { return a >> N; }
 
-/** \internal \returns \a a arithmetically shifted by N bits to the right */
+/** \internal \returns \a a logically shifted by N bits to the right */
 template<int N> EIGEN_DEVICE_FUNC inline int
 plogical_shift_right(const int& a) { return static_cast<int>(static_cast<unsigned int>(a) >> N); }
 template<int N> EIGEN_DEVICE_FUNC inline long int
@@ -1191,6 +1191,34 @@
   return preciprocal<Packet>(psqrt(a));
 }
 
+template <typename Packet, bool IsScalar = is_scalar<Packet>::value,
+    bool IsInteger = NumTraits<typename unpacket_traits<Packet>::type>::IsInteger>
+    struct psignbit_impl;
+template <typename Packet, bool IsInteger>
+struct psignbit_impl<Packet, true, IsInteger> {
+     EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static constexpr Packet run(const Packet& a) { return numext::signbit(a); }
+};
+template <typename Packet>
+struct psignbit_impl<Packet, false, false> {
+    // generic implementation if not specialized in PacketMath.h
+    // slower than arithmetic shift
+    typedef typename unpacket_traits<Packet>::type Scalar;
+    EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static Packet run(const Packet& a) {
+        const Packet cst_pos_one = pset1<Packet>(Scalar(1));
+        const Packet cst_neg_one = pset1<Packet>(Scalar(-1));
+        return pcmp_eq(por(pand(a, cst_neg_one), cst_pos_one), cst_neg_one);
+    }
+};
+template <typename Packet>
+struct psignbit_impl<Packet, false, true> {
+    // generic implementation for integer packets
+    EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static constexpr Packet run(const Packet& a) { return pcmp_lt(a, pzero(a)); }
+};
+/** \internal \returns the sign bit of \a a as a bitmask*/
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE constexpr Packet
+psignbit(const Packet& a) { return psignbit_impl<Packet>::run(a); }
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index 0eee333..b194353 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -1531,6 +1531,37 @@
 }
 #endif
 
+template <typename Scalar, bool IsInteger = NumTraits<Scalar>::IsInteger, bool IsSigned = NumTraits<Scalar>::IsSigned>
+struct signbit_impl;
+template <typename Scalar>
+struct signbit_impl<Scalar, false, true> {
+  static constexpr size_t Size = sizeof(Scalar);
+  static constexpr size_t Shift = (CHAR_BIT * Size) - 1;
+  using intSize_t = typename get_integer_by_size<Size>::signed_type;
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static Scalar run(const Scalar& x) {
+    intSize_t a = bit_cast<intSize_t, Scalar>(x);
+    a = a >> Shift;
+    Scalar result = bit_cast<Scalar, intSize_t>(a);
+    return result;
+  }
+};
+template <typename Scalar>
+struct signbit_impl<Scalar, true, true> {
+  static constexpr size_t Size = sizeof(Scalar);
+  static constexpr size_t Shift = (CHAR_BIT * Size) - 1;
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static constexpr Scalar run(const Scalar& x) { return x >> Shift; }
+};
+template <typename Scalar>
+struct signbit_impl<Scalar, true, false> {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static constexpr Scalar run(const Scalar&  ) {
+    return Scalar(0);
+  }
+};
+template <typename Scalar>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static constexpr Scalar signbit(const Scalar& x) {
+  return signbit_impl<Scalar>::run(x);
+}
+
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T exp(const T &x) {
diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h
index 23acd8a..c7747f1 100644
--- a/Eigen/src/Core/Matrix.h
+++ b/Eigen/src/Core/Matrix.h
@@ -312,8 +312,9 @@
       *
       * \sa Matrix(const Scalar& a0, const Scalar& a1, const Scalar& a2,  const Scalar& a3, const ArgTypes&... args)
       */
-    EIGEN_DEVICE_FUNC
-    explicit EIGEN_STRONG_INLINE Matrix(const std::initializer_list<std::initializer_list<Scalar>>& list) : Base(list) {}
+    EIGEN_DEVICE_FUNC explicit constexpr EIGEN_STRONG_INLINE Matrix(
+        const std::initializer_list<std::initializer_list<Scalar>>& list)
+        : Base(list) {}
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 
diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h
index d87c6b3..ea2178f 100644
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@@ -387,20 +387,9 @@
 
 /////////// Geometry module ///////////
 
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /// \internal helper struct to form the return type of the cross product
-    template<typename OtherDerived> struct cross_product_return_type {
-      typedef typename ScalarBinaryOpTraits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType Scalar;
-      typedef Matrix<Scalar,MatrixBase::RowsAtCompileTime,MatrixBase::ColsAtCompileTime> type;
-    };
-    #endif // EIGEN_PARSED_BY_DOXYGEN
     template<typename OtherDerived>
     EIGEN_DEVICE_FUNC
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-    inline typename cross_product_return_type<OtherDerived>::type
-#else
-    inline PlainObject
-#endif
+    inline typename internal::cross_impl<Derived, OtherDerived>::return_type
     cross(const MatrixBase<OtherDerived>& other) const;
 
     template<typename OtherDerived>
diff --git a/Eigen/src/Core/NestByValue.h b/Eigen/src/Core/NestByValue.h
index 5f1dc84..311cb5a 100644
--- a/Eigen/src/Core/NestByValue.h
+++ b/Eigen/src/Core/NestByValue.h
@@ -43,6 +43,8 @@
   public:
 
     typedef typename internal::dense_xpr_base<NestByValue>::type Base;
+    static constexpr bool HasDirectAccess = internal::has_direct_access<ExpressionType>::ret;
+    
     EIGEN_DENSE_PUBLIC_INTERFACE(NestByValue)
 
     EIGEN_DEVICE_FUNC explicit inline NestByValue(const ExpressionType& matrix) : m_expression(matrix) {}
@@ -54,6 +56,18 @@
 
     EIGEN_DEVICE_FUNC const ExpressionType& nestedExpression() const { return m_expression; }
 
+    EIGEN_DEVICE_FUNC typename std::enable_if<HasDirectAccess, const Scalar*>::type data() const {
+      return m_expression.data();
+    }
+    
+    EIGEN_DEVICE_FUNC typename std::enable_if<HasDirectAccess, Index>::type innerStride() const {
+      return m_expression.innerStride();
+    }
+    
+    EIGEN_DEVICE_FUNC typename std::enable_if<HasDirectAccess, Index>::type outerStride() const {
+      return m_expression.outerStride();
+    }
+
   protected:
     const ExpressionType m_expression;
 };
diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h
index 4f1f992..53362ef 100644
--- a/Eigen/src/Core/NumTraits.h
+++ b/Eigen/src/Core/NumTraits.h
@@ -95,7 +95,7 @@
   // Load src into registers first. This allows the memcpy to be elided by CUDA.
   const Src staged = src;
   EIGEN_USING_STD(memcpy)
-  memcpy(&tgt, &staged, sizeof(Tgt));
+  memcpy(static_cast<void*>(&tgt),static_cast<const void*>(&staged), sizeof(Tgt));
   return tgt;
 }
 }  // namespace numext
diff --git a/Eigen/src/Core/PlainObjectBase.h b/Eigen/src/Core/PlainObjectBase.h
index 222eaf5..60a75b1 100644
--- a/Eigen/src/Core/PlainObjectBase.h
+++ b/Eigen/src/Core/PlainObjectBase.h
@@ -29,18 +29,13 @@
 namespace internal {
 
 template<int MaxSizeAtCompileTime> struct check_rows_cols_for_overflow {
-  template<typename Index>
-  EIGEN_DEVICE_FUNC
-  static EIGEN_ALWAYS_INLINE void run(Index, Index)
-  {
-  }
+  template <typename Index>
+  EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE constexpr void run(Index, Index) {}
 };
 
 template<> struct check_rows_cols_for_overflow<Dynamic> {
-  template<typename Index>
-  EIGEN_DEVICE_FUNC
-  static EIGEN_ALWAYS_INLINE void run(Index rows, Index cols)
-  {
+  template <typename Index>
+  EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE constexpr void run(Index rows, Index cols) {
     // http://hg.mozilla.org/mozilla-central/file/6c8a909977d3/xpcom/ds/CheckedInt.h#l242
     // we assume Index is signed
     Index max_index = (std::size_t(1) << (8 * sizeof(Index) - 1)) - 1; // assume Index is signed
@@ -160,12 +155,10 @@
       * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts.
       *
       * See DenseCoeffsBase<Derived,ReadOnlyAccessors>::coeff(Index) const for details. */
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& coeff(Index rowId, Index colId) const
-    {
-      if(Flags & RowMajorBit)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const Scalar& coeff(Index rowId, Index colId) const {
+      if (Flags & RowMajorBit)
         return m_storage.data()[colId + rowId * m_storage.cols()];
-      else // column-major
+      else  // column-major
         return m_storage.data()[rowId + colId * m_storage.rows()];
     }
 
@@ -183,12 +176,10 @@
       * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts.
       *
       * See DenseCoeffsBase<Derived,WriteAccessors>::coeffRef(Index,Index) const for details. */
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& coeffRef(Index rowId, Index colId)
-    {
-      if(Flags & RowMajorBit)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& coeffRef(Index rowId, Index colId) {
+      if (Flags & RowMajorBit)
         return m_storage.data()[colId + rowId * m_storage.cols()];
-      else // column-major
+      else  // column-major
         return m_storage.data()[rowId + colId * m_storage.rows()];
     }
 
@@ -196,28 +187,20 @@
       * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts.
       *
       * See DenseCoeffsBase<Derived,WriteAccessors>::coeffRef(Index) const for details. */
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
-    {
-      return m_storage.data()[index];
-    }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& coeffRef(Index index) { return m_storage.data()[index]; }
 
     /** This is the const version of coeffRef(Index,Index) which is thus synonym of coeff(Index,Index).
       * It is provided for convenience. */
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& coeffRef(Index rowId, Index colId) const
-    {
-      if(Flags & RowMajorBit)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const Scalar& coeffRef(Index rowId, Index colId) const {
+      if (Flags & RowMajorBit)
         return m_storage.data()[colId + rowId * m_storage.cols()];
-      else // column-major
+      else  // column-major
         return m_storage.data()[rowId + colId * m_storage.rows()];
     }
 
     /** This is the const version of coeffRef(Index) which is thus synonym of coeff(Index).
       * It is provided for convenience. */
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& coeffRef(Index index) const
-    {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const Scalar& coeffRef(Index index) const {
       return m_storage.data()[index];
     }
 
@@ -279,9 +262,7 @@
       *
       * \sa resize(Index) for vectors, resize(NoChange_t, Index), resize(Index, NoChange_t)
       */
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE void resize(Index rows, Index cols)
-    {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void resize(Index rows, Index cols) {
       eigen_assert(internal::check_implication(RowsAtCompileTime!=Dynamic, rows==RowsAtCompileTime)
                    && internal::check_implication(ColsAtCompileTime!=Dynamic, cols==ColsAtCompileTime)
                    && internal::check_implication(RowsAtCompileTime==Dynamic && MaxRowsAtCompileTime!=Dynamic, rows<=MaxRowsAtCompileTime)
@@ -309,12 +290,13 @@
       *
       * \sa resize(Index,Index), resize(NoChange_t, Index), resize(Index, NoChange_t)
       */
-    EIGEN_DEVICE_FUNC
-    inline void resize(Index size)
-    {
-      EIGEN_STATIC_ASSERT_VECTOR_ONLY(PlainObjectBase)
-      eigen_assert(((SizeAtCompileTime == Dynamic && (MaxSizeAtCompileTime==Dynamic || size<=MaxSizeAtCompileTime)) || SizeAtCompileTime == size) && size>=0);
-      #ifdef EIGEN_INITIALIZE_COEFFS
+    EIGEN_DEVICE_FUNC inline constexpr void resize(Index size) {
+        EIGEN_STATIC_ASSERT_VECTOR_ONLY(PlainObjectBase)
+        eigen_assert(
+            ((SizeAtCompileTime == Dynamic && (MaxSizeAtCompileTime == Dynamic || size <= MaxSizeAtCompileTime)) ||
+             SizeAtCompileTime == size) &&
+            size >= 0);
+#ifdef EIGEN_INITIALIZE_COEFFS
         bool size_changed = size != this->size();
       #endif
       if(RowsAtCompileTime == 1)
@@ -334,11 +316,7 @@
       *
       * \sa resize(Index,Index)
       */
-    EIGEN_DEVICE_FUNC
-    inline void resize(NoChange_t, Index cols)
-    {
-      resize(rows(), cols);
-    }
+    EIGEN_DEVICE_FUNC inline constexpr void resize(NoChange_t, Index cols) { resize(rows(), cols); }
 
     /** Resizes the matrix, changing only the number of rows. For the parameter of type NoChange_t, just pass the special value \c NoChange
       * as in the example below.
@@ -348,11 +326,7 @@
       *
       * \sa resize(Index,Index)
       */
-    EIGEN_DEVICE_FUNC
-    inline void resize(Index rows, NoChange_t)
-    {
-      resize(rows, cols());
-    }
+    EIGEN_DEVICE_FUNC inline constexpr void resize(Index rows, NoChange_t) { resize(rows, cols()); }
 
     /** Resizes \c *this to have the same dimensions as \a other.
       * Takes care of doing all the checking that's needed.
@@ -552,10 +526,9 @@
     /** \brief Constructs a Matrix or Array and initializes it by elements given by an initializer list of initializer
       * lists
       */
-    EIGEN_DEVICE_FUNC
-    explicit EIGEN_STRONG_INLINE PlainObjectBase(const std::initializer_list<std::initializer_list<Scalar>>& list)
-      : m_storage()
-    {
+    EIGEN_DEVICE_FUNC explicit constexpr EIGEN_STRONG_INLINE PlainObjectBase(
+        const std::initializer_list<std::initializer_list<Scalar>>& list)
+        : m_storage() {
       size_t list_size = 0;
       if (list.begin() != list.end()) {
         list_size = list.begin()->size();
diff --git a/Eigen/src/Core/Reshaped.h b/Eigen/src/Core/Reshaped.h
index c90e61f..81355ac 100644
--- a/Eigen/src/Core/Reshaped.h
+++ b/Eigen/src/Core/Reshaped.h
@@ -251,7 +251,7 @@
     EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
     inline Index outerStride() const
     {
-      return ((Flags&RowMajorBit)==RowMajorBit) ? this->cols() : this->rows();
+      return (((Flags&RowMajorBit)==RowMajorBit) ? this->cols() : this->rows()) * m_xpr.innerStride();
     }
 
   protected:
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index ecbb73c..0fe830a 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -229,10 +229,7 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     HasCmp = 1,
-    size=4,
-
-    // requires AVX512
-    HasShift     = 0,
+    size=4
   };
 };
 #endif
@@ -360,6 +357,35 @@
 EIGEN_STRONG_INLINE Packet4l plogical_shift_left(Packet4l a) {
   return _mm256_slli_epi64(a, N);
 }
+#ifdef EIGEN_VECTORIZE_AVX512FP16
+template <int N>
+EIGEN_STRONG_INLINE Packet4l parithmetic_shift_right(Packet4l a) { return _mm256_srai_epi64(a, N); }
+#else
+template <int N>
+EIGEN_STRONG_INLINE std::enable_if_t< (N == 0), Packet4l> parithmetic_shift_right(Packet4l a) {
+  return a;
+}
+template <int N>
+EIGEN_STRONG_INLINE std::enable_if_t< (N > 0) && (N < 32), Packet4l> parithmetic_shift_right(Packet4l a) {
+  __m256i hi_word = _mm256_srai_epi32(a, N);
+  __m256i lo_word = _mm256_srli_epi64(a, N);
+  return _mm256_blend_epi32(hi_word, lo_word, 0b01010101);
+}
+template <int N>
+EIGEN_STRONG_INLINE std::enable_if_t< (N >= 32) && (N < 63), Packet4l> parithmetic_shift_right(Packet4l a) {
+  __m256i hi_word = _mm256_srai_epi32(a, 31);
+  __m256i lo_word = _mm256_shuffle_epi32(_mm256_srai_epi32(a, N - 32), (shuffle_mask<1, 1, 3, 3>::mask));
+  return _mm256_blend_epi32(hi_word, lo_word, 0b01010101);
+}
+template <int N>
+EIGEN_STRONG_INLINE std::enable_if_t< (N == 63), Packet4l> parithmetic_shift_right(Packet4l a) {
+  return _mm256_shuffle_epi32(_mm256_srai_epi32(a, 31), (shuffle_mask<1, 1, 3, 3>::mask));
+}
+template <int N>
+EIGEN_STRONG_INLINE std::enable_if_t< (N < 0) || (N > 63), Packet4l> parithmetic_shift_right(Packet4l a) {
+  return parithmetic_shift_right<int(N&63)>(a);
+}
+#endif
 template <>
 EIGEN_STRONG_INLINE Packet4l pload<Packet4l>(const int64_t* from) {
   EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
@@ -518,11 +544,13 @@
 
 template<> EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a)
 {
-  return _mm256_sub_ps(_mm256_set1_ps(0.0),a);
+  const Packet8f mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
+  return _mm256_xor_ps(a, mask);
 }
 template<> EIGEN_STRONG_INLINE Packet4d pnegate(const Packet4d& a)
 {
-  return _mm256_sub_pd(_mm256_set1_pd(0.0),a);
+  const Packet4d mask = _mm256_castsi256_pd(_mm256_set1_epi64x(0x8000000000000000ULL));
+  return _mm256_xor_pd(a, mask);
 }
 template<> EIGEN_STRONG_INLINE Packet8i pnegate(const Packet8i& a)
 {
@@ -1103,6 +1131,13 @@
 #endif
 }
 
+template<> EIGEN_STRONG_INLINE Packet8h  psignbit(const Packet8h&  a) { return _mm_srai_epi16(a, 15); }
+template<> EIGEN_STRONG_INLINE Packet8bf psignbit(const Packet8bf& a) { return _mm_srai_epi16(a, 15); }
+template<> EIGEN_STRONG_INLINE Packet8f  psignbit(const Packet8f&  a) { return _mm256_castsi256_ps(parithmetic_shift_right<31>((Packet8i)_mm256_castps_si256(a))); }
+#ifdef EIGEN_VECTORIZE_AVX2
+template<> EIGEN_STRONG_INLINE Packet4d  psignbit(const Packet4d& a)  { return _mm256_castsi256_pd(parithmetic_shift_right<63>((Packet4l)_mm256_castpd_si256(a))); }
+#endif
+
 template<> EIGEN_STRONG_INLINE Packet8f pfrexp<Packet8f>(const Packet8f& a, Packet8f& exponent) {
   return pfrexp_generic(a,exponent);
 }
diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h
index 5f37740..159ae3e 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
@@ -346,11 +346,13 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet16f pnegate(const Packet16f& a) {
-  return _mm512_sub_ps(_mm512_set1_ps(0.0), a);
+  const __m512i mask = _mm512_set1_epi32(0x80000000);
+  return _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(a), mask));
 }
 template <>
 EIGEN_STRONG_INLINE Packet8d pnegate(const Packet8d& a) {
-  return _mm512_sub_pd(_mm512_set1_pd(0.0), a);
+  const __m512i mask = _mm512_set1_epi64(0x8000000000000000ULL);
+  return _mm512_castsi512_pd(_mm512_xor_epi64(_mm512_castpd_si512(a), mask));
 }
 template <>
 EIGEN_STRONG_INLINE Packet16i pnegate(const Packet16i& a) {
@@ -1127,6 +1129,11 @@
   return _mm512_abs_epi32(a);
 }
 
+template<> EIGEN_STRONG_INLINE Packet16h  psignbit(const Packet16h&  a) { return _mm256_srai_epi16(a, 15); }
+template<> EIGEN_STRONG_INLINE Packet16bf psignbit(const Packet16bf& a) { return _mm256_srai_epi16(a, 15); }
+template<> EIGEN_STRONG_INLINE Packet16f  psignbit(const Packet16f&  a) { return _mm512_castsi512_ps(_mm512_srai_epi32(_mm512_castps_si512(a), 31)); }
+template<> EIGEN_STRONG_INLINE Packet8d   psignbit(const Packet8d&   a) { return _mm512_castsi512_pd(_mm512_srai_epi64(_mm512_castpd_si512(a), 63)); }
+
 template<>
 EIGEN_STRONG_INLINE Packet16f pfrexp<Packet16f>(const Packet16f& a, Packet16f& exponent){
   return pfrexp_generic(a, exponent);
diff --git a/Eigen/src/Core/arch/AVX512/PacketMathFP16.h b/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
index 324d050..f2e3e84 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
@@ -196,6 +196,13 @@
   return _mm512_abs_ph(a);
 }
 
+// psignbit
+
+template <>
+EIGEN_STRONG_INLINE Packet32h psignbit<Packet32h>(const Packet32h& a) {
+  return _mm512_castsi512_ph(_mm512_srai_epi16(_mm512_castph_si512(a), 15));
+}
+
 // pmin
 
 template <>
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
index 3b3b558..2429c81 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
@@ -91,6 +91,20 @@
   };
 };
 
+template<>
+struct quad_traits<bfloat16>
+{
+  typedef Packet8bf                       vectortype;
+  typedef PacketBlock<vectortype,4>             type;
+  typedef vectortype                         rhstype;
+  enum
+  {
+    vectorsize = packet_traits<bfloat16>::size,
+    size = 8,
+    rows = 4
+  };
+};
+
 // MatrixProduct decomposes real/imaginary vectors into a real vector and an imaginary vector, this turned out
 // to be faster than Eigen's usual approach of having real/imaginary pairs on a single vector. This constants then
 // are responsible to extract from convert between Eigen's and MatrixProduct approach.
@@ -2781,6 +2795,31 @@
     #endif
     gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
   }
+
+#if defined(__MMA__)
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<bfloat16, bfloat16, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+{
+  typedef typename quad_traits<bfloat16>::vectortype   Packet;
+  typedef typename quad_traits<bfloat16>::rhstype      RhsPacket;
+
+  void operator()(const DataMapper& res, const bfloat16* blockA, const bfloat16* blockB,
+                  Index rows, Index depth, Index cols, bfloat16 alpha,
+                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+};
+
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+void gebp_kernel<bfloat16, bfloat16, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+  ::operator()(const DataMapper& res, const bfloat16* blockA, const bfloat16* blockB,
+               Index rows, Index depth, Index cols, bfloat16 alpha,
+               Index strideA, Index strideB, Index offsetA, Index offsetB)
+  {
+    const Index accRows = quad_traits<bfloat16>::rows;
+    const Index accCols = quad_traits<bfloat16>::size;
+
+    Eigen::internal::gemmMMAbfloat16<Index, Packet, RhsPacket, DataMapper, accRows, accCols>(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+  }
+#endif
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h
index aa1cbf8..e4013a7 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h
@@ -28,6 +28,8 @@
 
 #include "../../InternalHeaderCheck.h"
 
+#include "MatrixProductMMAbfloat16.h"
+
 namespace Eigen {
 
 namespace internal {
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h b/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h
new file mode 100644
index 0000000..b3e063d
--- /dev/null
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h
@@ -0,0 +1,385 @@
+ #ifndef EIGEN_MATRIX_PRODUCT_MMA_BFLOAT16_ALTIVEC_H
+ #define EIGEN_MATRIX_PRODUCT_MMA_BFLOAT16_ALTIVEC_H
+
+namespace Eigen {
+
+namespace internal {
+
+EIGEN_STRONG_INLINE void pgerMMAbfloat16(__vector_quad* acc, const Packet8bf& a, const Packet8bf& b, int maskX, int maskY)
+{
+  switch(maskX){
+    case 15:
+      switch(maskY){
+        case 0b1111:          
+          __builtin_mma_xvbf16ger2pp(acc, reinterpret_cast<Packet16uc>(a.m_val), reinterpret_cast<Packet16uc>(b.m_val));
+          break;
+        case 0b0011:
+          __builtin_mma_pmxvbf16ger2pp(acc, reinterpret_cast<Packet16uc>(a.m_val), reinterpret_cast<Packet16uc>(b.m_val), 0b1111, 0b11, 0b11);
+          break;
+        case 0b0001:
+          __builtin_mma_pmxvbf16ger2pp(acc, reinterpret_cast<Packet16uc>(a.m_val), reinterpret_cast<Packet16uc>(b.m_val), 0b1111, 0b1, 0b11);
+          break;
+        case 0b0111:
+          __builtin_mma_pmxvbf16ger2pp(acc, reinterpret_cast<Packet16uc>(a.m_val), reinterpret_cast<Packet16uc>(b.m_val), 0b1111, 0b111, 0b11);
+          break;
+      }
+      break;
+    case 3:
+      switch(maskY){
+        case 0b1111:
+          __builtin_mma_xvbf16ger2pp(acc, reinterpret_cast<Packet16uc>(a.m_val), reinterpret_cast<Packet16uc>(b.m_val));
+          break;
+        case 0b0011:
+          __builtin_mma_pmxvbf16ger2pp(acc, reinterpret_cast<Packet16uc>(a.m_val), reinterpret_cast<Packet16uc>(b.m_val), 0b11, 0b11, 0b11);
+          break;
+        case 0b0001:
+          __builtin_mma_pmxvbf16ger2pp(acc, reinterpret_cast<Packet16uc>(a.m_val), reinterpret_cast<Packet16uc>(b.m_val), 0b11, 0b1, 0b11);
+          break;
+        case 0b0111:
+          __builtin_mma_pmxvbf16ger2pp(acc, reinterpret_cast<Packet16uc>(a.m_val), reinterpret_cast<Packet16uc>(b.m_val), 0b11, 0b111, 0b11);
+          break;
+      }
+      break;
+    case 1:
+      switch(maskY){
+        case 0b1111:
+          __builtin_mma_xvbf16ger2pp(acc, reinterpret_cast<Packet16uc>(a.m_val), reinterpret_cast<Packet16uc>(b.m_val));
+          break;
+        case 0b0011:
+          __builtin_mma_pmxvbf16ger2pp(acc, reinterpret_cast<Packet16uc>(a.m_val), reinterpret_cast<Packet16uc>(b.m_val), 0b1, 0b11, 0b11);
+          break;
+        case 0b0001:
+          __builtin_mma_pmxvbf16ger2pp(acc, reinterpret_cast<Packet16uc>(a.m_val), reinterpret_cast<Packet16uc>(b.m_val), 0b1, 0b1, 0b11);
+          break;
+        case 0b0111:
+          __builtin_mma_pmxvbf16ger2pp(acc, reinterpret_cast<Packet16uc>(a.m_val), reinterpret_cast<Packet16uc>(b.m_val), 0b1, 0b111, 0b11);
+          break;
+      }
+      break;
+    case 0b0111:
+      switch(maskY){
+        case 0b1111:
+          __builtin_mma_xvbf16ger2pp(acc, reinterpret_cast<Packet16uc>(a.m_val), reinterpret_cast<Packet16uc>(b.m_val));
+          break;
+        case 0b0011:
+          __builtin_mma_pmxvbf16ger2pp(acc, reinterpret_cast<Packet16uc>(a.m_val), reinterpret_cast<Packet16uc>(b.m_val), 0b111, 0b11, 0b11);
+          break;
+        case 0b0001:
+          __builtin_mma_pmxvbf16ger2pp(acc, reinterpret_cast<Packet16uc>(a.m_val), reinterpret_cast<Packet16uc>(b.m_val), 0b111, 0b1, 0b11);
+          break;
+        case 0b0111:
+          __builtin_mma_pmxvbf16ger2pp(acc, reinterpret_cast<Packet16uc>(a.m_val), reinterpret_cast<Packet16uc>(b.m_val), 0b111, 0b111, 0b11);
+          break;
+      }
+      break;
+  }
+}
+
+EIGEN_STRONG_INLINE void scaleAndStore(float* result, float* acc, Packet4f pAlpha)
+{
+  Packet4f result_block = ploadu<Packet4f>(result);
+  Packet4f packet_pmadd = pmadd(pload<Packet4f>(acc), pAlpha, result_block);
+  pstoreu(result, packet_pmadd);
+}
+
+template<int num_packets, bool zero>
+EIGEN_STRONG_INLINE Packet8bf loadLhsBfloat16(const bfloat16* indexA)
+{
+  Packet8bf lhs1 = ploadu<Packet8bf>(indexA);
+  Packet8bf lhs2;
+  const int packet_size = 8; //We fit 8 bfloat16 on a 128 register
+  if(zero){
+    lhs2 = pset1<Packet8bf>(Eigen::bfloat16(0));
+  }
+  else lhs2 = ploadu<Packet8bf>(indexA + num_packets*packet_size);
+  return vec_mergeh(lhs1.m_val, lhs2.m_val);
+}
+
+template<bool zero>
+EIGEN_STRONG_INLINE Packet8bf loadLhsBfloat16ExtraRows(const bfloat16* indexA, Index strideA, Index row, int extra_rows)
+{
+  EIGEN_ALIGN16 bfloat16 lhs_array[8] = {Eigen::bfloat16(0), Eigen::bfloat16(0), Eigen::bfloat16(0), Eigen::bfloat16(0), Eigen::bfloat16(0), Eigen::bfloat16(0), Eigen::bfloat16(0), Eigen::bfloat16(0)};
+  int count = 0;
+  const bfloat16* idxA = indexA + row*strideA;
+  for(int row_count = 0; row_count < extra_rows; row_count++){
+    lhs_array[count++] = *idxA;
+    if(!zero) lhs_array[count] = *(idxA+1);
+    count++;
+    idxA += strideA;
+  }
+  return pload<Packet8bf>(lhs_array);
+}
+
+template<bool zero>
+EIGEN_STRONG_INLINE Packet8bf loadRhsBfloat16(const bfloat16* baseB, Index strideB, int i, int k)
+{
+  const bfloat16* indexB = baseB + strideB*4*i + (k*4);
+  Packet8bf rhs1 = ploadu<Packet8bf>(indexB);
+  if(zero){
+    Packet8bf rhs2 = pset1<Packet8bf>(Eigen::bfloat16(0));
+    return vec_mergeh(rhs1.m_val, rhs2.m_val);
+  }
+  //r = vec_perm (a, b, c)
+  //Let v be the concatenation of a and b.
+  //Each byte of r selected by using the least-significant 5 bits of the corresponding byte of c as an index into v
+  //We need this elements from rhs: 0, 4, 1, 5, 2, 6, 3, 7
+  Packet16uc c = {0x0u, 0x1u, 0x8u, 0x9u, 0x2u, 0x3u, 0xAu, 0xB, 0x4, 0x5, 0xCu, 0xDu, 0x6u, 0x7u, 0xEu, 0xFu};
+  return vec_perm(rhs1.m_val, rhs1.m_val, c);
+}
+
+template<bool zero>
+EIGEN_STRONG_INLINE Packet8bf loadRhsBfloat16ExtraCols(const bfloat16* blockB, Index strideB, Index offsetB, Index col, int i, int k, int extra_cols)
+{
+  EIGEN_ALIGN16 bfloat16 rhs_vector[8] = {Eigen::bfloat16(0), Eigen::bfloat16(0), Eigen::bfloat16(0), Eigen::bfloat16(0), Eigen::bfloat16(0), Eigen::bfloat16(0), Eigen::bfloat16(0), Eigen::bfloat16(0)};
+  const bfloat16* indexB = blockB + ((col+4*i)*strideB)+k+offsetB;
+  for(int c = 0; c < extra_cols; c++){
+    rhs_vector[2*c] = *indexB;
+    if(!zero) rhs_vector[2*c+1] = *(indexB+1);
+    indexB += strideB;
+  }
+  return pload<Packet8bf>(rhs_vector);
+}
+
+template<int num_acc, int num_packets, bool zero, bool rhs_extra_cols, bool lhs_extra_rows>
+EIGEN_STRONG_INLINE void KLoop
+(
+  const bfloat16* indexA,
+  const bfloat16* indexB,
+  __vector_quad (&quad_acc)[num_acc],
+  Index strideA,
+  Index strideB,
+  Index offsetB,
+  Index k,
+  Index row,
+  Index col,
+  int extra_rows,
+  int extra_cols,
+  int mask_rows = 0xF,
+  int mask_cols = 0xF
+)
+{
+  Packet8bf lhs;
+  Packet8bf rhs[num_acc];
+  if(lhs_extra_rows) lhs = loadLhsBfloat16ExtraRows<zero>(indexA+k, strideA, row, extra_rows);
+  else lhs = loadLhsBfloat16<num_packets, zero>(indexA + k*num_packets*8); //a packet of bfloat16 has 8 elements
+  for(int i = 0; i < num_acc; i++){
+    if(!rhs_extra_cols)
+      rhs[i] = loadRhsBfloat16<zero>(indexB, strideB, i, k);
+    else{
+      rhs[i] = loadRhsBfloat16ExtraCols<zero>(indexB, strideB, offsetB, col, i, k, extra_cols);
+    }
+    pgerMMAbfloat16(&(quad_acc[i]), rhs[i], lhs, mask_cols, mask_rows);
+  }
+}
+
+template<const int num_acc, const int standard_block_size, const int num_packets, bool rhsExtraCols = false, bool lhsExtraRows = false>
+void colLoopBody(Index* p_col, Index row, Index depth, Index cols, Index rows, int offset_row, int block_index, Packet4f pAlpha, const bfloat16* indexA, Index strideA, const bfloat16* blockB, Index strideB, Index offsetB, float* result, int extra_cols = 0, int extra_rows = 0, int mask_cols = 0xF, int mask_rows = 0xF)
+{
+  int col = *p_col;
+  int count;
+  int max, step, bound;
+  const bfloat16* indexB;
+
+  if(num_acc == 1) bound = 0;
+  else bound = 1;
+
+  if(rhsExtraCols){
+    count = 0;
+    max = 1;
+    step = 1;
+    indexB = blockB;
+  }
+  else{
+    count = col;
+    step = num_acc * 4; //each accumulator has 4 elements
+    max = cols/step;
+    indexB = blockB + 4*offsetB + strideB*col;
+  }
+
+  while(count/step + bound < max){
+    Index k = 0;
+    EIGEN_ALIGN32 float acc[num_acc][4][4];
+    __vector_quad quad_acc[num_acc];
+ 
+    for(int i = 0; i < num_acc; i++)
+      __builtin_mma_xxsetaccz(&(quad_acc[i]));
+
+    if(depth%2 != 0){
+      KLoop<num_acc, num_packets, true, rhsExtraCols, lhsExtraRows>(indexA, indexB, quad_acc, strideA, strideB, offsetB, k, row, col, extra_rows, extra_cols, mask_rows, mask_cols);
+      k = 1;
+    }
+    for(; k/2 < depth/2; k += 2){ 
+      KLoop<num_acc, num_packets, false, rhsExtraCols, lhsExtraRows>(indexA, indexB, quad_acc, strideA, strideB, offsetB, k, row, col, extra_rows, extra_cols, mask_rows, mask_cols);
+    }
+    for(int i = 0; i < num_acc; i++){
+      __builtin_mma_disassemble_acc((void*)acc[i], &(quad_acc[i]));
+      if(lhsExtraRows){
+        for(int x = 0; x < extra_cols; x++){
+          for(int y = 0; y < extra_rows; y++){
+            result[((col+i*4)+x)*rows + row + y] += acc[i][x][y]*(pAlpha[0]);
+          }
+        }
+      }
+      else{
+        if(rhsExtraCols){
+          for(int x = 0; x < cols-col; x++){
+            scaleAndStore(result + ((col+i*4)+x)*rows + row + offset_row,acc[i][x], pAlpha);
+          }
+        }
+        else{
+          for(int x = 0; x < 4; x++){
+            scaleAndStore(result + ((col+i*4)+x)*rows + (block_index*16) + offset_row,acc[i][x], pAlpha);
+          }
+        }
+      }
+    }
+    count += step;
+    if(!rhsExtraCols) {
+      indexB += strideB*step;
+      col += step;
+    }
+  }
+  *p_col = col;
+}
+
+template<typename Index, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols>
+void gemmMMAbfloat16(const DataMapper& res, const bfloat16* blockA, const bfloat16* blockB, Index rows, Index depth, Index cols, bfloat16 alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
+{
+
+  if(rows == 0 || cols == 0 || depth == 0) return;
+  const Packet4f pAlpha = pset1<Packet4f>(Eigen::bfloat16_impl::bfloat16_to_float(alpha));
+  ei_declare_aligned_stack_constructed_variable(float, result, cols*rows, 0);
+
+  for(int j = 0; j < cols; j++){
+    for(int i = 0; i < rows; i++){
+      result[j*rows + i] = res(i,j);
+    }
+  }
+
+  Index row = 0;
+  Index col = 0;
+
+  if( strideA == -1 ) strideA = depth;
+  if( strideB == -1 ) strideB = depth;
+  //Packing is done in blocks.
+  //There's 3 possible sizes of blocks
+  //Blocks of 8 columns with 16 elements (8x16) as col major
+  //Blocks of 8 columns with 8 elements (8x8) as col major. This happens when there's 16 > rows > 8 
+  //Blocks of 8 columns with <8 elements as row major. This happens when there's less than 8 remaining rows
+
+  //Loop for LHS standard block (8x16)
+  int standard_block_size = 16;
+  const int standard_blocks_quantity = rows/standard_block_size; //Number of standard blocks
+  int bigSuffix = (2*8) * (strideA-offsetA-depth);
+  const bfloat16* indexA = blockA;
+  int block_index;
+  for(block_index = 0; block_index < standard_blocks_quantity; block_index++){
+    indexA += 2*8*offsetA;
+    for(int offset_row = 0; offset_row < standard_block_size; offset_row += 4){ //This block size has 16 rows maximum
+      col = 0;
+      colLoopBody<5, 16, 2>(&col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row, strideA, blockB, strideB, offsetB, result);
+      colLoopBody<4, 16, 2>(&col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row, strideA, blockB, strideB, offsetB, result);
+      colLoopBody<3, 16, 2>(&col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row, strideA, blockB, strideB, offsetB, result);
+      colLoopBody<2, 16, 2>(&col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row, strideA, blockB, strideB, offsetB, result);
+      colLoopBody<1, 16, 2>(&col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row, strideA, blockB, strideB, offsetB, result);
+      if(cols > col){
+        int extra_cols= cols-col;
+        int shift = (4-extra_cols>= 0) ? 4-extra_cols: 0;
+        int mask_cols= 0xF >> shift;
+        //Remember: It doesnt make sense use multiple acc to extra_cols as we are unrolling col loop
+        colLoopBody<1, 16, 2, true>(&col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row, strideA, blockB, strideB, offsetB, result, extra_cols, 4, mask_cols, 0xF);
+      }
+    }
+    row += 16;
+    indexA += bigSuffix + 2*8*depth;
+  }
+  //LHS (8x8) block
+  if(rows - standard_blocks_quantity*16 >= 8){
+    indexA += 1*8*offsetA + 2*8*offsetA;
+    for(int offset_row = 0; offset_row < 8; offset_row += 4){
+      col = 0;
+      colLoopBody<5, 8, 1>(&col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row, strideA, blockB, strideB, offsetB, result);
+      colLoopBody<4, 8, 1>(&col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row, strideA, blockB, strideB, offsetB, result);
+      colLoopBody<3, 8, 1>(&col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row, strideA, blockB, strideB, offsetB, result);
+      colLoopBody<2, 8, 1>(&col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row, strideA, blockB, strideB, offsetB, result);
+      colLoopBody<1, 8, 1>(&col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row, strideA, blockB, strideB, offsetB, result);
+    }
+    if(cols > col){
+      int extra_cols= cols-col;
+      int shift = (4-extra_cols>= 0) ? 4-extra_cols: 0;
+      int mask_cols= 0xF >> shift;
+
+      for(int offset_row = 0; offset_row < 8; offset_row += 4){
+        colLoopBody<1, 8, 1, true>(&col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row, strideA, blockB, strideB, offsetB, result, extra_cols, 4, mask_cols, 0xF);
+      }
+    } //end extra cols
+    row += 8;
+  }
+  //extra rows
+  while(row < rows){
+    int extra_rows = rows-row;
+    int shift = (4-extra_rows >= 0) ? 4-extra_rows : 0;
+    int mask_rows = 0xF >> shift;
+    int extra_rows_or_four = (extra_rows <= 4) ? extra_rows : 4;
+
+    //This index is the beginning of remaining block. 
+    //This last block for LHS is organized as RowMajor
+    col = 0;
+    colLoopBody<5, 8, 1, false, true>(&col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, 4, extra_rows_or_four, 0xF, mask_rows);
+    colLoopBody<4, 8, 1, false, true>(&col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, 4, extra_rows_or_four, 0xF, mask_rows);
+    colLoopBody<3, 8, 1, false, true>(&col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, 4, extra_rows_or_four, 0xF, mask_rows);
+    colLoopBody<2, 8, 1, false, true>(&col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, 4, extra_rows_or_four, 0xF, mask_rows);
+    colLoopBody<1, 8, 1, false, true>(&col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, 4, extra_rows_or_four, 0xF, mask_rows);
+    if(cols > col){
+      int extra_cols= cols-col;
+      int shift = (4-extra_cols>= 0) ? 4-extra_cols: 0;
+      int mask_cols= 0xF >> shift;
+      int extra_cols_or_four = (extra_cols <= 4) ? extra_cols : 4;
+
+      colLoopBody<1, 8, 1, true, true>(&col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, extra_cols_or_four, extra_rows_or_four, mask_cols, mask_rows);
+    }
+    row += extra_rows_or_four;
+  }
+
+  //Convert back to bfloat16
+  for(col = 0; col/4 < cols/4; col += 4){
+    int row;
+    for(row = 0; row/8 < rows/8; row += 8){
+      //get and save block
+      PacketBlock<Packet8bf,4> block;
+      for(int j = 0; j < 4; j++){
+        Packet4f temp_even, temp_odd;
+        EIGEN_ALIGN32 float even[4], odd[4];
+        for(int i = 0; i < 4; i++){
+          even[i] = result[(col + j)*rows + row + i*2];
+          odd[i] = result[(col + j)*rows + row + i*2+1];
+        }
+        temp_even = pload<Packet4f>(even);
+        temp_odd = pload<Packet4f>(odd);
+        block.packet[j] = F32ToBf16(temp_even, temp_odd);
+      }
+
+      res.template storePacketBlock<Packet8bf,4>(row, col, block);
+    }
+    //extra rows
+    while(row < rows){
+      for(int col_off = 0; col_off < 4; col_off++){
+        res(row, col+col_off) = Eigen::bfloat16(result[(col+col_off)*rows+row]);
+      }
+      row++;
+    }
+
+  }
+  //extra cols
+  while(col < cols){
+    for(int r = 0; r < rows; r++){
+      res(r, col) = Eigen::bfloat16(result[col*rows + r]);
+    }
+    col++;
+  }
+}
+
+
+}
+}
+#endif //EIGEN_MATRIX_PRODUCT_MMA_BFLOAT16_ALTIVEC_H
diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index 37398de..b0f8529 100644
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -1460,11 +1460,6 @@
   if (i < n2) {
     *reinterpret_cast<uint8_t *>(to2 + i) = *reinterpret_cast<uint8_t *>(store2 + i);
   }
-
-  LOAD_STORE_UNROLL_16
-  for (Index i = 0; i < n; i++) {
-    to[i] = from[i];
-  }
 #endif
 }
 
@@ -1575,6 +1570,9 @@
   return pand<Packet8us>(p8us_abs_mask, a);
 }
 
+template<> EIGEN_STRONG_INLINE Packet8bf psignbit(const Packet8bf& a) { return vec_sra(a.m_val, vec_splat_u16(15)); }
+template<> EIGEN_STRONG_INLINE Packet4f  psignbit(const Packet4f&  a) { return  (Packet4f)vec_sra((Packet4i)a, vec_splats((unsigned int)(31))); }
+
 template<int N> EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a)
 { return vec_sra(a,reinterpret_cast<Packet4ui>(pset1<Packet4i>(N))); }
 template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i& a)
@@ -2708,7 +2706,7 @@
     HasAbs  = 1,
     HasSin  = 0,
     HasCos  = 0,
-    HasATan = 1,
+    HasATan = 0,
     HasLog  = 0,
     HasExp  = 1,
     HasSqrt = 1,
@@ -2928,7 +2926,7 @@
   return vec_sld(a, a, 8);
 }
 template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); }
-
+template<> EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d&  a) { return (Packet2d)vec_sra((Packet2l)a, vec_splats((unsigned long long)(63))); }
 // VSX support varies between different compilers and even different
 // versions of the same compiler.  For gcc version >= 4.9.3, we can use
 // vec_cts to efficiently convert Packet2d to Packet2l.  Otherwise, use
diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h
index 008dd7a..3b1c6e7 100644
--- a/Eigen/src/Core/arch/NEON/Complex.h
+++ b/Eigen/src/Core/arch/NEON/Complex.h
@@ -66,6 +66,7 @@
     HasMul       = 1,
     HasDiv       = 1,
     HasNegate    = 1,
+    HasSqrt      = 1,
     HasAbs       = 0,
     HasAbs2      = 0,
     HasMin       = 0,
@@ -406,6 +407,7 @@
     HasMul    = 1,
     HasDiv    = 1,
     HasNegate = 1,
+    HasSqrt   = 1,
     HasAbs    = 0,
     HasAbs2   = 0,
     HasMin    = 0,
diff --git a/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h b/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h
index e49e394..5dc5ef8 100644
--- a/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h
@@ -2,7 +2,7 @@
 
 namespace Eigen {
 namespace internal {
-  
+
 #if EIGEN_ARCH_ARM && EIGEN_COMP_CLANG
 
 // Clang seems to excessively spill registers in the GEBP kernel on 32-bit arm.
@@ -183,7 +183,11 @@
   }
 };
 
-#if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
+// The register at operand 3 of fmla for data type half must be v0~v15, the compiler may not
+// allocate a required register for the '%2' of inline asm 'fmla %0.8h, %1.8h, %2.h[id]',
+// so inline assembly can't be used here to advoid the bug that vfmaq_lane_f16 is implemented
+// through a costly dup in gcc compiler.
+#if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC && EIGEN_COMP_CLANG
 
 template<>
 struct gebp_traits <half,half,false,false,Architecture::NEON>
@@ -212,9 +216,11 @@
   EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
   {}
 
-  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar*, RhsPacket&) const
   {
-    loadRhs(b,dest);
+    // If LHS is a Packet8h, we cannot correctly mimic a ploadquad of the RHS
+    // using a single scalar value.
+    eigen_assert(false && "Cannot loadRhsQuad for a scalar RHS.");
   }
 
   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
@@ -240,19 +246,10 @@
   template<int LaneID>
   EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
   {
-    #if EIGEN_COMP_GNUC_STRICT
-    // 1. vfmaq_lane_f16 is implemented through a costly dup
-    // 2. workaround the gcc register split problem on arm64-neon
-         if(LaneID==0)  asm("fmla %0.8h, %1.8h, %2.h[0]\n" : "+w" (c) : "w" (a), "w" (b) :  );
-    else if(LaneID==1)  asm("fmla %0.8h, %1.8h, %2.h[1]\n" : "+w" (c) : "w" (a), "w" (b) :  );
-    else if(LaneID==2)  asm("fmla %0.8h, %1.8h, %2.h[2]\n" : "+w" (c) : "w" (a), "w" (b) :  );
-    else if(LaneID==3)  asm("fmla %0.8h, %1.8h, %2.h[3]\n" : "+w" (c) : "w" (a), "w" (b) :  );
-    #else
     c = vfmaq_lane_f16(c, a, b, LaneID);
-    #endif
   }
 };
-#endif // EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
+#endif // EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC && EIGEN_COMP_CLANG
 #endif // EIGEN_ARCH_ARM64
 
 }  // namespace internal
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 5cbf4ac..069bd4c 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -176,6 +176,7 @@
     size = 4,
     HasHalfPacket = 1,
 
+    HasCmp       = 1,
     HasAdd       = 1,
     HasSub       = 1,
     HasShift     = 1,
@@ -188,7 +189,7 @@
     HasMin       = 1,
     HasMax       = 1,
     HasConj      = 1,
-    HasSetLinear = 0,
+    HasSetLinear = 1,
     HasBlend     = 0,
 
     HasDiv   = 1,
@@ -237,7 +238,7 @@
     HasMin       = 1,
     HasMax       = 1,
     HasConj      = 1,
-    HasSetLinear = 0,
+    HasSetLinear = 1,
     HasBlend     = 0
   };
 };
@@ -267,7 +268,7 @@
     HasMin       = 1,
     HasMax       = 1,
     HasConj      = 1,
-    HasSetLinear = 0,
+    HasSetLinear = 1,
     HasBlend     = 0,
 
     HasSqrt = 1
@@ -286,6 +287,7 @@
     size = 8,
     HasHalfPacket = 1,
 
+    HasCmp       = 1,
     HasAdd       = 1,
     HasSub       = 1,
     HasShift     = 1,
@@ -298,7 +300,7 @@
     HasMin       = 1,
     HasMax       = 1,
     HasConj      = 1,
-    HasSetLinear = 0,
+    HasSetLinear = 1,
     HasBlend     = 0
   };
 };
@@ -315,19 +317,20 @@
     size = 8,
     HasHalfPacket = 1,
 
+    HasCmp       = 1,
     HasAdd       = 1,
     HasSub       = 1,
     HasShift     = 1,
     HasMul       = 1,
     HasNegate    = 0,
-    HasAbs       = 0,
+    HasAbs       = 1,
     HasAbsDiff   = 1,
     HasArg       = 0,
     HasAbs2      = 1,
     HasMin       = 1,
     HasMax       = 1,
     HasConj      = 1,
-    HasSetLinear = 0,
+    HasSetLinear = 1,
     HasBlend     = 0,
     HasSqrt = 1
   };
@@ -345,6 +348,7 @@
     size = 4,
     HasHalfPacket = 1,
 
+    HasCmp       = 1,
     HasAdd       = 1,
     HasSub       = 1,
     HasShift     = 1,
@@ -357,7 +361,7 @@
     HasMin       = 1,
     HasMax       = 1,
     HasConj      = 1,
-    HasSetLinear = 0,
+    HasSetLinear = 1,
     HasBlend     = 0
   };
 };
@@ -374,19 +378,20 @@
     size = 4,
     HasHalfPacket = 1,
 
+    HasCmp       = 1,
     HasAdd       = 1,
     HasSub       = 1,
     HasShift     = 1,
     HasMul       = 1,
     HasNegate    = 0,
-    HasAbs       = 0,
+    HasAbs       = 1,
     HasArg       = 0,
     HasAbs2      = 1,
     HasAbsDiff   = 1,
     HasMin       = 1,
     HasMax       = 1,
     HasConj      = 1,
-    HasSetLinear = 0,
+    HasSetLinear = 1,
     HasBlend     = 0,
 
     HasSqrt = 1
@@ -418,7 +423,7 @@
     HasMin       = 1,
     HasMax       = 1,
     HasConj      = 1,
-    HasSetLinear = 0,
+    HasSetLinear = 1,
     HasBlend     = 0
   };
 };
@@ -441,14 +446,14 @@
     HasShift     = 1,
     HasMul       = 1,
     HasNegate    = 0,
-    HasAbs       = 0,
+    HasAbs       = 1,
     HasArg       = 0,
     HasAbs2      = 1,
     HasAbsDiff   = 1,
     HasMin       = 1,
     HasMax       = 1,
     HasConj      = 1,
-    HasSetLinear = 0,
+    HasSetLinear = 1,
     HasBlend     = 0
   };
 };
@@ -2372,6 +2377,15 @@
 }
 template<> EIGEN_STRONG_INLINE Packet2ul pabs(const Packet2ul& a) { return a; }
 
+template <>
+EIGEN_STRONG_INLINE Packet2f psignbit(const Packet2f& a) {
+  return vreinterpret_f32_s32(vshr_n_s32(vreinterpret_s32_f32(a), 31));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f psignbit(const Packet4f& a) {
+  return vreinterpretq_f32_s32(vshrq_n_s32(vreinterpretq_s32_f32(a), 31));
+}
+
 template<> EIGEN_STRONG_INLINE Packet2f pfrexp<Packet2f>(const Packet2f& a, Packet2f& exponent)
 { return pfrexp_generic(a,exponent); }
 template<> EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent)
@@ -3397,7 +3411,7 @@
     HasMin       = 1,
     HasMax       = 1,
     HasConj      = 1,
-    HasSetLinear = 0,
+    HasSetLinear = 1,
     HasBlend     = 0,
     HasDiv       = 1,
     HasFloor     = 1,
@@ -3754,7 +3768,7 @@
     HasMin       = 1,
     HasMax       = 1,
     HasConj      = 1,
-    HasSetLinear = 0,
+    HasSetLinear = 1,
     HasBlend     = 0,
 
     HasDiv   = 1,
@@ -3904,6 +3918,11 @@
 
 template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); }
 
+template <>
+EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) {
+  return vreinterpretq_f64_s64(vshrq_n_s64(vreinterpretq_s64_f64(a), 63));
+}
+
 template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
 { return vaddvq_f64(a); }
 
@@ -3993,7 +4012,7 @@
     HasMin = 1,
     HasMax = 1,
     HasConj = 1,
-    HasSetLinear = 0,
+    HasSetLinear = 1,
     HasBlend = 0,
     HasInsert = 1,
     HasReduxp = 1,
@@ -4465,12 +4484,22 @@
   return vabsq_f16(a);
 }
 
+template<>
+EIGEN_STRONG_INLINE Packet8hf psignbit(const Packet8hf& a) {
+  return vreinterpretq_f16_s16(vshrq_n_s16(vreinterpretq_s16_f16(a), 15));
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet4hf pabs<Packet4hf>(const Packet4hf& a) {
   return vabs_f16(a);
 }
 
 template <>
+EIGEN_STRONG_INLINE Packet4hf psignbit(const Packet4hf& a) {
+  return vreinterpret_f16_s16( vshr_n_s16( vreinterpret_s16_f16(a), 15)); 
+}
+
+template <>
 EIGEN_STRONG_INLINE Eigen::half predux<Packet8hf>(const Packet8hf& a) {
   float16x4_t a_lo, a_hi, sum;
 
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 847ff07..a0ff359 100644
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -649,6 +649,17 @@
   #endif
 }
 
+template<> EIGEN_STRONG_INLINE Packet4f psignbit(const Packet4f& a) { return _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(a), 31)); }
+template<> EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a)
+{
+    Packet4f tmp = psignbit<Packet4f>(_mm_castpd_ps(a));
+#ifdef EIGEN_VECTORIZE_AVX
+    return _mm_castps_pd(_mm_permute_ps(tmp, (shuffle_mask<1, 1, 3, 3>::mask)));
+#else
+    return _mm_castps_pd(_mm_shuffle_ps(tmp, tmp, (shuffle_mask<1, 1, 3, 3>::mask)));
+#endif // EIGEN_VECTORIZE_AVX
+}
+
 #ifdef EIGEN_VECTORIZE_SSE4_1
 template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
 {
diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h
index eaa4352..c8bb4e7 100644
--- a/Eigen/src/Core/functors/BinaryFunctors.h
+++ b/Eigen/src/Core/functors/BinaryFunctors.h
@@ -388,7 +388,11 @@
 struct maybe_raise_div_by_zero<Packet, true> {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Packet x) {
     if (EIGEN_PREDICT_FALSE(predux_any(pcmp_eq(x, pzero(x))))) {
-      std::raise(SIGFPE);
+      // Use volatile variables to force a division by zero, which will
+      // result in the default platform behaviour (usually SIGFPE).
+      volatile typename unpacket_traits<Packet>::type zero = 0;
+      volatile typename unpacket_traits<Packet>::type val = 1;
+      val = val / zero;
     }
   }
 };
diff --git a/Eigen/src/Core/functors/NullaryFunctors.h b/Eigen/src/Core/functors/NullaryFunctors.h
index e099d4a..4943d87 100644
--- a/Eigen/src/Core/functors/NullaryFunctors.h
+++ b/Eigen/src/Core/functors/NullaryFunctors.h
@@ -145,6 +145,39 @@
   const linspaced_op_impl<Scalar,NumTraits<Scalar>::IsInteger> impl;
 };
 
+template <typename Scalar>
+struct equalspaced_op {
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+
+  EIGEN_DEVICE_FUNC equalspaced_op(const Scalar& start, const Scalar& step) : m_start(start), m_step(step) {}
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(IndexType i) const {
+    return m_start + m_step * static_cast<Scalar>(i);
+  }
+  template <typename Packet, typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(IndexType i) const {
+    const Packet cst_start = pset1<Packet>(m_start);
+    const Packet cst_step = pset1<Packet>(m_step);
+    const Packet cst_lin0 = plset<Packet>(Scalar(0));
+    const Packet cst_offset = pmadd(cst_lin0, cst_step, cst_start);
+
+    Packet i_packet = pset1<Packet>(static_cast<Scalar>(i));
+    return pmadd(i_packet, cst_step, cst_offset);
+  }
+  const Scalar m_start;
+  const Scalar m_step;
+};
+
+template <typename Scalar>
+struct functor_traits<equalspaced_op<Scalar> > {
+  enum {
+    Cost = NumTraits<Scalar>::AddCost + NumTraits<Scalar>::MulCost,
+    PacketAccess =
+        packet_traits<Scalar>::HasSetLinear && packet_traits<Scalar>::HasMul && packet_traits<Scalar>::HasAdd,
+    IsRepeatable = true
+  };
+};
+
 // Linear access is automatically determined from the operator() prototypes available for the given functor.
 // If it exposes an operator()(i,j), then we assume the i and j coefficients are required independently
 // and linear access is not possible. In all other cases, linear access is enabled.
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 179d41c..4a6cef5 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -57,8 +57,13 @@
 const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(2*1024*1024);
 #elif EIGEN_ARCH_PPC
 const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(64*1024);
+#ifdef _ARCH_PWR10
+const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(2*1024*1024);
+const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(8*1024*1024);
+#else
 const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024);
 const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(4*1024*1024);
+#endif
 #else
 const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(16*1024);
 const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024);
@@ -746,6 +751,9 @@
 
 template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > {
   typedef DoublePacket<typename unpacket_traits<Packet>::half> half;
+  enum{
+    size = 2 * unpacket_traits<Packet>::size
+  };
 };
 // template<typename Packet>
 // DoublePacket<Packet> pmadd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
@@ -2485,7 +2493,13 @@
           // nr (which is currently 4) for the return type.
           const int SResPacketHalfSize = unpacket_traits<typename unpacket_traits<SResPacket>::half>::size;
           const int SResPacketQuarterSize = unpacket_traits<typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half>::size;
-          if ((SwappedTraits::LhsProgress % 4) == 0 &&
+          // The following code assumes we can load SRhsPacket in such a way that
+          // it multiplies blocks of 4 elements in SLhsPacket.  This is not the
+          // case for some customized kernels (i.e. NEON fp16).  If the assumption
+          // fails, drop down to the scalar path.
+          constexpr bool kCanLoadSRhsQuad = (unpacket_traits<SLhsPacket>::size < 4) || (unpacket_traits<SRhsPacket>::size % (unpacket_traits<SLhsPacket>::size / 4)) == 0;
+          if (kCanLoadSRhsQuad && 
+              (SwappedTraits::LhsProgress % 4) == 0 &&
               (SwappedTraits::LhsProgress<=16) &&
               (SwappedTraits::LhsProgress!=8  || SResPacketHalfSize==nr) &&
               (SwappedTraits::LhsProgress!=16 || SResPacketQuarterSize==nr))
diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h
index 3f00077..3f7638e 100644
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@@ -379,11 +379,12 @@
     storePacketBlock_helper<SubPacket, Scalar_, n, idx-1> spbh;
     EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(const blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType, Incr>* sup, Index i, Index j, const PacketBlock<SubPacket, n>& block) const {
       spbh.store(sup, i,j,block);
-      for(int l = 0; l < unpacket_traits<SubPacket>::size; l++)
-      {
-        Scalar_ *v = &sup->operator()(i+l, j+idx);
-        *v = block.packet[idx][l];
-      }
+      sup->template storePacket<SubPacket>(i, j+idx, block.packet[idx]);
+      //for(int l = 0; l < unpacket_traits<SubPacket>::size; l++)
+      //{
+      //  Scalar_ *v = &sup->operator()(i+l, j+idx);
+      //  *v = *reinterpret_cast<Scalar_ *>(&block.packet[idx][l]);
+      //}
     }
   };
 
@@ -393,12 +394,7 @@
     storePacketBlock_helper<SubPacket, std::complex<float>, n, idx-1> spbh;
     EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(const blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType, Incr>* sup, Index i, Index j, const PacketBlock<SubPacket, n>& block) const {
       spbh.store(sup,i,j,block);
-      for(int l = 0; l < unpacket_traits<SubPacket>::size; l++)
-      {
-        std::complex<float> *v = &sup->operator()(i+l, j+idx);
-        v->real(block.packet[idx].v[2*l+0]);
-        v->imag(block.packet[idx].v[2*l+1]);
-      }
+      sup->template storePacket<SubPacket>(i, j+idx, block.packet[idx]);
     }
   };
 
diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h
index 503d651..8f87c4a 100644
--- a/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/Eigen/src/Core/util/ForwardDeclarations.h
@@ -270,8 +270,10 @@
 template<typename Scalar>     class JacobiRotation;
 
 // Geometry module:
+namespace internal {
+template<typename Derived, typename OtherDerived, int Size = MatrixBase<Derived>::SizeAtCompileTime> struct cross_impl;
+}
 template<typename Derived, int Dim_> class RotationBase;
-template<typename Lhs, typename Rhs> class Cross;
 template<typename Derived> class QuaternionBase;
 template<typename Scalar> class Rotation2D;
 template<typename Scalar> class AngleAxis;
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index d8a67a4..4cd81a0 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -654,11 +654,11 @@
 // The macro EIGEN_COMP_CXXVER defines the c++ version expected by the compiler.
 // For instance, if compiling with gcc and -std=c++17, then EIGEN_COMP_CXXVER
 // is defined to 17.
-#if EIGEN_CPLUSPLUS > 201703L
+#if EIGEN_CPLUSPLUS >= 202002L
   #define EIGEN_COMP_CXXVER 20
-#elif EIGEN_CPLUSPLUS > 201402L
+#elif EIGEN_CPLUSPLUS >= 201703L
   #define EIGEN_COMP_CXXVER 17
-#elif EIGEN_CPLUSPLUS > 201103L
+#elif EIGEN_CPLUSPLUS >= 201402L
   #define EIGEN_COMP_CXXVER 14
 #elif EIGEN_CPLUSPLUS >= 201103L
   #define EIGEN_COMP_CXXVER 11
@@ -717,13 +717,16 @@
 //       and it could be that XCode 9 works just fine.
 // NOTE: the MSVC version is based on https://en.cppreference.com/w/cpp/compiler_support
 //       and not tested.
+// NOTE: Intel C++ Compiler Classic (icc) Version 19.0 and later supports dynamic allocation
+//       for over-aligned data, but not in a manner that is compatible with Eigen.
+//       See https://gitlab.com/libeigen/eigen/-/issues/2575
 #ifndef EIGEN_HAS_CXX17_OVERALIGN
 #if EIGEN_MAX_CPP_VER>=17 && EIGEN_COMP_CXXVER>=17 && (                                 \
            (EIGEN_COMP_MSVC >= 1912)                                                    \
         || (EIGEN_GNUC_AT_LEAST(7,0))                                                   \
         || ((!defined(__apple_build_version__)) && (EIGEN_COMP_CLANG>=500))             \
         || (( defined(__apple_build_version__)) && (__apple_build_version__>=10000000)) \
-      )
+      ) && !EIGEN_COMP_ICC
 #define EIGEN_HAS_CXX17_OVERALIGN 1
 #else
 #define EIGEN_HAS_CXX17_OVERALIGN 0
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index 385f4f4..e4a8793 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -96,19 +96,17 @@
 
 /* ----- Hand made implementations of aligned malloc/free and realloc ----- */
 
-/** \internal Like malloc, but the returned pointer is guaranteed to be 16-byte aligned.
-  * Fast, but wastes 16 additional bytes of memory. Does not throw any exception.
+/** \internal Like malloc, but the returned pointer is guaranteed to be aligned to `alignment`.
+  * Fast, but wastes `alignment` additional bytes of memory. Does not throw any exception.
   */
 EIGEN_DEVICE_FUNC inline void* handmade_aligned_malloc(std::size_t size, std::size_t alignment = EIGEN_DEFAULT_ALIGN_BYTES)
 {
-  eigen_assert(alignment >= sizeof(void*) && (alignment & (alignment-1)) == 0 && "Alignment must be at least sizeof(void*) and a power of 2");
-
-  EIGEN_USING_STD(malloc)
-  void *original = malloc(size+alignment);
-  
+  eigen_assert(alignment >= sizeof(void*) && alignment <= 128 && (alignment & (alignment-1)) == 0 && "Alignment must be at least sizeof(void*), less than or equal to 128, and a power of 2");
+  void* original = std::malloc(size + alignment);
   if (original == 0) return 0;
-  void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(alignment-1))) + alignment);
-  *(reinterpret_cast<void**>(aligned) - 1) = original;
+  uint8_t offset = static_cast<uint8_t>(alignment - (reinterpret_cast<std::size_t>(original) & (alignment - 1)));
+  void* aligned = static_cast<void*>(static_cast<uint8_t*>(original) + offset);
+  *(static_cast<uint8_t*>(aligned) - 1) = offset;
   return aligned;
 }
 
@@ -116,8 +114,9 @@
 EIGEN_DEVICE_FUNC inline void handmade_aligned_free(void *ptr)
 {
   if (ptr) {
-    EIGEN_USING_STD(free)
-    free(*(reinterpret_cast<void**>(ptr) - 1));
+    uint8_t offset = static_cast<uint8_t>(*(static_cast<uint8_t*>(ptr) - 1));
+    void* original = static_cast<void*>(static_cast<uint8_t*>(ptr) - offset);
+    std::free(original);
   }
 }
 
@@ -126,19 +125,22 @@
   * Since we know that our handmade version is based on std::malloc
   * we can use std::realloc to implement efficient reallocation.
   */
-inline void* handmade_aligned_realloc(void* ptr, std::size_t size, std::size_t = 0)
+EIGEN_DEVICE_FUNC inline void* handmade_aligned_realloc(void* ptr, std::size_t new_size, std::size_t old_size, std::size_t alignment = EIGEN_DEFAULT_ALIGN_BYTES)
 {
-  if (ptr == 0) return handmade_aligned_malloc(size);
-  void *original = *(reinterpret_cast<void**>(ptr) - 1);
-  std::ptrdiff_t previous_offset = static_cast<char *>(ptr)-static_cast<char *>(original);
-  original = std::realloc(original,size+EIGEN_DEFAULT_ALIGN_BYTES);
+  if (ptr == 0) return handmade_aligned_malloc(new_size, alignment);
+  uint8_t old_offset = *(static_cast<uint8_t*>(ptr) - 1);
+  void* old_original = static_cast<uint8_t*>(ptr) - old_offset;
+  void* original = std::realloc(old_original, new_size + alignment);
   if (original == 0) return 0;
-  void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1))) + EIGEN_DEFAULT_ALIGN_BYTES);
-  void *previous_aligned = static_cast<char *>(original)+previous_offset;
-  if(aligned!=previous_aligned)
-    std::memmove(aligned, previous_aligned, size);
-
-  *(reinterpret_cast<void**>(aligned) - 1) = original;
+  if (original == old_original) return ptr;
+  uint8_t offset = static_cast<uint8_t>(alignment - (reinterpret_cast<std::size_t>(original) & (alignment - 1)));
+  void* aligned = static_cast<void*>(static_cast<uint8_t*>(original) + offset);
+  if (offset != old_offset) {
+    const void* src = static_cast<const void*>(static_cast<uint8_t*>(original) + old_offset);
+    std::size_t count = (std::min)(new_size, old_size);
+    std::memmove(aligned, src, count);
+  }
+  *(static_cast<uint8_t*>(aligned) - 1) = offset;
   return aligned;
 }
 
@@ -214,13 +216,12 @@
   * \brief Reallocates an aligned block of memory.
   * \throws std::bad_alloc on allocation failure
   */
-inline void* aligned_realloc(void *ptr, std::size_t new_size, std::size_t old_size)
+EIGEN_DEVICE_FUNC inline void* aligned_realloc(void *ptr, std::size_t new_size, std::size_t old_size)
 {
   if (ptr == 0) return aligned_malloc(new_size);
-  EIGEN_UNUSED_VARIABLE(old_size)
-
   void *result;
 #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED
+  EIGEN_UNUSED_VARIABLE(old_size)
   result = std::realloc(ptr,new_size);
 #else
   result = handmade_aligned_realloc(ptr,new_size,old_size);
@@ -273,12 +274,12 @@
   free(ptr);
 }
 
-template<bool Align> inline void* conditional_aligned_realloc(void* ptr, std::size_t new_size, std::size_t old_size)
+template<bool Align> EIGEN_DEVICE_FUNC inline void* conditional_aligned_realloc(void* ptr, std::size_t new_size, std::size_t old_size)
 {
   return aligned_realloc(ptr, new_size, old_size);
 }
 
-template<> inline void* conditional_aligned_realloc<false>(void* ptr, std::size_t new_size, std::size_t)
+template<> EIGEN_DEVICE_FUNC inline void* conditional_aligned_realloc<false>(void* ptr, std::size_t new_size, std::size_t)
 {
   return std::realloc(ptr, new_size);
 }
@@ -471,7 +472,7 @@
   return result;
 }
 
-template<typename T, bool Align> inline T* conditional_aligned_realloc_new_auto(T* pts, std::size_t new_size, std::size_t old_size)
+template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_realloc_new_auto(T* pts, std::size_t new_size, std::size_t old_size)
 {
   if (NumTraits<T>::RequireInitialization) {
     return conditional_aligned_realloc_new<T, Align>(pts, new_size, old_size);
diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h
index 32152ac..6c6fb71 100644
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@@ -43,6 +43,32 @@
 typedef std::int32_t  int32_t;
 typedef std::uint64_t uint64_t;
 typedef std::int64_t  int64_t;
+
+template <size_t Size>
+struct get_integer_by_size {
+    typedef void signed_type;
+    typedef void unsigned_type;
+};
+template <>
+struct get_integer_by_size<1> {
+    typedef int8_t signed_type;
+    typedef uint8_t unsigned_type;
+};
+template <>
+struct get_integer_by_size<2> {
+    typedef int16_t signed_type;
+    typedef uint16_t unsigned_type;
+};
+template <>
+struct get_integer_by_size<4> {
+    typedef int32_t signed_type;
+    typedef uint32_t unsigned_type;
+};
+template <>
+struct get_integer_by_size<8> {
+    typedef int64_t signed_type;
+    typedef uint64_t unsigned_type;
+};
 }
 }
 
diff --git a/Eigen/src/Geometry/OrthoMethods.h b/Eigen/src/Geometry/OrthoMethods.h
index dc6a762..fbf020d 100644
--- a/Eigen/src/Geometry/OrthoMethods.h
+++ b/Eigen/src/Geometry/OrthoMethods.h
@@ -15,39 +15,83 @@
 
 namespace Eigen { 
 
+namespace internal {
+
+// Vector3 version (default)
+template<typename Derived, typename OtherDerived, int Size>
+struct cross_impl
+{
+  typedef typename ScalarBinaryOpTraits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType Scalar;
+  typedef Matrix<Scalar,MatrixBase<Derived>::RowsAtCompileTime,MatrixBase<Derived>::ColsAtCompileTime> return_type;
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  return_type run(const MatrixBase<Derived>& first, const MatrixBase<OtherDerived>& second)
+  {
+    EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Derived,3)
+    EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,3)
+
+    // Note that there is no need for an expression here since the compiler
+    // optimize such a small temporary very well (even within a complex expression)
+    typename internal::nested_eval<Derived,2>::type lhs(first.derived());
+    typename internal::nested_eval<OtherDerived,2>::type rhs(second.derived());
+    return return_type(
+      numext::conj(lhs.coeff(1) * rhs.coeff(2) - lhs.coeff(2) * rhs.coeff(1)),
+      numext::conj(lhs.coeff(2) * rhs.coeff(0) - lhs.coeff(0) * rhs.coeff(2)),
+      numext::conj(lhs.coeff(0) * rhs.coeff(1) - lhs.coeff(1) * rhs.coeff(0))
+    );
+  }
+};
+
+// Vector2 version
+template<typename Derived, typename OtherDerived>
+struct cross_impl<Derived, OtherDerived, 2>
+{
+  typedef typename ScalarBinaryOpTraits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType Scalar;
+  typedef Scalar return_type;
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  return_type run(const MatrixBase<Derived>& first, const MatrixBase<OtherDerived>& second)
+  {
+    EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Derived,2);
+    EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,2);
+    typename internal::nested_eval<Derived,2>::type lhs(first.derived());
+    typename internal::nested_eval<OtherDerived,2>::type rhs(second.derived());
+    return numext::conj(lhs.coeff(0) * rhs.coeff(1) - lhs.coeff(1) * rhs.coeff(0));
+  }
+};
+
+} // end namespace internal
+
 /** \geometry_module \ingroup Geometry_Module
   *
-  * \returns the cross product of \c *this and \a other
+  * \returns the cross product of \c *this and \a other. This is either a scalar for size-2 vectors or a size-3 vector for size-3 vectors.
   *
-  * Here is a very good explanation of cross-product: http://xkcd.com/199/
+  * This method is implemented for two different cases: between vectors of fixed size 2 and between vectors of fixed size 3.
   * 
-  * With complex numbers, the cross product is implemented as
-  * \f$ (\mathbf{a}+i\mathbf{b}) \times (\mathbf{c}+i\mathbf{d}) = (\mathbf{a} \times \mathbf{c} - \mathbf{b} \times \mathbf{d}) - i(\mathbf{a} \times \mathbf{d} - \mathbf{b} \times \mathbf{c})\f$
+  * For vectors of size 3, the output is simply the traditional cross product.
+  *
+  * For vectors of size 2, the output is a scalar.
+  * Given vectors \f$ v = \begin{bmatrix} v_1 & v_2 \end{bmatrix} \f$ and \f$ w = \begin{bmatrix} w_1 & w_2 \end{bmatrix} \f$,
+  * the result is simply \f$ v\times w = \overline{v_1 w_2 - v_2 w_1} = \text{conj}\left|\begin{smallmatrix} v_1 & w_1 \\ v_2 & w_2 \end{smallmatrix}\right| \f$;
+  * or, to put it differently, it is the third coordinate of the cross product of \f$ \begin{bmatrix} v_1 & v_2 & v_3 \end{bmatrix} \f$ and \f$ \begin{bmatrix} w_1 & w_2 & w_3 \end{bmatrix} \f$.
+  * For real-valued inputs, the result can be interpreted as the signed area of a parallelogram spanned by the two vectors.
+  * 
+  * \note With complex numbers, the cross product is implemented as
+  * \f$ (\mathbf{a}+i\mathbf{b}) \times (\mathbf{c}+i\mathbf{d}) = (\mathbf{a} \times \mathbf{c} - \mathbf{b} \times \mathbf{d}) - i(\mathbf{a} \times \mathbf{d} + \mathbf{b} \times \mathbf{c})\f$
   * 
   * \sa MatrixBase::cross3()
   */
 template<typename Derived>
 template<typename OtherDerived>
-#ifndef EIGEN_PARSED_BY_DOXYGEN
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-typename MatrixBase<Derived>::template cross_product_return_type<OtherDerived>::type
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+typename internal::cross_impl<Derived, OtherDerived>::return_type
 #else
-typename MatrixBase<Derived>::PlainObject
+inline std::conditional_t<SizeAtCompileTime==2, Scalar, PlainObject>
 #endif
 MatrixBase<Derived>::cross(const MatrixBase<OtherDerived>& other) const
 {
-  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Derived,3)
-  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,3)
-
-  // Note that there is no need for an expression here since the compiler
-  // optimize such a small temporary very well (even within a complex expression)
-  typename internal::nested_eval<Derived,2>::type lhs(derived());
-  typename internal::nested_eval<OtherDerived,2>::type rhs(other.derived());
-  return typename cross_product_return_type<OtherDerived>::type(
-    numext::conj(lhs.coeff(1) * rhs.coeff(2) - lhs.coeff(2) * rhs.coeff(1)),
-    numext::conj(lhs.coeff(2) * rhs.coeff(0) - lhs.coeff(0) * rhs.coeff(2)),
-    numext::conj(lhs.coeff(0) * rhs.coeff(1) - lhs.coeff(1) * rhs.coeff(0))
-  );
+  return internal::cross_impl<Derived, OtherDerived>::run(*this, other);
 }
 
 namespace internal {
diff --git a/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h b/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
index 079b22d..76195c7 100644
--- a/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
+++ b/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
@@ -51,9 +51,9 @@
     x.setZero();
     return true;
   }
-  Scalar rho    = 1;
-  Scalar alpha  = 1;
-  Scalar w      = 1;
+  Scalar rho    (1);
+  Scalar alpha  (1);
+  Scalar w      (1);
   
   VectorType v = VectorType::Zero(n), p = VectorType::Zero(n);
   VectorType y(n),  z(n);
diff --git a/Eigen/src/LU/arch/InverseSize4.h b/Eigen/src/LU/arch/InverseSize4.h
index 2733fbe..25f4601 100644
--- a/Eigen/src/LU/arch/InverseSize4.h
+++ b/Eigen/src/LU/arch/InverseSize4.h
@@ -37,7 +37,7 @@
 
 #include "../InternalHeaderCheck.h"
 
-#if !EIGEN_COMP_LLVM
+#if EIGEN_COMP_GNUC_STRICT
 // These routines requires bit manipulation of the sign, which is not compatible
 // with fastmath.
 #pragma GCC push_options
@@ -358,7 +358,7 @@
 } // namespace internal
 } // namespace Eigen
 
-#if !EIGEN_COMP_LLVM
+#if EIGEN_COMP_GNUC_STRICT
 #pragma GCC pop_options
 #endif
 
diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h
index 1b10c0e..a69fdca 100644
--- a/Eigen/src/SVD/BDCSVD.h
+++ b/Eigen/src/SVD/BDCSVD.h
@@ -236,7 +236,6 @@
   }
 
 private:
-  void allocate(Index rows, Index cols, unsigned int computationOptions);
   BDCSVD& compute_impl(const MatrixType& matrix, unsigned int computationOptions);
   void divide(Index firstCol, Index lastCol, Index firstRowW, Index firstColW, Index shift);
   void computeSVDofM(Index firstCol, Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V);
@@ -254,6 +253,7 @@
   void computeBaseCase(SVDType& svd, Index n, Index firstCol, Index firstRowW, Index firstColW, Index shift);
 
  protected:
+  void allocate(Index rows, Index cols, unsigned int computationOptions);
   MatrixXr m_naiveU, m_naiveV;
   MatrixXr m_computed;
   Index m_nRec;
diff --git a/Eigen/src/SVD/BDCSVD_LAPACKE.h b/Eigen/src/SVD/BDCSVD_LAPACKE.h
new file mode 100644
index 0000000..d4cc173
--- /dev/null
+++ b/Eigen/src/SVD/BDCSVD_LAPACKE.h
@@ -0,0 +1,163 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2022 Melven Roehrig-Zoellner <Melven.Roehrig-Zoellner@DLR.de>
+// Copyright (c) 2011, Intel Corporation. All rights reserved.
+//
+// This file is based on the JacobiSVD_LAPACKE.h originally from Intel -
+// see license notice below:
+/*
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to LAPACKe
+ *    Singular Value Decomposition - SVD (divide and conquer variant)
+ ********************************************************************************
+*/
+#ifndef EIGEN_BDCSVD_LAPACKE_H
+#define EIGEN_BDCSVD_LAPACKE_H
+
+namespace Eigen {
+
+namespace internal {
+
+namespace lapacke_helpers {
+
+/** \internal Specialization for the data types supported by LAPACKe */
+
+// defining a derived class to allow access to protected members
+template <typename MatrixType_, int Options>
+class BDCSVD_LAPACKE : public BDCSVD<MatrixType_, Options> {
+  typedef BDCSVD<MatrixType_, Options> SVD;
+  typedef typename SVD::MatrixType MatrixType;
+  typedef typename SVD::Scalar Scalar;
+  typedef typename SVD::RealScalar RealScalar;
+
+public:
+  // construct this by moving from a parent object
+  BDCSVD_LAPACKE(SVD&& svd) : SVD(std::move(svd)) {}
+
+  void compute_impl_lapacke(const MatrixType& matrix, unsigned int computationOptions) {
+
+    SVD::allocate(matrix.rows(), matrix.cols(), computationOptions);
+
+    SVD::m_nonzeroSingularValues = SVD::m_diagSize;
+
+    // prepare arguments to ?gesdd
+    const lapack_int matrix_order = lapack_storage_of(matrix);
+    const char jobz  = (SVD::m_computeFullU || SVD::m_computeFullV) ? 'A' : (SVD::m_computeThinU || SVD::m_computeThinV) ? 'S' : 'N';
+    const lapack_int u_cols = (jobz == 'A') ? to_lapack(SVD::m_rows) : (jobz == 'S') ? to_lapack(SVD::m_diagSize) : 1;
+    const lapack_int vt_rows = (jobz == 'A') ? to_lapack(SVD::m_cols) : (jobz == 'S') ? to_lapack(SVD::m_diagSize) : 1;
+    lapack_int ldu, ldvt;
+    Scalar *u, *vt, dummy;
+    MatrixType localU;
+    if (SVD::computeU() && !(SVD::m_computeThinU && SVD::m_computeFullV) ) {
+      ldu  = to_lapack(SVD::m_matrixU.outerStride());
+      u    = SVD::m_matrixU.data();
+    } else if (SVD::computeV()) {
+      localU.resize(SVD::m_rows, u_cols);
+      ldu  = to_lapack(localU.outerStride());
+      u    = localU.data();
+    } else { ldu=1; u=&dummy; }
+    MatrixType localV;
+    if (SVD::computeU() || SVD::computeV()) {
+      localV.resize(vt_rows, SVD::m_cols);
+      ldvt  = to_lapack(localV.outerStride());
+      vt   = localV.data();
+    } else { ldvt=1; vt=&dummy; }
+    MatrixType temp; temp = matrix;
+
+    // actual call to ?gesdd
+    lapack_int info = gesdd( matrix_order, jobz, to_lapack(SVD::m_rows), to_lapack(SVD::m_cols),
+                             to_lapack(temp.data()), to_lapack(temp.outerStride()), (RealScalar*)SVD::m_singularValues.data(),
+                             to_lapack(u), ldu, to_lapack(vt), ldvt);
+
+    // Check the result of the LAPACK call
+    if (info < 0 || !SVD::m_singularValues.allFinite()) {
+      // this includes info == -4 => NaN entry in A
+      SVD::m_info = InvalidInput;
+    } else if (info > 0 ) {
+      SVD::m_info = NoConvergence;
+    } else {
+      SVD::m_info = Success;
+      if (SVD::m_computeThinU && SVD::m_computeFullV) {
+        SVD::m_matrixU = localU.leftCols(SVD::m_matrixU.cols());
+      }
+      if (SVD::computeV()) {
+        SVD::m_matrixV = localV.adjoint().leftCols(SVD::m_matrixV.cols());
+      }
+    }
+    SVD::m_isInitialized = true;
+  }
+};
+
+template<typename MatrixType_, int Options>
+BDCSVD<MatrixType_, Options>& BDCSVD_wrapper(BDCSVD<MatrixType_, Options>& svd, const MatrixType_& matrix, int computationOptions)
+{
+  // we need to move to the wrapper type and back
+  BDCSVD_LAPACKE<MatrixType_, Options> tmpSvd(std::move(svd));
+  tmpSvd.compute_impl_lapacke(matrix, computationOptions);
+  svd = std::move(tmpSvd);
+  return svd;
+}
+
+} // end namespace lapacke_helpers
+
+} // end namespace internal
+
+#define EIGEN_LAPACKE_SDD(EIGTYPE, EIGCOLROW, OPTIONS) \
+template<> inline \
+BDCSVD<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>, OPTIONS>& \
+BDCSVD<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>, OPTIONS>::compute_impl(const Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>& matrix, unsigned int computationOptions) {\
+  return internal::lapacke_helpers::BDCSVD_wrapper(*this, matrix, computationOptions); \
+}
+
+#define EIGEN_LAPACK_SDD_OPTIONS(OPTIONS) \
+  EIGEN_LAPACKE_SDD(double,   ColMajor, OPTIONS) \
+  EIGEN_LAPACKE_SDD(float,    ColMajor, OPTIONS) \
+  EIGEN_LAPACKE_SDD(dcomplex, ColMajor, OPTIONS) \
+  EIGEN_LAPACKE_SDD(scomplex, ColMajor, OPTIONS) \
+\
+  EIGEN_LAPACKE_SDD(double,   RowMajor, OPTIONS) \
+  EIGEN_LAPACKE_SDD(float,    RowMajor, OPTIONS) \
+  EIGEN_LAPACKE_SDD(dcomplex, RowMajor, OPTIONS) \
+  EIGEN_LAPACKE_SDD(scomplex, RowMajor, OPTIONS)
+
+EIGEN_LAPACK_SDD_OPTIONS(0)
+EIGEN_LAPACK_SDD_OPTIONS(ComputeThinU)
+EIGEN_LAPACK_SDD_OPTIONS(ComputeThinV)
+EIGEN_LAPACK_SDD_OPTIONS(ComputeFullU)
+EIGEN_LAPACK_SDD_OPTIONS(ComputeFullV)
+EIGEN_LAPACK_SDD_OPTIONS(ComputeThinU | ComputeThinV)
+EIGEN_LAPACK_SDD_OPTIONS(ComputeFullU | ComputeFullV)
+EIGEN_LAPACK_SDD_OPTIONS(ComputeThinU | ComputeFullV)
+EIGEN_LAPACK_SDD_OPTIONS(ComputeFullU | ComputeThinV)
+
+#undef EIGEN_LAPACK_SDD_OPTIONS
+
+#undef EIGEN_LAPACKE_SDD
+
+} // end namespace Eigen
+
+#endif // EIGEN_BDCSVD_LAPACKE_H
diff --git a/Eigen/src/SVD/JacobiSVD_LAPACKE.h b/Eigen/src/SVD/JacobiSVD_LAPACKE.h
index c5717e9..93244cd 100644
--- a/Eigen/src/SVD/JacobiSVD_LAPACKE.h
+++ b/Eigen/src/SVD/JacobiSVD_LAPACKE.h
@@ -72,8 +72,16 @@
   } else { ldvt=1; vt=&dummy; }\
   Matrix<LAPACKE_RTYPE, Dynamic, Dynamic> superb; superb.resize(m_diagSize, 1); \
   MatrixType m_temp; m_temp = matrix; \
-  LAPACKE_##LAPACKE_PREFIX##gesvd( matrix_order, jobu, jobvt, internal::convert_index<lapack_int>(m_rows), internal::convert_index<lapack_int>(m_cols), (LAPACKE_TYPE*)m_temp.data(), lda, (LAPACKE_RTYPE*)m_singularValues.data(), u, ldu, vt, ldvt, superb.data()); \
-  if (computeV()) m_matrixV = localV.adjoint(); \
+  lapack_int info = LAPACKE_##LAPACKE_PREFIX##gesvd( matrix_order, jobu, jobvt, internal::convert_index<lapack_int>(m_rows), internal::convert_index<lapack_int>(m_cols), (LAPACKE_TYPE*)m_temp.data(), lda, (LAPACKE_RTYPE*)m_singularValues.data(), u, ldu, vt, ldvt, superb.data()); \
+  /* Check the result of the LAPACK call */ \
+  if (info < 0 || !m_singularValues.allFinite()) { \
+    m_info = InvalidInput; \
+  } else if (info > 0 ) { \
+    m_info = NoConvergence; \
+  } else { \
+    m_info = Success; \
+    if (computeV()) m_matrixV = localV.adjoint(); \
+  } \
  /* for(int i=0;i<m_diagSize;i++) if (m_singularValues.coeffRef(i) < precision) { m_nonzeroSingularValues--; m_singularValues.coeffRef(i)=RealScalar(0);}*/ \
   m_isInitialized = true; \
   return *this; \
diff --git a/Eigen/src/SparseCore/SparseCompressedBase.h b/Eigen/src/SparseCore/SparseCompressedBase.h
index ede5766..243cd16 100644
--- a/Eigen/src/SparseCore/SparseCompressedBase.h
+++ b/Eigen/src/SparseCore/SparseCompressedBase.h
@@ -22,6 +22,9 @@
 struct traits<SparseCompressedBase<Derived> > : traits<Derived>
 {};
 
+template <typename Derived, class Comp, bool IsVector>
+struct inner_sort_impl;
+
 } // end namespace internal
 
 /** \ingroup SparseCore_Module
@@ -126,6 +129,40 @@
       *
       * \sa valuePtr(), isCompressed() */
     Map<Array<Scalar,Dynamic,1> > coeffs() { eigen_assert(isCompressed()); return Array<Scalar,Dynamic,1>::Map(valuePtr(),nonZeros()); }
+    
+    /** sorts the inner vectors in the range [begin,end) with respect to `Comp`  
+      * \sa innerIndicesAreSorted() */
+    template <class Comp = std::less<>>
+    inline void sortInnerIndices(Index begin, Index end) {
+      eigen_assert(begin >= 0 && end <= derived().outerSize() && end >= begin);
+      internal::inner_sort_impl<Derived, Comp, IsVectorAtCompileTime>::run(*this, begin, end);
+    }
+    
+    /** \returns the index of the first inner vector in the range [begin,end) that is not sorted with respect to `Comp`, or `end` if the range is fully sorted
+      * \sa sortInnerIndices() */
+    template <class Comp = std::less<>>
+    inline Index innerIndicesAreSorted(Index begin, Index end) const {
+      eigen_assert(begin >= 0 && end <= derived().outerSize() && end >= begin);
+      return internal::inner_sort_impl<Derived, Comp, IsVectorAtCompileTime>::check(*this, begin, end);
+    }
+
+    /** sorts the inner vectors in the range [0,outerSize) with respect to `Comp`
+      * \sa innerIndicesAreSorted() */
+    template <class Comp = std::less<>>
+    inline void sortInnerIndices() {
+      Index begin = 0;
+      Index end = derived().outerSize();
+      internal::inner_sort_impl<Derived, Comp, IsVectorAtCompileTime>::run(*this, begin, end);
+    }
+
+    /** \returns the index of the first inner vector in the range [0,outerSize) that is not sorted with respect to `Comp`, or `outerSize` if the range is fully sorted
+      * \sa sortInnerIndices() */
+    template<class Comp = std::less<>>
+    inline Index innerIndicesAreSorted() const {
+      Index begin = 0;
+      Index end = derived().outerSize();
+      return internal::inner_sort_impl<Derived, Comp, IsVectorAtCompileTime>::check(*this, begin, end);
+    }
 
   protected:
     /** Default constructor. Do nothing. */
@@ -306,6 +343,138 @@
 
 namespace internal {
 
+// modified from https://artificial-mind.net/blog/2020/11/28/std-sort-multiple-ranges
+template <typename Scalar, typename StorageIndex>
+class CompressedStorageIterator;
+
+// wrapper class analogous to std::pair<StorageIndex&, Scalar&>
+// used to define assignment, swap, and comparison operators for CompressedStorageIterator
+template <typename Scalar, typename StorageIndex>
+class StorageRef 
+{
+public:
+  using value_type = std::pair<StorageIndex, Scalar>;
+
+  inline StorageRef& operator=(const StorageRef& other) {
+    *m_innerIndexIterator = *other.m_innerIndexIterator;
+    *m_valueIterator = *other.m_valueIterator;
+    return *this;
+  }
+  inline StorageRef& operator=(const value_type& other) {
+    std::tie(*m_innerIndexIterator, *m_valueIterator) = other;
+    return *this;
+  }
+  inline operator value_type() const { return std::make_pair(*m_innerIndexIterator, *m_valueIterator); }
+  inline friend void swap(const StorageRef& a, const StorageRef& b) {
+    std::iter_swap(a.m_innerIndexIterator, b.m_innerIndexIterator);
+    std::iter_swap(a.m_valueIterator, b.m_valueIterator);
+  }
+
+  inline static const StorageIndex& key(const StorageRef& a) { return *a.m_innerIndexIterator; }
+  inline static const StorageIndex& key(const value_type& a) { return a.first; }
+  #define REF_COMP_REF(OP) inline friend bool operator OP(const StorageRef& a, const StorageRef& b) { return key(a) OP key(b); };
+  #define REF_COMP_VAL(OP) inline friend bool operator OP(const StorageRef& a, const value_type& b) { return key(a) OP key(b); };
+  #define VAL_COMP_REF(OP) inline friend bool operator OP(const value_type& a, const StorageRef& b) { return key(a) OP key(b); };
+  #define MAKE_COMPS(OP) REF_COMP_REF(OP) REF_COMP_VAL(OP) VAL_COMP_REF(OP)
+  MAKE_COMPS(<) MAKE_COMPS(>) MAKE_COMPS(<=) MAKE_COMPS(>=) MAKE_COMPS(==) MAKE_COMPS(!=)
+
+protected:
+  StorageIndex* m_innerIndexIterator;
+  Scalar* m_valueIterator;
+private:
+  StorageRef() = delete;
+  // these constructors are only called by the CompressedStorageIterator constructors for convenience only
+  StorageRef(StorageIndex* innerIndexIterator, Scalar* valueIterator) : m_innerIndexIterator(innerIndexIterator), m_valueIterator(valueIterator) {}
+  StorageRef(const StorageRef& other) : m_innerIndexIterator(other.m_innerIndexIterator), m_valueIterator(other.m_valueIterator) {}
+
+  friend class CompressedStorageIterator<Scalar, StorageIndex>;
+};
+
+// STL-compatible iterator class that operates on inner indices and values
+template<typename Scalar, typename StorageIndex>
+class CompressedStorageIterator
+{
+public:
+  using iterator_category = std::random_access_iterator_tag;
+  using reference = StorageRef<Scalar, StorageIndex>;
+  using difference_type = Index;
+  using value_type = typename reference::value_type;
+  using pointer = value_type*;
+
+  CompressedStorageIterator() = delete;
+  CompressedStorageIterator(difference_type index, StorageIndex* innerIndexPtr, Scalar* valuePtr) : m_index(index), m_data(innerIndexPtr, valuePtr) {}
+  CompressedStorageIterator(difference_type index, reference data) : m_index(index), m_data(data) {}
+  CompressedStorageIterator(const CompressedStorageIterator& other) : m_index(other.m_index), m_data(other.m_data) {}
+  inline CompressedStorageIterator& operator=(const CompressedStorageIterator& other) {
+    m_index = other.m_index;
+    m_data = other.m_data;
+    return *this;
+  }
+
+  inline CompressedStorageIterator operator+(difference_type offset) const { return CompressedStorageIterator(m_index + offset, m_data); }
+  inline CompressedStorageIterator operator-(difference_type offset) const { return CompressedStorageIterator(m_index - offset, m_data); }
+  inline difference_type operator-(const CompressedStorageIterator& other) const { return m_index - other.m_index; }
+  inline CompressedStorageIterator& operator++() { ++m_index; return *this; }
+  inline CompressedStorageIterator& operator--() { --m_index; return *this; }
+  inline CompressedStorageIterator& operator+=(difference_type offset) { m_index += offset; return *this; }
+  inline CompressedStorageIterator& operator-=(difference_type offset) { m_index -= offset; return *this; }
+  inline reference operator*() const { return reference(m_data.m_innerIndexIterator + m_index, m_data.m_valueIterator + m_index); }
+
+  #define MAKE_COMP(OP) inline bool operator OP(const CompressedStorageIterator& other) const { return m_index OP other.m_index; }
+  MAKE_COMP(<) MAKE_COMP(>) MAKE_COMP(>=) MAKE_COMP(<=) MAKE_COMP(!=) MAKE_COMP(==)
+
+protected:
+  difference_type m_index;
+  reference m_data;
+};
+
+template <typename Derived, class Comp, bool IsVector>
+struct inner_sort_impl {
+  typedef typename Derived::Scalar Scalar;
+  typedef typename Derived::StorageIndex StorageIndex;
+  static inline void run(SparseCompressedBase<Derived>& obj, Index begin, Index end) {
+    const bool is_compressed = obj.isCompressed();
+    for (Index outer = begin; outer < end; outer++) {
+      Index begin_offset = obj.outerIndexPtr()[outer];
+      Index end_offset = is_compressed ? obj.outerIndexPtr()[outer + 1] : (begin_offset + obj.innerNonZeroPtr()[outer]);
+      CompressedStorageIterator<Scalar, StorageIndex> begin_it(begin_offset, obj.innerIndexPtr(), obj.valuePtr());
+      CompressedStorageIterator<Scalar, StorageIndex> end_it(end_offset, obj.innerIndexPtr(), obj.valuePtr());
+      std::sort(begin_it, end_it, Comp());
+    }
+  }
+  static inline Index check(const SparseCompressedBase<Derived>& obj, Index begin, Index end) {
+    const bool is_compressed = obj.isCompressed();
+    for (Index outer = begin; outer < end; outer++) {
+      Index begin_offset = obj.outerIndexPtr()[outer];
+      Index end_offset = is_compressed ? obj.outerIndexPtr()[outer + 1] : (begin_offset + obj.innerNonZeroPtr()[outer]);
+      const StorageIndex* begin_it = obj.innerIndexPtr() + begin_offset;
+      const StorageIndex* end_it = obj.innerIndexPtr() + end_offset;
+      bool is_sorted = std::is_sorted(begin_it, end_it, Comp());
+      if (!is_sorted) return outer;
+    }
+    return end;
+  }
+};
+template <typename Derived, class Comp>
+struct inner_sort_impl<Derived, Comp, true> {
+  typedef typename Derived::Scalar Scalar;
+  typedef typename Derived::StorageIndex StorageIndex;
+  static inline void run(SparseCompressedBase<Derived>& obj, Index, Index) {
+    Index begin_offset = 0;
+    Index end_offset = obj.nonZeros();
+    CompressedStorageIterator<Scalar, StorageIndex> begin_it(begin_offset, obj.innerIndexPtr(), obj.valuePtr());
+    CompressedStorageIterator<Scalar, StorageIndex> end_it(end_offset, obj.innerIndexPtr(), obj.valuePtr());
+    std::sort(begin_it, end_it, Comp());
+  }
+  static inline Index check(const SparseCompressedBase<Derived>& obj, Index, Index) {
+    Index begin_offset = 0;
+    Index end_offset = obj.nonZeros();
+    const StorageIndex* begin_it = obj.innerIndexPtr() + begin_offset;
+    const StorageIndex* end_it = obj.innerIndexPtr() + end_offset;
+    return std::is_sorted(begin_it, end_it, Comp()) ? 1 : 0;
+  }
+};
+
 template<typename Derived>
 struct evaluator<SparseCompressedBase<Derived> >
   : evaluator_base<Derived>
diff --git a/Eigen/src/SparseCore/SparseMap.h b/Eigen/src/SparseCore/SparseMap.h
index a74baec..0ee3813 100644
--- a/Eigen/src/SparseCore/SparseMap.h
+++ b/Eigen/src/SparseCore/SparseMap.h
@@ -239,6 +239,7 @@
     /** Constructs a read-write Map to a sparse matrix of size \a rows x \a cols, containing \a nnz non-zero coefficients,
       * stored as a sparse format as defined by the pointers \a outerIndexPtr, \a innerIndexPtr, and \a valuePtr.
       * If the optional parameter \a innerNonZerosPtr is the null pointer, then a standard compressed format is assumed.
+      * The inner indices must be sorted appropriately.
       *
       * This constructor is available only if \c SparseMatrixType is non-const.
       *
diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h
index fdac443..6806812 100644
--- a/Eigen/src/SparseCore/SparseMatrix.h
+++ b/Eigen/src/SparseCore/SparseMatrix.h
@@ -694,6 +694,11 @@
       Base::operator=(other);
     }
 
+    inline SparseMatrix(SparseMatrix&& other) : Base(), m_outerSize(0), m_innerSize(0), m_outerIndex(0), m_innerNonZeros(0)
+    {
+      *this = other.derived().markAsRValue();
+    }
+
     /** Copy constructor (it performs a deep copy) */
     inline SparseMatrix(const SparseMatrix& other)
       : Base(), m_outerSize(0), m_innerSize(0), m_outerIndex(0), m_innerNonZeros(0)
@@ -742,6 +747,7 @@
       internal::conditional_aligned_delete_auto<StorageIndex, true>(m_innerNonZeros, m_outerSize);
       m_innerNonZeros = 0;
     }
+
     inline SparseMatrix& operator=(const SparseMatrix& other)
     {
       if (other.isRValue())
@@ -767,6 +773,10 @@
       return *this;
     }
 
+    inline SparseMatrix& operator=(SparseMatrix&& other) {
+      return *this = other.derived().markAsRValue();
+    }
+
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename OtherDerived>
     inline SparseMatrix& operator=(const EigenBase<OtherDerived>& other)
@@ -1496,6 +1506,126 @@
 
 }
 
+// Specialization for SparseMatrix.
+// Serializes [rows, cols, isCompressed, outerSize, innerBufferSize,
+// innerNonZeros, outerIndices, innerIndices, values].
+template <typename Scalar, int Options, typename StorageIndex>
+class Serializer<SparseMatrix<Scalar, Options, StorageIndex>, void> {
+ public:
+  typedef SparseMatrix<Scalar, Options, StorageIndex> SparseMat;
+
+  struct Header {
+    typename SparseMat::Index rows;
+    typename SparseMat::Index cols;
+    bool compressed;
+    Index outer_size;
+    Index inner_buffer_size;
+  };
+
+  EIGEN_DEVICE_FUNC size_t size(const SparseMat& value) const {
+    // innerNonZeros.
+    std::size_t num_storage_indices = value.isCompressed() ? 0 : value.outerSize();
+    // Outer indices.
+    num_storage_indices += value.outerSize() + 1;
+    // Inner indices.
+    const StorageIndex inner_buffer_size = value.outerIndexPtr()[value.outerSize()];
+    num_storage_indices += inner_buffer_size;
+    // Values.
+    std::size_t num_values = inner_buffer_size;
+    return sizeof(Header) + sizeof(Scalar) * num_values +
+           sizeof(StorageIndex) * num_storage_indices;
+  }
+
+  EIGEN_DEVICE_FUNC uint8_t* serialize(uint8_t* dest, uint8_t* end,
+                                       const SparseMat& value) {
+    if (EIGEN_PREDICT_FALSE(dest == nullptr)) return nullptr;
+    if (EIGEN_PREDICT_FALSE(dest + size(value) > end)) return nullptr;
+
+    const size_t header_bytes = sizeof(Header);
+    Header header = {value.rows(), value.cols(), value.isCompressed(),
+                     value.outerSize(), value.outerIndexPtr()[value.outerSize()]};
+    EIGEN_USING_STD(memcpy)
+    memcpy(dest, &header, header_bytes);
+    dest += header_bytes;
+
+    // innerNonZeros.
+    if (!header.compressed) {
+      std::size_t data_bytes = sizeof(StorageIndex) * header.outer_size;
+      memcpy(dest, value.innerNonZeroPtr(), data_bytes);
+      dest += data_bytes;
+    }
+
+    // Outer indices.
+    std::size_t data_bytes = sizeof(StorageIndex) * (header.outer_size + 1);
+    memcpy(dest, value.outerIndexPtr(), data_bytes);
+    dest += data_bytes;
+
+    // Inner indices.
+    data_bytes = sizeof(StorageIndex) * header.inner_buffer_size;
+    memcpy(dest, value.innerIndexPtr(), data_bytes);
+    dest += data_bytes;
+
+    // Values.
+    data_bytes = sizeof(Scalar) * header.inner_buffer_size;
+    memcpy(dest, value.valuePtr(), data_bytes);
+    dest += data_bytes;
+
+    return dest;
+  }
+
+  EIGEN_DEVICE_FUNC const uint8_t* deserialize(const uint8_t* src,
+                                               const uint8_t* end,
+                                               SparseMat& value) const {
+    if (EIGEN_PREDICT_FALSE(src == nullptr)) return nullptr;
+    if (EIGEN_PREDICT_FALSE(src + sizeof(Header) > end)) return nullptr;
+
+    const size_t header_bytes = sizeof(Header);
+    Header header;
+    EIGEN_USING_STD(memcpy)
+    memcpy(&header, src, header_bytes);
+    src += header_bytes;
+
+    value.setZero();
+    value.resize(header.rows, header.cols);
+    if (header.compressed) {
+      value.makeCompressed();
+    } else {
+      value.uncompress();
+    }
+    
+    // Adjust value ptr size.
+    value.data().resize(header.inner_buffer_size);
+
+    // Initialize compressed state and inner non-zeros.
+    if (!header.compressed) {           
+      // Inner non-zero counts.
+      std::size_t data_bytes = sizeof(StorageIndex) * header.outer_size;
+      if (EIGEN_PREDICT_FALSE(src + data_bytes > end)) return nullptr;
+      memcpy(value.innerNonZeroPtr(), src, data_bytes);
+      src += data_bytes;
+    }
+
+    // Outer indices.
+    std::size_t data_bytes = sizeof(StorageIndex) * (header.outer_size + 1);
+    if (EIGEN_PREDICT_FALSE(src + data_bytes > end)) return nullptr;
+    memcpy(value.outerIndexPtr(), src, data_bytes);
+    src += data_bytes;
+
+    // Inner indices.
+    data_bytes = sizeof(StorageIndex) * header.inner_buffer_size;
+    if (EIGEN_PREDICT_FALSE(src + data_bytes > end)) return nullptr;
+    memcpy(value.innerIndexPtr(), src, data_bytes);
+    src += data_bytes;
+
+    // Values.
+    data_bytes = sizeof(Scalar) * header.inner_buffer_size;
+    if (EIGEN_PREDICT_FALSE(src + data_bytes > end)) return nullptr;
+    memcpy(value.valuePtr(), src, data_bytes);
+    src += data_bytes;
+    return src;
+  }
+};
+
 } // end namespace Eigen
 
 #endif // EIGEN_SPARSEMATRIX_H
diff --git a/Eigen/src/SparseCore/SparseVector.h b/Eigen/src/SparseCore/SparseVector.h
index ebbadb5..3b4d7b0 100644
--- a/Eigen/src/SparseCore/SparseVector.h
+++ b/Eigen/src/SparseCore/SparseVector.h
@@ -495,6 +495,78 @@
 
 }
 
+// Specialization for SparseVector.
+// Serializes [size, numNonZeros, innerIndices, values].
+template <typename Scalar, int Options, typename StorageIndex>
+class Serializer<SparseVector<Scalar, Options, StorageIndex>, void> {
+ public:
+  typedef SparseVector<Scalar, Options, StorageIndex> SparseMat;
+
+  struct Header {
+    typename SparseMat::Index size;
+    Index num_non_zeros;
+  };
+
+  EIGEN_DEVICE_FUNC size_t size(const SparseMat& value) const {
+    return sizeof(Header) +
+           (sizeof(Scalar) + sizeof(StorageIndex)) * value.nonZeros();
+  }
+
+  EIGEN_DEVICE_FUNC uint8_t* serialize(uint8_t* dest, uint8_t* end,
+                                       const SparseMat& value) {
+    if (EIGEN_PREDICT_FALSE(dest == nullptr)) return nullptr;
+    if (EIGEN_PREDICT_FALSE(dest + size(value) > end)) return nullptr;
+
+    const size_t header_bytes = sizeof(Header);
+    Header header = {value.innerSize(), value.nonZeros()};
+    EIGEN_USING_STD(memcpy)
+    memcpy(dest, &header, header_bytes);
+    dest += header_bytes;
+
+    // Inner indices.
+    std::size_t data_bytes = sizeof(StorageIndex) * header.num_non_zeros;
+    memcpy(dest, value.innerIndexPtr(), data_bytes);
+    dest += data_bytes;
+
+    // Values.
+    data_bytes = sizeof(Scalar) * header.num_non_zeros;
+    memcpy(dest, value.valuePtr(), data_bytes);
+    dest += data_bytes;
+
+    return dest;
+  }
+
+  EIGEN_DEVICE_FUNC const uint8_t* deserialize(const uint8_t* src,
+                                               const uint8_t* end,
+                                               SparseMat& value) const {
+    if (EIGEN_PREDICT_FALSE(src == nullptr)) return nullptr;
+    if (EIGEN_PREDICT_FALSE(src + sizeof(Header) > end)) return nullptr;
+
+    const size_t header_bytes = sizeof(Header);
+    Header header;
+    EIGEN_USING_STD(memcpy)
+    memcpy(&header, src, header_bytes);
+    src += header_bytes;
+
+    value.setZero();
+    value.resize(header.size);
+    value.resizeNonZeros(header.num_non_zeros);
+
+    // Inner indices.
+    std::size_t data_bytes = sizeof(StorageIndex) * header.num_non_zeros;
+    if (EIGEN_PREDICT_FALSE(src + data_bytes > end)) return nullptr;
+    memcpy(value.innerIndexPtr(), src, data_bytes);
+    src += data_bytes;
+
+    // Values.
+    data_bytes = sizeof(Scalar) * header.num_non_zeros;
+    if (EIGEN_PREDICT_FALSE(src + data_bytes > end)) return nullptr;
+    memcpy(value.valuePtr(), src, data_bytes);
+    src += data_bytes;
+    return src;
+  }
+};
+
 } // end namespace Eigen
 
 #endif // EIGEN_SPARSEVECTOR_H
diff --git a/Eigen/src/SparseLU/SparseLU.h b/Eigen/src/SparseLU/SparseLU.h
index 9fd7e25..f70aab1 100644
--- a/Eigen/src/SparseLU/SparseLU.h
+++ b/Eigen/src/SparseLU/SparseLU.h
@@ -37,9 +37,10 @@
     MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
   };
 
-  SparseLUTransposeView() : m_sparseLU(NULL) {}
-  SparseLUTransposeView(const SparseLUTransposeView& view) {
+  SparseLUTransposeView() : APIBase(), m_sparseLU(NULL) {}
+  SparseLUTransposeView(const SparseLUTransposeView& view) : APIBase() {
     this->m_sparseLU = view.m_sparseLU;
+    this->m_isInitialized = view.m_isInitialized;
   }
   void setIsInitialized(const bool isInitialized) {this->m_isInitialized = isInitialized;}
   void setSparseLU(SparseLUType* sparseLU) {m_sparseLU = sparseLU;}
@@ -860,7 +861,6 @@
   template<typename Dest>   void solveInPlace(MatrixBase<Dest> &X) const
   {
     Index nrhs = X.cols();
-    Index n    = X.rows();
     // Backward solve with U
     for (Index k = m_mapL.nsuper(); k >= 0; k--)
     {
@@ -880,7 +880,7 @@
       {
         // FIXME: the following lines should use Block expressions and not Map!
         Map<const Matrix<Scalar,Dynamic,Dynamic, ColMajor>, 0, OuterStride<> > A( &(m_mapL.valuePtr()[luptr]), nsupc, nsupc, OuterStride<>(lda) );
-        Map< Matrix<Scalar,Dynamic,Dest::ColsAtCompileTime, ColMajor>, 0, OuterStride<> > U (&(X.coeffRef(fsupc,0)), nsupc, nrhs, OuterStride<>(n) );
+        typename Dest::RowsBlockXpr U = X.derived().middleRows(fsupc, nsupc);
         U = A.template triangularView<Upper>().solve(U);
       }
 
@@ -903,7 +903,6 @@
   {
     using numext::conj;
     Index nrhs = X.cols();
-    Index n    = X.rows();
     // Forward solve with U
     for (Index k = 0; k <=  m_mapL.nsuper(); k++)
     {
@@ -934,7 +933,7 @@
       else
       {
         Map<const Matrix<Scalar,Dynamic,Dynamic, ColMajor>, 0, OuterStride<> > A( &(m_mapL.valuePtr()[luptr]), nsupc, nsupc, OuterStride<>(lda) );
-        Map< Matrix<Scalar,Dynamic,Dest::ColsAtCompileTime, ColMajor>, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) );
+        typename Dest::RowsBlockXpr U = X.derived().middleRows(fsupc, nsupc);
         if(Conjugate)
           U = A.adjoint().template triangularView<Lower>().solve(U);
         else
diff --git a/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h b/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h
index 0d59a38..adfc63a 100644
--- a/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h
+++ b/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h
@@ -276,9 +276,8 @@
         
         // Triangular solve 
         Map<const Matrix<Scalar,Dynamic,Dynamic, ColMajor>, 0, OuterStride<> > A( &(Lval[luptr]), nsupc, nsupc, OuterStride<>(lda) );
-        Map< Matrix<Scalar,Dynamic,Dest::ColsAtCompileTime, ColMajor>, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) );
-        U = A.template triangularView<UnitLower>().solve(U); 
-        
+        typename Dest::RowsBlockXpr U = X.derived().middleRows(fsupc, nsupc);
+        U = A.template triangularView<UnitLower>().solve(U);        
         // Matrix-vector product 
         new (&A) Map<const Matrix<Scalar,Dynamic,Dynamic, ColMajor>, 0, OuterStride<> > ( &(Lval[luptr+nsupc]), nrow, nsupc, OuterStride<>(lda) );
         work.topRows(nrow).noalias() = A * U;
@@ -351,7 +350,7 @@
 
       // Matrix-vector product with transposed submatrix
       Map<const Matrix<Scalar,Dynamic,Dynamic, ColMajor>, 0, OuterStride<> > A( &(Lval[luptr+nsupc]), nrow, nsupc, OuterStride<>(lda) );
-      Map< Matrix<Scalar,Dynamic,Dest::ColsAtCompileTime, ColMajor>, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) );
+      typename Dest::RowsBlockXpr U = X.derived().middleRows(fsupc, nsupc);
       if(Conjugate)
         U = U - A.adjoint() * work.topRows(nrow);
       else
diff --git a/Eigen/src/SparseLU/SparseLU_gemm_kernel.h b/Eigen/src/SparseLU/SparseLU_gemm_kernel.h
deleted file mode 100644
index 034d379..0000000
--- a/Eigen/src/SparseLU/SparseLU_gemm_kernel.h
+++ /dev/null
@@ -1,282 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2012 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_SPARSELU_GEMM_KERNEL_H
-#define EIGEN_SPARSELU_GEMM_KERNEL_H
-
-#include "./InternalHeaderCheck.h"
-
-namespace Eigen {
-
-namespace internal {
-
-
-/** \internal
-  * A general matrix-matrix product kernel optimized for the SparseLU factorization.
-  *  - A, B, and C must be column major
-  *  - lda and ldc must be multiples of the respective packet size
-  *  - C must have the same alignment as A
-  */
-template<typename Scalar>
-EIGEN_DONT_INLINE
-void sparselu_gemm(Index m, Index n, Index d, const Scalar* A, Index lda, const Scalar* B, Index ldb, Scalar* C, Index ldc)
-{
-  using namespace Eigen::internal;
-  
-  typedef typename packet_traits<Scalar>::type Packet;
-  enum {
-    NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
-    PacketSize = packet_traits<Scalar>::size,
-    PM = 8,                             // peeling in M
-    RN = 2,                             // register blocking
-    RK = NumberOfRegisters>=16 ? 4 : 2, // register blocking
-    BM = 4096/sizeof(Scalar),           // number of rows of A-C per chunk
-    SM = PM*PacketSize                  // step along M
-  };
-  Index d_end = (d/RK)*RK;    // number of columns of A (rows of B) suitable for full register blocking
-  Index n_end = (n/RN)*RN;    // number of columns of B-C suitable for processing RN columns at once
-  Index i0 = internal::first_default_aligned(A,m);
-  
-  eigen_internal_assert(((lda%PacketSize)==0) && ((ldc%PacketSize)==0) && (i0==internal::first_default_aligned(C,m)));
-  
-  // handle the non aligned rows of A and C without any optimization:
-  for(Index i=0; i<i0; ++i)
-  {
-    for(Index j=0; j<n; ++j)
-    {
-      Scalar c = C[i+j*ldc];
-      for(Index k=0; k<d; ++k)
-        c += B[k+j*ldb] * A[i+k*lda];
-      C[i+j*ldc] = c;
-    }
-  }
-  // process the remaining rows per chunk of BM rows
-  for(Index ib=i0; ib<m; ib+=BM)
-  {
-    Index actual_b = std::min<Index>(BM, m-ib);                 // actual number of rows
-    Index actual_b_end1 = (actual_b/SM)*SM;                   // actual number of rows suitable for peeling
-    Index actual_b_end2 = (actual_b/PacketSize)*PacketSize;   // actual number of rows suitable for vectorization
-    
-    // Let's process two columns of B-C at once
-    for(Index j=0; j<n_end; j+=RN)
-    {
-      const Scalar* Bc0 = B+(j+0)*ldb;
-      const Scalar* Bc1 = B+(j+1)*ldb;
-      
-      for(Index k=0; k<d_end; k+=RK)
-      {
-        
-        // load and expand a RN x RK block of B
-        Packet b00, b10, b20, b30, b01, b11, b21, b31;
-                  { b00 = pset1<Packet>(Bc0[0]); }
-                  { b10 = pset1<Packet>(Bc0[1]); }
-        if(RK==4) { b20 = pset1<Packet>(Bc0[2]); }
-        if(RK==4) { b30 = pset1<Packet>(Bc0[3]); }
-                  { b01 = pset1<Packet>(Bc1[0]); }
-                  { b11 = pset1<Packet>(Bc1[1]); }
-        if(RK==4) { b21 = pset1<Packet>(Bc1[2]); }
-        if(RK==4) { b31 = pset1<Packet>(Bc1[3]); }
-        
-        Packet a0, a1, a2, a3, c0, c1, t0, t1;
-        
-        const Scalar* A0 = A+ib+(k+0)*lda;
-        const Scalar* A1 = A+ib+(k+1)*lda;
-        const Scalar* A2 = A+ib+(k+2)*lda;
-        const Scalar* A3 = A+ib+(k+3)*lda;
-        
-        Scalar* C0 = C+ib+(j+0)*ldc;
-        Scalar* C1 = C+ib+(j+1)*ldc;
-        
-                  a0 = pload<Packet>(A0);
-                  a1 = pload<Packet>(A1);
-        if(RK==4)
-        {
-          a2 = pload<Packet>(A2);
-          a3 = pload<Packet>(A3);
-        }
-        else
-        {
-          // workaround "may be used uninitialized in this function" warning
-          a2 = a3 = a0;
-        }
-        
-#define KMADD(c, a, b, tmp) {tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp);}
-#define WORK(I)  \
-                     c0 = pload<Packet>(C0+i+(I)*PacketSize);    \
-                     c1 = pload<Packet>(C1+i+(I)*PacketSize);    \
-                     KMADD(c0, a0, b00, t0)                      \
-                     KMADD(c1, a0, b01, t1)                      \
-                     a0 = pload<Packet>(A0+i+(I+1)*PacketSize);  \
-                     KMADD(c0, a1, b10, t0)                      \
-                     KMADD(c1, a1, b11, t1)                      \
-                     a1 = pload<Packet>(A1+i+(I+1)*PacketSize);  \
-          if(RK==4){ KMADD(c0, a2, b20, t0)                     }\
-          if(RK==4){ KMADD(c1, a2, b21, t1)                     }\
-          if(RK==4){ a2 = pload<Packet>(A2+i+(I+1)*PacketSize); }\
-          if(RK==4){ KMADD(c0, a3, b30, t0)                     }\
-          if(RK==4){ KMADD(c1, a3, b31, t1)                     }\
-          if(RK==4){ a3 = pload<Packet>(A3+i+(I+1)*PacketSize); }\
-                     pstore(C0+i+(I)*PacketSize, c0);            \
-                     pstore(C1+i+(I)*PacketSize, c1)
-        
-        // process rows of A' - C' with aggressive vectorization and peeling 
-        for(Index i=0; i<actual_b_end1; i+=PacketSize*8)
-        {
-          EIGEN_ASM_COMMENT("SPARSELU_GEMML_KERNEL1");
-                    prefetch((A0+i+(5)*PacketSize));
-                    prefetch((A1+i+(5)*PacketSize));
-          if(RK==4) prefetch((A2+i+(5)*PacketSize));
-          if(RK==4) prefetch((A3+i+(5)*PacketSize));
-
-          WORK(0);
-          WORK(1);
-          WORK(2);
-          WORK(3);
-          WORK(4);
-          WORK(5);
-          WORK(6);
-          WORK(7);
-        }
-        // process the remaining rows with vectorization only
-        for(Index i=actual_b_end1; i<actual_b_end2; i+=PacketSize)
-        {
-          WORK(0);
-        }
-#undef WORK
-        // process the remaining rows without vectorization
-        for(Index i=actual_b_end2; i<actual_b; ++i)
-        {
-          if(RK==4)
-          {
-            C0[i] += A0[i]*Bc0[0]+A1[i]*Bc0[1]+A2[i]*Bc0[2]+A3[i]*Bc0[3];
-            C1[i] += A0[i]*Bc1[0]+A1[i]*Bc1[1]+A2[i]*Bc1[2]+A3[i]*Bc1[3];
-          }
-          else
-          {
-            C0[i] += A0[i]*Bc0[0]+A1[i]*Bc0[1];
-            C1[i] += A0[i]*Bc1[0]+A1[i]*Bc1[1];
-          }
-        }
-        
-        Bc0 += RK;
-        Bc1 += RK;
-      } // peeled loop on k
-    } // peeled loop on the columns j
-    // process the last column (we now perform a matrix-vector product)
-    if((n-n_end)>0)
-    {
-      const Scalar* Bc0 = B+(n-1)*ldb;
-      
-      for(Index k=0; k<d_end; k+=RK)
-      {
-        
-        // load and expand a 1 x RK block of B
-        Packet b00, b10, b20, b30;
-                  b00 = pset1<Packet>(Bc0[0]);
-                  b10 = pset1<Packet>(Bc0[1]);
-        if(RK==4) b20 = pset1<Packet>(Bc0[2]);
-        if(RK==4) b30 = pset1<Packet>(Bc0[3]);
-        
-        Packet a0, a1, a2, a3, c0, t0/*, t1*/;
-        
-        const Scalar* A0 = A+ib+(k+0)*lda;
-        const Scalar* A1 = A+ib+(k+1)*lda;
-        const Scalar* A2 = A+ib+(k+2)*lda;
-        const Scalar* A3 = A+ib+(k+3)*lda;
-        
-        Scalar* C0 = C+ib+(n_end)*ldc;
-        
-                  a0 = pload<Packet>(A0);
-                  a1 = pload<Packet>(A1);
-        if(RK==4)
-        {
-          a2 = pload<Packet>(A2);
-          a3 = pload<Packet>(A3);
-        }
-        else
-        {
-          // workaround "may be used uninitialized in this function" warning
-          a2 = a3 = a0;
-        }
-        
-#define WORK(I) \
-                   c0 = pload<Packet>(C0+i+(I)*PacketSize);     \
-                   KMADD(c0, a0, b00, t0)                       \
-                   a0 = pload<Packet>(A0+i+(I+1)*PacketSize);   \
-                   KMADD(c0, a1, b10, t0)                       \
-                   a1 = pload<Packet>(A1+i+(I+1)*PacketSize);   \
-        if(RK==4){ KMADD(c0, a2, b20, t0)                      }\
-        if(RK==4){ a2 = pload<Packet>(A2+i+(I+1)*PacketSize);  }\
-        if(RK==4){ KMADD(c0, a3, b30, t0)                      }\
-        if(RK==4){ a3 = pload<Packet>(A3+i+(I+1)*PacketSize);  }\
-                   pstore(C0+i+(I)*PacketSize, c0);
-        
-        // aggressive vectorization and peeling
-        for(Index i=0; i<actual_b_end1; i+=PacketSize*8)
-        {
-          EIGEN_ASM_COMMENT("SPARSELU_GEMML_KERNEL2");
-          WORK(0);
-          WORK(1);
-          WORK(2);
-          WORK(3);
-          WORK(4);
-          WORK(5);
-          WORK(6);
-          WORK(7);
-        }
-        // vectorization only
-        for(Index i=actual_b_end1; i<actual_b_end2; i+=PacketSize)
-        {
-          WORK(0);
-        }
-        // remaining scalars
-        for(Index i=actual_b_end2; i<actual_b; ++i)
-        {
-          if(RK==4) 
-            C0[i] += A0[i]*Bc0[0]+A1[i]*Bc0[1]+A2[i]*Bc0[2]+A3[i]*Bc0[3];
-          else
-            C0[i] += A0[i]*Bc0[0]+A1[i]*Bc0[1];
-        }
-        
-        Bc0 += RK;
-#undef WORK
-      }
-    }
-    
-    // process the last columns of A, corresponding to the last rows of B
-    Index rd = d-d_end;
-    if(rd>0)
-    {
-      for(Index j=0; j<n; ++j)
-      {
-        enum {
-          Alignment = PacketSize>1 ? Aligned : 0
-        };
-        typedef Map<Matrix<Scalar,Dynamic,1>, Alignment > MapVector;
-        typedef Map<const Matrix<Scalar,Dynamic,1>, Alignment > ConstMapVector;
-        if(rd==1)       MapVector(C+j*ldc+ib,actual_b) += B[0+d_end+j*ldb] * ConstMapVector(A+(d_end+0)*lda+ib, actual_b);
-        
-        else if(rd==2)  MapVector(C+j*ldc+ib,actual_b) += B[0+d_end+j*ldb] * ConstMapVector(A+(d_end+0)*lda+ib, actual_b)
-                                                        + B[1+d_end+j*ldb] * ConstMapVector(A+(d_end+1)*lda+ib, actual_b);
-        
-        else            MapVector(C+j*ldc+ib,actual_b) += B[0+d_end+j*ldb] * ConstMapVector(A+(d_end+0)*lda+ib, actual_b)
-                                                        + B[1+d_end+j*ldb] * ConstMapVector(A+(d_end+1)*lda+ib, actual_b)
-                                                        + B[2+d_end+j*ldb] * ConstMapVector(A+(d_end+2)*lda+ib, actual_b);
-      }
-    }
-  
-  } // blocking on the rows of A and C
-}
-#undef KMADD
-
-} // namespace internal
-
-} // namespace Eigen
-
-#endif // EIGEN_SPARSELU_GEMM_KERNEL_H
diff --git a/Eigen/src/SparseLU/SparseLU_kernel_bmod.h b/Eigen/src/SparseLU/SparseLU_kernel_bmod.h
index 424f93c..831b938 100644
--- a/Eigen/src/SparseLU/SparseLU_kernel_bmod.h
+++ b/Eigen/src/SparseLU/SparseLU_kernel_bmod.h
@@ -71,8 +71,7 @@
   Index aligned_with_B_offset = (PacketSize-internal::first_default_aligned(B.data(), PacketSize))%PacketSize;
   Map<Matrix<Scalar,Dynamic,1>, 0, OuterStride<> > l(tempv.data()+segsize+aligned_offset+aligned_with_B_offset, nrow, OuterStride<>(ldl) );
   
-  l.setZero();
-  internal::sparselu_gemm<Scalar>(l.rows(), l.cols(), B.cols(), B.data(), B.outerStride(), u.data(), u.outerStride(), l.data(), l.outerStride());
+  l.noalias() = B * u;
   
   // Scatter tempv[] into SPA dense[] as a temporary storage 
   isub = lptr + no_zeros;
diff --git a/Eigen/src/SparseLU/SparseLU_panel_bmod.h b/Eigen/src/SparseLU/SparseLU_panel_bmod.h
index 8cd331a..fd1243d 100644
--- a/Eigen/src/SparseLU/SparseLU_panel_bmod.h
+++ b/Eigen/src/SparseLU/SparseLU_panel_bmod.h
@@ -150,8 +150,7 @@
       Index offset = (PacketSize-internal::first_default_aligned(B.data(), PacketSize)) % PacketSize;
       MappedMatrixBlock L(tempv.data()+w*ldu+offset, nrow, u_cols, OuterStride<>(ldl));
       
-      L.setZero();
-      internal::sparselu_gemm<Scalar>(L.rows(), L.cols(), B.cols(), B.data(), B.outerStride(), U.data(), U.outerStride(), L.data(), L.outerStride());
+      L.noalias() = B * U;
       
       // scatter U and L
       u_col = 0;
diff --git a/Eigen/src/misc/lapacke_helpers.h b/Eigen/src/misc/lapacke_helpers.h
index e5d7ca0..b6ad6e8 100644
--- a/Eigen/src/misc/lapacke_helpers.h
+++ b/Eigen/src/misc/lapacke_helpers.h
@@ -150,6 +150,7 @@
 EIGEN_MAKE_LAPACKE_WRAPPER(potrf)
 EIGEN_MAKE_LAPACKE_WRAPPER(getrf)
 EIGEN_MAKE_LAPACKE_WRAPPER(geqrf)
+EIGEN_MAKE_LAPACKE_WRAPPER(gesdd)
 
 #undef EIGEN_MAKE_LAPACKE_WRAPPER
 }
diff --git a/cmake/EigenSmokeTestList.cmake b/cmake/EigenSmokeTestList.cmake
index 498d529..db7d3ff 100644
--- a/cmake/EigenSmokeTestList.cmake
+++ b/cmake/EigenSmokeTestList.cmake
@@ -103,6 +103,7 @@
   sizeof
   sizeoverflow
   smallvectors
+  sparse_basic_1
   sparse_basic_3
   sparse_block_1
   sparse_extra_4
@@ -132,4 +133,4 @@
   unalignedcount
   vectorwiseop_1
   visitor_1
-  vectorization_logic_1)
\ No newline at end of file
+  vectorization_logic_1)
diff --git a/doc/QuickReference.dox b/doc/QuickReference.dox
index c5dfce4..e96b617 100644
--- a/doc/QuickReference.dox
+++ b/doc/QuickReference.dox
@@ -367,7 +367,8 @@
 <tr class="alt"><td>
 \link MatrixBase::cross() cross product \endlink \matrixworld</td><td>\code
 #include <Eigen/Geometry>
-vec3 = vec1.cross(vec2);\endcode</td></tr>
+v3c = v3a.cross(v3b);    // size-3 vectors
+scalar = v2a.cross(v2b); // size-2 vectors \endcode</td></tr>
 </table>
 
 <a href="#" class="top">top</a>
diff --git a/doc/TutorialMatrixArithmetic.dox b/doc/TutorialMatrixArithmetic.dox
index 5fc569a..53916c2 100644
--- a/doc/TutorialMatrixArithmetic.dox
+++ b/doc/TutorialMatrixArithmetic.dox
@@ -158,7 +158,7 @@
 \verbinclude tut_arithmetic_dot_cross.out
 </td></tr></table>
 
-Remember that cross product is only for vectors of size 3. Dot product is for vectors of any sizes.
+Cross product is defined in Eigen not only for vectors of size 3 but also for those of size 2, check \link MatrixBase::cross() the doc\endlink for details. Dot product is for vectors of any sizes.
 When using complex numbers, Eigen's dot product is conjugate-linear in the first variable and linear in the
 second variable.
 
diff --git a/doc/UsingBlasLapackBackends.dox b/doc/UsingBlasLapackBackends.dox
index 4572820..c700d85 100644
--- a/doc/UsingBlasLapackBackends.dox
+++ b/doc/UsingBlasLapackBackends.dox
@@ -106,6 +106,12 @@
 \endcode</td><td>\code
 ?gesvd
 \endcode</td></tr>
+<tr class="alt"><td>Singular value decomposition \n \c EIGEN_USE_LAPACKE \n \c EIGEN_USE_LAPACKE_STRICT </td><td>\code
+BDCSVD<MatrixXd> svd;
+svd.compute(m1);
+\endcode</td><td>\code
+?gesdd
+\endcode</td></tr>
 <tr><td>Eigen-value decompositions \n \c EIGEN_USE_LAPACKE \n \c EIGEN_USE_LAPACKE_STRICT </td><td>\code
 EigenSolver<MatrixXd> es(m1);
 ComplexEigenSolver<MatrixXcd> ces(m1);
diff --git a/doc/examples/tut_arithmetic_dot_cross.cpp b/doc/examples/tut_arithmetic_dot_cross.cpp
index 5b0fd1e..d95e03c 100644
--- a/doc/examples/tut_arithmetic_dot_cross.cpp
+++ b/doc/examples/tut_arithmetic_dot_cross.cpp
@@ -9,5 +9,10 @@
   std::cout << "Dot product: " << v.dot(w) << std::endl;
   double dp = v.adjoint()*w; // automatic conversion of the inner product to a scalar
   std::cout << "Dot product via a matrix product: " << dp << std::endl;
+
   std::cout << "Cross product:\n" << v.cross(w) << std::endl;
+  Eigen::Vector2d v2(1,2);
+  Eigen::Vector2d w2(0,1);
+  double cp = v2.cross(w2); // returning a scalar between size-2 vectors
+  std::cout << "Cross product for 2D vectors: " << cp << std::endl;
 }
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 8ffa002..223a9f1 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -185,6 +185,7 @@
 ei_add_test(packetmath "-DEIGEN_FAST_MATH=1")
 ei_add_test(vectorization_logic)
 ei_add_test(basicstuff)
+ei_add_test(constexpr)
 ei_add_test(constructor)
 ei_add_test(linearstructure)
 ei_add_test(integer_types)
diff --git a/test/adjoint.cpp b/test/adjoint.cpp
index 210a4d92..37d23b1 100644
--- a/test/adjoint.cpp
+++ b/test/adjoint.cpp
@@ -63,10 +63,10 @@
 };
 
 template<typename MatrixType, typename Scalar = typename MatrixType::Scalar>
-MatrixType RandomMatrix(int rows, int cols, Scalar min, Scalar max) {
+MatrixType RandomMatrix(Index rows, Index cols, Scalar min, Scalar max) {
   MatrixType M = MatrixType(rows, cols);
-  for (int i=0; i<rows; ++i) {
-    for (int j=0; j<cols; ++j) {
+  for (Index i=0; i<rows; ++i) {
+    for (Index j=0; j<cols; ++j) {
       M(i, j) = Eigen::internal::random<Scalar>(min, max);
     }
   }
diff --git a/test/array_cwise.cpp b/test/array_cwise.cpp
index a7e0ff4..af8a1ef 100644
--- a/test/array_cwise.cpp
+++ b/test/array_cwise.cpp
@@ -10,7 +10,18 @@
 #include <vector>
 #include "main.h"
 
-template <typename Scalar>
+template <typename Scalar, std::enable_if_t<NumTraits<Scalar>::IsInteger,int> = 0>
+std::vector<Scalar> special_values() {
+  const Scalar zero = Scalar(0);
+  const Scalar one = Scalar(1);
+  const Scalar two = Scalar(2);
+  const Scalar three = Scalar(3);
+  const Scalar min = (std::numeric_limits<Scalar>::min)();
+  const Scalar max = (std::numeric_limits<Scalar>::max)();
+  return { zero, min, one, two, three, max };
+}
+
+template <typename Scalar, std::enable_if_t<!NumTraits<Scalar>::IsInteger, int> = 0>
 std::vector<Scalar> special_values() {
   const Scalar zero = Scalar(0);
   const Scalar eps = Eigen::NumTraits<Scalar>::epsilon();
@@ -25,30 +36,29 @@
   const Scalar min = (std::numeric_limits<Scalar>::min)();
   const Scalar max = (std::numeric_limits<Scalar>::max)();
   const Scalar max_exp = (static_cast<Scalar>(int(Eigen::NumTraits<Scalar>::max_exponent())) * Scalar(EIGEN_LN2)) / eps;
-
-  return {zero, denorm_min, min, eps, sqrt_half, one, sqrt2, two, three, max_exp, max, inf, nan};
+  return { zero, denorm_min, min, eps, sqrt_half, one, sqrt2, two, three, max_exp, max, inf, nan };
 }
 
 template<typename Scalar>
 void special_value_pairs(Array<Scalar, Dynamic, Dynamic>& x,
                          Array<Scalar, Dynamic, Dynamic>& y) {
   std::vector<Scalar> abs_vals = special_values<Scalar>();
-  const int abs_cases = abs_vals.size();
-  const int num_cases = 2*abs_cases * 2*abs_cases;
+  const Index abs_cases = (Index)abs_vals.size();
+  const Index num_cases = 2*abs_cases * 2*abs_cases;
   // ensure both vectorized and non-vectorized paths taken
-  const int num_repeats = 2 * internal::packet_traits<Scalar>::size + 1;
+  const Index num_repeats = 2 * (Index)internal::packet_traits<Scalar>::size + 1;
   x.resize(num_repeats, num_cases);
   y.resize(num_repeats, num_cases);
   int count = 0;
-  for (int i = 0; i < abs_cases; ++i) {
+  for (Index i = 0; i < abs_cases; ++i) {
     const Scalar abs_x = abs_vals[i];
-    for (int sign_x = 0; sign_x < 2; ++sign_x) {
+    for (Index sign_x = 0; sign_x < 2; ++sign_x) {
       Scalar x_case = sign_x == 0 ? -abs_x : abs_x;
-      for (int j = 0; j < abs_cases; ++j) {
+      for (Index j = 0; j < abs_cases; ++j) {
         const Scalar abs_y = abs_vals[j];
-        for (int sign_y = 0; sign_y < 2; ++sign_y) {
+        for (Index sign_y = 0; sign_y < 2; ++sign_y) {
           Scalar y_case = sign_y == 0 ? -abs_y : abs_y;
-          for (int repeat = 0; repeat < num_repeats; ++repeat) {
+          for (Index repeat = 0; repeat < num_repeats; ++repeat) {
             x(repeat, count) = x_case;
             y(repeat, count) = y_case;
           }
@@ -68,8 +78,8 @@
 
   Array<Scalar, Dynamic, Dynamic> actual = fun(x, y);
   bool all_pass = true;
-  for (int i = 0; i < x.rows(); ++i) {
-    for (int j = 0; j < x.cols(); ++j) {
+  for (Index i = 0; i < x.rows(); ++i) {
+    for (Index j = 0; j < x.cols(); ++j) {
       Scalar e = static_cast<Scalar>(ref(x(i,j), y(i,j)));
       Scalar a = actual(i, j);
       bool success = (a==e) || ((numext::isfinite)(e) && internal::isApprox(a, e, tol)) || ((numext::isnan)(a) && (numext::isnan)(e));
@@ -98,7 +108,7 @@
   const Scalar tol = test_precision<Scalar>();
 
   std::vector<Scalar> abs_vals = special_values<Scalar>();
-  const int num_vals = abs_vals.size();
+  const Index num_vals = (Index)abs_vals.size();
   Map<Array<Scalar, Dynamic, 1>> bases(abs_vals.data(), num_vals);
 
   bool all_pass = true;
@@ -110,7 +120,7 @@
       if (exponent_is_integer) {
         Int_t exponent_as_int = static_cast<Int_t>(exponent);
         Array<Scalar, Dynamic, 1> eigenPow = bases.pow(exponent_as_int);
-        for (int j = 0; j < num_vals; j++) {
+        for (Index j = 0; j < num_vals; j++) {
           Scalar e = static_cast<Scalar>(std::pow(bases(j), exponent));
           Scalar a = eigenPow(j);
           bool success = (a == e) || ((numext::isfinite)(e) && internal::isApprox(a, e, tol)) ||
@@ -123,7 +133,7 @@
       } else {
         // test floating point exponent code path
         Array<Scalar, Dynamic, 1> eigenPow = bases.pow(exponent);
-        for (int j = 0; j < num_vals; j++) {
+        for (Index j = 0; j < num_vals; j++) {
           Scalar e = static_cast<Scalar>(std::pow(bases(j), exponent));
           Scalar a = eigenPow(j);
           bool success = (a == e) || ((numext::isfinite)(e) && internal::isApprox(a, e, tol)) ||
@@ -219,7 +229,7 @@
   for (Exponent exponent = min_exponent; exponent < max_exponent; ++exponent) {
     test_exponent<Base, Exponent>(exponent);
   }
-};
+}
 
 void mixed_pow_test() {
   // The following cases will test promoting a smaller exponent type
@@ -260,6 +270,63 @@
   unary_pow_test<long long, int>();
 }
 
+namespace Eigen {
+namespace internal {
+template <typename Scalar>
+struct test_signbit_op {
+  Scalar constexpr operator()(const Scalar& a) const { return numext::signbit(a); }
+  template <typename Packet>
+  inline Packet packetOp(const Packet& a) const {
+    return psignbit(a);
+  }
+};
+template <typename Scalar>
+struct functor_traits<test_signbit_op<Scalar>> {
+  enum { Cost = 1, PacketAccess = true }; //todo: define HasSignbit flag
+};
+}  // namespace internal
+}  // namespace Eigen
+
+
+template <typename Scalar>
+void signbit_test() {
+  const size_t size = 100 * internal::packet_traits<Scalar>::size;
+  ArrayX<Scalar> x(size), y(size);
+  x.setRandom();
+  std::vector<Scalar> special_vals = special_values<Scalar>();
+  for (size_t i = 0; i < special_vals.size(); i++) {
+    x(2 * i + 0) = special_vals[i];
+    x(2 * i + 1) = -special_vals[i];
+  }
+  y = x.unaryExpr(internal::test_signbit_op<Scalar>());
+
+  bool all_pass = true;
+  for (size_t i = 0; i < size; i++) {
+    const Scalar ref_val = numext::signbit(x(i));
+    bool not_same = internal::predux_any(internal::bitwise_helper<Scalar>::bitwise_xor(ref_val, y(i)));
+    if (not_same) std::cout << "signbit(" << x(i) << ") != " << y(i) << "\n";
+    all_pass = all_pass && !not_same;
+  }
+
+  VERIFY(all_pass);
+}
+void signbit_tests() {
+  signbit_test<float>();
+  signbit_test<double>();
+  signbit_test<Eigen::half>();
+  signbit_test<Eigen::bfloat16>();
+
+  signbit_test<uint8_t>();
+  signbit_test<uint16_t>();
+  signbit_test<uint32_t>();
+  signbit_test<uint64_t>();
+
+  signbit_test<int8_t>();
+  signbit_test<int16_t>();
+  signbit_test<int32_t>();
+  signbit_test<int64_t>();
+}
+
 template<typename ArrayType> void array(const ArrayType& m)
 {
   typedef typename ArrayType::Scalar Scalar;
@@ -855,6 +922,35 @@
   VERIFY( (m2 == m1.unaryExpr(arithmetic_shift_right<9>())).all() );
 }
 
+template <typename ArrayType>
+struct signed_shift_test_impl {
+  typedef typename ArrayType::Scalar Scalar;
+  static constexpr size_t Size = sizeof(Scalar);
+  static constexpr size_t MaxShift = (CHAR_BIT * Size) - 1;
+
+  template <size_t N = 0>
+  static inline std::enable_if_t<(N >  MaxShift), void> run(const ArrayType&  ) {}
+  template <size_t N = 0>
+  static inline std::enable_if_t<(N <= MaxShift), void> run(const ArrayType& m) {
+    const Index rows = m.rows();
+    const Index cols = m.cols();
+
+    ArrayType m1 = ArrayType::Random(rows, cols), m2(rows, cols);
+
+    m2 = m1.unaryExpr([](const Scalar& x) { return x >> N; });
+    VERIFY((m2 == m1.unaryExpr(internal::scalar_shift_right_op<Scalar, N>())).all());
+
+    m2 = m1.unaryExpr([](const Scalar& x) { return x << N; });
+    VERIFY((m2 == m1.unaryExpr( internal::scalar_shift_left_op<Scalar, N>())).all());
+
+    run<N + 1>(m);
+  }
+};
+template <typename ArrayType>
+void signed_shift_test(const ArrayType& m) {
+    signed_shift_test_impl<ArrayType>::run(m);
+}
+
 EIGEN_DECLARE_TEST(array_cwise)
 {
   for(int i = 0; i < g_repeat; i++) {
@@ -867,6 +963,9 @@
     CALL_SUBTEST_6( array(Array<Index,Dynamic,Dynamic>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
     CALL_SUBTEST_6( array_integer(ArrayXXi(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
     CALL_SUBTEST_6( array_integer(Array<Index,Dynamic,Dynamic>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_7( signed_shift_test(ArrayXXi(internal::random<int>(1, EIGEN_TEST_MAX_SIZE), internal::random<int>(1, EIGEN_TEST_MAX_SIZE))));
+    CALL_SUBTEST_7( signed_shift_test(Array<Index, Dynamic, Dynamic>(internal::random<int>(1, EIGEN_TEST_MAX_SIZE), internal::random<int>(1, EIGEN_TEST_MAX_SIZE))));
+
   }
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( comparisons(Array<float, 1, 1>()) );
@@ -897,6 +996,7 @@
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_6( int_pow_test() );
     CALL_SUBTEST_7( mixed_pow_test() );
+    CALL_SUBTEST_8( signbit_tests() );
   }
 
   VERIFY((internal::is_same< internal::global_math_functions_filtering_base<int>::type, int >::value));
diff --git a/test/constexpr.cpp b/test/constexpr.cpp
new file mode 100644
index 0000000..b8f0b09
--- /dev/null
+++ b/test/constexpr.cpp
@@ -0,0 +1,52 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2022 Alex Richardson <alexrichardson@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+EIGEN_DECLARE_TEST(constexpr) {
+  // Clang accepts (some of) this code when using C++14/C++17, but GCC does not like
+  // the fact that `T array[Size]` inside Eigen::internal::plain_array is not initialized
+  // until after the constructor returns:
+  // error: member ‘Eigen::internal::plain_array<int, 9, 0, 0>::array’ must be initialized by mem-initializer in
+  // ‘constexpr’ constructor
+#if EIGEN_COMP_CXXVER >= 20
+  constexpr Matrix3i mat({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  VERIFY_IS_EQUAL(mat.size(), 9);
+  VERIFY_IS_EQUAL(mat(0, 0), 1);
+  static_assert(mat.coeff(0,1) == 2);
+  constexpr Array33i arr({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  VERIFY_IS_EQUAL(arr(0, 0), 1);
+  VERIFY_IS_EQUAL(arr.size(), 9);
+  static_assert(arr.coeff(0,1) == 2);
+  // Also check dynamic size arrays/matrices with fixed-size storage (currently
+  // only works if all elements are initialized, since otherwise the compiler
+  // complains about uninitialized trailing elements.
+  constexpr Matrix<int, Eigen::Dynamic, Eigen::Dynamic, 0, 3, 3> dyn_mat({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  VERIFY_IS_EQUAL(dyn_mat.size(), 9);
+  VERIFY_IS_EQUAL(dyn_mat(0, 0), 1);
+  static_assert(dyn_mat.coeff(0,1) == 2);
+  constexpr Array<int, Eigen::Dynamic, Eigen::Dynamic, 0, 3, 3> dyn_arr({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  VERIFY_IS_EQUAL(dyn_arr(0, 0), 1);
+  VERIFY_IS_EQUAL(dyn_arr.size(), 9);
+  static_assert(dyn_arr.coeff(0,1) == 2);
+#endif  // EIGEN_COMP_CXXVER >= 20
+}
+
+// Check that we can use the std::initializer_list constructor for constexpr variables.
+#if EIGEN_COMP_CXXVER >= 20
+// EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT() will fail constexpr evaluation unless
+// we have std::is_constant_evaluated().
+constexpr Matrix<int, 2, 2> global_mat({{1, 2}, {3, 4}});
+
+EIGEN_DECLARE_TEST(constexpr_global) {
+  VERIFY_IS_EQUAL(global_mat.size(), 4);
+  VERIFY_IS_EQUAL(global_mat(0, 0), 1);
+  static_assert(global_mat.coeff(0,0) == 1);
+}
+#endif  // EIGEN_COMP_CXXVER >= 20
diff --git a/test/geo_orthomethods.cpp b/test/geo_orthomethods.cpp
index 5f7ddb9..64b3927 100644
--- a/test/geo_orthomethods.cpp
+++ b/test/geo_orthomethods.cpp
@@ -78,6 +78,36 @@
   VERIFY_IS_APPROX(v2.cross(v1), rv1.cross(v1));
 }
 
+template<typename Scalar> void orthomethods_2()
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef Matrix<Scalar,2,1> Vector2;
+  typedef Matrix<Scalar,3,1> Vector3;
+
+  Vector3 v30 = Vector3::Random(),
+          v31 = Vector3::Random();
+  Vector2 v20 = v30.template head<2>();
+  Vector2 v21 = v31.template head<2>();
+
+  VERIFY_IS_MUCH_SMALLER_THAN(v20.cross(v20), Scalar(1));
+  VERIFY_IS_MUCH_SMALLER_THAN(v21.cross(v21), Scalar(1));
+  VERIFY_IS_APPROX(v20.cross(v21), v30.cross(v31).z());
+  
+  Vector2 v20Rot90(numext::conj(-v20.y()), numext::conj(v20.x()));
+  VERIFY_IS_APPROX(v20.cross( v20Rot90),  v20.squaredNorm());
+  VERIFY_IS_APPROX(v20.cross(-v20Rot90), -v20.squaredNorm());
+  Vector2 v21Rot90(numext::conj(-v21.y()), numext::conj(v21.x()));
+  VERIFY_IS_APPROX(v21.cross( v21Rot90),  v21.squaredNorm());
+  VERIFY_IS_APPROX(v21.cross(-v21Rot90), -v21.squaredNorm());
+
+  // check mixed product
+  typedef Matrix<RealScalar, 2, 1> RealVector2;
+  RealVector2 rv21 = RealVector2::Random();
+  v21 = rv21.template cast<Scalar>();
+  VERIFY_IS_APPROX(v20.cross(v21), v20.cross(rv21));
+  VERIFY_IS_APPROX(v21.cross(v20), rv21.cross(v20));
+}
+
 template<typename Scalar, int Size> void orthomethods(int size=Size)
 {
   typedef typename NumTraits<Scalar>::Real RealScalar;
@@ -119,6 +149,9 @@
 EIGEN_DECLARE_TEST(geo_orthomethods)
 {
   for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( orthomethods_2<float>() );
+    CALL_SUBTEST_2( orthomethods_2<double>() );
+    CALL_SUBTEST_4( orthomethods_2<std::complex<double> >() );
     CALL_SUBTEST_1( orthomethods_3<float>() );
     CALL_SUBTEST_2( orthomethods_3<double>() );
     CALL_SUBTEST_4( orthomethods_3<std::complex<double> >() );
diff --git a/test/main.h b/test/main.h
index 884b6c3..a52da9e 100644
--- a/test/main.h
+++ b/test/main.h
@@ -670,6 +670,12 @@
   return true;
 }
 
+template <typename Derived1, typename Derived2>
+bool test_isCwiseApprox(const SparseMatrixBase<Derived1>& m1,
+                        const SparseMatrixBase<Derived2>& m2, bool exact) {
+  return test_isCwiseApprox(m1.toDense(), m2.toDense(), exact);
+}
+
 template<typename T, typename U>
 bool test_is_equal(const T& actual, const U& expected, bool expect_equal)
 {
diff --git a/test/nestbyvalue.cpp b/test/nestbyvalue.cpp
index 3a86bea..c25f0bf 100644
--- a/test/nestbyvalue.cpp
+++ b/test/nestbyvalue.cpp
@@ -33,5 +33,7 @@
     MatrixXd b = x;
     VERIFY_IS_EQUAL(nb_temporaries,6+1);
     VERIFY_IS_APPROX(b, a.rowwise().reverse().eval() + (a+a).eval());
+    // Block expressions work with dense NestByValue.
+    VERIFY_IS_APPROX(b, a.nestByValue().rowwise().reverse().eval() + (a.nestByValue()+a.nestByValue()).eval());
   }
 }
diff --git a/test/nullary.cpp b/test/nullary.cpp
index 2c4d938..e524837 100644
--- a/test/nullary.cpp
+++ b/test/nullary.cpp
@@ -78,8 +78,9 @@
   const Scalar step = ((size == 1) ? 1 : (high-low)/RealScalar(size-1));
 
   // check whether the result yields what we expect it to do
-  VectorType m(base);
+  VectorType m(base), o(base);
   m.setLinSpaced(size,low,high);
+  o.setEqualSpaced(size, low, step);
 
   if(!NumTraits<Scalar>::IsInteger)
   {
@@ -87,6 +88,7 @@
     for (int i=0; i<size; ++i)
       n(i) = low+RealScalar(i)*step;
     VERIFY_IS_APPROX(m,n);
+    VERIFY_IS_APPROX(n,o);
 
     CALL_SUBTEST( check_extremity_accuracy(m, low, high) );
   }
@@ -256,11 +258,12 @@
 {
   // Check possible overflow issue
   int n = 60000;
-  ArrayXi a1(n), a2(n);
-  a1.setLinSpaced(n, 0, n-1);
-  for(int i=0; i<n; ++i)
-    a2(i) = i;
-  VERIFY_IS_APPROX(a1,a2);
+  ArrayXi a1(n), a2(n), a_ref(n);
+  a1.setLinSpaced(n, 0, n - 1);
+  a2.setEqualSpaced(n, 0, 1);
+  for (int i = 0; i < n; ++i) a_ref(i) = i;
+  VERIFY_IS_APPROX(a1, a_ref);
+  VERIFY_IS_APPROX(a2, a_ref);
 }
 
 template<int>
diff --git a/test/numext.cpp b/test/numext.cpp
index ee879c9..e99eddc 100644
--- a/test/numext.cpp
+++ b/test/numext.cpp
@@ -239,6 +239,63 @@
   check_rsqrt_impl<T>::run();
 }
 
+
+template <typename T>
+struct check_signbit_impl {
+  static void run() {
+    T true_mask;
+    std::memset(static_cast<void*>(&true_mask), 0xff, sizeof(T));
+    T false_mask;
+    std::memset(static_cast<void*>(&false_mask), 0x00, sizeof(T));
+
+    std::vector<T> negative_values;
+    std::vector<T> non_negative_values;
+
+    if (NumTraits<T>::IsInteger) {
+      negative_values = {static_cast<T>(-1),
+        static_cast<T>(NumTraits<T>::lowest())};
+      non_negative_values = {static_cast<T>(0), static_cast<T>(1),
+        static_cast<T>(NumTraits<T>::highest())};
+    } else {
+      // has sign bit
+      const T neg_zero = static_cast<T>(-0.0);
+      const T neg_one = static_cast<T>(-1.0);
+      const T neg_inf = -std::numeric_limits<T>::infinity();
+      const T neg_nan = -std::numeric_limits<T>::quiet_NaN();
+      // does not have sign bit
+      const T pos_zero = static_cast<T>(0.0);
+      const T pos_one = static_cast<T>(1.0);
+      const T pos_inf = std::numeric_limits<T>::infinity();
+      const T pos_nan = std::numeric_limits<T>::quiet_NaN();
+      negative_values = {neg_zero, neg_one, neg_inf, neg_nan};
+      non_negative_values = {pos_zero, pos_one, pos_inf, pos_nan};
+    }
+
+
+    auto check_all = [](auto values, auto expected) {
+      bool all_pass = true;
+      for (T val : values) {
+        const T numext_val = numext::signbit(val);
+        bool not_same = internal::predux_any(
+            internal::bitwise_helper<T>::bitwise_xor(expected, numext_val));
+        all_pass = all_pass && !not_same;
+        if (not_same)
+          std::cout << "signbit(" << val << ") = " << numext_val
+                    << " != " << expected << std::endl;
+      }
+      return all_pass;
+    };
+
+    bool all_pass = check_all(non_negative_values, false_mask);
+    all_pass = all_pass && check_all(negative_values, (NumTraits<T>::IsSigned ? true_mask : false_mask));
+    VERIFY(all_pass);
+  }
+};
+template <typename T>
+void check_signbit() {
+  check_signbit_impl<T>::run();
+}
+
 EIGEN_DECLARE_TEST(numext) {
   for(int k=0; k<g_repeat; ++k)
   {
@@ -266,10 +323,25 @@
     CALL_SUBTEST( check_sqrt<double>() );
     CALL_SUBTEST( check_sqrt<std::complex<float> >() );
     CALL_SUBTEST( check_sqrt<std::complex<double> >() );
-    
+
     CALL_SUBTEST( check_rsqrt<float>() );
     CALL_SUBTEST( check_rsqrt<double>() );
     CALL_SUBTEST( check_rsqrt<std::complex<float> >() );
     CALL_SUBTEST( check_rsqrt<std::complex<double> >() );
+
+    CALL_SUBTEST( check_signbit<half>());
+    CALL_SUBTEST( check_signbit<bfloat16>());
+    CALL_SUBTEST( check_signbit<float>());
+    CALL_SUBTEST( check_signbit<double>());
+
+    CALL_SUBTEST( check_signbit<uint8_t>());
+    CALL_SUBTEST( check_signbit<uint16_t>());
+    CALL_SUBTEST( check_signbit<uint32_t>());
+    CALL_SUBTEST( check_signbit<uint64_t>());
+
+    CALL_SUBTEST( check_signbit<int8_t>());
+    CALL_SUBTEST( check_signbit<int16_t>());
+    CALL_SUBTEST( check_signbit<int32_t>());
+    CALL_SUBTEST( check_signbit<int64_t>());
   }
 }
diff --git a/test/packetmath.cpp b/test/packetmath.cpp
index d7c7c9c..f082985 100644
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@@ -790,9 +790,19 @@
   const int PacketSize = internal::unpacket_traits<Packet>::size;
 
   const int size = PacketSize * 4;
-  EIGEN_ALIGN_MAX Scalar data1[PacketSize * 4];
-  EIGEN_ALIGN_MAX Scalar data2[PacketSize * 4];
-  EIGEN_ALIGN_MAX Scalar ref[PacketSize * 4];
+  EIGEN_ALIGN_MAX Scalar data1[PacketSize * 4] = {};
+  EIGEN_ALIGN_MAX Scalar data2[PacketSize * 4] = {};
+  EIGEN_ALIGN_MAX Scalar ref[PacketSize * 4] = {};
+  
+  // Negate with -0.
+  if (PacketTraits::HasNegate) {
+    test::packet_helper<PacketTraits::HasNegate,Packet> h;
+    data1[0] = Scalar{-0};
+    h.store(data2, internal::pnegate(h.load(data1)));
+    typedef typename internal::make_unsigned<typename internal::make_integer<Scalar>::type>::type Bits;
+    Bits bits = numext::bit_cast<Bits>(data2[0]);
+    VERIFY_IS_EQUAL(bits, static_cast<Bits>(Bits(1)<<(sizeof(Scalar)*CHAR_BIT - 1)));
+  }
 
   for (int i = 0; i < size; ++i) {
     data1[i] = Scalar(internal::random<double>(0, 1) * std::pow(10., internal::random<double>(-6, 6)));
diff --git a/test/product.h b/test/product.h
index a4d8591..b0ce06d 100644
--- a/test/product.h
+++ b/test/product.h
@@ -17,6 +17,19 @@
                           * (std::max)(m1.cwiseAbs2().maxCoeff(), m2.cwiseAbs2().maxCoeff()));
 }
 
+// Allow specifying tolerance for verifying error.
+template<typename Type1, typename Type2, typename Tol>
+inline bool verifyIsApprox(const Type1& a, const Type2& b, Tol tol)
+{
+  bool ret = a.isApprox(b, tol);
+  if(!ret)
+  {
+    std::cerr << "Difference too large wrt tolerance " << tol  << ", relative error is: " << test_relative_error(a,b) << std::endl;
+  }
+  return ret;
+}
+
+
 template <typename LhsType, typename RhsType>
 std::enable_if_t<RhsType::SizeAtCompileTime==Dynamic,void>
 check_mismatched_product(LhsType& lhs, const RhsType& rhs) {
@@ -34,6 +47,7 @@
      Identity.h Product.h
   */
   typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
   typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> RowVectorType;
   typedef Matrix<Scalar, MatrixType::ColsAtCompileTime, 1> ColVectorType;
   typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime> RowSquareMatrixType;
@@ -41,6 +55,11 @@
   typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::ColsAtCompileTime,
                          MatrixType::Flags&RowMajorBit?ColMajor:RowMajor> OtherMajorMatrixType;
 
+  // Wwe want a tighter epsilon for not-approx tests.  Otherwise, for certain
+  // low-precision types (e.g. bfloat16), the bound ends up being relatively large
+  // (e.g. 0.12), causing flaky tests.
+  RealScalar not_approx_epsilon = RealScalar(0.1) * NumTraits<RealScalar>::dummy_precision();
+
   Index rows = m.rows();
   Index cols = m.cols();
 
@@ -68,7 +87,11 @@
 
   // begin testing Product.h: only associativity for now
   // (we use Transpose.h but this doesn't count as a test for it)
-  VERIFY_IS_APPROX((m1*m1.transpose())*m2,  m1*(m1.transpose()*m2));
+  {
+    // Increase tolerance, since coefficients here can get relatively large.
+    RealScalar tol = RealScalar(2) * get_test_precision(m1);
+    VERIFY(verifyIsApprox((m1*m1.transpose())*m2,  m1*(m1.transpose()*m2), tol));
+  }
   m3 = m1;
   m3 *= m1.transpose() * m2;
   VERIFY_IS_APPROX(m3,                      m1 * (m1.transpose()*m2));
@@ -96,7 +119,7 @@
   // (we use the more accurate default epsilon)
   if (!NumTraits<Scalar>::IsInteger && (std::min)(rows,cols)>1)
   {
-    VERIFY(areNotApprox(m1.transpose()*m2,m2.transpose()*m1));
+    VERIFY(areNotApprox(m1.transpose()*m2,m2.transpose()*m1, not_approx_epsilon));
   }
 
   // test optimized operator+= path
@@ -105,7 +128,7 @@
   VERIFY_IS_APPROX(res, square + m1 * m2.transpose());
   if (!NumTraits<Scalar>::IsInteger && (std::min)(rows,cols)>1)
   {
-    VERIFY(areNotApprox(res,square + m2 * m1.transpose()));
+    VERIFY(areNotApprox(res,square + m2 * m1.transpose(), not_approx_epsilon));
   }
   vcres = vc2;
   vcres.noalias() += m1.transpose() * v1;
@@ -117,7 +140,7 @@
   VERIFY_IS_APPROX(res, square - (m1 * m2.transpose()));
   if (!NumTraits<Scalar>::IsInteger && (std::min)(rows,cols)>1)
   {
-    VERIFY(areNotApprox(res,square - m2 * m1.transpose()));
+    VERIFY(areNotApprox(res,square - m2 * m1.transpose(), not_approx_epsilon));
   }
   vcres = vc2;
   vcres.noalias() -= m1.transpose() * v1;
@@ -138,7 +161,7 @@
   res.noalias() = square + m1 * m2.transpose();
   VERIFY_IS_APPROX(res, square + m1 * m2.transpose());
   res.noalias() += square + m1 * m2.transpose();
-  VERIFY_IS_APPROX(res, 2*(square + m1 * m2.transpose()));
+  VERIFY_IS_APPROX(res, Scalar(2)*(square + m1 * m2.transpose()));
   res.noalias() -= square + m1 * m2.transpose();
   VERIFY_IS_APPROX(res, square + m1 * m2.transpose());
 
@@ -146,7 +169,7 @@
   res.noalias() = square - m1 * m2.transpose();
   VERIFY_IS_APPROX(res, square - m1 * m2.transpose());
   res.noalias() += square - m1 * m2.transpose();
-  VERIFY_IS_APPROX(res, 2*(square - m1 * m2.transpose()));
+  VERIFY_IS_APPROX(res, Scalar(2)*(square - m1 * m2.transpose()));
   res.noalias() -= square - m1 * m2.transpose();
   VERIFY_IS_APPROX(res, square - m1 * m2.transpose());
 
@@ -169,7 +192,7 @@
   VERIFY_IS_APPROX(res2, square2 + m1.transpose() * m2);
   if (!NumTraits<Scalar>::IsInteger && (std::min)(rows,cols)>1)
   {
-    VERIFY(areNotApprox(res2,square2 + m2.transpose() * m1));
+    VERIFY(areNotApprox(res2,square2 + m2.transpose() * m1, not_approx_epsilon));
   }
 
   VERIFY_IS_APPROX(res.col(r).noalias() = square.adjoint() * square.col(r), (square.adjoint() * square.col(r)).eval());
@@ -247,10 +270,12 @@
 
   // regression for blas_trais
   {
-    VERIFY_IS_APPROX(square * (square*square).transpose(), square * square.transpose() * square.transpose());
-    VERIFY_IS_APPROX(square * (-(square*square)), -square * square * square);
-    VERIFY_IS_APPROX(square * (s1*(square*square)), s1 * square * square * square);
-    VERIFY_IS_APPROX(square * (square*square).conjugate(), square * square.conjugate() * square.conjugate());
+    // Increase test tolerance, since coefficients can get relatively large.
+    RealScalar tol = RealScalar(2) * get_test_precision(square);
+    VERIFY(verifyIsApprox(square * (square*square).transpose(), square * square.transpose() * square.transpose(), tol));
+    VERIFY(verifyIsApprox(square * (-(square*square)), -square * square * square, tol));
+    VERIFY(verifyIsApprox(square * (s1*(square*square)), s1 * square * square * square, tol));
+    VERIFY(verifyIsApprox(square * (square*square).conjugate(), square * square.conjugate() * square.conjugate(), tol));
   }
 
   // destination with a non-default inner-stride
diff --git a/test/product_large.cpp b/test/product_large.cpp
index 3d0204b..8a9d7f8 100644
--- a/test/product_large.cpp
+++ b/test/product_large.cpp
@@ -117,6 +117,7 @@
     CALL_SUBTEST_8( product(Matrix<double,Dynamic,Dynamic,RowMajor>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
     CALL_SUBTEST_9( product(Matrix<std::complex<float>,Dynamic,Dynamic,RowMajor>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
     CALL_SUBTEST_10( product(Matrix<std::complex<double>,Dynamic,Dynamic,RowMajor>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_11( product(Matrix<bfloat16,Dynamic,Dynamic,RowMajor>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
   }
 
   CALL_SUBTEST_6( product_large_regressions<0>() );
diff --git a/test/product_small.cpp b/test/product_small.cpp
index c1d9943..a3a04dc 100644
--- a/test/product_small.cpp
+++ b/test/product_small.cpp
@@ -281,6 +281,25 @@
   }
 }
 
+template<typename T>
+void product_sweep(int max_m, int max_k, int max_n) {
+  using Matrix = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>;
+  for (int m = 1; m < max_m; ++m) {
+    for (int n = 1; n < max_n; ++n) {
+      Matrix C = Matrix::Zero(m, n);
+      Matrix Cref = Matrix::Zero(m, n);
+      for (int k = 1; k < max_k; ++k) {
+        Matrix A = Matrix::Random(m, k);
+        Matrix B = Matrix::Random(k, n);
+        C = A * B;
+        Cref.setZero();
+        ref_prod(Cref, A, B);
+        VERIFY_IS_APPROX(C, Cref);
+      }
+    }
+  }   
+}
+
 EIGEN_DECLARE_TEST(product_small)
 {
   for(int i = 0; i < g_repeat; i++) {
@@ -290,6 +309,7 @@
     CALL_SUBTEST_3( product(Matrix3d()) );
     CALL_SUBTEST_4( product(Matrix4d()) );
     CALL_SUBTEST_5( product(Matrix4f()) );
+    CALL_SUBTEST_10( product(Matrix<bfloat16, 3, 2>()) );
     CALL_SUBTEST_6( product1x1<0>() );
 
     CALL_SUBTEST_11( test_lazy_l1<float>() );
@@ -316,6 +336,12 @@
     CALL_SUBTEST_6( bug_1311<5>() );
 
     CALL_SUBTEST_9( test_dynamic_bool() );
+    
+    // Commonly specialized vectorized types.
+    CALL_SUBTEST_50( product_sweep<float>(10, 10, 10) );
+    CALL_SUBTEST_51( product_sweep<double>(10, 10, 10) );
+    CALL_SUBTEST_52( product_sweep<Eigen::half>(10, 10, 10) );
+    CALL_SUBTEST_53( product_sweep<Eigen::bfloat16>(10, 10, 10) );
   }
 
   CALL_SUBTEST_6( product_small_regressions<0>() );
diff --git a/test/product_syrk.cpp b/test/product_syrk.cpp
index 8becd37..16ccb3a 100644
--- a/test/product_syrk.cpp
+++ b/test/product_syrk.cpp
@@ -141,6 +141,7 @@
     s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2);
     CALL_SUBTEST_3( syrk(MatrixXcf(s, s)) );
     CALL_SUBTEST_4( syrk(MatrixXcd(s, s)) );
+    CALL_SUBTEST_5( syrk(Matrix<bfloat16, Dynamic, Dynamic>(s, s)) );
     TEST_SET_BUT_UNUSED_VARIABLE(s)
   }
 }
diff --git a/test/reshape.cpp b/test/reshape.cpp
index d248f01..ca7a73e 100644
--- a/test/reshape.cpp
+++ b/test/reshape.cpp
@@ -196,6 +196,24 @@
   }
 }
 
+template<typename BlockType>
+void reshape_block(const BlockType& M) {
+  auto dense = M.eval();
+  Index rows = M.size() / 2;
+  Index cols = M.size() / rows;
+  VERIFY_IS_EQUAL(dense.reshaped(rows, cols), M.reshaped(rows, cols));
+  
+  for (Index i=0; i<rows; ++i) {
+    VERIFY_IS_EQUAL(dense.reshaped(rows, cols).row(i),
+                    M.reshaped(rows, cols).row(i));
+  }
+  
+  for (Index j = 0; j<cols; ++j) {
+    VERIFY_IS_EQUAL(dense.reshaped(rows, cols).col(j),
+                    M.reshaped(rows, cols).col(j));
+  }
+}
+
 EIGEN_DECLARE_TEST(reshape)
 {
   typedef Matrix<int,Dynamic,Dynamic,RowMajor> RowMatrixXi;
@@ -216,4 +234,5 @@
 
   CALL_SUBTEST(reshape4x4(rmx));
   CALL_SUBTEST(reshape4x4(rm4));
+  CALL_SUBTEST(reshape_block(rm4.col(1)));
 }
diff --git a/test/serializer.cpp b/test/serializer.cpp
index 76b9083..d3e7ba5 100644
--- a/test/serializer.cpp
+++ b/test/serializer.cpp
@@ -9,8 +9,71 @@
 
 #include "main.h"
 
-#include <vector>
 #include <Eigen/Core>
+#include <Eigen/SparseCore>
+#include <vector>
+
+template <typename T>
+struct RandomImpl {
+  static auto Create(Eigen::Index rows, Eigen::Index cols) {
+    return T::Random(rows, cols);
+  }
+};
+
+template <typename Scalar, int Options, typename DenseIndex>
+struct RandomImpl<Eigen::SparseMatrix<Scalar, Options, DenseIndex>> {
+  using T = Eigen::SparseMatrix<Scalar, Options, DenseIndex>;
+
+  static auto Create(Eigen::Index rows, Eigen::Index cols) {
+    Eigen::SparseMatrix<Scalar, Options, DenseIndex> M(rows, cols);
+    M.setZero();
+    double density = 0.1;
+
+    // Reserve some space along each inner dim.
+    int nnz = static_cast<int>(std::ceil(density * 1.5 * M.innerSize()));
+    M.reserve(Eigen::VectorXi::Constant(M.outerSize(), nnz));
+
+    for (int j = 0; j < M.outerSize(); j++) {
+      for (int i = 0; i < M.innerSize(); i++) {
+        bool zero = (Eigen::internal::random<double>(0, 1) > density);
+        if (!zero) {
+          M.insertByOuterInner(j, i) = internal::random<Scalar>();
+        }
+      }
+    }
+
+    // 50-50 whether to compress or not.
+    if (Eigen::internal::random<double>(0, 1) >= 0.5) {
+      M.makeCompressed();
+    }
+
+    return M;
+  }
+};
+
+template <typename Scalar, int Options, typename DenseIndex>
+struct RandomImpl<Eigen::SparseVector<Scalar, Options, DenseIndex>> {
+  using T = Eigen::SparseVector<Scalar, Options, DenseIndex>;
+
+  static auto Create(Eigen::Index rows, Eigen::Index cols) {
+    Eigen::SparseVector<Scalar, Options, DenseIndex> M(rows, cols);
+    M.setZero();
+    double density = 0.1;
+
+    // Reserve some space along each inner dim.
+    int nnz = static_cast<int>(density * 1.5 * M.innerSize());
+    M.reserve(nnz);
+
+    for (int i = 0; i < M.innerSize(); i++) {
+      bool zero = (Eigen::internal::random<double>(0, 1) > density);
+      if (!zero) {
+        M.insert(i) = internal::random<Scalar>();
+      }
+    }
+
+    return M;
+  }
+};
 
 struct MyPodType {
   double x;
@@ -67,9 +130,9 @@
 void test_eigen_type(const T& type) {
   const Index rows = type.rows();
   const Index cols = type.cols();
-  
-  const T initial = T::Random(rows, cols);
-  
+
+  const T initial = RandomImpl<T>::Create(rows, cols);
+
   // Serialize.
   Eigen::Serializer<T> serializer;
   size_t buffer_size = serializer.size(initial);
@@ -160,7 +223,9 @@
     CALL_SUBTEST( test_eigen_type(Eigen::Vector3f()) );
     CALL_SUBTEST( test_eigen_type(Eigen::Matrix4d()) );
     CALL_SUBTEST( test_eigen_type(Eigen::MatrixXd(15, 17)) );
-    
+    CALL_SUBTEST(test_eigen_type(Eigen::SparseMatrix<float>(13, 12)));
+    CALL_SUBTEST(test_eigen_type(Eigen::SparseVector<float>(17)));
+
     CALL_SUBTEST( test_dense_types( Eigen::Array33f(),
                                     Eigen::ArrayXd(10),
                                     Eigen::MatrixXd(15, 17)) );
diff --git a/test/sparse_basic.cpp b/test/sparse_basic.cpp
index 8694490..52ff7b7 100644
--- a/test/sparse_basic.cpp
+++ b/test/sparse_basic.cpp
@@ -28,8 +28,8 @@
   
   const Index rows = ref.rows();
   const Index cols = ref.cols();
-  //const Index inner = ref.innerSize();
-  //const Index outer = ref.outerSize();
+  const Index inner = ref.innerSize();
+  const Index outer = ref.outerSize();
 
   typedef typename SparseMatrixType::Scalar Scalar;
   typedef typename SparseMatrixType::RealScalar RealScalar;
@@ -91,8 +91,12 @@
         for (Index k=0; k<nnz; ++k)
         {
           Index i = internal::random<Index>(0,rows-1);
-          if (m1.coeff(i,j)==Scalar(0))
-            m2.insert(i,j) = m1(i,j) = internal::random<Scalar>();
+          if (m1.coeff(i, j) == Scalar(0)) {
+            Scalar v = internal::random<Scalar>();
+            if (v == Scalar(0)) v = Scalar(1);
+            m1(i, j) = v;
+            m2.insert(i, j) = v;
+          }
         }
       }
       
@@ -116,13 +120,18 @@
       {
         Index i = internal::random<Index>(0,rows-1);
         Index j = internal::random<Index>(0,cols-1);
-        if ((m1.coeff(i,j)==Scalar(0)) && (internal::random<int>()%2))
-          m2.insert(i,j) = m1(i,j) = internal::random<Scalar>();
+        if ((m1.coeff(i, j) == Scalar(0)) && (internal::random<int>() % 2)) {
+          Scalar v = internal::random<Scalar>();
+          if (v == Scalar(0)) v = Scalar(1);
+          m1(i, j) = v;
+          m2.insert(i, j) = v;
+        }
         else
         {
           Scalar v = internal::random<Scalar>();
-          m2.coeffRef(i,j) += v;
-          m1(i,j) += v;
+          if (v == Scalar(0)) v = Scalar(1);
+          m1(i, j) = v;
+          m2.coeffRef(i, j) = v;
         }
       }
       VERIFY_IS_APPROX(m2,m1);
@@ -140,8 +149,12 @@
       {
         Index i = internal::random<Index>(0,rows-1);
         Index j = internal::random<Index>(0,cols-1);
-        if (m1.coeff(i,j)==Scalar(0))
-          m2.insert(i,j) = m1(i,j) = internal::random<Scalar>();
+        if (m1.coeff(i, j) == Scalar(0)) {
+          Scalar v = internal::random<Scalar>();
+          if (v == Scalar(0)) v = Scalar(1);
+          m1(i, j) = v;
+          m2.insert(i, j) = v;
+        }
         if(mode==3)
           m2.reserve(r);
       }
@@ -150,6 +163,63 @@
       VERIFY_IS_APPROX(m2,m1);
     }
 
+    // test sort
+    if (inner > 1) {
+      bool StorageOrdersMatch = DenseMatrix::IsRowMajor == SparseMatrixType::IsRowMajor;
+      DenseMatrix m1(rows, cols);
+      m1.setZero();
+      SparseMatrixType m2(rows, cols);
+      // generate random inner indices with no repeats
+      Vector<Index, Dynamic> innerIndices(inner);
+      innerIndices.setLinSpaced(inner, 0, inner - 1);
+      for (Index j = 0; j < outer; j++) {
+        std::random_shuffle(innerIndices.begin(), innerIndices.end());
+        Index nzj = internal::random<Index>(2, inner / 2);
+        for (Index k = 0; k < nzj; k++) {
+          Index i = innerIndices[k];
+          Scalar val = internal::random<Scalar>();
+          m1.coeffRefByOuterInner(StorageOrdersMatch ? j : i, StorageOrdersMatch ? i : j) = val;
+          m2.insertByOuterInner(j, i) = val;
+        }
+      }
+
+      VERIFY_IS_APPROX(m2, m1);
+      // sort wrt greater
+      m2.template sortInnerIndices<std::greater<>>();
+      // verify that all inner vectors are not sorted wrt less
+      VERIFY_IS_EQUAL(m2.template innerIndicesAreSorted<std::less<>>(), 0);
+      // verify that all inner vectors are sorted wrt greater
+      VERIFY_IS_EQUAL(m2.template innerIndicesAreSorted<std::greater<>>(), m2.outerSize());
+      // verify that sort does not change evaluation
+      VERIFY_IS_APPROX(m2, m1);
+      // sort wrt less
+      m2.template sortInnerIndices<std::less<>>();
+      // verify that all inner vectors are sorted wrt less
+      VERIFY_IS_EQUAL(m2.template innerIndicesAreSorted<std::less<>>(), m2.outerSize());
+      // verify that all inner vectors are not sorted wrt greater
+      VERIFY_IS_EQUAL(m2.template innerIndicesAreSorted<std::greater<>>(), 0);
+      // verify that sort does not change evaluation
+      VERIFY_IS_APPROX(m2, m1);
+
+      m2.makeCompressed();
+      // sort wrt greater
+      m2.template sortInnerIndices<std::greater<>>();
+      // verify that all inner vectors are not sorted wrt less
+      VERIFY_IS_EQUAL(m2.template innerIndicesAreSorted<std::less<>>(), 0);
+      // verify that all inner vectors are sorted wrt greater
+      VERIFY_IS_EQUAL(m2.template innerIndicesAreSorted<std::greater<>>(), m2.outerSize());
+      // verify that sort does not change evaluation
+      VERIFY_IS_APPROX(m2, m1);
+      // sort wrt less
+      m2.template sortInnerIndices<std::less<>>();
+      // verify that all inner vectors are sorted wrt less
+      VERIFY_IS_EQUAL(m2.template innerIndicesAreSorted<std::less<>>(), m2.outerSize());
+      // verify that all inner vectors are not sorted wrt greater
+      VERIFY_IS_EQUAL(m2.template innerIndicesAreSorted<std::greater<>>(), 0);
+      // verify that sort does not change evaluation
+      VERIFY_IS_APPROX(m2, m1);
+    }
+
   // test basic computations
   {
     DenseMatrix refM1 = DenseMatrix::Zero(rows, cols);
@@ -729,9 +799,12 @@
     CALL_SUBTEST_1(( sparse_basic(SparseMatrix<double>(8, 8)) ));
     CALL_SUBTEST_2(( sparse_basic(SparseMatrix<std::complex<double>, ColMajor>(r, c)) ));
     CALL_SUBTEST_2(( sparse_basic(SparseMatrix<std::complex<double>, RowMajor>(r, c)) ));
-    CALL_SUBTEST_1(( sparse_basic(SparseMatrix<double>(r, c)) ));
-    CALL_SUBTEST_5(( sparse_basic(SparseMatrix<double,ColMajor,long int>(r, c)) ));
-    CALL_SUBTEST_5(( sparse_basic(SparseMatrix<double,RowMajor,long int>(r, c)) ));
+    CALL_SUBTEST_2(( sparse_basic(SparseMatrix<float,  RowMajor>(r, c))));
+    CALL_SUBTEST_2(( sparse_basic(SparseMatrix<float,  ColMajor>(r, c))));
+    CALL_SUBTEST_3(( sparse_basic(SparseMatrix<double, ColMajor>(r, c))));
+    CALL_SUBTEST_3(( sparse_basic(SparseMatrix<double, RowMajor>(r, c))));
+    CALL_SUBTEST_4(( sparse_basic(SparseMatrix<double, ColMajor,long int>(r, c)) ));
+    CALL_SUBTEST_4(( sparse_basic(SparseMatrix<double, RowMajor,long int>(r, c)) ));
     
     r = Eigen::internal::random<int>(1,100);
     c = Eigen::internal::random<int>(1,100);
@@ -739,14 +812,14 @@
       r = c; // check square matrices in 25% of tries
     }
     
-    CALL_SUBTEST_6(( sparse_basic(SparseMatrix<double,ColMajor,short int>(short(r), short(c))) ));
-    CALL_SUBTEST_6(( sparse_basic(SparseMatrix<double,RowMajor,short int>(short(r), short(c))) ));
+    CALL_SUBTEST_5(( sparse_basic(SparseMatrix<double,ColMajor,short int>(short(r), short(c))) ));
+    CALL_SUBTEST_5(( sparse_basic(SparseMatrix<double,RowMajor,short int>(short(r), short(c))) ));
   }
 
   // Regression test for bug 900: (manually insert higher values here, if you have enough RAM):
-  CALL_SUBTEST_3((big_sparse_triplet<SparseMatrix<float, RowMajor, int> >(10000, 10000, 0.125)));
-  CALL_SUBTEST_4((big_sparse_triplet<SparseMatrix<double, ColMajor, long int> >(10000, 10000, 0.125)));
+  CALL_SUBTEST_5(( big_sparse_triplet<SparseMatrix<float, RowMajor, int>>(10000, 10000, 0.125)));
+  CALL_SUBTEST_5(( big_sparse_triplet<SparseMatrix<double, ColMajor, long int>>(10000, 10000, 0.125)));
 
-  CALL_SUBTEST_7( bug1105<0>() );
+  CALL_SUBTEST_5(bug1105<0>());
 }
 #endif
diff --git a/test/sparse_solver.h b/test/sparse_solver.h
index a9b18b8..01846e2 100644
--- a/test/sparse_solver.h
+++ b/test/sparse_solver.h
@@ -99,6 +99,13 @@
     VERIFY(solver.info() == Success && "solving failed when using Map");
     VERIFY(oldb.isApprox(bm) && "sparse solver testing: the rhs should not be modified!");
     VERIFY(xm.isApprox(refX,test_precision<Scalar>()));
+    
+    // Test with a Map and non-unit stride.
+    Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic> out(2*xm.rows(), 2*xm.cols());
+    out.setZero();
+    Eigen::Map<DenseRhs, 0, Stride<Eigen::Dynamic, 2>> outm(out.data(), xm.rows(), xm.cols(), Stride<Eigen::Dynamic, 2>(2 * xm.rows(), 2));
+    outm = solver.solve(bm);
+    VERIFY(outm.isApprox(refX,test_precision<Scalar>()));
   }
   
   // if not too large, do some extra check:
diff --git a/test/sparse_vector.cpp b/test/sparse_vector.cpp
index e1e74f3..88cea09 100644
--- a/test/sparse_vector.cpp
+++ b/test/sparse_vector.cpp
@@ -15,6 +15,7 @@
   double densityVec = (std::max)(8./(rows), 0.1);
   typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
   typedef Matrix<Scalar,Dynamic,1> DenseVector;
+  typedef Matrix<DenseIndex,Dynamic,1> DenseIndexVector;
   typedef SparseVector<Scalar,0,StorageIndex> SparseVectorType;
   typedef SparseMatrix<Scalar,0,StorageIndex> SparseMatrixType;
   Scalar eps = 1e-6;
@@ -143,6 +144,33 @@
     }
   }
 
+  // test sort
+  if(rows > 1)
+  {
+    SparseVectorType vec1(rows);
+    DenseVector refVec1 = DenseVector::Zero(rows);
+    DenseIndexVector innerIndices(rows);
+    innerIndices.setLinSpaced(0, rows - 1);
+    std::random_shuffle(innerIndices.begin(), innerIndices.end());
+    Index nz = internal::random<Index>(2, rows / 2);
+    for (Index k = 0; k < nz; k++)
+    {
+        Index i = innerIndices[k];
+        Scalar val = internal::random<Scalar>();
+        refVec1.coeffRef(i) = val;
+        vec1.insert(i) = val;
+    }
+
+    vec1.template sortInnerIndices<std::greater<>>();
+    VERIFY_IS_APPROX(vec1, refVec1);
+    VERIFY_IS_EQUAL(vec1.template innerIndicesAreSorted<std::greater<>>(), 1);
+    VERIFY_IS_EQUAL(vec1.template innerIndicesAreSorted<std::less<>>(), 0);
+    vec1.template sortInnerIndices<std::less<>>();
+    VERIFY_IS_APPROX(vec1, refVec1);
+    VERIFY_IS_EQUAL(vec1.template innerIndicesAreSorted<std::greater<>>(), 0);
+    VERIFY_IS_EQUAL(vec1.template innerIndicesAreSorted<std::less<>>(), 1);
+  }
+
 }
 void test_pruning() {
     using SparseVectorType = SparseVector<double, 0, int>;
diff --git a/test/visitor.cpp b/test/visitor.cpp
index 3b118f9..bc34917 100644
--- a/test/visitor.cpp
+++ b/test/visitor.cpp
@@ -207,8 +207,9 @@
   
   // Unrolled - ColMajor.
   {
-    Eigen::Matrix4f X = Eigen::Matrix4f::Random();
-    TrackedVisitor<double, false> visitor;
+    using MatrixType = Matrix<float, 4, 4, ColMajor>;
+    MatrixType X = MatrixType::Random(4, 4);
+    TrackedVisitor<MatrixType::Scalar, false> visitor;
     X.visit(visitor);
     Index count = 0;
     for (Index j=0; j<X.cols(); ++j) {
@@ -221,10 +222,10 @@
   }
   
   // Unrolled - RowMajor.
-  using Matrix4fRowMajor = Eigen::Matrix<float, 4, 4, Eigen::RowMajor>;
   {
-    Matrix4fRowMajor X = Matrix4fRowMajor::Random();
-    TrackedVisitor<double, false> visitor;
+    using MatrixType = Matrix<float, 4, 4, RowMajor>;
+    MatrixType X = MatrixType::Random(4, 4);
+    TrackedVisitor<MatrixType::Scalar, false> visitor;
     X.visit(visitor);
     Index count = 0;
     for (Index i=0; i<X.rows(); ++i) {
@@ -238,8 +239,9 @@
   
   // Not unrolled - ColMajor
   {
-    Eigen::MatrixXf X = Eigen::MatrixXf::Random(4, 4);
-    TrackedVisitor<double, false> visitor;
+    using MatrixType = Matrix<float, Dynamic, Dynamic, ColMajor>;
+    MatrixType X = MatrixType::Random(4, 4);
+    TrackedVisitor<MatrixType::Scalar, false> visitor;
     X.visit(visitor);
     Index count = 0;
     for (Index j=0; j<X.cols(); ++j) {
@@ -252,10 +254,10 @@
   }
   
   // Not unrolled - RowMajor.
-  using MatrixXfRowMajor = Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
   {
-    MatrixXfRowMajor X = MatrixXfRowMajor::Random(4, 4);
-    TrackedVisitor<double, false> visitor;
+    using MatrixType = Matrix<float, Dynamic, Dynamic, RowMajor>;
+    MatrixType X = MatrixType::Random(4, 4);
+    TrackedVisitor<MatrixType::Scalar, false> visitor;
     X.visit(visitor);
     Index count = 0;
     for (Index i=0; i<X.rows(); ++i) {
@@ -269,10 +271,11 @@
   
   // Vectorized - ColMajor
   {
+    using MatrixType = Matrix<float, Dynamic, Dynamic, ColMajor>;
     // Ensure rows/cols is larger than packet size.
-    constexpr int PacketSize = Eigen::internal::packet_traits<float>::size;
-    Eigen::MatrixXf X = Eigen::MatrixXf::Random(4 * PacketSize, 4 * PacketSize);
-    TrackedVisitor<double, true> visitor;
+    constexpr int PacketSize = Eigen::internal::packet_traits<MatrixType::Scalar>::size;
+    MatrixType X = MatrixType::Random(4 * PacketSize, 4 * PacketSize);
+    TrackedVisitor<MatrixType::Scalar, true> visitor;
     X.visit(visitor);
     Index previ = -1;
     Index prevj = 0;
@@ -287,17 +290,18 @@
       previ = i;
       prevj = j;
     }
-    if (Eigen::internal::packet_traits<float>::Vectorizable) {
+    if (Eigen::internal::packet_traits<MatrixType::Scalar>::Vectorizable) {
       VERIFY(visitor.vectorized);
     }
   }
   
   // Vectorized - RowMajor.
   {
+    using MatrixType = Matrix<float, Dynamic, Dynamic, RowMajor>;
     // Ensure rows/cols is larger than packet size.
-    constexpr int PacketSize = Eigen::internal::packet_traits<float>::size;
-    MatrixXfRowMajor X = MatrixXfRowMajor::Random(4 * PacketSize, 4 * PacketSize);
-    TrackedVisitor<double, true> visitor;
+    constexpr int PacketSize = Eigen::internal::packet_traits<MatrixType::Scalar>::size;
+    MatrixType X = MatrixType::Random(4 * PacketSize, 4 * PacketSize);
+    TrackedVisitor<MatrixType::Scalar, true> visitor;
     X.visit(visitor);
     Index previ = 0;
     Index prevj = -1;
@@ -312,7 +316,7 @@
       previ = i;
       prevj = j;
     }
-    if (Eigen::internal::packet_traits<float>::Vectorizable) {
+    if (Eigen::internal::packet_traits<MatrixType::Scalar>::Vectorizable) {
       VERIFY(visitor.vectorized);
     }
   }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
index d9a3d32..7864a2a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
@@ -95,6 +95,10 @@
     return firstLevelCacheSize();
 #endif
   }
+  
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
+    // Nothing.  Default device operations are synchronous.
+  }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
 #if !defined(EIGEN_GPU_COMPILE_PHASE)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
index a3adf61..6accc66 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -147,6 +147,10 @@
     // The l3 cache size is shared between all the cores.
     return l3CacheSize() / num_threads_;
   }
+  
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
+    // Nothing.  Threadpool device operations are synchronous.
+  }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
     // Should return an enum that encodes the ISA supported by the CPU
diff --git a/unsupported/Eigen/src/IterativeSolvers/IDRS.h b/unsupported/Eigen/src/IterativeSolvers/IDRS.h
index 2c7d7b0..d8f7a32 100644
--- a/unsupported/Eigen/src/IterativeSolvers/IDRS.h
+++ b/unsupported/Eigen/src/IterativeSolvers/IDRS.h
@@ -66,7 +66,6 @@
   const RealScalar tol = relres;
   const Index maxit = iter;
 
-  Index replacements = 0;
   bool trueres = false;
 
   FullPivLU<DenseMatrixType> lu_solver;
@@ -88,15 +87,14 @@
   }
   // from http://homepage.tudelft.nl/1w5b5/IDRS/manual.pdf
   // A peak in the residual is considered dangerously high if‖ri‖/‖b‖> C(tol/epsilon).
-  // With epsilon the
-  // relative machine precision. The factor tol/epsilon corresponds to the size of a
-  // finite precision number that is so large that the absolute round-off error in
-  // this number, when propagated through the process, makes it impossible to
-  // achieve the required accuracy.The factor C accounts for the accumulation of
-  // round-off errors. This parameter has beenset to 10−3.
-  // mp is epsilon/C
-  // 10^3 * eps is very conservative, so normally no residual replacements will take place.
-  // It only happens if things go very wrong. Too many restarts may ruin the convergence.
+  // With epsilon the relative machine precision. The factor tol/epsilon corresponds
+  // to the size of a finite precision number that is so large that the absolute
+  // round-off error in this number, when propagated through the process, makes it
+  // impossible to achieve the required accuracy. The factor C accounts for the
+  // accumulation of round-off errors. This parameter has been set to 10^{-3}.
+  // mp is epsilon/C 10^3 * eps is very conservative, so normally no residual
+  // replacements will take place. It only happens if things go very wrong. Too many
+  // restarts may ruin the convergence.
   const RealScalar mp = RealScalar(1e3) * NumTraits<Scalar>::epsilon();
 
   // Compute initial residual
@@ -224,7 +222,6 @@
     if (trueres && normr < normb) {
       r = b - A * x;
       trueres = false;
-      replacements++;
     }
 
     // Smoothing: