Update Eigen to commit:3460f3558e7b469efb8a225894e21929c8c77629 CHANGELOG ========= 3460f3558 - Use VERIFY_IS_EQUAL to compare to zeros. 13a1f25da - Revert StlIterators edit from "Fix undefined behavior..." fd2fd4870 - Update file ForwardDeclarations.h 37b2e9717 - Tweak special case handling in atan2. a1cdcdb03 - Fix undefined behavior in Block access 4a58f30aa - Fix pre-POWER8_VECTOR bugs in pcmp_lt and pnegate and reactivate psqrt. 12ad99ce6 - Remove unused variables from GenericPacketMathFunctions.h 6987a200b - Fix stupid sparse bugs with outerSize == 0 0471e61b4 - Optimize various mathematical packet ops 1aa6dc200 - Fix sparse warnings 17ae83a96 - Fix bugs exposed by enabling GPU asserts. ab8725d94 - Turn off vectorize version of rsqrt - doesn't match generic version 6d9f662a7 - Tweak atan2 6fc9de7d9 - Fix slowdown in bfloat16 MMA when rows is not a multiple of 8 or columns is not a multiple of 4. 6d4221af7 - Revert qr tests 7f58bc98b - Refactor sparse 576448572 - More fixes for __GNUC_PATCHLEVEL__. 164ddf75a - Use __GNUC_PATCHLEVEL__ rather than __GNUC_PATCH__, according to the documentation https://gcc.gnu.org/onlinedocs/cpp/Common-Predefined-Macros.html 5a7ca681d - Fix sparse insert 08c961e83 - Add custom ODR-safe assert. 3fe8c5110 - Replace the Deprecated `$<CONFIGURATION>` with `$<CONFIG>` d70b4864d - issue #2581: review and cleanup of compiler version checks b52312068 - [SYCL-2020 Support] Enabling Intel DPCPP Compiler support to Eigen bae119bb7 - Support per-thread is_malloc_allowed() state fa0bd2c34 - improve sparse permutations 2e61c0c6b - Add missing EIGEN_DEVICE_FUNC in a few places when called by asserts. 4aca06f63 - avoid move assignment in ColPivHouseholderQR 68082b822 - Fix QR, again 4d0576534 - Altivec fixes for Darwin: do not use unsupported VSX insns PiperOrigin-RevId: 506525228 Change-Id: I7b499f4573134b7fbdc73a4feb15a3ea032b599d

commit: 4e9978d189f13f5167f7a510d0c541655793bc3a [log] [tgz]
author: Rasmus Munk Larsen <rmlarsen@google.com> Wed Feb 01 22:56:02 2023 -0800
committer: Antonio Sanchez <cantonios@google.com> Mon Feb 06 12:41:30 2023 -0800
tree: 0945b0813c5dea8c0cc400f64319d1fdcb230845
parent: 9e1418030ad39044ab96c3c11388a0b189f73de9 [diff]
diff --git a/Eigen/Core b/Eigen/Core
index 623d735..af71891 100644
--- a/Eigen/Core
+++ b/Eigen/Core

@@ -34,16 +34,16 @@
   #include <new>
 #endif
 
-// Disable the ipa-cp-clone optimization flag with MinGW 6.x or newer (enabled by default with -O3)
+// Disable the ipa-cp-clone optimization flag with MinGW 6.x or older (enabled by default with -O3)
 // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=556 for details.
-#if EIGEN_COMP_MINGW && EIGEN_GNUC_AT_MOST(5,5)
+#if EIGEN_COMP_MINGW && EIGEN_GNUC_STRICT_LESS_THAN(6,0,0)
   #pragma GCC optimize ("-fno-ipa-cp-clone")
 #endif
 
 // Prevent ICC from specializing std::complex operators that silently fail
 // on device. This allows us to use our own device-compatible specializations
 // instead.
-#if defined(EIGEN_COMP_ICC) && defined(EIGEN_GPU_COMPILE_PHASE) \
+#if EIGEN_COMP_ICC && defined(EIGEN_GPU_COMPILE_PHASE) \
     && !defined(_OVERRIDE_COMPLEX_SPECIALIZATION_)
 #define _OVERRIDE_COMPLEX_SPECIALIZATION_ 1
 #endif
@@ -82,7 +82,6 @@
 #include <cstddef>
 #include <cstdlib>
 #include <cmath>
-#include <cassert>
 #include <functional>
 #ifndef EIGEN_NO_IO
   #include <sstream>
@@ -158,6 +157,7 @@
 
 #include "src/Core/util/Constants.h"
 #include "src/Core/util/Meta.h"
+#include "src/Core/util/Assert.h"
 #include "src/Core/util/ForwardDeclarations.h"
 #include "src/Core/util/StaticAssert.h"
 #include "src/Core/util/XprHelper.h"

diff --git a/Eigen/src/Cholesky/LLT_LAPACKE.h b/Eigen/src/Cholesky/LLT_LAPACKE.h
index 62bc679..a32ff23 100644
--- a/Eigen/src/Cholesky/LLT_LAPACKE.h
+++ b/Eigen/src/Cholesky/LLT_LAPACKE.h

@@ -70,6 +70,7 @@
 
   template<typename Scalar, UpLoType Mode>
   struct lapacke_llt {
+    EIGEN_STATIC_ASSERT(((Mode == Lower) || (Mode == Upper)),MODE_MUST_BE_UPPER_OR_LOWER)
     template<typename MatrixType>
     static Index blocked(MatrixType& m)
     {
@@ -80,10 +81,11 @@
       /* Set up parameters for ?potrf */
       lapack_int size = to_lapack(m.rows());
       lapack_int matrix_order = lapack_storage_of(m);
+      constexpr char uplo = Mode == Upper ? 'U' : 'L';
       Scalar* a = &(m.coeffRef(0,0));
       lapack_int lda = to_lapack(m.outerStride());
 
-      lapack_int info = potrf(matrix_order, translate_mode<Mode>, size, to_lapack(a), lda );
+      lapack_int info = potrf(matrix_order, uplo, size, to_lapack(a), lda );
       info = (info==0) ? -1 : info>0 ? info-1 : size;
       return info;
     }

diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h
index 8fb1f81..2b3a1c7 100644
--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h

@@ -937,7 +937,7 @@
 }
 
 // forward declaration
-template<typename Dst, typename Src> void check_for_aliasing(const Dst &dst, const Src &src);
+template<typename Dst, typename Src> EIGEN_DEVICE_FUNC void check_for_aliasing(const Dst &dst, const Src &src);
 
 // Generic Dense to Dense assignment
 // Note that the last template argument "Weak" is needed to make it possible to perform

diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h
index 19c4b68..5805763 100644
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h

@@ -336,6 +336,17 @@
     enum {
       XprTypeIsRowMajor = (int(traits<XprType>::Flags)&RowMajorBit) != 0
     };
+
+    /** \internal Returns base+offset (unless base is null, in which case returns null).
+      * Adding an offset to nullptr is undefined behavior, so we must avoid it.
+      */
+    template <typename Scalar>
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE
+    static Scalar* add_to_nullable_pointer(Scalar* base, Index offset)
+    {
+      return base != nullptr ? base+offset : nullptr;
+    }
+
   public:
 
     typedef MapBase<BlockType> Base;
@@ -346,8 +357,9 @@
       */
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     BlockImpl_dense(XprType& xpr, Index i)
-      : Base(xpr.data() + i * (    ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && (!XprTypeIsRowMajor))
-                                || ((BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) && ( XprTypeIsRowMajor)) ? xpr.innerStride() : xpr.outerStride()),
+      : Base(add_to_nullable_pointer(xpr.data(),
+                 i * (    ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && (!XprTypeIsRowMajor))
+                       || ((BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) && ( XprTypeIsRowMajor)) ? xpr.innerStride() : xpr.outerStride())),
              BlockRows==1 ? 1 : xpr.rows(),
              BlockCols==1 ? 1 : xpr.cols()),
         m_xpr(xpr),
@@ -361,7 +373,8 @@
       */
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     BlockImpl_dense(XprType& xpr, Index startRow, Index startCol)
-      : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol)),
+      : Base(add_to_nullable_pointer(xpr.data(),
+                 xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol))),
         m_xpr(xpr), m_startRow(startRow), m_startCol(startCol)
     {
       init();
@@ -373,7 +386,9 @@
     BlockImpl_dense(XprType& xpr,
           Index startRow, Index startCol,
           Index blockRows, Index blockCols)
-      : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol), blockRows, blockCols),
+      : Base(add_to_nullable_pointer(xpr.data(),
+                 xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol)),
+             blockRows, blockCols),
         m_xpr(xpr), m_startRow(startRow), m_startCol(startCol)
     {
       init();

diff --git a/Eigen/src/Core/DenseStorage.h b/Eigen/src/Core/DenseStorage.h
index cf588bd..b409a39 100644
--- a/Eigen/src/Core/DenseStorage.h
+++ b/Eigen/src/Core/DenseStorage.h

@@ -54,18 +54,6 @@
 
 #if defined(EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT)
   #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask)
-#elif EIGEN_COMP_GNUC
-  // GCC 4.7 is too aggressive in its optimizations and remove the alignment test based on the fact the array is declared to be aligned.
-  // See this bug report: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=53900
-  // Hiding the origin of the array pointer behind a function argument seems to do the trick even if the function is inlined:
-  template<typename PtrType>
-  EIGEN_ALWAYS_INLINE PtrType eigen_unaligned_array_assert_workaround_gcc47(PtrType array) { return array; }
-  #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \
-    eigen_assert((internal::is_constant_evaluated() \
-                || (internal::UIntPtr(eigen_unaligned_array_assert_workaround_gcc47(array)) & (sizemask)) == 0) \
-              && "this assertion is explained here: " \
-              "http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html" \
-              " **** READ THIS WEB PAGE !!! ****");
 #else
   #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \
     eigen_assert((internal::is_constant_evaluated() || (internal::UIntPtr(array) & (sizemask)) == 0) \

diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index af773dd..d65c63a 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h

@@ -1219,6 +1219,43 @@
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE constexpr Packet
 psignbit(const Packet& a) { return psignbit_impl<Packet>::run(a); }
 
+/** \internal \returns the 2-argument arc tangent of \a y and \a x (coeff-wise) */
+template <typename Packet, std::enable_if_t<is_scalar<Packet>::value, int> = 0>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet patan2(const Packet& y, const Packet& x) {
+  return numext::atan2(y, x);
+}
+
+/** \internal \returns the 2-argument arc tangent of \a y and \a x (coeff-wise) */
+template <typename Packet, std::enable_if_t<!is_scalar<Packet>::value, int> = 0>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet patan2(const Packet& y, const Packet& x) {
+  typedef typename internal::unpacket_traits<Packet>::type Scalar;
+
+  // See https://en.cppreference.com/w/cpp/numeric/math/atan2
+  // for how corner cases are supposed to be handled according to the
+  // IEEE floating-point standard (IEC 60559).
+  const Packet kSignMask = pset1<Packet>(-Scalar(0));
+  const Packet kZero = pzero(x);
+  const Packet kOne = pset1<Packet>(Scalar(1));
+  const Packet kPi = pset1<Packet>(Scalar(EIGEN_PI));
+
+  const Packet x_has_signbit = psignbit(x);
+  const Packet y_signmask = pand(y, kSignMask);
+  const Packet x_signmask = pand(x, kSignMask);
+  const Packet result_signmask = pxor(y_signmask, x_signmask);
+  const Packet shift = por(pand(x_has_signbit, kPi), y_signmask);
+
+  const Packet x_and_y_are_same = pcmp_eq(pabs(x), pabs(y));
+  const Packet x_and_y_are_zero = pcmp_eq(por(x, y), kZero);
+
+  Packet arg = pdiv(y, x);
+  arg = pselect(x_and_y_are_same, por(kOne, result_signmask), arg);
+  arg = pselect(x_and_y_are_zero, result_signmask, arg);
+
+  Packet result = patan(arg);
+  result = padd(result, shift);
+  return result;
+}
+
 } // end namespace internal
 
 } // end namespace Eigen

diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index b194353..def5428 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h

@@ -1734,6 +1734,12 @@
   return static_cast<T>(atan(x));
 }
 
+template <typename T, std::enable_if_t<!NumTraits<T>::IsComplex, int> = 0>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T atan2(const T& y, const T& x) {
+  EIGEN_USING_STD(atan2);
+  return static_cast<T>(atan2(y, x));
+}
+
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T atanh(const T &x) {

diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h
index ea2178f..8767e1f 100644
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h

@@ -362,9 +362,9 @@
 /////////// QR module ///////////
 
     inline const HouseholderQR<PlainObject> householderQr() const;
-    inline const ColPivHouseholderQR<PlainObject> colPivHouseholderQr() const;
-    inline const FullPivHouseholderQR<PlainObject> fullPivHouseholderQr() const;
-    inline const CompleteOrthogonalDecomposition<PlainObject> completeOrthogonalDecomposition() const;
+    template<typename PermutationIndex = DefaultPermutationIndex> inline const ColPivHouseholderQR<PlainObject, PermutationIndex> colPivHouseholderQr() const;
+    template<typename PermutationIndex = DefaultPermutationIndex> inline const FullPivHouseholderQR<PlainObject, PermutationIndex> fullPivHouseholderQr() const;
+    template<typename PermutationIndex = DefaultPermutationIndex> inline const CompleteOrthogonalDecomposition<PlainObject, PermutationIndex> completeOrthogonalDecomposition() const;
 
 /////////// Eigenvalues module ///////////
 

diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h
index 74650ef..c56318c 100644
--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h

@@ -402,7 +402,7 @@
 template<typename Scalar, bool DestIsTransposed, typename OtherDerived>
 struct check_transpose_aliasing_run_time_selector
 {
-  static bool run(const Scalar* dest, const OtherDerived& src)
+  EIGEN_DEVICE_FUNC static bool run(const Scalar* dest, const OtherDerived& src)
   {
     return (bool(blas_traits<OtherDerived>::IsTransposed) != DestIsTransposed) && (dest!=0 && dest==(const Scalar*)extract_data(src));
   }
@@ -411,7 +411,7 @@
 template<typename Scalar, bool DestIsTransposed, typename BinOp, typename DerivedA, typename DerivedB>
 struct check_transpose_aliasing_run_time_selector<Scalar,DestIsTransposed,CwiseBinaryOp<BinOp,DerivedA,DerivedB> >
 {
-  static bool run(const Scalar* dest, const CwiseBinaryOp<BinOp,DerivedA,DerivedB>& src)
+  EIGEN_DEVICE_FUNC static bool run(const Scalar* dest, const CwiseBinaryOp<BinOp,DerivedA,DerivedB>& src)
   {
     return ((blas_traits<DerivedA>::IsTransposed != DestIsTransposed) && (dest!=0 && dest==(const Scalar*)extract_data(src.lhs())))
         || ((blas_traits<DerivedB>::IsTransposed != DestIsTransposed) && (dest!=0 && dest==(const Scalar*)extract_data(src.rhs())));
@@ -431,7 +431,7 @@
         >
 struct checkTransposeAliasing_impl
 {
-    static void run(const Derived& dst, const OtherDerived& other)
+    EIGEN_DEVICE_FUNC static void run(const Derived& dst, const OtherDerived& other)
     {
         eigen_assert((!check_transpose_aliasing_run_time_selector
                       <typename Derived::Scalar,blas_traits<Derived>::IsTransposed,OtherDerived>
@@ -445,13 +445,13 @@
 template<typename Derived, typename OtherDerived>
 struct checkTransposeAliasing_impl<Derived, OtherDerived, false>
 {
-    static void run(const Derived&, const OtherDerived&)
+    EIGEN_DEVICE_FUNC static void run(const Derived&, const OtherDerived&)
     {
     }
 };
 
 template<typename Dst, typename Src>
-void check_for_aliasing(const Dst &dst, const Src &src)
+EIGEN_DEVICE_FUNC inline void check_for_aliasing(const Dst &dst, const Src &src)
 {
   if((!Dst::IsVectorAtCompileTime) && dst.rows()>1 && dst.cols()>1)
     internal::checkTransposeAliasing_impl<Dst, Src>::run(dst, src);

diff --git a/Eigen/src/Core/VectorwiseOp.h b/Eigen/src/Core/VectorwiseOp.h
index b004f76..762cbfc 100644
--- a/Eigen/src/Core/VectorwiseOp.h
+++ b/Eigen/src/Core/VectorwiseOp.h

@@ -745,7 +745,7 @@
 #   endif
 
   protected:
-    Index redux_length() const
+    EIGEN_DEVICE_FUNC Index redux_length() const
     {
       return Direction==Vertical ? m_matrix.rows() : m_matrix.cols();
     }

diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index 0fe830a..60e1ff4 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h

@@ -670,7 +670,7 @@
 }
 
 template<> EIGEN_STRONG_INLINE Packet8f pmin<Packet8f>(const Packet8f& a, const Packet8f& b) {
-#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
+#if EIGEN_GNUC_STRICT_LESS_THAN(6,3,0)
   // There appears to be a bug in GCC, by which the optimizer may flip
   // the argument order in calls to _mm_min_ps/_mm_max_ps, so we have to
   // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
@@ -684,7 +684,7 @@
 #endif
 }
 template<> EIGEN_STRONG_INLINE Packet4d pmin<Packet4d>(const Packet4d& a, const Packet4d& b) {
-#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
+#if EIGEN_GNUC_STRICT_LESS_THAN(6,3,0)
   // See pmin above
   Packet4d res;
   asm("vminpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
@@ -705,7 +705,7 @@
 }
 
 template<> EIGEN_STRONG_INLINE Packet8f pmax<Packet8f>(const Packet8f& a, const Packet8f& b) {
-#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
+#if EIGEN_GNUC_STRICT_LESS_THAN(6,3,0)
   // See pmin above
   Packet8f res;
   asm("vmaxps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
@@ -716,7 +716,7 @@
 #endif
 }
 template<> EIGEN_STRONG_INLINE Packet4d pmax<Packet4d>(const Packet4d& a, const Packet4d& b) {
-#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
+#if EIGEN_GNUC_STRICT_LESS_THAN(6,3,0)
   // See pmin above
   Packet4d res;
   asm("vmaxpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));

diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h
index 159ae3e..916babf 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h

@@ -31,7 +31,7 @@
 #endif
 
 // Disable the code for older versions of gcc that don't support many of the required avx512 math instrinsics.
-#if EIGEN_GNUC_AT_LEAST(5, 3) || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC >= 1923 || EIGEN_COMP_ICC >= 1900
+#if EIGEN_GNUC_STRICT_AT_LEAST(5,3,0) || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC >= 1923 || EIGEN_COMP_ICC >= 1900
 #define EIGEN_HAS_AVX512_MATH 1
 #else
 #define EIGEN_HAS_AVX512_MATH 0
@@ -2384,7 +2384,7 @@
 EIGEN_STRONG_INLINE Packet16bf F32ToBf16(const Packet16f& a) {
   Packet16bf r;
 
-#if defined(EIGEN_VECTORIZE_AVX512BF16) && EIGEN_GNUC_AT_LEAST(10, 1)
+#if defined(EIGEN_VECTORIZE_AVX512BF16) && EIGEN_GNUC_STRICT_AT_LEAST(10,1,0)
   // Since GCC 10.1 supports avx512bf16 and C style explicit cast
   // (C++ static_cast is not supported yet), do conversion via intrinsic
   // and register path for performance.

diff --git a/Eigen/src/Core/arch/AVX512/PacketMathFP16.h b/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
index f2e3e84..bbc016a 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMathFP16.h

@@ -17,7 +17,7 @@
 namespace internal {
 
 // Disable the code for older versions of gcc that don't support many of the required avx512 math instrinsics.
-#if EIGEN_GNUC_AT_LEAST(5, 3) || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC >= 1923 || EIGEN_COMP_ICC >= 1900
+#if EIGEN_GNUC_STRICT_AT_LEAST(5,3,0) || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC >= 1923 || EIGEN_COMP_ICC >= 1900
 #define EIGEN_HAS_AVX512_MATH 1
 #else
 #define EIGEN_HAS_AVX512_MATH 0

diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h
index 46812f9..60f28ff 100644
--- a/Eigen/src/Core/arch/AltiVec/Complex.h
+++ b/Eigen/src/Core/arch/AltiVec/Complex.h

@@ -18,7 +18,7 @@
 namespace internal {
 
 static Packet4ui  p4ui_CONJ_XOR = vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
 #if defined(_BIG_ENDIAN)
 static Packet2ul  p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_MZERO, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
 static Packet2ul  p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO,  (Packet4ui) p2d_MZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
@@ -103,7 +103,7 @@
     HasMin    = 0,
     HasMax    = 0,
     HasSqrt   = 1,
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
     HasBlend  = 1,
 #endif
     HasSetLinear = 0
@@ -115,7 +115,7 @@
 template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
 {
   Packet2cf res;
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
   // Load a single std::complex<float> from memory and duplicate
   //
   // Using pload would read past the end of the reference in this case
@@ -151,7 +151,7 @@
 EIGEN_STRONG_INLINE Packet2cf pload2(const std::complex<float>& from0, const std::complex<float>& from1)
 {
   Packet4f res0, res1;
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
   // Load two std::complex<float> from memory and combine
   __asm__ ("lxsdx %x0,%y1" : "=wa" (res0) : "Z" (from0));
   __asm__ ("lxsdx %x0,%y1" : "=wa" (res1) : "Z" (from1));
@@ -269,7 +269,7 @@
 
 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel)
 {
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
   Packet4f tmp = reinterpret_cast<Packet4f>(vec_mergeh(reinterpret_cast<Packet2d>(kernel.packet[0].v), reinterpret_cast<Packet2d>(kernel.packet[1].v)));
   kernel.packet[1].v = reinterpret_cast<Packet4f>(vec_mergel(reinterpret_cast<Packet2d>(kernel.packet[0].v), reinterpret_cast<Packet2d>(kernel.packet[1].v)));
 #else
@@ -284,7 +284,7 @@
   return Packet2cf(vec_and(eq, vec_perm(eq, eq, p16uc_COMPLEX32_REV)));
 }
 
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
 template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
   Packet2cf result;
   result.v = reinterpret_cast<Packet4f>(pblend<Packet2d>(ifPacket, reinterpret_cast<Packet2d>(thenPacket.v), reinterpret_cast<Packet2d>(elsePacket.v)));
@@ -298,7 +298,7 @@
 }
 
 //---------- double ----------
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
 struct Packet1cd
 {
   EIGEN_STRONG_INLINE Packet1cd() {}

diff --git a/Eigen/src/Core/arch/AltiVec/MathFunctions.h b/Eigen/src/Core/arch/AltiVec/MathFunctions.h
index 6f48d98..45761e2 100644
--- a/Eigen/src/Core/arch/AltiVec/MathFunctions.h
+++ b/Eigen/src/Core/arch/AltiVec/MathFunctions.h

@@ -60,27 +60,7 @@
   return patan_float(_x);
 }
 
-#ifdef __VSX__
-#ifndef EIGEN_COMP_CLANG
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet4f prsqrt<Packet4f>(const Packet4f& x)
-{
-  return  vec_rsqrt(x);
-}
-
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet2d prsqrt<Packet2d>(const Packet2d& x)
-{
-  return  vec_rsqrt(x);
-}
-
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet2d patan<Packet2d>(const Packet2d& _x)
-{
-  return patan_double(_x);
-}
-#endif
-
+#ifdef EIGEN_VECTORIZE_VSX
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet4f psqrt<Packet4f>(const Packet4f& x)
 {
@@ -93,6 +73,30 @@
   return  vec_sqrt(x);
 }
 
+#if !EIGEN_COMP_CLANG
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet4f prsqrt<Packet4f>(const Packet4f& x)
+{
+  return pset1<Packet4f>(1.0f) / psqrt<Packet4f>(x);
+//  vec_rsqrt returns different results from the generic version
+//  return  vec_rsqrt(x);
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet2d prsqrt<Packet2d>(const Packet2d& x)
+{
+  return pset1<Packet2d>(1.0) / psqrt<Packet2d>(x);
+//  vec_rsqrt returns different results from the generic version
+//  return  vec_rsqrt(x);
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet2d patan<Packet2d>(const Packet2d& _x)
+{
+  return patan_double(_x);
+}
+#endif
+
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet2d pexp<Packet2d>(const Packet2d& _x)
 {
@@ -103,7 +107,7 @@
   BF16_TO_F32_UNARY_OP_WRAPPER(psqrt<Packet4f>, a);
 }
 
-#ifndef EIGEN_COMP_CLANG
+#if !EIGEN_COMP_CLANG
 template<> EIGEN_STRONG_INLINE Packet8bf prsqrt<Packet8bf> (const Packet8bf& a){
   BF16_TO_F32_UNARY_OP_WRAPPER(prsqrt<Packet4f>, a);
 }

diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
index 61d9618..adb2eac 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h

@@ -1007,6 +1007,7 @@
 
           Packet2ul v1[8];
 
+          // This is transposing and interleaving data
           v1[0] = reinterpret_cast<Packet2ul>(vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[0].m_val), reinterpret_cast<Packet4ui>(block1.packet[1].m_val)));
           v1[1] = reinterpret_cast<Packet2ul>(vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[0].m_val), reinterpret_cast<Packet4ui>(block1.packet[1].m_val)));
           v1[2] = reinterpret_cast<Packet2ul>(vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[2].m_val), reinterpret_cast<Packet4ui>(block1.packet[3].m_val)));
@@ -1052,19 +1053,82 @@
 
       if(PanelMode) ri += vectorSize*(stride - offset - depth);
     }
-
-    if(PanelMode) ri += offset;
-
-    for(; j < rows; j++)
+    if(j + 4 <= rows)
     {
       const DataMapper lhs2 = lhs.getSubMapper(j, 0);
-      for(Index i = 0; i < depth; i++)
+      Index i = 0;
+
+      if(PanelMode) ri += 4*offset;
+
+      for(; i + 2 <= depth; i+=2)
       {
-        blockA[ri] = lhs2(0, i);
-        ri += 1;
+        if(StorageOrder == ColMajor)
+        {
+          PacketBlock<Packet8bf,2> block;
+
+          block.packet[0] = lhs2.template loadPacketPartial<Packet8bf>(0, i + 0, 4);
+          block.packet[1] = lhs2.template loadPacketPartial<Packet8bf>(0, i + 1, 4);
+
+          block.packet[0] = vec_mergeh(block.packet[0].m_val, block.packet[1].m_val);
+
+          pstore<bfloat16>(blockA + ri, block.packet[0]);
+        } else {
+          blockA[ri+0] = lhs2(0, i + 0);
+          blockA[ri+1] = lhs2(0, i + 1);
+          blockA[ri+2] = lhs2(1, i + 0);
+          blockA[ri+3] = lhs2(1, i + 1);
+          blockA[ri+4] = lhs2(2, i + 0);
+          blockA[ri+5] = lhs2(2, i + 1);
+          blockA[ri+6] = lhs2(3, i + 0);
+          blockA[ri+7] = lhs2(3, i + 1);
+        }
+
+        ri += 2*4;
+      }
+      if (depth & 1)
+      {
+        if(StorageOrder == ColMajor)
+        {
+          Packet8bf lhsV = lhs2.template loadPacketPartial<Packet8bf>(0, i + 0, 4);
+
+          pstore_partial<bfloat16>(blockA + ri, lhsV, 4);
+        } else {
+          blockA[ri+0] = lhs2(0, i);
+          blockA[ri+1] = lhs2(1, i);
+          blockA[ri+2] = lhs2(2, i);
+          blockA[ri+3] = lhs2(3, i);
+        }
+
+        ri += 4;
       }
 
-      if(PanelMode) ri += stride - depth;
+      if(PanelMode) ri += 4*(stride - offset - depth);
+      j += 4;
+    }
+
+    if (j < rows)
+    {
+      if(PanelMode) ri += offset*(rows - j);
+
+      Index i = 0;
+      for(; i + 2 <= depth; i+=2)
+      {
+        Index k = j;
+        for(; k < rows; k++)
+        {
+          blockA[ri+0] = lhs(k, i + 0);
+          blockA[ri+1] = lhs(k, i + 1);
+          ri += 2;
+        }
+      }
+      if (depth & 1)
+      {
+        for(; j < rows; j++)
+        {
+          blockA[ri] = lhs(j, i);
+          ri += 1;
+        }
+      }
     }
   }
 };
@@ -1163,18 +1227,29 @@
       if(PanelMode) ri += 4*(stride - offset - depth);
     }
 
-    if(PanelMode) ri += offset;
-
-    for(; j < cols; j++)
+    if (j < cols)
     {
-      const DataMapper rhs2 = rhs.getSubMapper(0, j);
-      for(Index i = 0; i < depth; i++)
-      {
-        blockB[ri] = rhs2(i, 0);
-        ri += 1;
-      }
+      if(PanelMode) ri += offset*(cols - j);
 
-      if(PanelMode) ri += stride - depth;
+      Index i = 0;
+      for(; i + 2 <= depth; i+=2)
+      {
+        Index k = j;
+        for(; k < cols; k++)
+        {
+          blockB[ri+0] = rhs(i + 0, k);
+          blockB[ri+1] = rhs(i + 1, k);
+          ri += 2;
+        }
+      }
+      if (depth & 1)
+      {
+        for(; j < cols; j++)
+        {
+          blockB[ri] = rhs(i, j);
+          ri += 1;
+        }
+      }
     }
   }
 };
@@ -2662,6 +2737,7 @@
 #endif
 
 #ifdef __MMA__
+#if EIGEN_ALTIVEC_USE_CUSTOM_PACK
 template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
 struct gemm_pack_rhs<bfloat16, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
 {
@@ -2689,6 +2765,7 @@
   dhs_pack<bfloat16, DataMapper, Packet8bf, RowMajor, PanelMode, false> pack;
   pack(blockB, rhs, depth, cols, stride, offset);
 }
+#endif
 
 template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
 struct gemm_pack_lhs<bfloat16, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>

diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h b/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h
index 01465d3..c6d4216 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h

@@ -18,8 +18,8 @@
   pstoreu(result, result_block);
 }
 
-template<Index num_packets, bool zero>
-EIGEN_ALWAYS_INLINE Packet8bf loadLhsBfloat16(const bfloat16* indexA)
+template<bool zero>
+EIGEN_ALWAYS_INLINE Packet8bf loadBfloat16(const bfloat16* indexA)
 {
   Packet8bf lhs1 = ploadu<Packet8bf>(indexA);
   if(zero){
@@ -31,48 +31,33 @@
 }
 
 template<bool zero>
-EIGEN_ALWAYS_INLINE Packet8bf loadBfloat16Extra(const bfloat16* indexA, Index strideA, Index extra_rows)
+EIGEN_ALWAYS_INLINE Packet8bf loadBfloat16Extra(const bfloat16* indexA, Index extra_rows)
 {
-  Index row_count = 0;
   if (zero) {
-    EIGEN_ALIGN16 bfloat16 lhs_array[8] = { Eigen::bfloat16(0) };
-    do{
-      lhs_array[row_count] = *indexA;
-      indexA += strideA;
-    } while ((row_count += 2) < extra_rows*2);
-    return pload_partial<Packet8bf>(lhs_array, extra_rows*2);
+    Packet8bf lhs1 = ploadu_partial<Packet8bf>(indexA, extra_rows);
+    Packet8bf lhs2 = pset1<Packet8bf>(Eigen::bfloat16(0));
+    return vec_mergeh(lhs1.m_val, lhs2.m_val);
   } else {
-    EIGEN_ALIGN16 int lhs_array[4];
-    do{
-      lhs_array[row_count] = *reinterpret_cast<const int *>(indexA);
-      indexA += strideA;
-    } while ((row_count += 1) < extra_rows);
-    return reinterpret_cast<Packet8us>(pload_partial<Packet4i>(lhs_array, extra_rows));
+    return reinterpret_cast<Packet8us>(ploadu_partial<Packet4i>(reinterpret_cast<const int *>(indexA), extra_rows));
   }
 }
 
 template<bool zero>
 EIGEN_ALWAYS_INLINE Packet8bf loadLhsBfloat16ExtraRows(const bfloat16* indexA, Index strideA, Index row, Index extra_rows)
 {
-  return loadBfloat16Extra<zero>(indexA + row*strideA, strideA, extra_rows);
+  return loadBfloat16Extra<zero>(indexA + row*strideA, extra_rows);
 }
 
 template<bool zero>
 EIGEN_ALWAYS_INLINE Packet8bf loadRhsBfloat16(const bfloat16* baseB, Index strideB, Index i, Index k)
 {
-  const bfloat16* indexB = baseB + strideB*4*i + (k*4);
-  Packet8bf rhs1 = ploadu<Packet8bf>(indexB);
-  if(zero){
-    Packet8bf rhs2 = pset1<Packet8bf>(Eigen::bfloat16(0));
-    return vec_mergeh(rhs1.m_val, rhs2.m_val);
-  }
-  return rhs1;
+  return loadBfloat16<zero>(baseB + strideB*4*i + (k*4));
 }
 
 template<bool zero>
 EIGEN_ALWAYS_INLINE Packet8bf loadRhsBfloat16ExtraCols(const bfloat16* blockB, Index strideB, Index offsetB, Index col, Index i, Index k, Index extra_cols)
 {
-  return loadBfloat16Extra<zero>(blockB + ((col+4*i)*strideB)+k+offsetB, strideB, extra_cols);
+  return loadBfloat16Extra<zero>(blockB + ((col+4*i)*strideB)+k*extra_cols+offsetB, extra_cols);
 }
 
 template<Index num_acc, Index num_packets, bool zero, bool rhs_extra_cols, bool lhs_extra_rows>
@@ -93,8 +78,8 @@
 {
   Packet8bf lhs;
   Packet8bf rhs[num_acc];
-  if(lhs_extra_rows) lhs = loadLhsBfloat16ExtraRows<zero>(indexA+k, strideA, row, extra_rows);
-  else lhs = loadLhsBfloat16<num_packets, zero>(indexA + k*num_packets); //a packet of bfloat16 has 8 elements
+  if(lhs_extra_rows) lhs = loadLhsBfloat16ExtraRows<zero>(indexA+k*extra_rows, strideA, row, extra_rows);
+  else lhs = loadBfloat16<zero>(indexA + k*num_packets); //a packet of bfloat16 has 8 elements
   BFLOAT16_UNROLL
   for(Index i = 0; i < num_acc; i++){
     if(!rhs_extra_cols)
@@ -125,7 +110,7 @@
       KLoop<num_acc, num_packets, false, rhsExtraCols, lhsExtraRows>(indexA, indexB, quad_acc, strideA, strideB, offsetB, k, row, col, extra_rows, extra_cols);
     }
     if(depth&1){
-      KLoop<num_acc, num_packets, true, rhsExtraCols, lhsExtraRows>(indexA-offset_row, indexB, quad_acc, strideA, strideB, offsetB, k, row, col, extra_rows, extra_cols);
+      KLoop<num_acc, num_packets, true, rhsExtraCols, lhsExtraRows>(indexA-(offset_row&(num_packets-1)), indexB, quad_acc, strideA, strideB, offsetB, k, row, col, extra_rows, extra_cols);
     }
 
     BFLOAT16_UNROLL
@@ -174,8 +159,10 @@
   typedef typename DataMapper::LinearMapper LinearMapper;
   for(Index j = 0; j < cols; j++){
     const LinearMapper res2 = res.getLinearMapper(0, j);
+    float *result2 = result + j*rows;
+    BFLOAT16_UNROLL
     for(Index i = 0; i < rows; i++){
-      result[j*rows + i] = res2(i);
+      result2[i] = res2(i);
     }
   }
 
@@ -185,15 +172,16 @@
   if( strideA == -1 ) strideA = depth;
   if( strideB == -1 ) strideB = depth;
   //Packing is done in blocks.
-  //There's 3 possible sizes of blocks
-  //Blocks of 8 columns with 16 elements (8x16) as col major
-  //Blocks of 8 columns with 8 elements (8x8) as col major. This happens when there's 16 > rows > 8 
-  //Blocks of 8 columns with <8 elements as row major. This happens when there's less than 8 remaining rows
+  //There's 4 possible sizes of blocks
+  //Blocks of 8 columns with 16 elements (8x16)
+  //Blocks of 8 columns with 8 elements (8x8). This happens when there's 16 > rows >= 8
+  //Blocks of 8 columns with 4 elements (8x4). This happens when there's 8 > rows >= 4
+  //Blocks of 8 columns with < 4 elements. This happens when there's less than 4 remaining rows
 
   //Loop for LHS standard block (8x16)
   const Index standard_block_size = 16;
   const Index standard_blocks_quantity = rows/standard_block_size; //Number of standard blocks
-  Index bigSuffix = (2*8) * (strideA-offsetA-depth);
+  Index bigSuffix = (2*8) * (strideA-offsetA);
   const bfloat16* indexA = blockA;
   const Index offset_factor = 2;
   Index block_index;
@@ -215,11 +203,11 @@
       }
     }
     row += 16;
-    indexA += bigSuffix + 2*8*depth;
+    indexA += bigSuffix;
   }
   //LHS (8x8) block
-  if(rows - standard_blocks_quantity*16 >= 8){
-    indexA += 1*8*offsetA + 2*8*offsetA;
+  if(rows & 8){
+    indexA += 1*8*offsetA;
     for(Index offset_row = 0; offset_row < 8; offset_row += 4){
       col = 0;
       colLoopBody<7, 8>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row*offset_factor, strideA, blockB, strideB, offsetB, result);
@@ -238,14 +226,33 @@
       }
     } //end extra cols
     row += 8;
+    indexA += (bigSuffix >> 1);
+  }
+  //LHS (8x4) block
+  if(rows & 4){
+    Index offset_row = (rows & 8);
+    indexA += 1*4*offsetA;
+    col = 0;
+    colLoopBody<7, 4>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA, strideA, blockB, strideB, offsetB, result);
+    colLoopBody<6, 4>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA, strideA, blockB, strideB, offsetB, result);
+    colLoopBody<5, 4>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA, strideA, blockB, strideB, offsetB, result);
+    colLoopBody<4, 4>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA, strideA, blockB, strideB, offsetB, result);
+    colLoopBody<3, 4>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA, strideA, blockB, strideB, offsetB, result);
+    colLoopBody<2, 4>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA, strideA, blockB, strideB, offsetB, result);
+    colLoopBody<1, 4>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA, strideA, blockB, strideB, offsetB, result);
+    if(cols > col){
+      Index extra_cols= cols-col;
+
+      colLoopBody<1, 4, true>(col, row, depth, cols, rows, 0, block_index, pAlpha, indexA, strideA, blockB, strideB, offsetB, result, extra_cols, 4);
+    }
+    row += 4;
+    indexA += (bigSuffix >> 2);
   }
   //extra rows
-  while(row < rows){
-    Index extra_rows = rows-row;
-    Index extra_rows_or_four = (extra_rows <= 4) ? extra_rows : 4;
+  if(row < rows){
+    Index extra_rows_or_four = rows-row;
 
-    //This index is the beginning of remaining block. 
-    //This last block for LHS is organized as RowMajor
+    //This index is the beginning of remaining block.
     col = 0;
     colLoopBody<7, 8, false, true>(col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, 4, extra_rows_or_four);
     colLoopBody<6, 8, false, true>(col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, 4, extra_rows_or_four);
@@ -269,8 +276,8 @@
       //get and save block
       PacketBlock<Packet8bf,4> block;
       for(Index j = 0; j < 4; j++){
-        Packet16uc fp16_0 = __builtin_vsx_xvcvspbf16(reinterpret_cast<Packet16uc>(pload<Packet4f>(result + (col + j)*rows + row)));
-        Packet16uc fp16_1 = __builtin_vsx_xvcvspbf16(reinterpret_cast<Packet16uc>(pload<Packet4f>(result + (col + j)*rows + row + 4)));
+        Packet16uc fp16_0 = __builtin_vsx_xvcvspbf16(reinterpret_cast<Packet16uc>(ploadu<Packet4f>(result + (col + j)*rows + row)));
+        Packet16uc fp16_1 = __builtin_vsx_xvcvspbf16(reinterpret_cast<Packet16uc>(ploadu<Packet4f>(result + (col + j)*rows + row + 4)));
         block.packet[j].m_val = vec_pack(reinterpret_cast<Packet4ui>(fp16_0), reinterpret_cast<Packet4ui>(fp16_1));
       }
 

diff --git a/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h
index 9d00b93..bb84ac9 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h

@@ -599,7 +599,7 @@
 
 EIGEN_ALWAYS_INLINE Packet1cd pcplxflip2(Packet1cd a)
 {
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
     return Packet1cd(__builtin_vsx_xxpermdi(a.v, a.v, 2));
 #else
     return Packet1cd(Packet2d(vec_perm(Packet16uc(a.v), Packet16uc(a.v), p16uc_COMPLEX64_XORFLIP)));
@@ -610,7 +610,7 @@
 EIGEN_ALWAYS_INLINE Packet4f pload_complex_half(std::complex<float>* src)
 {
     Packet4f t;
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
     // Load float64/two float32 (doubleword alignment)
     __asm__("lxsdx %x0,%y1" : "=wa" (t) : "Z" (*src));
 #else
@@ -636,7 +636,7 @@
 template<typename RhsScalar>
 EIGEN_ALWAYS_INLINE void pload_realimag(RhsScalar* src, Packet2d& r, Packet2d& i)
 {
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
     __asm__("lxvdsx %x0,%y1" : "=wa" (r) : "Z" (*(reinterpret_cast<double*>(src) + 0)));
     __asm__("lxvdsx %x0,%y1" : "=wa" (i) : "Z" (*(reinterpret_cast<double*>(src) + 1)));
 #else
@@ -675,7 +675,7 @@
 /** \internal load and splat a complex value into a vector - column-wise */
 EIGEN_ALWAYS_INLINE Packet4f pload_realimag_combine(std::complex<float>* src)
 {
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
     Packet4f ret;
     __asm__("lxvdsx %x0,%y1" : "=wa" (ret) : "Z" (*(reinterpret_cast<double*>(src) + 0)));
     return ret;

diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index b0f8529..fa12892 100644
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h

@@ -173,7 +173,7 @@
     HasATan = 1,
     HasLog = 1,
     HasExp = 1,
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
     HasSqrt = 1,
 #if !EIGEN_COMP_CLANG
     HasRsqrt = 1,
@@ -218,7 +218,7 @@
     HasCos = EIGEN_FAST_MATH,
     HasLog = 1,
     HasExp = 1,
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
     HasSqrt = 1,
 #if !EIGEN_COMP_CLANG
     HasRsqrt = 1,
@@ -446,7 +446,7 @@
   // ignoring these warnings for now.
   EIGEN_UNUSED_VARIABLE(from);
   EIGEN_DEBUG_ALIGNED_LOAD
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
   return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
 #else
   return vec_ld(0, from);
@@ -501,7 +501,7 @@
   #pragma GCC diagnostic push
   #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #endif
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
   return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
 #else
   return vec_ld(0, from);
@@ -608,7 +608,7 @@
   // ignoring these warnings for now.
   EIGEN_UNUSED_VARIABLE(to);
   EIGEN_DEBUG_ALIGNED_STORE
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
   vec_xst(from, 0, to);
 #else
   vec_st(from, 0, to);
@@ -1002,7 +1002,7 @@
 #ifdef __POWER8_VECTOR__
   return vec_neg(a);
 #else
-  return p4f_ZERO - a;
+  return vec_xor(a, p4f_MZERO);
 #endif
 }
 template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a)
@@ -1054,7 +1054,7 @@
 template<> EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) { return vec_madd(a,b,c); }
 template<> EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c) { return vec_madd(a,b,c); }
 
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
 template<> EIGEN_STRONG_INLINE Packet4f pmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_msub(a,b,c); }
 template<> EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_nmsub(a,b,c); }
 template<> EIGEN_STRONG_INLINE Packet4f pnmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_nmadd(a,b,c); }
@@ -1062,7 +1062,7 @@
 
 template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b)
 {
-  #ifdef __VSX__
+  #ifdef EIGEN_VECTORIZE_VSX
   // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
   Packet4f ret;
   __asm__ ("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
@@ -1080,7 +1080,7 @@
 
 template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b)
 {
-  #ifdef __VSX__
+  #ifdef EIGEN_VECTORIZE_VSX
   // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
   Packet4f ret;
   __asm__ ("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
@@ -1096,34 +1096,37 @@
 template<> EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vec_max(a, b); }
 
 template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmple(a,b)); }
+// To fix bug with vec_cmplt on older versions
+#if defined(__POWER8_VECTOR__) || EIGEN_COMP_LLVM
 template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmplt(a,b)); }
+#endif
 template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmpeq(a,b)); }
 template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) {
   Packet4f c = reinterpret_cast<Packet4f>(vec_cmpge(a,b));
   return vec_nor(c,c);
 }
 
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
 template<> EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmple(a,b)); }
 #endif
 template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmplt(a,b)); }
 template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmpeq(a,b)); }
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
 template<> EIGEN_STRONG_INLINE Packet8s pcmp_le(const Packet8s& a, const Packet8s& b) { return reinterpret_cast<Packet8s>(vec_cmple(a,b)); }
 #endif
 template<> EIGEN_STRONG_INLINE Packet8s pcmp_lt(const Packet8s& a, const Packet8s& b) { return reinterpret_cast<Packet8s>(vec_cmplt(a,b)); }
 template<> EIGEN_STRONG_INLINE Packet8s pcmp_eq(const Packet8s& a, const Packet8s& b) { return reinterpret_cast<Packet8s>(vec_cmpeq(a,b)); }
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
 template<> EIGEN_STRONG_INLINE Packet8us pcmp_le(const Packet8us& a, const Packet8us& b) { return reinterpret_cast<Packet8us>(vec_cmple(a,b)); }
 #endif
 template<> EIGEN_STRONG_INLINE Packet8us pcmp_lt(const Packet8us& a, const Packet8us& b) { return reinterpret_cast<Packet8us>(vec_cmplt(a,b)); }
 template<> EIGEN_STRONG_INLINE Packet8us pcmp_eq(const Packet8us& a, const Packet8us& b) { return reinterpret_cast<Packet8us>(vec_cmpeq(a,b)); }
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
 template<> EIGEN_STRONG_INLINE Packet16c pcmp_le(const Packet16c& a, const Packet16c& b) { return reinterpret_cast<Packet16c>(vec_cmple(a,b)); }
 #endif
 template<> EIGEN_STRONG_INLINE Packet16c pcmp_lt(const Packet16c& a, const Packet16c& b) { return reinterpret_cast<Packet16c>(vec_cmplt(a,b)); }
 template<> EIGEN_STRONG_INLINE Packet16c pcmp_eq(const Packet16c& a, const Packet16c& b) { return reinterpret_cast<Packet16c>(vec_cmpeq(a,b)); }
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
 template<> EIGEN_STRONG_INLINE Packet16uc pcmp_le(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast<Packet16uc>(vec_cmple(a,b)); }
 #endif
 template<> EIGEN_STRONG_INLINE Packet16uc pcmp_lt(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast<Packet16uc>(vec_cmplt(a,b)); }
@@ -1164,7 +1167,7 @@
     Packet4f t = vec_add(reinterpret_cast<Packet4f>(vec_or(vec_and(reinterpret_cast<Packet4ui>(a), p4ui_SIGN), p4ui_PREV0DOT5)), a);
     Packet4f res;
 
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
     __asm__("xvrspiz %x0, %x1\n\t"
         : "=&wa" (res)
         : "wa" (t));
@@ -1178,7 +1181,7 @@
 }
 template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const  Packet4f& a) { return vec_ceil(a); }
 template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return vec_floor(a); }
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
 template<> EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a)
 {
     Packet4f res;
@@ -1194,7 +1197,7 @@
 template<typename Packet> EIGEN_STRONG_INLINE Packet ploadu_common(const __UNPACK_TYPE__(Packet)* from)
 {
   EIGEN_DEBUG_ALIGNED_LOAD
-#if defined(__VSX__) || !defined(_BIG_ENDIAN)
+#if defined(EIGEN_VECTORIZE_VSX) || !defined(_BIG_ENDIAN)
   EIGEN_DEBUG_UNALIGNED_LOAD
   return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
 #else
@@ -1377,7 +1380,7 @@
 template<typename Packet> EIGEN_STRONG_INLINE void pstoreu_common(__UNPACK_TYPE__(Packet)*  to, const Packet& from)
 {
   EIGEN_DEBUG_UNALIGNED_STORE
-#if defined(__VSX__) || !defined(_BIG_ENDIAN)
+#if defined(EIGEN_VECTORIZE_VSX) || !defined(_BIG_ENDIAN)
   vec_xst(from, 0, to);
 #else
   // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
@@ -1773,7 +1776,7 @@
 template<> EIGEN_STRONG_INLINE Packet8bf pround<Packet8bf> (const Packet8bf& a){
   BF16_TO_F32_UNARY_OP_WRAPPER(pround<Packet4f>, a);
 }
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
 template<> EIGEN_STRONG_INLINE Packet8bf print<Packet8bf> (const Packet8bf& a){
   BF16_TO_F32_UNARY_OP_WRAPPER(print<Packet4f>, a);
 }
@@ -2657,7 +2660,7 @@
 
 
 //---------- double ----------
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
 typedef __vector double              Packet2d;
 typedef __vector unsigned long long  Packet2ul;
 typedef __vector long long           Packet2l;
@@ -2710,7 +2713,11 @@
     HasLog  = 0,
     HasExp  = 1,
     HasSqrt = 1,
+#if !EIGEN_COMP_CLANG
     HasRsqrt = 1,
+#else
+    HasRsqrt = 0,
+#endif
     HasRound = 1,
     HasFloor = 1,
     HasCeil = 1,
@@ -2816,7 +2823,7 @@
 #ifdef __POWER8_VECTOR__
   return vec_neg(a);
 #else
-  return p2d_ZERO - a;
+  return vec_xor(a, p2d_MZERO);
 #endif
 }
 
@@ -2935,8 +2942,7 @@
 // are buggy, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70963
 template<>
 inline Packet2l pcast<Packet2d, Packet2l>(const Packet2d& x) {
-#if EIGEN_GNUC_AT_LEAST(5, 4) || \
-    (EIGEN_GNUC_AT(6, 1) && __GNUC_PATCHLEVEL__ >= 1)
+#if EIGEN_GNUC_STRICT_AT_LEAST(7,1,0)
   return vec_cts(x, 0);    // TODO: check clang version.
 #else
   double tmp[2];

diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
index 3060214..38bd93d 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h

@@ -728,20 +728,19 @@
 
   const Packet cst_one = pset1<Packet>(Scalar(1));
   const Packet cst_pi = pset1<Packet>(Scalar(EIGEN_PI));
-  const Packet p6 = pset1<Packet>(Scalar(2.26911413483321666717529296875e-3));
-  const Packet p5 = pset1<Packet>(Scalar(-1.1063250713050365447998046875e-2));
-  const Packet p4 = pset1<Packet>(Scalar(2.680264413356781005859375e-2));
-  const Packet p3 = pset1<Packet>(Scalar(-4.87488098442554473876953125e-2));
-  const Packet p2 = pset1<Packet>(Scalar(8.874166011810302734375e-2));
-  const Packet p1 = pset1<Packet>(Scalar(-0.2145837843418121337890625));
-  const Packet p0 = pset1<Packet>(Scalar(1.57079613208770751953125));
+  const Packet p6 = pset1<Packet>(Scalar(2.36423197202384471893310546875e-3));
+  const Packet p5 = pset1<Packet>(Scalar(-1.1368644423782825469970703125e-2));
+  const Packet p4 = pset1<Packet>(Scalar(2.717843465507030487060546875e-2));
+  const Packet p3 = pset1<Packet>(Scalar(-4.8969544470310211181640625e-2));
+  const Packet p2 = pset1<Packet>(Scalar(8.8804088532924652099609375e-2));
+  const Packet p1 = pset1<Packet>(Scalar(-0.214591205120086669921875));
+  const Packet p0 = pset1<Packet>(Scalar(1.57079637050628662109375));
 
   // For x in [0:1], we approximate acos(x)/sqrt(1-x), which is a smooth
   // function, by a 6'th order polynomial.
   // For x in [-1:0) we use that acos(-x) = pi - acos(x).
-  const Packet neg_mask = pcmp_lt(x_in, pzero(x_in));
-  Packet x = pabs(x_in);
-  const Packet invalid_mask = pcmp_lt(pset1<Packet>(1.0f), x);
+  const Packet neg_mask = psignbit(x_in);
+  const Packet abs_x = pabs(x_in);
 
   // Evaluate the polynomial using Horner's rule:
   //   P(x) = p0 + x * (p1 +  x * (p2 + ... (p5 + x * p6)) ... ) .
@@ -753,19 +752,15 @@
   p_even = pmadd(p_even, x2, p2);
   p_odd = pmadd(p_odd, x2, p1);
   p_even = pmadd(p_even, x2, p0);
-  Packet p = pmadd(p_odd, x, p_even);
+  Packet p = pmadd(p_odd, abs_x, p_even);
 
   // The polynomial approximates acos(x)/sqrt(1-x), so
   // multiply by sqrt(1-x) to get acos(x).
-  Packet denom = psqrt(psub(cst_one, x));
+  // Conveniently returns NaN for arguments outside [-1:1].
+  Packet denom = psqrt(psub(cst_one, abs_x));
   Packet result = pmul(denom, p);
-
   // Undo mapping for negative arguments.
-  result = pselect(neg_mask, psub(cst_pi, result), result);
-  // Return NaN for arguments outside [-1:1].
-  return pselect(invalid_mask,
-                 pset1<Packet>(std::numeric_limits<float>::quiet_NaN()),
-                 result);
+  return pselect(neg_mask, psub(cst_pi, result), result);
 }
 
 // Generic implementation of asin(x).
@@ -834,6 +829,8 @@
   // We evaluate even and odd terms in x^2 in parallel
   // to take advantage of instruction level parallelism
   // and hardware with multiple FMA units.
+
+  // note: if x == -0, this returns +0
   const Packet x2 = pmul(x, x);
   const Packet x4 = pmul(x2, x2);
   Packet q_odd = pmadd(q14, x4, q10);
@@ -852,20 +849,25 @@
   typedef typename unpacket_traits<Packet>::type Scalar;
   static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
 
+  constexpr float kPiOverTwo = static_cast<float>(EIGEN_PI / 2);
+
+  const Packet cst_signmask = pset1<Packet>(-0.0f);
   const Packet cst_one = pset1<Packet>(1.0f);
-  constexpr float kPiOverTwo = static_cast<float>(EIGEN_PI/2);
+  const Packet cst_pi_over_two = pset1<Packet>(kPiOverTwo);
 
   //   "Large": For |x| > 1, use atan(1/x) = sign(x)*pi/2 - atan(x).
   //   "Small": For |x| <= 1, approximate atan(x) directly by a polynomial
   //            calculated using Sollya.
-  const Packet neg_mask = pcmp_lt(x_in, pzero(x_in));
-  const Packet large_mask = pcmp_lt(cst_one, pabs(x_in));
-  const Packet large_shift = pselect(neg_mask, pset1<Packet>(-kPiOverTwo), pset1<Packet>(kPiOverTwo));
-  const Packet x = pselect(large_mask, preciprocal(x_in), x_in);
+
+  const Packet abs_x = pabs(x_in);
+  const Packet x_signmask = pand(x_in, cst_signmask);
+  const Packet large_mask = pcmp_lt(cst_one, abs_x);
+  const Packet x = pselect(large_mask, preciprocal(abs_x), abs_x);
   const Packet p = patan_reduced_float(x);
-  
   // Apply transformations according to the range reduction masks.
-  return pselect(large_mask, psub(large_shift, p), p);
+  Packet result = pselect(large_mask, psub(cst_pi_over_two, p), p);
+  // Return correct sign
+  return pxor(result, x_signmask);
 }
 
 // Computes elementwise atan(x) for x in [-tan(pi/8):tan(pi/8)]
@@ -920,16 +922,17 @@
   typedef typename unpacket_traits<Packet>::type Scalar;
   static_assert(std::is_same<Scalar, double>::value, "Scalar type must be double");
 
-  const Packet cst_one = pset1<Packet>(1.0);
   constexpr double kPiOverTwo = static_cast<double>(EIGEN_PI / 2);
-  const Packet cst_pi_over_two = pset1<Packet>(kPiOverTwo);
   constexpr double kPiOverFour = static_cast<double>(EIGEN_PI / 4);
-  const Packet cst_pi_over_four = pset1<Packet>(kPiOverFour);
-  const Packet cst_large = pset1<Packet>(2.4142135623730950488016887);  // tan(3*pi/8);
-  const Packet cst_medium = pset1<Packet>(0.4142135623730950488016887);  // tan(pi/8);
+  constexpr double kTanPiOverEight = 0.4142135623730950488016887;
+  constexpr double kTan3PiOverEight = 2.4142135623730950488016887;
 
-  const Packet neg_mask = pcmp_lt(x_in, pzero(x_in));
-  Packet x = pabs(x_in);
+  const Packet cst_signmask = pset1<Packet>(-0.0);
+  const Packet cst_one = pset1<Packet>(1.0);
+  const Packet cst_pi_over_two = pset1<Packet>(kPiOverTwo);
+  const Packet cst_pi_over_four = pset1<Packet>(kPiOverFour);
+  const Packet cst_large = pset1<Packet>(kTan3PiOverEight);
+  const Packet cst_medium = pset1<Packet>(kTanPiOverEight);
 
   // Use the same range reduction strategy (to [0:tan(pi/8)]) as the
   // Cephes library:
@@ -938,10 +941,15 @@
   //             use atan(x) = pi/4 + atan((x-1)/(x+1)).
   //   "Small": For x < tan(pi/8), approximate atan(x) directly by a polynomial
   //            calculated using Sollya.
-  const Packet large_mask = pcmp_lt(cst_large, x);
-  x = pselect(large_mask, preciprocal(x), x);
-  const Packet medium_mask = pandnot(pcmp_lt(cst_medium, x), large_mask);
-  x = pselect(medium_mask, pdiv(psub(x, cst_one), padd(x, cst_one)), x);
+
+  const Packet abs_x = pabs(x_in);
+  const Packet x_signmask = pand(x_in, cst_signmask);
+  const Packet large_mask = pcmp_lt(cst_large, abs_x);
+  const Packet medium_mask = pandnot(pcmp_lt(cst_medium, abs_x), large_mask);
+
+  Packet x = abs_x;
+  x = pselect(large_mask, preciprocal(abs_x), x);
+  x = pselect(medium_mask, pdiv(psub(abs_x, cst_one), padd(abs_x, cst_one)), x);
 
   // Compute approximation of p ~= atan(x') where x' is the argument reduced to
   // [0:tan(pi/8)].
@@ -950,7 +958,8 @@
   // Apply transformations according to the range reduction masks.
   p = pselect(large_mask, psub(cst_pi_over_two, p), p);
   p = pselect(medium_mask, padd(cst_pi_over_four, p), p);
-  return pselect(neg_mask, pnegate(p), p);
+  // Return the correct sign
+  return pxor(p, x_signmask);
 }
 
 template<typename Packet>
@@ -1751,7 +1760,7 @@
   const Packet abs_x = pabs(x);
   // Predicates for sign and magnitude of x.
   const Packet abs_x_is_zero = pcmp_eq(abs_x, cst_zero);
-  const Packet x_has_signbit = pcmp_eq(por(pand(x, cst_neg_inf), cst_pos_inf), cst_neg_inf);
+  const Packet x_has_signbit = psignbit(x);
   const Packet x_is_neg = pandnot(x_has_signbit, abs_x_is_zero);
   const Packet x_is_neg_zero = pand(x_has_signbit, abs_x_is_zero);
   const Packet abs_x_is_inf = pcmp_eq(abs_x, cst_pos_inf);
@@ -1790,19 +1799,21 @@
   const Packet pow_is_inf = por(por(por(pand(abs_x_is_zero, y_is_neg), pand(abs_x_is_inf, y_is_pos)),
                                     pand(pand(abs_x_is_lt_one, abs_y_is_huge), y_is_neg)),
                                 pand(pand(abs_x_is_gt_one, abs_y_is_huge), y_is_pos));
+  const Packet pow_is_neg_zero = pand(pandnot(y_is_int, y_is_even),
+                                      por(pand(y_is_neg, pand(abs_x_is_inf, x_is_neg)), pand(y_is_pos, x_is_neg_zero)));
   const Packet inf_val =
       pselect(pandnot(pand(por(pand(abs_x_is_inf, x_is_neg), pand(x_is_neg_zero, y_is_neg)), y_is_int), y_is_even),
               cst_neg_inf, cst_pos_inf);
-
   // General computation of pow(x,y) for positive x or negative x and integer y.
   const Packet negate_pow_abs = pandnot(x_is_neg, y_is_even);
   const Packet pow_abs = generic_pow_impl(abs_x, y);
-  return pselect(
-      y_is_one, x,
-      pselect(pow_is_one, cst_one,
-              pselect(pow_is_nan, cst_nan,
-                      pselect(pow_is_inf, inf_val,
-                              pselect(pow_is_zero, cst_zero, pselect(negate_pow_abs, pnegate(pow_abs), pow_abs))))));
+  return pselect(y_is_one, x,
+                 pselect(pow_is_one, cst_one,
+                         pselect(pow_is_nan, cst_nan,
+                                 pselect(pow_is_inf, inf_val,
+                                         pselect(pow_is_neg_zero, pnegate(cst_zero),
+                                                 pselect(pow_is_zero, cst_zero,
+                                                         pselect(negate_pow_abs, pnegate(pow_abs), pow_abs)))))));
 }
 
 /* polevl (modified for Eigen)
@@ -2022,7 +2033,7 @@
   const Packet abs_x_is_one = pcmp_eq(abs_x, cst_pos_one);
   const Packet abs_x_is_inf = pcmp_eq(abs_x, cst_pos_inf);
 
-  const Packet x_has_signbit = pcmp_eq(por(pand(x, cst_neg_inf), cst_pos_inf), cst_neg_inf);
+  const Packet x_has_signbit = psignbit(x);
   const Packet x_is_neg = pandnot(x_has_signbit, abs_x_is_zero);
   const Packet x_is_neg_zero = pand(x_has_signbit, abs_x_is_zero);
 
@@ -2071,13 +2082,11 @@
   const Scalar pos_zero = Scalar(0);
   const Scalar pos_one = Scalar(1);
   const Scalar pos_inf = NumTraits<Scalar>::infinity();
-  const Scalar neg_inf = -NumTraits<Scalar>::infinity();
   const Scalar nan = NumTraits<Scalar>::quiet_NaN();
 
   const Packet cst_pos_zero = pset1<Packet>(pos_zero);
   const Packet cst_pos_one = pset1<Packet>(pos_one);
   const Packet cst_pos_inf = pset1<Packet>(pos_inf);
-  const Packet cst_neg_inf = pset1<Packet>(neg_inf);
   const Packet cst_nan = pset1<Packet>(nan);
 
   const Packet abs_x = pabs(x);
@@ -2087,7 +2096,7 @@
   const Packet abs_x_is_one = pcmp_eq(abs_x, cst_pos_one);
   const Packet abs_x_is_inf = pcmp_eq(abs_x, cst_pos_inf);
 
-  const Packet x_has_signbit = pcmp_eq(por(pand(x, cst_neg_inf), cst_pos_inf), cst_neg_inf);
+  const Packet x_has_signbit = psignbit(x);
   const Packet x_is_neg = pandnot(x_has_signbit, abs_x_is_zero);
 
   if (exponent_is_nan) {

diff --git a/Eigen/src/Core/arch/GPU/Complex.h b/Eigen/src/Core/arch/GPU/Complex.h
index c2b4c38..4e4cdd1 100644
--- a/Eigen/src/Core/arch/GPU/Complex.h
+++ b/Eigen/src/Core/arch/GPU/Complex.h

@@ -39,7 +39,7 @@
 // To allow std::complex operator use on device, define _OVERRIDE_COMPLEX_SPECIALIZATION_
 // prior to first inclusion of <complex>.  This prevents ICC from adding
 // its own specializations, so our custom ones below can be used instead.
-#if !(defined(EIGEN_COMP_ICC) && defined(_USE_COMPLEX_SPECIALIZATION_))
+#if !(EIGEN_COMP_ICC && defined(_USE_COMPLEX_SPECIALIZATION_))
 
 // Import Eigen's internal operator specializations.
 #define EIGEN_USING_STD_COMPLEX_OPERATORS           \

diff --git a/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h b/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h
index c892d39..ff1646d 100644
--- a/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h

@@ -96,7 +96,7 @@
   template<int LaneID>
   EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
   {
-    #if EIGEN_COMP_GNUC_STRICT
+    #if EIGEN_GNUC_STRICT_LESS_THAN(9,0,0)
     // 1. workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
     //    vfmaq_laneq_f32 is implemented through a costly dup, which was fixed in gcc9
     // 2. workaround the gcc register split problem on arm64-neon
@@ -166,7 +166,7 @@
   template <int LaneID>
   EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
   {
-    #if EIGEN_COMP_GNUC_STRICT
+    #if EIGEN_GNUC_STRICT_LESS_THAN(9,0,0)
     // 1. workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
     //    vfmaq_laneq_f64 is implemented through a costly dup, which was fixed in gcc9
     // 2. workaround the gcc register split problem on arm64-neon

diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 069bd4c..6d4d1c3 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h

@@ -3696,11 +3696,11 @@
 
 // Clang 3.5 in the iOS toolchain has an ICE triggered by NEON intrisics for double.
 // Confirmed at least with __apple_build_version__ = 6000054.
-#ifdef __apple_build_version__
+#if EIGEN_COMP_CLANGAPPLE
 // Let's hope that by the time __apple_build_version__ hits the 601* range, the bug will be fixed.
 // https://gist.github.com/yamaya/2924292 suggests that the 3 first digits are only updated with
 // major toolchain updates.
-#define EIGEN_APPLE_DOUBLE_NEON_BUG (__apple_build_version__ < 6010000)
+#define EIGEN_APPLE_DOUBLE_NEON_BUG (EIGEN_COMP_CLANGAPPLE < 6010000)
 #else
 #define EIGEN_APPLE_DOUBLE_NEON_BUG 0
 #endif
@@ -3928,7 +3928,7 @@
 
 // Other reduction functions:
 // mul
-#if EIGEN_COMP_CLANG && defined(__apple_build_version__)
+#if EIGEN_COMP_CLANGAPPLE
 template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
 { return (vget_low_f64(a) * vget_high_f64(a))[0]; }
 #else

diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index a0ff359..434d033 100644
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h

@@ -475,7 +475,7 @@
 template<> EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) { return por(pcmp_lt(a,b), pcmp_eq(a,b)); }
 
 template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
-#if EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC < 63
+#if EIGEN_GNUC_STRICT_LESS_THAN(6,3,0)
   // There appears to be a bug in GCC, by which the optimizer may
   // flip the argument order in calls to _mm_min_ps, so we have to
   // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
@@ -494,7 +494,7 @@
 #endif
 }
 template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
-#if EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC < 63
+#if EIGEN_GNUC_STRICT_LESS_THAN(6,3,0)
   // There appears to be a bug in GCC, by which the optimizer may
   // flip the argument order in calls to _mm_min_pd, so we have to
   // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
@@ -525,7 +525,7 @@
 
 
 template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
-#if EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC < 63
+#if EIGEN_GNUC_STRICT_LESS_THAN(6,3,0)
   // There appears to be a bug in GCC, by which the optimizer may
   // flip the argument order in calls to _mm_max_ps, so we have to
   // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
@@ -544,7 +544,7 @@
 #endif
 }
 template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
-#if EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC < 63
+#if EIGEN_GNUC_STRICT_LESS_THAN(6,3,0)
   // There appears to be a bug in GCC, by which the optimizer may
   // flip the argument order in calls to _mm_max_pd, so we have to
   // resort to inline ASM here. This is supposed to be fixed in gcc6.3,

diff --git a/Eigen/src/Core/arch/SYCL/PacketMath.h b/Eigen/src/Core/arch/SYCL/PacketMath.h
index 5bc3235..062d50e 100644
--- a/Eigen/src/Core/arch/SYCL/PacketMath.h
+++ b/Eigen/src/Core/arch/SYCL/PacketMath.h

@@ -21,193 +21,53 @@
 #ifndef EIGEN_PACKET_MATH_SYCL_H
 #define EIGEN_PACKET_MATH_SYCL_H
 #include <type_traits>
+
 #include "../../InternalHeaderCheck.h"
 
 namespace Eigen {
 
 namespace internal {
 #ifdef SYCL_DEVICE_ONLY
-
-#define SYCL_PLOADT_RO(address_space_target)                                 \
-  template <typename packet_type, int Alignment>                             \
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type ploadt_ro(               \
-      typename cl::sycl::multi_ptr<                                          \
-          const typename unpacket_traits<packet_type>::type,                 \
-          cl::sycl::access::address_space::address_space_target>::pointer_t  \
-          from) {                                                            \
-    typedef typename unpacket_traits<packet_type>::type scalar;              \
-    typedef cl::sycl::multi_ptr<                                             \
-        scalar, cl::sycl::access::address_space::address_space_target>       \
-        multi_ptr;                                                           \
-    auto res = packet_type(                                                  \
-        static_cast<typename unpacket_traits<packet_type>::type>(0));        \
-    res.load(0, multi_ptr(const_cast<typename multi_ptr::pointer_t>(from))); \
-    return res;                                                              \
+#define SYCL_PLOAD(packet_type, AlignedType)                          \
+  template <>                                                         \
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type                   \
+      pload##AlignedType<packet_type>(                                \
+          const typename unpacket_traits<packet_type>::type* from) {  \
+    using scalar = typename unpacket_traits<packet_type>::type;       \
+    typedef cl::sycl::multi_ptr<                                      \
+        const scalar, cl::sycl::access::address_space::generic_space, \
+        cl::sycl::access::decorated::no>                              \
+        multi_ptr;                                                    \
+    packet_type res{};                                                \
+    res.load(0, multi_ptr(from));                                     \
+    return res;                                                       \
   }
 
-SYCL_PLOADT_RO(global_space)
-SYCL_PLOADT_RO(local_space)
-#undef SYCL_PLOADT_RO
-#endif
-
-template <typename packet_type, int Alignment, typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type
-ploadt_ro(const Eigen::TensorSycl::internal::RangeAccess<
-          cl::sycl::access::mode::read_write, T>& from) {
-  return ploadt_ro<packet_type, Alignment>(from.get_pointer());
-}
-
-#ifdef SYCL_DEVICE_ONLY
-#define SYCL_PLOAD(address_space_target, Alignment, AlignedType)            \
-  template <typename packet_type>                                           \
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##AlignedType(     \
-      typename cl::sycl::multi_ptr<                                         \
-          const typename unpacket_traits<packet_type>::type,                \
-          cl::sycl::access::address_space::address_space_target>::pointer_t \
-          from) {                                                           \
-    return ploadt_ro<packet_type, Alignment>(from);                         \
-  }
-
-// global space
-SYCL_PLOAD(global_space, Unaligned, u)
-SYCL_PLOAD(global_space, Aligned, )
-// local space
-SYCL_PLOAD(local_space, Unaligned, u)
-SYCL_PLOAD(local_space, Aligned, )
+SYCL_PLOAD(cl::sycl::cl_float4, u)
+SYCL_PLOAD(cl::sycl::cl_float4, )
+SYCL_PLOAD(cl::sycl::cl_double2, u)
+SYCL_PLOAD(cl::sycl::cl_double2, )
 
 #undef SYCL_PLOAD
-#endif
 
-#define SYCL_PLOAD(Alignment, AlignedType)                              \
-  template <typename packet_type>                                       \
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##AlignedType( \
-      const Eigen::TensorSycl::internal::RangeAccess<                   \
-          cl::sycl::access::mode::read_write,                           \
-          typename unpacket_traits<packet_type>::type>                  \
-          from) {                                                       \
-    return ploadt_ro<packet_type, Alignment>(from);                     \
-  }
-SYCL_PLOAD(Unaligned, u)
-SYCL_PLOAD(Aligned, )
-#undef SYCL_PLOAD
-
-#ifdef SYCL_DEVICE_ONLY
-/** \internal \returns a packet version of \a *from.
- * The pointer \a from must be aligned on a \a Alignment bytes boundary. */
-#define SYCL_PLOADT(address_space_target)                                   \
-  template <typename packet_type, int Alignment>                            \
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type ploadt(                 \
-      typename cl::sycl::multi_ptr<                                         \
-          const typename unpacket_traits<packet_type>::type,                \
-          cl::sycl::access::address_space::address_space_target>::pointer_t \
-          from) {                                                           \
-    if (Alignment >= unpacket_traits<packet_type>::alignment)               \
-      return pload<packet_type>(from);                                      \
-    else                                                                    \
-      return ploadu<packet_type>(from);                                     \
+#define SYCL_PSTORE(scalar, packet_type, alignment)             \
+  template <>                                                   \
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \
+      scalar* to, const packet_type& from) {                    \
+    typedef cl::sycl::multi_ptr<                                \
+        scalar, cl::sycl::access::address_space::generic_space, \
+        cl::sycl::access::decorated::no>                        \
+        multi_ptr;                                              \
+    from.store(0, multi_ptr(to));                               \
   }
 
-// global space
-SYCL_PLOADT(global_space)
-// local space
-SYCL_PLOADT(local_space)
-#undef SYCL_PLOADT
-#endif
+SYCL_PSTORE(float, cl::sycl::cl_float4, )
+SYCL_PSTORE(float, cl::sycl::cl_float4, u)
+SYCL_PSTORE(double, cl::sycl::cl_double2, )
+SYCL_PSTORE(double, cl::sycl::cl_double2, u)
 
-template <typename packet_type, int Alignment>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type
-ploadt(const Eigen::TensorSycl::internal::RangeAccess<
-       cl::sycl::access::mode::read_write,
-       typename unpacket_traits<packet_type>::type>& from) {
-  return ploadt<packet_type, Alignment>(from.get_pointer());
-}
-#ifdef SYCL_DEVICE_ONLY
-
-// private_space
-#define SYCL_PLOADT_RO_SPECIAL(packet_type, Alignment)                 \
-  template <>                                                          \
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type                    \
-  ploadt_ro<packet_type, Alignment>(                                   \
-      const typename unpacket_traits<packet_type>::type* from) {       \
-    typedef typename unpacket_traits<packet_type>::type scalar;        \
-    auto res = packet_type(static_cast<scalar>(0));                    \
-    res.template load<cl::sycl::access::address_space::private_space>( \
-        0, const_cast<scalar*>(from));                                 \
-    return res;                                                        \
-  }
-
-SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_float4, Aligned)
-SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_double2, Aligned)
-SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_float4, Unaligned)
-SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_double2, Unaligned)
-
-#define SYCL_PLOAD_SPECIAL(packet_type, alignment_type)                    \
-  template <>                                                              \
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##alignment_type( \
-      const typename unpacket_traits<packet_type>::type* from) {           \
-    typedef typename unpacket_traits<packet_type>::type scalar;            \
-    auto res = packet_type(static_cast<scalar>(0));                        \
-    res.template load<cl::sycl::access::address_space::private_space>(     \
-        0, const_cast<scalar*>(from));                                     \
-    return res;                                                            \
-  }
-SYCL_PLOAD_SPECIAL(cl::sycl::cl_float4, )
-SYCL_PLOAD_SPECIAL(cl::sycl::cl_double2, )
-SYCL_PLOAD_SPECIAL(cl::sycl::cl_float4, u)
-SYCL_PLOAD_SPECIAL(cl::sycl::cl_double2, u)
-
-#undef SYCL_PLOAD_SPECIAL
-
-#define SYCL_PSTORE(scalar, packet_type, address_space_target, alignment)   \
-  template <>                                                               \
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment(             \
-      typename cl::sycl::multi_ptr<                                         \
-          scalar,                                                           \
-          cl::sycl::access::address_space::address_space_target>::pointer_t \
-          to,                                                               \
-      const packet_type& from) {                                            \
-    typedef cl::sycl::multi_ptr<                                            \
-        scalar, cl::sycl::access::address_space::address_space_target>      \
-        multi_ptr;                                                          \
-    from.store(0, multi_ptr(to));                                           \
-  }
-
-// global space
-SYCL_PSTORE(float, cl::sycl::cl_float4, global_space, )
-SYCL_PSTORE(float, cl::sycl::cl_float4, global_space, u)
-SYCL_PSTORE(double, cl::sycl::cl_double2, global_space, )
-SYCL_PSTORE(double, cl::sycl::cl_double2, global_space, u)
-SYCL_PSTORE(float, cl::sycl::cl_float4, local_space, )
-SYCL_PSTORE(float, cl::sycl::cl_float4, local_space, u)
-SYCL_PSTORE(double, cl::sycl::cl_double2, local_space, )
-SYCL_PSTORE(double, cl::sycl::cl_double2, local_space, u)
-
-SYCL_PSTORE(float, cl::sycl::cl_float4, private_space, )
-SYCL_PSTORE(float, cl::sycl::cl_float4, private_space, u)
-SYCL_PSTORE(double, cl::sycl::cl_double2, private_space, )
-SYCL_PSTORE(double, cl::sycl::cl_double2, private_space, u)
 #undef SYCL_PSTORE
 
-#define SYCL_PSTORE_T(address_space_target)                                 \
-  template <typename scalar, typename packet_type, int Alignment>           \
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(                       \
-      typename cl::sycl::multi_ptr<                                         \
-          scalar,                                                           \
-          cl::sycl::access::address_space::address_space_target>::pointer_t \
-          to,                                                               \
-      const packet_type& from) {                                            \
-    if (Alignment)                                                          \
-      pstore(to, from);                                                     \
-    else                                                                    \
-      pstoreu(to, from);                                                    \
-  }
-
-SYCL_PSTORE_T(global_space)
-
-SYCL_PSTORE_T(local_space)
-
-#undef SYCL_PSTORE_T
-
 #define SYCL_PSET1(packet_type)                                         \
   template <>                                                           \
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pset1<packet_type>( \
@@ -291,22 +151,6 @@
   }
 };
 
-#define SYCL_PLOAD_DUP(address_space_target)                                \
-  template <typename packet_type>                                           \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ploaddup(               \
-      typename cl::sycl::multi_ptr<                                         \
-          const typename unpacket_traits<packet_type>::type,                \
-          cl::sycl::access::address_space::address_space_target>::pointer_t \
-          from) {                                                           \
-    return get_base_packet<packet_type>::get_ploaddup(from);                \
-  }
-
-// global space
-SYCL_PLOAD_DUP(global_space)
-// local_space
-SYCL_PLOAD_DUP(local_space)
-#undef SYCL_PLOAD_DUP
-
 #define SYCL_PLOAD_DUP_SPECILIZE(packet_type)                              \
   template <>                                                              \
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ploaddup<packet_type>( \
@@ -325,30 +169,11 @@
       const typename unpacket_traits<packet_type>::type& a) {           \
     return get_base_packet<packet_type>::set_plset(a);                  \
   }
-
 SYCL_PLSET(cl::sycl::cl_float4)
 SYCL_PLSET(cl::sycl::cl_double2)
 
 #undef SYCL_PLSET
 
-#define SYCL_PGATHER(address_space_target)                                  \
-  template <typename Scalar, typename packet_type>                          \
-  EIGEN_DEVICE_FUNC inline packet_type pgather(                             \
-      typename cl::sycl::multi_ptr<                                         \
-          const typename unpacket_traits<packet_type>::type,                \
-          cl::sycl::access::address_space::address_space_target>::pointer_t \
-          from,                                                             \
-      Index stride) {                                                       \
-    return get_base_packet<packet_type>::get_pgather(from, stride);         \
-  }
-
-// global space
-SYCL_PGATHER(global_space)
-// local space
-SYCL_PGATHER(local_space)
-
-#undef SYCL_PGATHER
-
 #define SYCL_PGATHER_SPECILIZE(scalar, packet_type)                            \
   template <>                                                                  \
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type                            \
@@ -362,24 +187,6 @@
 
 #undef SYCL_PGATHER_SPECILIZE
 
-#define SYCL_PSCATTER(address_space_target)                                 \
-  template <typename Scalar, typename packet_type>                          \
-  EIGEN_DEVICE_FUNC inline void pscatter(                                   \
-      typename cl::sycl::multi_ptr<                                         \
-          typename unpacket_traits<packet_type>::type,                      \
-          cl::sycl::access::address_space::address_space_target>::pointer_t \
-          to,                                                               \
-      const packet_type& from, Index stride) {                              \
-    get_base_packet<packet_type>::set_pscatter(to, from, stride);           \
-  }
-
-// global space
-SYCL_PSCATTER(global_space)
-// local space
-SYCL_PSCATTER(local_space)
-
-#undef SYCL_PSCATTER
-
 #define SYCL_PSCATTER_SPECILIZE(scalar, packet_type)                        \
   template <>                                                               \
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<scalar, packet_type>( \
@@ -563,6 +370,34 @@
 }
 #endif  // SYCL_DEVICE_ONLY
 
+template <typename packet_type, int Alignment, typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type
+ploadt_ro(const Eigen::TensorSycl::internal::RangeAccess<
+          cl::sycl::access::mode::read_write, T>& from) {
+  return ploadt_ro<packet_type, Alignment>(from.get_pointer());
+}
+
+#define SYCL_PLOAD(Alignment, AlignedType)                              \
+  template <typename packet_type>                                       \
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##AlignedType( \
+      const Eigen::TensorSycl::internal::RangeAccess<                   \
+          cl::sycl::access::mode::read_write,                           \
+          typename unpacket_traits<packet_type>::type>                  \
+          from) {                                                       \
+    return ploadt_ro<packet_type, Alignment>(from);                     \
+  }
+SYCL_PLOAD(Unaligned, u)
+SYCL_PLOAD(Aligned, )
+#undef SYCL_PLOAD
+
+template <typename packet_type, int Alignment>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type
+ploadt(const Eigen::TensorSycl::internal::RangeAccess<
+       cl::sycl::access::mode::read_write,
+       typename unpacket_traits<packet_type>::type>& from) {
+  return ploadt<packet_type, Alignment>(from.get_pointer());
+}
+
 #define SYCL_PSTORE(alignment)                                  \
   template <typename packet_type>                               \
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \

diff --git a/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h b/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h
index 54eedfa..c532c18 100644
--- a/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h
+++ b/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h

@@ -223,7 +223,6 @@
         m_pointerMap.clear();
         EIGEN_THROW_X(
             std::out_of_range("The pointer is not registered in the map\n"));
-
       }
       --node;
     }
@@ -550,7 +549,7 @@
   static const auto is_place_holder = cl::sycl::access::placeholder::true_t;
   typedef T scalar_t;
   typedef scalar_t &ref_t;
-  typedef typename cl::sycl::global_ptr<scalar_t>::pointer_t ptr_t;
+  typedef scalar_t *ptr_t;
 
   // the accessor type does not necessarily the same as T
   typedef cl::sycl::accessor<scalar_t, 1, AcMd, global_access, is_place_holder>
@@ -570,7 +569,12 @@
   RangeAccess(std::nullptr_t) : RangeAccess() {}
   // This template parameter must be removed and scalar_t should be replaced
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptr_t get_pointer() const {
-    return (access_.get_pointer().get() + offset_);
+    typedef cl::sycl::multi_ptr<scalar_t,
+                                cl::sycl::access::address_space::generic_space,
+                                cl::sycl::access::decorated::no>
+        multi_ptr;
+    multi_ptr p(access_);
+    return (p + offset_).get_raw();
   }
   template <typename Index>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t &operator+=(Index offset) {

diff --git a/Eigen/src/Core/arch/SYCL/TypeCasting.h b/Eigen/src/Core/arch/SYCL/TypeCasting.h
index 613e823..f6f057b 100644
--- a/Eigen/src/Core/arch/SYCL/TypeCasting.h
+++ b/Eigen/src/Core/arch/SYCL/TypeCasting.h

@@ -64,7 +64,7 @@
                                cl::sycl::rounding_mode::automatic>();
   auto b1 = b.template convert<cl::sycl::cl_float,
                                cl::sycl::rounding_mode::automatic>();
-  return cl::sycl::float4(a1.x(), a1.y(), b1.x(), b1.y());
+  return cl::sycl::cl_float4(a1.x(), a1.y(), b1.x(), b1.y());
 }
 
 template <>

diff --git a/Eigen/src/Core/functors/AssignmentFunctors.h b/Eigen/src/Core/functors/AssignmentFunctors.h
index c9d80e6..ca08991 100644
--- a/Eigen/src/Core/functors/AssignmentFunctors.h
+++ b/Eigen/src/Core/functors/AssignmentFunctors.h

@@ -154,7 +154,7 @@
   enum {
     Cost = 3 * NumTraits<Scalar>::ReadCost,
     PacketAccess = 
-    #if defined(EIGEN_VECTORIZE_AVX) && EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<800 || defined(__apple_build_version__))
+    #if defined(EIGEN_VECTORIZE_AVX) && (EIGEN_CLANG_STRICT_LESS_THAN(8,0,0) || EIGEN_COMP_CLANGAPPLE)
     // This is a partial workaround for a bug in clang generating bad code
     // when mixing 256/512 bits loads and 128 bits moves.
     // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1684

diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h
index c8bb4e7..3c76019 100644
--- a/Eigen/src/Core/functors/BinaryFunctors.h
+++ b/Eigen/src/Core/functors/BinaryFunctors.h

@@ -516,68 +516,25 @@
 template <typename LhsScalar, typename RhsScalar>
 struct scalar_atan2_op {
   using Scalar = LhsScalar;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<is_same<LhsScalar,RhsScalar>::value, Scalar>
-  operator()(const Scalar& y, const Scalar& x) const {
-    EIGEN_USING_STD(atan2);
-    return static_cast<Scalar>(atan2(y, x));
+
+  static constexpr bool Enable = is_same<LhsScalar, RhsScalar>::value && !NumTraits<Scalar>::IsInteger && !NumTraits<Scalar>::IsComplex;
+  EIGEN_STATIC_ASSERT(Enable, "LhsScalar and RhsScalar must be the same non-integer, non-complex type")
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& y, const Scalar& x) const {
+    return numext::atan2(y, x);
   }
   template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-      std::enable_if_t<is_same<LhsScalar, RhsScalar>::value, Packet>
-      packetOp(const Packet& y, const Packet& x) const {
-    // See https://en.cppreference.com/w/cpp/numeric/math/atan2
-    // for how corner cases are supposed to be handled according to the
-    // IEEE floating-point standard (IEC 60559).
-    const Packet kSignMask = pset1<Packet>(-Scalar(0));
-    const Packet kPi = pset1<Packet>(Scalar(EIGEN_PI));
-    const Packet kPiO2 = pset1<Packet>(Scalar(EIGEN_PI / 2));
-    const Packet kPiO4 = pset1<Packet>(Scalar(EIGEN_PI / 4));
-    const Packet k3PiO4 = pset1<Packet>(Scalar(3.0 * (EIGEN_PI / 4)));
-
-    // Various predicates about the inputs.
-    Packet x_signbit = pand(x, kSignMask);
-    Packet x_has_signbit = pcmp_lt(por(x_signbit, kPi), pzero(x));
-    Packet x_is_zero = pcmp_eq(x, pzero(x));
-    Packet x_neg = pandnot(x_has_signbit, x_is_zero);
-
-    Packet y_signbit = pand(y, kSignMask);
-    Packet y_is_zero = pcmp_eq(y, pzero(y));
-    Packet x_is_not_nan = pcmp_eq(x, x);
-    Packet y_is_not_nan = pcmp_eq(y, y);
-
-    // Compute the normal case. Notice that we expect that
-    // finite/infinite = +/-0 here.
-    Packet result = patan(pdiv(y, x));
-
-    // Compute shift for when x != 0 and y != 0.
-    Packet shift = pselect(x_neg, por(kPi, y_signbit), pzero(x));
-
-    // Special cases:
-    //   Handle  x = +/-inf && y = +/-inf.
-    Packet is_not_nan = pcmp_eq(result, result);
-    result =
-        pselect(is_not_nan, padd(shift, result),
-                pselect(x_neg, por(k3PiO4, y_signbit), por(kPiO4, y_signbit)));
-    //   Handle x == +/-0.
-    result = pselect(
-        x_is_zero, pselect(y_is_zero, pzero(y), por(y_signbit, kPiO2)), result);
-    //   Handle y == +/-0.
-    result = pselect(
-        y_is_zero,
-        pselect(x_has_signbit, por(y_signbit, kPi), por(y_signbit, pzero(y))),
-        result);
-    // Handle NaN inputs.
-    Packet kQNaN = pset1<Packet>(NumTraits<Scalar>::quiet_NaN());
-    return pselect(pand(x_is_not_nan, y_is_not_nan), result, kQNaN);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& y, const Packet& x) const {
+    return internal::patan2(y, x);
   }
 };
 
 template<typename LhsScalar,typename RhsScalar>
     struct functor_traits<scalar_atan2_op<LhsScalar, RhsScalar>> {
+  using Scalar = LhsScalar;
   enum {
-    PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasATan && packet_traits<LhsScalar>::HasDiv && !NumTraits<LhsScalar>::IsInteger && !NumTraits<LhsScalar>::IsComplex,
-    Cost =
-        scalar_div_cost<LhsScalar, PacketAccess>::value + 5 * NumTraits<LhsScalar>::MulCost + 5 * NumTraits<LhsScalar>::AddCost
+    PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<Scalar>::HasATan && packet_traits<Scalar>::HasDiv && !NumTraits<Scalar>::IsInteger && !NumTraits<Scalar>::IsComplex,
+    Cost = scalar_div_cost<Scalar, PacketAccess>::value + functor_traits<scalar_atan_op<Scalar>>::Cost
   };
 };
 

diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 4a6cef5..862efc6 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h

@@ -1212,7 +1212,7 @@
     traits.madd(*A0, *rhs_panel, *C1, *T0, fix<1>);
     traits.madd(*A0, *rhs_panel, *C2, *T0, fix<2>);
     traits.madd(*A0, *rhs_panel, *C3, *T0, fix<3>);
-    #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE) && !(EIGEN_COMP_LCC)
+    #if EIGEN_GNUC_STRICT_AT_LEAST(6,0,0) && defined(EIGEN_VECTORIZE_SSE) && !(EIGEN_COMP_LCC)
     __asm__  ("" : "+x,m" (*A0));
     #endif
     EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
@@ -1628,7 +1628,7 @@
               RhsPanel27 rhs_panel;
               RhsPacket T0;
               LhsPacket A2;
-            #if EIGEN_COMP_GNUC_STRICT && EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && !(EIGEN_GNUC_AT_LEAST(9,0))
+            #if EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && EIGEN_GNUC_STRICT_LESS_THAN(9,0,0)
             // see http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1633
             // without this workaround A0, A1, and A2 are loaded in the same register,
             // which is not good for pipelining
@@ -1834,7 +1834,7 @@
             RhsPanel15 rhs_panel;
             RhsPacket T0;
             LhsPacket A2;
-            #if EIGEN_COMP_GNUC_STRICT && EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && !(EIGEN_GNUC_AT_LEAST(9,0))
+            #if EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && EIGEN_GNUC_STRICT_LESS_THAN(9,0,0)
             // see http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1633
             // without this workaround A0, A1, and A2 are loaded in the same register,
             // which is not good for pipelining
@@ -2081,7 +2081,7 @@
               RhsPacket T0;
               // NOTE: the begin/end asm comments below work around bug 935!
               // but they are not enough for gcc>=6 without FMA (bug 1637)
-              #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE)
+              #if EIGEN_GNUC_STRICT_AT_LEAST(6,0,0) && defined(EIGEN_VECTORIZE_SSE)
                 #define EIGEN_GEBP_2Px8_SPILLING_WORKAROUND __asm__  ("" : [a0] "+x,m" (A0),[a1] "+x,m" (A1));
               #else
                 #define EIGEN_GEBP_2Px8_SPILLING_WORKAROUND
@@ -2245,7 +2245,7 @@
 
           // NOTE: the begin/end asm comments below work around bug 935!
           // but they are not enough for gcc>=6 without FMA (bug 1637)
-          #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE) && !(EIGEN_COMP_LCC)
+          #if EIGEN_GNUC_STRICT_AT_LEAST(6,0,0) && defined(EIGEN_VECTORIZE_SSE) && !(EIGEN_COMP_LCC)
             #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__  ("" : [a0] "+x,m" (A0),[a1] "+x,m" (A1));
           #else
             #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND

diff --git a/Eigen/src/Core/util/Assert.h b/Eigen/src/Core/util/Assert.h
new file mode 100644
index 0000000..f8ba632
--- /dev/null
+++ b/Eigen/src/Core/util/Assert.h

@@ -0,0 +1,166 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2022, The Eigen authors.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CORE_UTIL_ASSERT_H
+#define EIGEN_CORE_UTIL_ASSERT_H
+
+// Eigen custom assert function.
+//
+// The combination of Eigen's relative includes and cassert's `assert` function
+// (or any usage of the __FILE__ macro) can lead to ODR issues:
+// a header included using different relative paths in two different TUs will
+// have two different token-for-token definitions, since __FILE__ is expanded
+// as an in-line string with different values.  Normally this would be
+// harmless - the linker would just choose one definition. However, it breaks
+// with C++20 modules when functions in different modules have different
+// definitions.
+//
+// To get around this, we need to use __builtin_FILE() when available, which is
+// considered a single token, and thus satisfies the ODR.
+
+// Only define eigen_plain_assert if we are debugging, and either
+//  - we are not compiling for GPU, or
+//  - gpu debugging is enabled.
+#if !defined(EIGEN_NO_DEBUG) && (!defined(EIGEN_GPU_COMPILE_PHASE) || !defined(EIGEN_NO_DEBUG_GPU))
+
+#include <cassert>
+
+#ifndef EIGEN_USE_CUSTOM_PLAIN_ASSERT
+// Disable new custom asserts by default for now.
+#define EIGEN_USE_CUSTOM_PLAIN_ASSERT  0
+#endif
+
+#if EIGEN_USE_CUSTOM_PLAIN_ASSERT
+
+#ifndef EIGEN_HAS_BUILTIN_FILE
+// Clang can check if __builtin_FILE() is supported.
+// GCC > 5, MSVC 2019 14.26 (1926) all have __builtin_FILE().
+// 
+// For NVCC, it's more complicated.  Through trial-and-error:
+//   - nvcc+gcc supports __builtin_FILE() on host, and on device after CUDA 11.
+//   - nvcc+msvc supports __builtin_FILE() only after CUDA 11.
+#if (EIGEN_HAS_BUILTIN(__builtin_FILE) && (EIGEN_COMP_CLANG || !defined(EIGEN_CUDA_ARCH))) || \
+    (EIGEN_GNUC_STRICT_AT_LEAST(5, 0, 0) && (EIGEN_COMP_NVCC >= 110000 || !defined(EIGEN_CUDA_ARCH))) || \
+    (EIGEN_COMP_MSVC >= 1926 && (!EIGEN_COMP_NVCC || EIGEN_COMP_NVCC >= 110000))
+#define EIGEN_HAS_BUILTIN_FILE 1
+#else
+#define EIGEN_HAS_BUILTIN_FILE 0
+#endif
+#endif  // EIGEN_HAS_BUILTIN_FILE
+
+#if EIGEN_HAS_BUILTIN_FILE
+#  define EIGEN_BUILTIN_FILE __builtin_FILE()
+#  define EIGEN_BUILTIN_LINE __builtin_LINE()
+#else
+// Default (potentially unsafe) values.
+#  define EIGEN_BUILTIN_FILE __FILE__
+#  define EIGEN_BUILTIN_LINE __LINE__
+#endif
+
+// Use __PRETTY_FUNCTION__ when available, since it is more descriptive, as
+// __builtin_FUNCTION() only returns the undecorated function name.
+// This should still be okay ODR-wise since it is a compiler-specific fixed
+// value.  Mixing compilers will likely lead to ODR violations anyways.
+#if EIGEN_COMP_MSVC
+#  define EIGEN_BUILTIN_FUNCTION __FUNCSIG__
+#elif EIGEN_COMP_GNUC
+#  define EIGEN_BUILTIN_FUNCTION __PRETTY_FUNCTION__
+#else
+#  define EIGEN_BUILTIN_FUNCTION __func__
+#endif
+
+namespace Eigen {
+namespace internal {
+
+// Generic default assert handler.
+template<typename EnableIf = void, typename... EmptyArgs>
+struct assert_handler_impl {
+  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE
+  static inline void run(const char* expression, const char* file, unsigned line, const char* function) {
+#ifdef EIGEN_GPU_COMPILE_PHASE
+    // GPU device code doesn't allow stderr or abort, so use printf and raise an
+    // illegal instruction exception to trigger a kernel failure.
+#ifndef EIGEN_NO_IO
+    printf("Assertion failed at %s:%u in %s: %s\n",
+      file == nullptr ? "<file>" : file,
+      line,
+      function == nullptr ? "<function>" : function,
+      expression);
+#endif
+    __trap();
+      
+#else  // EIGEN_GPU_COMPILE_PHASE
+
+    // Print to stderr and abort, as specified in <cassert>.
+#ifndef EIGEN_NO_IO
+    fprintf(stderr, "Assertion failed at %s:%u in %s: %s\n",
+      file == nullptr ? "<file>" : file,
+      line,
+      function == nullptr ? "<function>" : function,
+      expression);
+#endif
+    std::abort();
+    
+#endif  // EIGEN_GPU_COMPILE_PHASE
+  }
+};
+
+// Use POSIX __assert_fail handler when available.
+//
+// This allows us to integrate with systems that have custom handlers.
+//
+// NOTE: this handler is not always available on all POSIX systems (otherwise
+// we could simply test for __unix__ or similar).  The handler function name
+// seems to depend on the specific toolchain implementation, and differs between
+// compilers, platforms, OSes, etc.  Hence, we detect support via SFINAE.
+template<typename... EmptyArgs>
+struct assert_handler_impl<
+    void_t<decltype(__assert_fail(
+        (const char*)nullptr,         // expression
+        (const char*)nullptr,         // file
+        0,                            // line
+        (const char*)nullptr,         // function
+        std::declval<EmptyArgs>()...  // Empty substitution required for SFINAE.
+    ))>, EmptyArgs... > {
+  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE
+  static inline void run(const char* expression, const char* file, unsigned line, const char* function) {
+    // GCC requires this call to be dependent on the template parameters.
+    __assert_fail(expression, file, line, function, std::declval<EmptyArgs>()...);
+  }
+};
+
+EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE
+inline void __assert_handler(const char* expression, const char* file, unsigned line, const char* function) {
+  assert_handler_impl<>::run(expression, file, line, function);
+}
+
+}  // namespace internal
+}  // namespace Eigen
+
+#define eigen_plain_assert(expression)                     \
+  (EIGEN_PREDICT_FALSE(!(expression)) ?                    \
+    Eigen::internal::__assert_handler(#expression,         \
+                                      EIGEN_BUILTIN_FILE,  \
+                                      EIGEN_BUILTIN_LINE,  \
+                                      EIGEN_BUILTIN_FUNCTION) : (void)0)
+
+#else  // EIGEN_USE_CUSTOM_PLAIN_ASSERT
+
+// Use regular assert.
+#define eigen_plain_assert(condition) assert(condition)
+
+#endif  // EIGEN_USE_CUSTOM_PLAIN_ASSERT
+
+#else   // EIGEN_NO_DEBUG
+
+#define eigen_plain_assert(condition) ((void)0)
+
+#endif  // EIGEN_NO_DEBUG
+
+#endif  // EIGEN_CORE_UTIL_ASSERT_H

diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h
index a2486af..979d974 100644
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h

@@ -478,8 +478,8 @@
     ExtractType,
     typename ExtractType_::PlainObject
     > DirectLinearAccessType;
-  static inline EIGEN_DEVICE_FUNC ExtractType extract(const XprType& x) { return x; }
-  static inline EIGEN_DEVICE_FUNC const Scalar extractScalarFactor(const XprType&) { return Scalar(1); }
+  EIGEN_DEVICE_FUNC static inline EIGEN_DEVICE_FUNC ExtractType extract(const XprType& x) { return x; }
+  EIGEN_DEVICE_FUNC static inline EIGEN_DEVICE_FUNC const Scalar extractScalarFactor(const XprType&) { return Scalar(1); }
 };
 
 // pop conjugate
@@ -495,8 +495,8 @@
     IsComplex = NumTraits<Scalar>::IsComplex,
     NeedToConjugate = Base::NeedToConjugate ? 0 : IsComplex
   };
-  static inline ExtractType extract(const XprType& x) { return Base::extract(x.nestedExpression()); }
-  static inline Scalar extractScalarFactor(const XprType& x) { return conj(Base::extractScalarFactor(x.nestedExpression())); }
+  EIGEN_DEVICE_FUNC static inline ExtractType extract(const XprType& x) { return Base::extract(x.nestedExpression()); }
+  EIGEN_DEVICE_FUNC static inline Scalar extractScalarFactor(const XprType& x) { return conj(Base::extractScalarFactor(x.nestedExpression())); }
 };
 
 // pop scalar multiple
@@ -510,8 +510,8 @@
   typedef blas_traits<NestedXpr> Base;
   typedef CwiseBinaryOp<scalar_product_op<Scalar>, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain>, NestedXpr> XprType;
   typedef typename Base::ExtractType ExtractType;
-  static inline EIGEN_DEVICE_FUNC ExtractType extract(const XprType& x) { return Base::extract(x.rhs()); }
-  static inline EIGEN_DEVICE_FUNC Scalar extractScalarFactor(const XprType& x)
+  EIGEN_DEVICE_FUNC static inline EIGEN_DEVICE_FUNC ExtractType extract(const XprType& x) { return Base::extract(x.rhs()); }
+  EIGEN_DEVICE_FUNC static inline EIGEN_DEVICE_FUNC Scalar extractScalarFactor(const XprType& x)
   { return x.lhs().functor().m_other * Base::extractScalarFactor(x.rhs()); }
 };
 template<typename Scalar, typename NestedXpr, typename Plain>
@@ -524,8 +524,8 @@
   typedef blas_traits<NestedXpr> Base;
   typedef CwiseBinaryOp<scalar_product_op<Scalar>, NestedXpr, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain> > XprType;
   typedef typename Base::ExtractType ExtractType;
-  static inline ExtractType extract(const XprType& x) { return Base::extract(x.lhs()); }
-  static inline Scalar extractScalarFactor(const XprType& x)
+  EIGEN_DEVICE_FUNC static inline ExtractType extract(const XprType& x) { return Base::extract(x.lhs()); }
+  EIGEN_DEVICE_FUNC static inline Scalar extractScalarFactor(const XprType& x)
   { return Base::extractScalarFactor(x.lhs()) * x.rhs().functor().m_other; }
 };
 template<typename Scalar, typename Plain1, typename Plain2>
@@ -545,8 +545,8 @@
   typedef blas_traits<NestedXpr> Base;
   typedef CwiseUnaryOp<scalar_opposite_op<Scalar>, NestedXpr> XprType;
   typedef typename Base::ExtractType ExtractType;
-  static inline ExtractType extract(const XprType& x) { return Base::extract(x.nestedExpression()); }
-  static inline Scalar extractScalarFactor(const XprType& x)
+  EIGEN_DEVICE_FUNC static inline ExtractType extract(const XprType& x) { return Base::extract(x.nestedExpression()); }
+  EIGEN_DEVICE_FUNC static inline Scalar extractScalarFactor(const XprType& x)
   { return - Base::extractScalarFactor(x.nestedExpression()); }
 };
 
@@ -567,8 +567,8 @@
   enum {
     IsTransposed = Base::IsTransposed ? 0 : 1
   };
-  static inline ExtractType extract(const XprType& x) { return ExtractType(Base::extract(x.nestedExpression())); }
-  static inline Scalar extractScalarFactor(const XprType& x) { return Base::extractScalarFactor(x.nestedExpression()); }
+  EIGEN_DEVICE_FUNC static inline ExtractType extract(const XprType& x) { return ExtractType(Base::extract(x.nestedExpression())); }
+  EIGEN_DEVICE_FUNC static inline Scalar extractScalarFactor(const XprType& x) { return Base::extractScalarFactor(x.nestedExpression()); }
 };
 
 template<typename T>
@@ -586,7 +586,7 @@
 
 template<typename T>
 struct extract_data_selector<T,false> {
-  static typename T::Scalar* run(const T&) { return 0; }
+  EIGEN_DEVICE_FUNC static typename T::Scalar* run(const T&) { return 0; }
 };
 
 template<typename T>

diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h
index 7c1a08b..643d6400 100644
--- a/Eigen/src/Core/util/ConfigureVectorization.h
+++ b/Eigen/src/Core/util/ConfigureVectorization.h

@@ -186,14 +186,14 @@
     #define EIGEN_SSE2_ON_MSVC_2008_OR_LATER
   #endif
 #else
-  #if (defined __SSE2__) && ( (!EIGEN_COMP_GNUC) || EIGEN_COMP_ICC || EIGEN_COMP_GNUC )
-    #define EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC
+  #if defined(__SSE2__)
+    #define EIGEN_SSE2_ON_NON_MSVC
   #endif
 #endif
 
 #if !(defined(EIGEN_DONT_VECTORIZE) || defined(EIGEN_GPUCC))
 
-  #if defined (EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC) || defined(EIGEN_SSE2_ON_MSVC_2008_OR_LATER)
+  #if defined (EIGEN_SSE2_ON_NON_MSVC) || defined(EIGEN_SSE2_ON_MSVC_2008_OR_LATER)
 
     // Defines symbols for compile-time detection of which instructions are
     // used.
@@ -285,7 +285,7 @@
     #endif
 
     // Disable AVX support on broken xcode versions
-    #if defined(__apple_build_version__) && (__apple_build_version__ == 11000033 ) && ( __MAC_OS_X_VERSION_MIN_REQUIRED == 101500 )
+    #if ( EIGEN_COMP_CLANGAPPLE == 11000033 ) && ( __MAC_OS_X_VERSION_MIN_REQUIRED == 101500 )
       // A nasty bug in the clang compiler shipped with xcode in a common compilation situation
       // when XCode 11.0 and Mac deployment target macOS 10.15 is https://trac.macports.org/ticket/58776#no1
       #ifdef EIGEN_VECTORIZE_AVX
@@ -352,10 +352,10 @@
       #endif
     } // end extern "C"
 
-  #elif defined __VSX__
+  #elif defined(__VSX__) && !defined(__APPLE__)
 
     #define EIGEN_VECTORIZE
-    #define EIGEN_VECTORIZE_VSX
+    #define EIGEN_VECTORIZE_VSX 1
     #include <altivec.h>
     // We need to #undef all these ugly tokens defined in <altivec.h>
     // => use __vector instead of vector
@@ -427,7 +427,7 @@
   #include <arm_fp16.h>
 #endif
 
-#if defined(__F16C__) && (!defined(EIGEN_GPUCC) && (!EIGEN_COMP_CLANG || EIGEN_COMP_CLANG>=380))
+#if defined(__F16C__) && !defined(EIGEN_GPUCC) && (!EIGEN_COMP_CLANG_STRICT || EIGEN_CLANG_STRICT_AT_LEAST(3,8,0))
   // We can use the optimized fp16 to float and float to fp16 conversion routines
   #define EIGEN_HAS_FP16_C
 

diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h
index 8f87c4a..facb074 100644
--- a/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/Eigen/src/Core/util/ForwardDeclarations.h

@@ -192,6 +192,8 @@
 template<typename Scalar> struct scalar_acos_op;
 template<typename Scalar> struct scalar_asin_op;
 template<typename Scalar> struct scalar_tan_op;
+template<typename Scalar> struct scalar_atan_op;
+template <typename LhsScalar, typename RhsScalar = LhsScalar> struct scalar_atan2_op;
 template<typename Scalar> struct scalar_inverse_op;
 template<typename Scalar> struct scalar_square_op;
 template<typename Scalar> struct scalar_cube_op;
@@ -252,15 +254,22 @@
 template<typename MatrixType,int RowFactor,int ColFactor> class Replicate;
 template<typename MatrixType, int Direction = BothDirections> class Reverse;
 
+#if defined(EIGEN_USE_LAPACKE) && defined(lapack_int)
+// Lapacke interface requires StorageIndex to be lapack_int
+typedef lapack_int DefaultPermutationIndex;
+#else
+typedef int DefaultPermutationIndex;
+#endif
+
 template<typename MatrixType> class FullPivLU;
 template<typename MatrixType> class PartialPivLU;
 namespace internal {
 template<typename MatrixType> struct inverse_impl;
 }
 template<typename MatrixType> class HouseholderQR;
-template<typename MatrixType> class ColPivHouseholderQR;
-template<typename MatrixType> class FullPivHouseholderQR;
-template<typename MatrixType> class CompleteOrthogonalDecomposition;
+template<typename MatrixType, typename PermutationIndex = DefaultPermutationIndex> class ColPivHouseholderQR;
+template<typename MatrixType, typename PermutationIndex = DefaultPermutationIndex> class FullPivHouseholderQR;
+template<typename MatrixType, typename PermutationIndex = DefaultPermutationIndex> class CompleteOrthogonalDecomposition;
 template<typename MatrixType> class SVDBase;
 template<typename MatrixType, int Options = 0> class JacobiSVD;
 template<typename MatrixType, int Options = 0> class BDCSVD;

diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index 4cd81a0..dd94f88 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h

@@ -63,20 +63,27 @@
 // Compiler identification, EIGEN_COMP_*
 //------------------------------------------------------------------------------------------
 
-/// \internal EIGEN_COMP_GNUC set to 1 for all compilers compatible with GCC
+/// \internal EIGEN_COMP_GNUC set to version (e.g., 951 for GCC 9.5.1) for all compilers compatible with GCC
 #ifdef __GNUC__
-  #define EIGEN_COMP_GNUC (__GNUC__*10+__GNUC_MINOR__)
+  #define EIGEN_COMP_GNUC (__GNUC__*100+__GNUC_MINOR__*10+__GNUC_PATCHLEVEL__)
 #else
   #define EIGEN_COMP_GNUC 0
 #endif
 
-/// \internal EIGEN_COMP_CLANG set to major+minor version (e.g., 307 for clang 3.7) if the compiler is clang
+/// \internal EIGEN_COMP_CLANG set to version (e.g., 372 for clang 3.7.2) if the compiler is clang
 #if defined(__clang__)
-  #define EIGEN_COMP_CLANG (__clang_major__*100+__clang_minor__)
+  #define EIGEN_COMP_CLANG (__clang_major__*100+__clang_minor__*10+__clang_patchlevel__)
 #else
   #define EIGEN_COMP_CLANG 0
 #endif
 
+/// \internal EIGEN_COMP_CLANGAPPLE set to the version number (e.g. 9000000 for AppleClang 9.0) if the compiler is AppleClang
+#if defined(__clang__) && defined(__apple_build_version__)
+  #define EIGEN_COMP_CLANGAPPLE __apple_build_version__
+#else
+  #define EIGEN_COMP_CLANGAPPLE 0
+#endif
+
 /// \internal EIGEN_COMP_CASTXML set to 1 if being preprocessed by CastXML
 #if defined(__castxml__)
   #define EIGEN_COMP_CASTXML 1
@@ -248,25 +255,48 @@
 #endif
 
 
-/// \internal EIGEN_GNUC_STRICT set to 1 if the compiler is really GCC and not a compatible compiler (e.g., ICC, clang, mingw, etc.)
+/// \internal EIGEN_COMP_GNUC_STRICT set to 1 if the compiler is really GCC and not a compatible compiler (e.g., ICC, clang, mingw, etc.)
 #if EIGEN_COMP_GNUC && !(EIGEN_COMP_CLANG || EIGEN_COMP_ICC || EIGEN_COMP_CLANGICC || EIGEN_COMP_MINGW || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM || EIGEN_COMP_EMSCRIPTEN || EIGEN_COMP_FCC || EIGEN_COMP_CLANGFCC || EIGEN_COMP_CPE || EIGEN_COMP_CLANGCPE || EIGEN_COMP_LCC)
   #define EIGEN_COMP_GNUC_STRICT 1
 #else
   #define EIGEN_COMP_GNUC_STRICT 0
 #endif
 
-
-#if EIGEN_COMP_GNUC
-  #define EIGEN_GNUC_AT_LEAST(x,y) ((__GNUC__==x && __GNUC_MINOR__>=y) || __GNUC__>x)
-  #define EIGEN_GNUC_AT_MOST(x,y)  ((__GNUC__==x && __GNUC_MINOR__<=y) || __GNUC__<x)
-  #define EIGEN_GNUC_AT(x,y)       ( __GNUC__==x && __GNUC_MINOR__==y )
+// GCC, and compilers that pretend to be it, have different version schemes, so this only makes sense to use with the real GCC.
+#if EIGEN_COMP_GNUC_STRICT
+  #define EIGEN_GNUC_STRICT_AT_LEAST(x,y,z)  ((__GNUC__ > x) || \
+                                              (__GNUC__ == x && __GNUC_MINOR__ > y) || \
+                                              (__GNUC__ == x && __GNUC_MINOR__ == y && __GNUC_PATCHLEVEL__ >= z))
+  #define EIGEN_GNUC_STRICT_LESS_THAN(x,y,z) ((__GNUC__ < x) || \
+                                              (__GNUC__ == x && __GNUC_MINOR__ < y) || \
+                                              (__GNUC__ == x && __GNUC_MINOR__ == y && __GNUC_PATCHLEVEL__ < z))
 #else
-  #define EIGEN_GNUC_AT_LEAST(x,y) 0
-  #define EIGEN_GNUC_AT_MOST(x,y)  0
-  #define EIGEN_GNUC_AT(x,y)       0
+  #define EIGEN_GNUC_STRICT_AT_LEAST(x,y,z)  0
+  #define EIGEN_GNUC_STRICT_LESS_THAN(x,y,z) 0
 #endif
 
 
+
+/// \internal EIGEN_COMP_CLANG_STRICT set to 1 if the compiler is really Clang and not a compatible compiler (e.g., AppleClang, etc.)
+#if EIGEN_COMP_CLANG && !(EIGEN_COMP_CLANGAPPLE || EIGEN_COMP_CLANGICC || EIGEN_COMP_CLANGFCC || EIGEN_COMP_CLANGCPE)
+  #define EIGEN_COMP_CLANG_STRICT 1
+#else
+  #define EIGEN_COMP_CLANG_STRICT 0
+#endif
+
+// Clang, and compilers forked from it, have different version schemes, so this only makes sense to use with the real Clang.
+#if EIGEN_COMP_CLANG_STRICT
+  #define EIGEN_CLANG_STRICT_AT_LEAST(x,y,z)  ((__clang_major__ > x) || \
+                                               (__clang_major__ == x && __clang_minor__ > y) || \
+                                               (__clang_major__ == x && __clang_minor__ == y && __clang_patchlevel__ >= z))
+  #define EIGEN_CLANG_STRICT_LESS_THAN(x,y,z) ((__clang_major__ < x) || \
+                                               (__clang_major__ == x && __clang_minor__ < y) || \
+                                               (__clang_major__ == x && __clang_minor__ == y && __clang_patchlevel__ < z))
+#else
+  #define EIGEN_CLANG_STRICT_AT_LEAST(x,y,z)  0
+  #define EIGEN_CLANG_STRICT_LESS_THAN(x,y,z) 0
+#endif
+
 //------------------------------------------------------------------------------------------
 // Architecture identification, EIGEN_ARCH_*
 //------------------------------------------------------------------------------------------
@@ -375,7 +405,7 @@
 #endif
 
 /// \internal EIGEN_ARCH_PPC set to 1 if the architecture is PowerPC
-#if defined(__powerpc__) || defined(__ppc__) || defined(_M_PPC)
+#if defined(__powerpc__) || defined(__ppc__) || defined(_M_PPC) || defined(__POWERPC__)
   #define EIGEN_ARCH_PPC 1
 #else
   #define EIGEN_ARCH_PPC 0
@@ -671,10 +701,13 @@
 // but in practice we should not rely on them but rather on the availability of
 // individual features as defined later.
 // This is why there is no EIGEN_HAS_CXX17.
-#if EIGEN_MAX_CPP_VER<14 || EIGEN_COMP_CXXVER<14 || (EIGEN_COMP_MSVC && EIGEN_COMP_MSVC < 1900) || \
-  (EIGEN_COMP_ICC && EIGEN_COMP_ICC < 1500) || (EIGEN_COMP_NVCC && EIGEN_COMP_NVCC < 80000) ||     \
-  (EIGEN_COMP_CLANG && ((EIGEN_COMP_CLANG<309) || (defined(__apple_build_version__) && (__apple_build_version__ < 9000000)))) || \
-  (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<51)
+#if EIGEN_MAX_CPP_VER < 14 || EIGEN_COMP_CXXVER < 14 || \
+  (EIGEN_COMP_MSVC && EIGEN_COMP_MSVC < 1900) || \
+  (EIGEN_COMP_ICC && EIGEN_COMP_ICC < 1500) || \
+  (EIGEN_COMP_NVCC && EIGEN_COMP_NVCC < 80000) || \
+  (EIGEN_COMP_CLANG_STRICT && EIGEN_COMP_CLANG < 390) || \
+  (EIGEN_COMP_CLANGAPPLE && EIGEN_COMP_CLANGAPPLE < 9000000) || \
+  (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC < 510)
 #error This compiler appears to be too old to be supported by Eigen
 #endif
 
@@ -721,11 +754,11 @@
 //       for over-aligned data, but not in a manner that is compatible with Eigen.
 //       See https://gitlab.com/libeigen/eigen/-/issues/2575
 #ifndef EIGEN_HAS_CXX17_OVERALIGN
-#if EIGEN_MAX_CPP_VER>=17 && EIGEN_COMP_CXXVER>=17 && (                                 \
-           (EIGEN_COMP_MSVC >= 1912)                                                    \
-        || (EIGEN_GNUC_AT_LEAST(7,0))                                                   \
-        || ((!defined(__apple_build_version__)) && (EIGEN_COMP_CLANG>=500))             \
-        || (( defined(__apple_build_version__)) && (__apple_build_version__>=10000000)) \
+#if EIGEN_MAX_CPP_VER>=17 && EIGEN_COMP_CXXVER>=17 && (                 \
+           (EIGEN_COMP_MSVC >= 1912)                                    \
+        || (EIGEN_GNUC_STRICT_AT_LEAST(7,0,0))                          \
+        || (EIGEN_CLANG_STRICT_AT_LEAST(5,0,0))                         \
+        || (EIGEN_COMP_CLANGAPPLE && EIGEN_COMP_CLANGAPPLE >= 10000000) \
       ) && !EIGEN_COMP_ICC
 #define EIGEN_HAS_CXX17_OVERALIGN 1
 #else
@@ -819,8 +852,8 @@
 
 // GPU stuff
 
-// Disable some features when compiling with GPU compilers (NVCC/clang-cuda/SYCL/HIPCC)
-#if defined(EIGEN_CUDACC) || defined(SYCL_DEVICE_ONLY) || defined(EIGEN_HIPCC)
+// Disable some features when compiling with GPU compilers (SYCL/HIPCC)
+#if defined(SYCL_DEVICE_ONLY) || defined(EIGEN_HIP_DEVICE_COMPILE)
   // Do not try asserts on device code
   #ifndef EIGEN_NO_DEBUG
   #define EIGEN_NO_DEBUG
@@ -829,7 +862,10 @@
   #ifdef EIGEN_INTERNAL_DEBUGGING
   #undef EIGEN_INTERNAL_DEBUGGING
   #endif
+#endif
 
+// No exceptions on device.
+#if defined(SYCL_DEVICE_ONLY) || defined(EIGEN_GPU_COMPILE_PHASE)
   #ifdef EIGEN_EXCEPTIONS
   #undef EIGEN_EXCEPTIONS
   #endif
@@ -861,17 +897,6 @@
 # endif
 #endif
 
-// eigen_plain_assert is where we implement the workaround for the assert() bug in GCC <= 4.3, see bug 89
-#ifdef EIGEN_NO_DEBUG
-  #ifdef SYCL_DEVICE_ONLY // used to silence the warning on SYCL device
-    #define eigen_plain_assert(x) EIGEN_UNUSED_VARIABLE(x)
-  #else
-    #define eigen_plain_assert(x)
-  #endif
-#else
-    #define eigen_plain_assert(x) assert(x)
-#endif
-
 // eigen_assert can be overridden
 #ifndef eigen_assert
 #define eigen_assert(x) eigen_plain_assert(x)
@@ -880,10 +905,10 @@
 #ifdef EIGEN_INTERNAL_DEBUGGING
 #define eigen_internal_assert(x) eigen_assert(x)
 #else
-#define eigen_internal_assert(x)
+#define eigen_internal_assert(x) ((void)0)
 #endif
 
-#ifdef EIGEN_NO_DEBUG
+#if defined(EIGEN_NO_DEBUG) || (defined(EIGEN_GPU_COMPILE_PHASE) && defined(EIGEN_NO_DEBUG_GPU))
 #define EIGEN_ONLY_USED_FOR_DEBUG(x) EIGEN_UNUSED_VARIABLE(x)
 #else
 #define EIGEN_ONLY_USED_FOR_DEBUG(x)
@@ -980,11 +1005,16 @@
     // directly for std::complex<T>, Eigen::half, Eigen::bfloat16. For these,
     // you will need to apply to the underlying POD type.
     #if EIGEN_ARCH_PPC && EIGEN_COMP_GNUC_STRICT
-      // This seems to be broken on clang.  Packet4f is loaded into a single
-      //   register rather than a vector, zeroing out some entries.  Integer
+      // This seems to be broken on clang. Packet4f is loaded into a single
+      //   register rather than a vector, zeroing out some entries. Integer
       //   types also generate a compile error.
-      // General, Altivec, VSX.
-      #define EIGEN_OPTIMIZATION_BARRIER(X)  __asm__  ("" : "+r,v,wa" (X));
+      #if EIGEN_OS_MAC
+        // General, Altivec for Apple (VSX were added in ISA v2.06):
+        #define EIGEN_OPTIMIZATION_BARRIER(X)  __asm__  ("" : "+r,v" (X));
+      #else
+        // General, Altivec, VSX otherwise:
+        #define EIGEN_OPTIMIZATION_BARRIER(X)  __asm__  ("" : "+r,v,wa" (X));
+      #endif
     #elif EIGEN_ARCH_ARM_OR_ARM64
       #ifdef __ARM_FP
         // General, VFP or NEON.
@@ -1237,10 +1267,10 @@
 namespace Eigen {
 namespace internal {
 
-inline bool all(){ return true; }
+EIGEN_DEVICE_FUNC inline bool all(){ return true; }
 
 template<typename T, typename ...Ts>
-bool all(T t, Ts ... ts){ return t && all(ts...); }
+EIGEN_DEVICE_FUNC bool all(T t, Ts ... ts){ return t && all(ts...); }
 
 }
 }

diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index e4a8793..a39c718 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h

@@ -59,6 +59,25 @@
 
 #endif
 
+#ifndef EIGEN_MALLOC_CHECK_THREAD_LOCAL
+
+// Check whether we can use the thread_local keyword to allow or disallow
+// allocating memory with per-thread granularity, by means of the
+// set_is_malloc_allowed() function.
+#ifndef EIGEN_AVOID_THREAD_LOCAL
+
+#if ((EIGEN_COMP_GNUC) || __has_feature(cxx_thread_local) || EIGEN_COMP_MSVC >= 1900) && !defined(EIGEN_GPU_COMPILE_PHASE)
+#define EIGEN_MALLOC_CHECK_THREAD_LOCAL thread_local
+#else
+#define EIGEN_MALLOC_CHECK_THREAD_LOCAL
+#endif
+
+#else // EIGEN_AVOID_THREAD_LOCAL
+#define EIGEN_MALLOC_CHECK_THREAD_LOCAL
+#endif // EIGEN_AVOID_THREAD_LOCAL
+
+#endif
+
 #include "../InternalHeaderCheck.h"
 
 namespace Eigen {
@@ -156,7 +175,7 @@
 #elif defined EIGEN_RUNTIME_NO_MALLOC
 EIGEN_DEVICE_FUNC inline bool is_malloc_allowed_impl(bool update, bool new_value = false)
 {
-  static bool value = true;
+  EIGEN_MALLOC_CHECK_THREAD_LOCAL static bool value = true;
   if (update == 1)
     value = new_value;
   return value;
@@ -938,7 +957,7 @@
 
   ~aligned_allocator() {}
 
-  #if EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_LEAST(7,0)
+  #if EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_STRICT_AT_LEAST(7,0,0)
   // In gcc std::allocator::max_size() is bugged making gcc triggers a warning:
   // eigen/Eigen/src/Core/util/Memory.h:189:12: warning: argument 1 value '18446744073709551612' exceeds maximum object size 9223372036854775807
   // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87544

diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h
index 6c6fb71..1a5d515 100644
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h

@@ -153,6 +153,21 @@
 template< class T >
 struct is_void : is_same<void, std::remove_const_t<T>> {};
 
+/** \internal
+  * Implementation of std::void_t for SFINAE.
+  *
+  * Pre C++17:
+  * Custom implementation.
+  *
+  * Post C++17: Uses std::void_t
+  */
+#if EIGEN_COMP_CXXVER >= 17
+using std::void_t;
+#else
+template<typename...>
+using void_t = void;
+#endif
+
 template<> struct is_arithmetic<signed long long>   { enum { value = true }; };
 template<> struct is_arithmetic<unsigned long long> { enum { value = true }; };
 using std::is_integral;
@@ -232,7 +247,7 @@
   *
   * For C++20, this function just forwards to `std::ssize`, or any ADL discoverable `ssize` function.
   */
-#if EIGEN_COMP_CXXVER < 20  || EIGEN_GNUC_AT_MOST(9,4)
+#if EIGEN_COMP_CXXVER < 20 || EIGEN_GNUC_STRICT_LESS_THAN(10,0,0)
 template <typename T>
 EIGEN_CONSTEXPR auto index_list_size(const T& x) {
   using R = std::common_type_t<std::ptrdiff_t, std::make_signed_t<decltype(x.size())>>;

diff --git a/Eigen/src/QR/ColPivHouseholderQR.h b/Eigen/src/QR/ColPivHouseholderQR.h
index c906997..6855893 100644
--- a/Eigen/src/QR/ColPivHouseholderQR.h
+++ b/Eigen/src/QR/ColPivHouseholderQR.h

@@ -16,12 +16,12 @@
 namespace Eigen {
 
 namespace internal {
-template<typename MatrixType_> struct traits<ColPivHouseholderQR<MatrixType_> >
+template<typename MatrixType_, typename PermutationIndex_> struct traits<ColPivHouseholderQR<MatrixType_, PermutationIndex_> >
  : traits<MatrixType_>
 {
   typedef MatrixXpr XprKind;
   typedef SolverStorage StorageKind;
-  typedef int StorageIndex;
+  typedef PermutationIndex_ PermutationIndex;
   enum { Flags = 0 };
 };
 
@@ -50,31 +50,41 @@
   * 
   * \sa MatrixBase::colPivHouseholderQr()
   */
-template<typename MatrixType_> class ColPivHouseholderQR
-        : public SolverBase<ColPivHouseholderQR<MatrixType_> >
+template<typename MatrixType_, typename PermutationIndex_> class ColPivHouseholderQR
+        : public SolverBase<ColPivHouseholderQR<MatrixType_, PermutationIndex_> >
 {
   public:
 
     typedef MatrixType_ MatrixType;
     typedef SolverBase<ColPivHouseholderQR> Base;
     friend class SolverBase<ColPivHouseholderQR>;
-
+    typedef PermutationIndex_ PermutationIndex;
     EIGEN_GENERIC_PUBLIC_INTERFACE(ColPivHouseholderQR)
+
     enum {
       MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
       MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
     };
     typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
-    typedef PermutationMatrix<ColsAtCompileTime, MaxColsAtCompileTime> PermutationType;
-    typedef typename internal::plain_row_type<MatrixType, Index>::type IntRowVectorType;
+    typedef PermutationMatrix<ColsAtCompileTime, MaxColsAtCompileTime, PermutationIndex> PermutationType;
+    typedef typename internal::plain_row_type<MatrixType, PermutationIndex>::type IntRowVectorType;
     typedef typename internal::plain_row_type<MatrixType>::type RowVectorType;
     typedef typename internal::plain_row_type<MatrixType, RealScalar>::type RealRowVectorType;
     typedef HouseholderSequence<MatrixType,internal::remove_all_t<typename HCoeffsType::ConjugateReturnType>> HouseholderSequenceType;
     typedef typename MatrixType::PlainObject PlainObject;
 
-  private:
-
-    typedef typename PermutationType::StorageIndex PermIndexType;
+private:
+    void init(Index rows, Index cols) {
+      Index diag = numext::mini(rows, cols);
+      m_hCoeffs.resize(diag);
+      m_colsPermutation.resize(cols);
+      m_colsTranspositions.resize(cols);
+      m_temp.resize(cols);
+      m_colNormsUpdated.resize(cols);
+      m_colNormsDirect.resize(cols);
+      m_isInitialized = false;
+      m_usePrescribedThreshold = false;
+    }
 
   public:
 
@@ -101,16 +111,7 @@
       * according to the specified problem \a size.
       * \sa ColPivHouseholderQR()
       */
-    ColPivHouseholderQR(Index rows, Index cols)
-      : m_qr(rows, cols),
-        m_hCoeffs((std::min)(rows,cols)),
-        m_colsPermutation(PermIndexType(cols)),
-        m_colsTranspositions(cols),
-        m_temp(cols),
-        m_colNormsUpdated(cols),
-        m_colNormsDirect(cols),
-        m_isInitialized(false),
-        m_usePrescribedThreshold(false) {}
+    ColPivHouseholderQR(Index rows, Index cols) : m_qr(rows, cols) { init(rows, cols); }
 
     /** \brief Constructs a QR factorization from a given matrix
       *
@@ -124,18 +125,9 @@
       *
       * \sa compute()
       */
-    template<typename InputType>
-    explicit ColPivHouseholderQR(const EigenBase<InputType>& matrix)
-      : m_qr(matrix.rows(), matrix.cols()),
-        m_hCoeffs((std::min)(matrix.rows(),matrix.cols())),
-        m_colsPermutation(PermIndexType(matrix.cols())),
-        m_colsTranspositions(matrix.cols()),
-        m_temp(matrix.cols()),
-        m_colNormsUpdated(matrix.cols()),
-        m_colNormsDirect(matrix.cols()),
-        m_isInitialized(false),
-        m_usePrescribedThreshold(false)
-    {
+    template <typename InputType>
+    explicit ColPivHouseholderQR(const EigenBase<InputType>& matrix) : m_qr(matrix.rows(), matrix.cols()) {
+      init(matrix.rows(), matrix.cols());
       compute(matrix.derived());
     }
 
@@ -145,18 +137,9 @@
       *
       * \sa ColPivHouseholderQR(const EigenBase&)
       */
-    template<typename InputType>
-    explicit ColPivHouseholderQR(EigenBase<InputType>& matrix)
-      : m_qr(matrix.derived()),
-        m_hCoeffs((std::min)(matrix.rows(),matrix.cols())),
-        m_colsPermutation(PermIndexType(matrix.cols())),
-        m_colsTranspositions(matrix.cols()),
-        m_temp(matrix.cols()),
-        m_colNormsUpdated(matrix.cols()),
-        m_colNormsDirect(matrix.cols()),
-        m_isInitialized(false),
-        m_usePrescribedThreshold(false)
-    {
+    template <typename InputType>
+    explicit ColPivHouseholderQR(EigenBase<InputType>& matrix) : m_qr(matrix.derived()) {
+      init(matrix.rows(), matrix.cols());
       computeInPlace();
     }
 
@@ -441,7 +424,7 @@
 
   protected:
 
-    friend class CompleteOrthogonalDecomposition<MatrixType>;
+    friend class CompleteOrthogonalDecomposition<MatrixType, PermutationIndex>;
 
     EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
 
@@ -460,8 +443,8 @@
     Index m_det_p;
 };
 
-template<typename MatrixType>
-typename MatrixType::Scalar ColPivHouseholderQR<MatrixType>::determinant() const
+template<typename MatrixType, typename PermutationIndex>
+typename MatrixType::Scalar ColPivHouseholderQR<MatrixType, PermutationIndex>::determinant() const
 {
   eigen_assert(m_isInitialized && "HouseholderQR is not initialized.");
   eigen_assert(m_qr.rows() == m_qr.cols() && "You can't take the determinant of a non-square matrix!");
@@ -470,8 +453,8 @@
   return m_qr.diagonal().prod() * detQ * Scalar(m_det_p);
 }
 
-template<typename MatrixType>
-typename MatrixType::RealScalar ColPivHouseholderQR<MatrixType>::absDeterminant() const
+template<typename MatrixType, typename PermutationIndex>
+typename MatrixType::RealScalar ColPivHouseholderQR<MatrixType, PermutationIndex>::absDeterminant() const
 {
   using std::abs;
   eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
@@ -479,8 +462,8 @@
   return abs(m_qr.diagonal().prod());
 }
 
-template<typename MatrixType>
-typename MatrixType::RealScalar ColPivHouseholderQR<MatrixType>::logAbsDeterminant() const
+template<typename MatrixType, typename PermutationIndex>
+typename MatrixType::RealScalar ColPivHouseholderQR<MatrixType, PermutationIndex>::logAbsDeterminant() const
 {
   eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
   eigen_assert(m_qr.rows() == m_qr.cols() && "You can't take the determinant of a non-square matrix!");
@@ -493,20 +476,20 @@
   *
   * \sa class ColPivHouseholderQR, ColPivHouseholderQR(const MatrixType&)
   */
-template<typename MatrixType>
+template<typename MatrixType, typename PermutationIndex>
 template<typename InputType>
-ColPivHouseholderQR<MatrixType>& ColPivHouseholderQR<MatrixType>::compute(const EigenBase<InputType>& matrix)
+ColPivHouseholderQR<MatrixType, PermutationIndex>& ColPivHouseholderQR<MatrixType, PermutationIndex>::compute(const EigenBase<InputType>& matrix)
 {
   m_qr = matrix.derived();
   computeInPlace();
   return *this;
 }
 
-template<typename MatrixType>
-void ColPivHouseholderQR<MatrixType>::computeInPlace()
+template<typename MatrixType, typename PermutationIndex>
+void ColPivHouseholderQR<MatrixType, PermutationIndex>::computeInPlace()
 {
-  // the column permutation is stored as int indices, so just to be sure:
-  eigen_assert(m_qr.cols()<=NumTraits<int>::highest());
+
+  eigen_assert(m_qr.cols()<=NumTraits<PermutationIndex>::highest());
 
   using std::abs;
 
@@ -549,7 +532,7 @@
       m_nonzero_pivots = k;
 
     // apply the transposition to the columns
-    m_colsTranspositions.coeffRef(k) = biggest_col_index;
+    m_colsTranspositions.coeffRef(k) = static_cast<PermutationIndex>(biggest_col_index);
     if(k != biggest_col_index) {
       m_qr.col(k).swap(m_qr.col(biggest_col_index));
       std::swap(m_colNormsUpdated.coeffRef(k), m_colNormsUpdated.coeffRef(biggest_col_index));
@@ -595,18 +578,18 @@
     }
   }
 
-  m_colsPermutation.setIdentity(PermIndexType(cols));
-  for(PermIndexType k = 0; k < size/*m_nonzero_pivots*/; ++k)
-    m_colsPermutation.applyTranspositionOnTheRight(k, PermIndexType(m_colsTranspositions.coeff(k)));
+  m_colsPermutation.setIdentity(cols);
+  for(Index k = 0; k < size/*m_nonzero_pivots*/; ++k)
+    m_colsPermutation.applyTranspositionOnTheRight(k, static_cast<Index>(m_colsTranspositions.coeff(k)));
 
   m_det_p = (number_of_transpositions%2) ? -1 : 1;
   m_isInitialized = true;
 }
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-template<typename MatrixType_>
+template<typename MatrixType_, typename PermutationIndex_>
 template<typename RhsType, typename DstType>
-void ColPivHouseholderQR<MatrixType_>::_solve_impl(const RhsType &rhs, DstType &dst) const
+void ColPivHouseholderQR<MatrixType_, PermutationIndex_>::_solve_impl(const RhsType &rhs, DstType &dst) const
 {
   const Index nonzero_pivots = nonzeroPivots();
 
@@ -628,9 +611,9 @@
   for(Index i = nonzero_pivots; i < cols(); ++i) dst.row(m_colsPermutation.indices().coeff(i)).setZero();
 }
 
-template<typename MatrixType_>
+template<typename MatrixType_, typename PermutationIndex_>
 template<bool Conjugate, typename RhsType, typename DstType>
-void ColPivHouseholderQR<MatrixType_>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
+void ColPivHouseholderQR<MatrixType_, PermutationIndex_>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
 {
   const Index nonzero_pivots = nonzeroPivots();
 
@@ -656,10 +639,10 @@
 
 namespace internal {
 
-template<typename DstXprType, typename MatrixType>
-struct Assignment<DstXprType, Inverse<ColPivHouseholderQR<MatrixType> >, internal::assign_op<typename DstXprType::Scalar,typename ColPivHouseholderQR<MatrixType>::Scalar>, Dense2Dense>
+template<typename DstXprType, typename MatrixType, typename PermutationIndex>
+struct Assignment<DstXprType, Inverse<ColPivHouseholderQR<MatrixType, PermutationIndex> >, internal::assign_op<typename DstXprType::Scalar,typename ColPivHouseholderQR<MatrixType, PermutationIndex>::Scalar>, Dense2Dense>
 {
-  typedef ColPivHouseholderQR<MatrixType> QrType;
+  typedef ColPivHouseholderQR<MatrixType, PermutationIndex> QrType;
   typedef Inverse<QrType> SrcXprType;
   static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename QrType::Scalar> &)
   {
@@ -672,8 +655,8 @@
 /** \returns the matrix Q as a sequence of householder transformations.
   * You can extract the meaningful part only by using:
   * \code qr.householderQ().setLength(qr.nonzeroPivots()) \endcode*/
-template<typename MatrixType>
-typename ColPivHouseholderQR<MatrixType>::HouseholderSequenceType ColPivHouseholderQR<MatrixType>
+template<typename MatrixType, typename PermutationIndex>
+typename ColPivHouseholderQR<MatrixType, PermutationIndex>::HouseholderSequenceType ColPivHouseholderQR<MatrixType, PermutationIndex>
   ::householderQ() const
 {
   eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
@@ -685,10 +668,11 @@
   * \sa class ColPivHouseholderQR
   */
 template<typename Derived>
-const ColPivHouseholderQR<typename MatrixBase<Derived>::PlainObject>
+template<typename PermutationIndexType>
+const ColPivHouseholderQR<typename MatrixBase<Derived>::PlainObject, PermutationIndexType>
 MatrixBase<Derived>::colPivHouseholderQr() const
 {
-  return ColPivHouseholderQR<PlainObject>(eval());
+  return ColPivHouseholderQR<PlainObject, PermutationIndexType>(eval());
 }
 
 } // end namespace Eigen

diff --git a/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h b/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h
index 7652d31..8a6ae9b 100644
--- a/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h
+++ b/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h

@@ -36,64 +36,117 @@
 
 #include "./InternalHeaderCheck.h"
 
-namespace Eigen { 
+namespace Eigen {
 
-/** \internal Specialization for the data types supported by LAPACKe */
+#if defined(EIGEN_USE_LAPACKE)
 
-#define EIGEN_LAPACKE_QR_COLPIV(EIGTYPE, LAPACKE_TYPE, LAPACKE_PREFIX, EIGCOLROW, LAPACKE_COLROW) \
-template<> template<typename InputType> inline \
-ColPivHouseholderQR<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic> >& \
-ColPivHouseholderQR<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic> >::compute( \
-              const EigenBase<InputType>& matrix) \
-\
-{ \
-  using std::abs; \
-  typedef Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic> MatrixType; \
-  typedef MatrixType::RealScalar RealScalar; \
-  Index rows = matrix.rows();\
-  Index cols = matrix.cols();\
-\
-  m_qr = matrix;\
-  Index size = m_qr.diagonalSize();\
-  m_hCoeffs.resize(size);\
-\
-  m_colsTranspositions.resize(cols);\
-  /*Index number_of_transpositions = 0;*/ \
-\
-  m_nonzero_pivots = 0; \
-  m_maxpivot = RealScalar(0);\
-  m_colsPermutation.resize(cols); \
-  m_colsPermutation.indices().setZero(); \
-\
-  lapack_int lda = internal::convert_index<lapack_int,Index>(m_qr.outerStride()); \
-  lapack_int matrix_order = LAPACKE_COLROW; \
-  LAPACKE_##LAPACKE_PREFIX##geqp3( matrix_order, internal::convert_index<lapack_int,Index>(rows), internal::convert_index<lapack_int,Index>(cols), \
-                              (LAPACKE_TYPE*)m_qr.data(), lda, (lapack_int*)m_colsPermutation.indices().data(), (LAPACKE_TYPE*)m_hCoeffs.data()); \
-  m_isInitialized = true; \
-  m_maxpivot=m_qr.diagonal().cwiseAbs().maxCoeff(); \
-  m_hCoeffs.adjointInPlace(); \
-  RealScalar premultiplied_threshold = abs(m_maxpivot) * threshold(); \
-  lapack_int *perm = m_colsPermutation.indices().data(); \
-  for(Index i=0;i<size;i++) { \
-    m_nonzero_pivots += (abs(m_qr.coeff(i,i)) > premultiplied_threshold);\
-  } \
-  for(Index i=0;i<cols;i++) perm[i]--;\
-\
-  /*m_det_pq = (number_of_transpositions%2) ? -1 : 1;  // TODO: It's not needed now; fix upon availability in Eigen */ \
-\
-  return *this; \
-}
+    template<typename Scalar>
+    inline lapack_int call_geqp3(int matrix_layout, lapack_int m, lapack_int n, Scalar* a, lapack_int lda, lapack_int* jpvt, Scalar* tau);
+    template<>
+    inline lapack_int call_geqp3(int matrix_layout, lapack_int m, lapack_int n, float* a, lapack_int lda, lapack_int* jpvt, float* tau)
+    { return LAPACKE_sgeqp3(matrix_layout, m, n, a, lda, jpvt, tau); }
+    template<>
+    inline lapack_int call_geqp3(int matrix_layout, lapack_int m, lapack_int n, double* a, lapack_int lda, lapack_int* jpvt, double* tau)
+    { return LAPACKE_dgeqp3(matrix_layout, m, n, a, lda, jpvt, tau); }
+    template<>
+    inline lapack_int call_geqp3(int matrix_layout, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda, lapack_int* jpvt, lapack_complex_float* tau)
+    { return LAPACKE_cgeqp3(matrix_layout, m, n, a, lda, jpvt, tau); }
+    template<>
+    inline lapack_int call_geqp3(int matrix_layout, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda, lapack_int* jpvt, lapack_complex_double* tau)
+    { return LAPACKE_zgeqp3(matrix_layout, m, n, a, lda, jpvt, tau); }
 
-EIGEN_LAPACKE_QR_COLPIV(double,   double,        d, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_LAPACKE_QR_COLPIV(float,    float,         s, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_LAPACKE_QR_COLPIV(dcomplex, lapack_complex_double, z, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_LAPACKE_QR_COLPIV(scomplex, lapack_complex_float,  c, ColMajor, LAPACK_COL_MAJOR)
+    template <typename MatrixType>
+    struct ColPivHouseholderQR_LAPACKE_impl {
+      typedef typename MatrixType::Scalar Scalar;
+      typedef typename MatrixType::RealScalar RealScalar;
+      typedef typename internal::lapacke_helpers::translate_type_imp<Scalar>::type LapackeType;
+      static constexpr int LapackeStorage = MatrixType::IsRowMajor ? (LAPACK_ROW_MAJOR) : (LAPACK_COL_MAJOR);
 
-EIGEN_LAPACKE_QR_COLPIV(double,   double,        d, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_LAPACKE_QR_COLPIV(float,    float,         s, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_LAPACKE_QR_COLPIV(dcomplex, lapack_complex_double, z, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_LAPACKE_QR_COLPIV(scomplex, lapack_complex_float,  c, RowMajor, LAPACK_ROW_MAJOR)
+      typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
+      typedef PermutationMatrix<Dynamic, Dynamic, lapack_int> PermutationType;
 
-} // end namespace Eigen
+      static void run(MatrixType& qr, HCoeffsType& hCoeffs, PermutationType& colsPermutation, Index& nonzero_pivots,
+                      RealScalar& maxpivot, bool usePrescribedThreshold, RealScalar prescribedThreshold, Index& det_p,
+                      bool& isInitialized) {
 
-#endif // EIGEN_COLPIVOTINGHOUSEHOLDERQR_LAPACKE_H
+        isInitialized = false;
+        hCoeffs.resize(qr.diagonalSize());
+        nonzero_pivots = 0;
+        maxpivot = RealScalar(0);
+        colsPermutation.resize(qr.cols());
+        colsPermutation.indices().setZero();
+
+        lapack_int rows = internal::lapacke_helpers::to_lapack(qr.rows());
+        lapack_int cols = internal::lapacke_helpers::to_lapack(qr.cols());
+        LapackeType* qr_data = (LapackeType*)(qr.data());
+        lapack_int lda = internal::lapacke_helpers::to_lapack(qr.outerStride());
+        lapack_int* perm_data = colsPermutation.indices().data();
+        LapackeType* hCoeffs_data = (LapackeType*)(hCoeffs.data());
+
+        lapack_int info = call_geqp3(LapackeStorage, rows, cols, qr_data, lda, perm_data, hCoeffs_data);
+        if (info != 0) return;
+
+        maxpivot = qr.diagonal().cwiseAbs().maxCoeff();
+        hCoeffs.adjointInPlace();
+        RealScalar defaultThreshold = NumTraits<RealScalar>::epsilon() * RealScalar(qr.diagonalSize());
+        RealScalar threshold = usePrescribedThreshold ? prescribedThreshold : defaultThreshold;
+        RealScalar premultiplied_threshold = maxpivot * threshold;
+        nonzero_pivots = (qr.diagonal().cwiseAbs().array() > premultiplied_threshold).count();
+        colsPermutation.indices().array() -= 1;
+        det_p = colsPermutation.determinant();
+        isInitialized = true;
+      };
+
+      static void init(Index rows, Index cols, HCoeffsType& hCoeffs, PermutationType& colsPermutation,
+                              bool& usePrescribedThreshold, bool& isInitialized) {
+
+        Index diag = numext::mini(rows, cols);
+        hCoeffs.resize(diag);
+        colsPermutation.resize(cols);
+        usePrescribedThreshold = false;
+        isInitialized = false;
+      }
+    };
+
+    #define COLPIVQR_LAPACKE_COMPUTEINPLACE(EIGTYPE)                                                                 \
+    template <> inline void ColPivHouseholderQR<EIGTYPE, lapack_int>::computeInPlace() {                             \
+      ColPivHouseholderQR_LAPACKE_impl<MatrixType>::run(m_qr, m_hCoeffs, m_colsPermutation, m_nonzero_pivots,        \
+                                                        m_maxpivot, m_usePrescribedThreshold, m_prescribedThreshold, \
+                                                        m_det_p, m_isInitialized); }                                 \
+
+    #define COLPIVQR_LAPACKE_INIT(EIGTYPE)                                                                          \
+    template <> inline void ColPivHouseholderQR<EIGTYPE, lapack_int>::init(Index rows, Index cols) {                \
+      ColPivHouseholderQR_LAPACKE_impl<MatrixType>::init(rows, cols, m_hCoeffs, m_colsPermutation, m_isInitialized, \
+                                                         m_usePrescribedThreshold); }                               \
+
+    #define COLPIVQR_LAPACKE(EIGTYPE)          \
+      COLPIVQR_LAPACKE_COMPUTEINPLACE(EIGTYPE) \
+      COLPIVQR_LAPACKE_INIT(EIGTYPE)           \
+
+    typedef Matrix<float, Dynamic, Dynamic, RowMajor> MatrixXfR;
+    typedef Matrix<double, Dynamic, Dynamic, RowMajor> MatrixXdR;
+    typedef Matrix<scomplex, Dynamic, Dynamic, RowMajor> MatrixXcfR;
+    typedef Matrix<dcomplex, Dynamic, Dynamic, RowMajor> MatrixXcdR;
+
+    COLPIVQR_LAPACKE(MatrixXf)
+    COLPIVQR_LAPACKE(MatrixXd)
+    COLPIVQR_LAPACKE(MatrixXcf)
+    COLPIVQR_LAPACKE(MatrixXcd)
+    COLPIVQR_LAPACKE(MatrixXfR)
+    COLPIVQR_LAPACKE(MatrixXdR)
+    COLPIVQR_LAPACKE(MatrixXcfR)
+    COLPIVQR_LAPACKE(MatrixXcdR)
+
+    COLPIVQR_LAPACKE(Ref<MatrixXf>)
+    COLPIVQR_LAPACKE(Ref<MatrixXd>)
+    COLPIVQR_LAPACKE(Ref<MatrixXcf>)
+    COLPIVQR_LAPACKE(Ref<MatrixXcd>)
+    COLPIVQR_LAPACKE(Ref<MatrixXfR>)
+    COLPIVQR_LAPACKE(Ref<MatrixXdR>)
+    COLPIVQR_LAPACKE(Ref<MatrixXcfR>)
+    COLPIVQR_LAPACKE(Ref<MatrixXcdR>)
+
+#endif
+}  // end namespace Eigen
+
+#endif  // EIGEN_COLPIVOTINGHOUSEHOLDERQR_LAPACKE_H

diff --git a/Eigen/src/QR/CompleteOrthogonalDecomposition.h b/Eigen/src/QR/CompleteOrthogonalDecomposition.h
index 02583a2..171e601 100644
--- a/Eigen/src/QR/CompleteOrthogonalDecomposition.h
+++ b/Eigen/src/QR/CompleteOrthogonalDecomposition.h

@@ -15,12 +15,12 @@
 namespace Eigen {
 
 namespace internal {
-template <typename MatrixType_>
-struct traits<CompleteOrthogonalDecomposition<MatrixType_> >
+template <typename MatrixType_, typename PermutationIndex_>
+struct traits<CompleteOrthogonalDecomposition<MatrixType_, PermutationIndex_> >
     : traits<MatrixType_> {
   typedef MatrixXpr XprKind;
   typedef SolverStorage StorageKind;
-  typedef int StorageIndex;
+  typedef PermutationIndex_ PermutationIndex;
   enum { Flags = 0 };
 };
 
@@ -49,8 +49,8 @@
   * 
   * \sa MatrixBase::completeOrthogonalDecomposition()
   */
-template <typename MatrixType_> class CompleteOrthogonalDecomposition
-          : public SolverBase<CompleteOrthogonalDecomposition<MatrixType_> >
+template <typename MatrixType_, typename PermutationIndex_> class CompleteOrthogonalDecomposition
+          : public SolverBase<CompleteOrthogonalDecomposition<MatrixType_, PermutationIndex_> >
 {
  public:
   typedef MatrixType_ MatrixType;
@@ -58,14 +58,14 @@
 
   template<typename Derived>
   friend struct internal::solve_assertion;
-
+  typedef PermutationIndex_ PermutationIndex;
   EIGEN_GENERIC_PUBLIC_INTERFACE(CompleteOrthogonalDecomposition)
   enum {
     MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
   };
   typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
-  typedef PermutationMatrix<ColsAtCompileTime, MaxColsAtCompileTime>
+  typedef PermutationMatrix<ColsAtCompileTime, MaxColsAtCompileTime, PermutationIndex>
       PermutationType;
   typedef typename internal::plain_row_type<MatrixType, Index>::type
       IntRowVectorType;
@@ -78,9 +78,6 @@
       HouseholderSequenceType;
   typedef typename MatrixType::PlainObject PlainObject;
 
- private:
-  typedef typename PermutationType::Index PermIndexType;
-
  public:
   /**
    * \brief Default Constructor.
@@ -417,26 +414,26 @@
   template <typename Rhs>
   void applyZAdjointOnTheLeftInPlace(Rhs& rhs) const;
 
-  ColPivHouseholderQR<MatrixType> m_cpqr;
+  ColPivHouseholderQR<MatrixType, PermutationIndex> m_cpqr;
   HCoeffsType m_zCoeffs;
   RowVectorType m_temp;
 };
 
-template <typename MatrixType>
+template <typename MatrixType, typename PermutationIndex>
 typename MatrixType::Scalar
-CompleteOrthogonalDecomposition<MatrixType>::determinant() const {
+CompleteOrthogonalDecomposition<MatrixType, PermutationIndex>::determinant() const {
   return m_cpqr.determinant();
 }
 
-template <typename MatrixType>
+template <typename MatrixType, typename PermutationIndex>
 typename MatrixType::RealScalar
-CompleteOrthogonalDecomposition<MatrixType>::absDeterminant() const {
+CompleteOrthogonalDecomposition<MatrixType, PermutationIndex>::absDeterminant() const {
   return m_cpqr.absDeterminant();
 }
 
-template <typename MatrixType>
+template <typename MatrixType, typename PermutationIndex>
 typename MatrixType::RealScalar
-CompleteOrthogonalDecomposition<MatrixType>::logAbsDeterminant() const {
+CompleteOrthogonalDecomposition<MatrixType, PermutationIndex>::logAbsDeterminant() const {
   return m_cpqr.logAbsDeterminant();
 }
 
@@ -447,11 +444,10 @@
  * \sa class CompleteOrthogonalDecomposition,
  * CompleteOrthogonalDecomposition(const MatrixType&)
  */
-template <typename MatrixType>
-void CompleteOrthogonalDecomposition<MatrixType>::computeInPlace()
+template <typename MatrixType, typename PermutationIndex>
+void CompleteOrthogonalDecomposition<MatrixType, PermutationIndex>::computeInPlace()
 {
-  // the column permutation is stored as int indices, so just to be sure:
-  eigen_assert(m_cpqr.cols() <= NumTraits<int>::highest());
+  eigen_assert(m_cpqr.cols() <= NumTraits<PermutationIndex>::highest());
 
   const Index rank = m_cpqr.rank();
   const Index cols = m_cpqr.cols();
@@ -503,9 +499,9 @@
   }
 }
 
-template <typename MatrixType>
+template <typename MatrixType, typename PermutationIndex>
 template <bool Conjugate, typename Rhs>
-void CompleteOrthogonalDecomposition<MatrixType>::applyZOnTheLeftInPlace(
+void CompleteOrthogonalDecomposition<MatrixType, PermutationIndex>::applyZOnTheLeftInPlace(
     Rhs& rhs) const {
   const Index cols = this->cols();
   const Index nrhs = rhs.cols();
@@ -525,9 +521,9 @@
   }
 }
 
-template <typename MatrixType>
+template <typename MatrixType, typename PermutationIndex>
 template <typename Rhs>
-void CompleteOrthogonalDecomposition<MatrixType>::applyZAdjointOnTheLeftInPlace(
+void CompleteOrthogonalDecomposition<MatrixType, PermutationIndex>::applyZAdjointOnTheLeftInPlace(
     Rhs& rhs) const {
   const Index cols = this->cols();
   const Index nrhs = rhs.cols();
@@ -548,9 +544,9 @@
 }
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-template <typename MatrixType_>
+template <typename MatrixType_, typename PermutationIndex_>
 template <typename RhsType, typename DstType>
-void CompleteOrthogonalDecomposition<MatrixType_>::_solve_impl(
+void CompleteOrthogonalDecomposition<MatrixType_, PermutationIndex_>::_solve_impl(
     const RhsType& rhs, DstType& dst) const {
   const Index rank = this->rank();
   if (rank == 0) {
@@ -580,9 +576,9 @@
   dst = colsPermutation() * dst;
 }
 
-template<typename MatrixType_>
+template<typename MatrixType_, typename PermutationIndex_>
 template<bool Conjugate, typename RhsType, typename DstType>
-void CompleteOrthogonalDecomposition<MatrixType_>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
+void CompleteOrthogonalDecomposition<MatrixType_, PermutationIndex_>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
 {
   const Index rank = this->rank();
 
@@ -611,17 +607,17 @@
 
 namespace internal {
 
-template<typename MatrixType>
-struct traits<Inverse<CompleteOrthogonalDecomposition<MatrixType> > >
+template<typename MatrixType, typename PermutationIndex>
+struct traits<Inverse<CompleteOrthogonalDecomposition<MatrixType, PermutationIndex> > >
   : traits<typename Transpose<typename MatrixType::PlainObject>::PlainObject>
 {
   enum { Flags = 0 };
 };
 
-template<typename DstXprType, typename MatrixType>
-struct Assignment<DstXprType, Inverse<CompleteOrthogonalDecomposition<MatrixType> >, internal::assign_op<typename DstXprType::Scalar,typename CompleteOrthogonalDecomposition<MatrixType>::Scalar>, Dense2Dense>
+template<typename DstXprType, typename MatrixType, typename PermutationIndex>
+struct Assignment<DstXprType, Inverse<CompleteOrthogonalDecomposition<MatrixType, PermutationIndex> >, internal::assign_op<typename DstXprType::Scalar,typename CompleteOrthogonalDecomposition<MatrixType, PermutationIndex>::Scalar>, Dense2Dense>
 {
-  typedef CompleteOrthogonalDecomposition<MatrixType> CodType;
+  typedef CompleteOrthogonalDecomposition<MatrixType, PermutationIndex> CodType;
   typedef Inverse<CodType> SrcXprType;
   static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename CodType::Scalar> &)
   {
@@ -633,9 +629,9 @@
 } // end namespace internal
 
 /** \returns the matrix Q as a sequence of householder transformations */
-template <typename MatrixType>
-typename CompleteOrthogonalDecomposition<MatrixType>::HouseholderSequenceType
-CompleteOrthogonalDecomposition<MatrixType>::householderQ() const {
+template <typename MatrixType, typename PermutationIndex>
+typename CompleteOrthogonalDecomposition<MatrixType, PermutationIndex>::HouseholderSequenceType
+CompleteOrthogonalDecomposition<MatrixType, PermutationIndex>::householderQ() const {
   return m_cpqr.householderQ();
 }
 
@@ -644,7 +640,8 @@
   * \sa class CompleteOrthogonalDecomposition
   */
 template <typename Derived>
-const CompleteOrthogonalDecomposition<typename MatrixBase<Derived>::PlainObject>
+template <typename PermutationIndex>
+const CompleteOrthogonalDecomposition<typename MatrixBase<Derived>::PlainObject, PermutationIndex>
 MatrixBase<Derived>::completeOrthogonalDecomposition() const {
   return CompleteOrthogonalDecomposition<PlainObject>(eval());
 }

diff --git a/Eigen/src/QR/FullPivHouseholderQR.h b/Eigen/src/QR/FullPivHouseholderQR.h
index ec7e19b..588f917 100644
--- a/Eigen/src/QR/FullPivHouseholderQR.h
+++ b/Eigen/src/QR/FullPivHouseholderQR.h

@@ -17,19 +17,19 @@
 
 namespace internal {
 
-template<typename MatrixType_> struct traits<FullPivHouseholderQR<MatrixType_> >
+template<typename MatrixType_, typename PermutationIndex_> struct traits<FullPivHouseholderQR<MatrixType_, PermutationIndex_> >
  : traits<MatrixType_>
 {
   typedef MatrixXpr XprKind;
   typedef SolverStorage StorageKind;
-  typedef int StorageIndex;
+  typedef PermutationIndex_ PermutationIndex;
   enum { Flags = 0 };
 };
 
-template<typename MatrixType> struct FullPivHouseholderQRMatrixQReturnType;
+template<typename MatrixType, typename PermutationIndex> struct FullPivHouseholderQRMatrixQReturnType;
 
-template<typename MatrixType>
-struct traits<FullPivHouseholderQRMatrixQReturnType<MatrixType> >
+template<typename MatrixType, typename PermutationIndex>
+struct traits<FullPivHouseholderQRMatrixQReturnType<MatrixType, PermutationIndex> >
 {
   typedef typename MatrixType::PlainObject ReturnType;
 };
@@ -59,26 +59,27 @@
   * 
   * \sa MatrixBase::fullPivHouseholderQr()
   */
-template<typename MatrixType_> class FullPivHouseholderQR
-        : public SolverBase<FullPivHouseholderQR<MatrixType_> >
+template<typename MatrixType_, typename PermutationIndex_> class FullPivHouseholderQR
+        : public SolverBase<FullPivHouseholderQR<MatrixType_, PermutationIndex_> >
 {
   public:
 
     typedef MatrixType_ MatrixType;
     typedef SolverBase<FullPivHouseholderQR> Base;
     friend class SolverBase<FullPivHouseholderQR>;
-
+    typedef PermutationIndex_ PermutationIndex;
     EIGEN_GENERIC_PUBLIC_INTERFACE(FullPivHouseholderQR)
+    
     enum {
       MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
       MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
     };
-    typedef internal::FullPivHouseholderQRMatrixQReturnType<MatrixType> MatrixQReturnType;
+    typedef internal::FullPivHouseholderQRMatrixQReturnType<MatrixType, PermutationIndex> MatrixQReturnType;
     typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
-    typedef Matrix<StorageIndex, 1,
+    typedef Matrix<PermutationIndex, 1,
                    internal::min_size_prefer_dynamic(ColsAtCompileTime,RowsAtCompileTime), RowMajor, 1,
                    internal::min_size_prefer_fixed(MaxColsAtCompileTime, MaxRowsAtCompileTime)> IntDiagSizeVectorType;
-    typedef PermutationMatrix<ColsAtCompileTime, MaxColsAtCompileTime> PermutationType;
+    typedef PermutationMatrix<ColsAtCompileTime, MaxColsAtCompileTime, PermutationIndex> PermutationType;
     typedef typename internal::plain_row_type<MatrixType>::type RowVectorType;
     typedef typename internal::plain_col_type<MatrixType>::type ColVectorType;
     typedef typename MatrixType::PlainObject PlainObject;
@@ -437,8 +438,8 @@
     Index m_det_p;
 };
 
-template<typename MatrixType>
-typename MatrixType::Scalar FullPivHouseholderQR<MatrixType>::determinant() const
+template<typename MatrixType, typename PermutationIndex>
+typename MatrixType::Scalar FullPivHouseholderQR<MatrixType, PermutationIndex>::determinant() const
 {
   eigen_assert(m_isInitialized && "HouseholderQR is not initialized.");
   eigen_assert(m_qr.rows() == m_qr.cols() && "You can't take the determinant of a non-square matrix!");
@@ -447,8 +448,8 @@
   return m_qr.diagonal().prod() * detQ * Scalar(m_det_p);
 }
 
-template<typename MatrixType>
-typename MatrixType::RealScalar FullPivHouseholderQR<MatrixType>::absDeterminant() const
+template<typename MatrixType, typename PermutationIndex>
+typename MatrixType::RealScalar FullPivHouseholderQR<MatrixType, PermutationIndex>::absDeterminant() const
 {
   using std::abs;
   eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
@@ -456,8 +457,8 @@
   return abs(m_qr.diagonal().prod());
 }
 
-template<typename MatrixType>
-typename MatrixType::RealScalar FullPivHouseholderQR<MatrixType>::logAbsDeterminant() const
+template<typename MatrixType, typename PermutationIndex>
+typename MatrixType::RealScalar FullPivHouseholderQR<MatrixType, PermutationIndex>::logAbsDeterminant() const
 {
   eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
   eigen_assert(m_qr.rows() == m_qr.cols() && "You can't take the determinant of a non-square matrix!");
@@ -470,18 +471,19 @@
   *
   * \sa class FullPivHouseholderQR, FullPivHouseholderQR(const MatrixType&)
   */
-template<typename MatrixType>
+template<typename MatrixType, typename PermutationIndex>
 template<typename InputType>
-FullPivHouseholderQR<MatrixType>& FullPivHouseholderQR<MatrixType>::compute(const EigenBase<InputType>& matrix)
+FullPivHouseholderQR<MatrixType, PermutationIndex>& FullPivHouseholderQR<MatrixType, PermutationIndex>::compute(const EigenBase<InputType>& matrix)
 {
   m_qr = matrix.derived();
   computeInPlace();
   return *this;
 }
 
-template<typename MatrixType>
-void FullPivHouseholderQR<MatrixType>::computeInPlace()
+template<typename MatrixType, typename PermutationIndex>
+void FullPivHouseholderQR<MatrixType, PermutationIndex>::computeInPlace()
 {
+  eigen_assert(m_qr.cols() <= NumTraits<PermutationIndex>::highest());
   using std::abs;
   Index rows = m_qr.rows();
   Index cols = m_qr.cols();
@@ -523,15 +525,15 @@
       m_nonzero_pivots = k;
       for(Index i = k; i < size; i++)
       {
-        m_rows_transpositions.coeffRef(i) = internal::convert_index<StorageIndex>(i);
-        m_cols_transpositions.coeffRef(i) = internal::convert_index<StorageIndex>(i);
+        m_rows_transpositions.coeffRef(i) = internal::convert_index<PermutationIndex>(i);
+        m_cols_transpositions.coeffRef(i) = internal::convert_index<PermutationIndex>(i);
         m_hCoeffs.coeffRef(i) = Scalar(0);
       }
       break;
     }
 
-    m_rows_transpositions.coeffRef(k) = internal::convert_index<StorageIndex>(row_of_biggest_in_corner);
-    m_cols_transpositions.coeffRef(k) = internal::convert_index<StorageIndex>(col_of_biggest_in_corner);
+    m_rows_transpositions.coeffRef(k) = internal::convert_index<PermutationIndex>(row_of_biggest_in_corner);
+    m_cols_transpositions.coeffRef(k) = internal::convert_index<PermutationIndex>(col_of_biggest_in_corner);
     if(k != row_of_biggest_in_corner) {
       m_qr.row(k).tail(cols-k).swap(m_qr.row(row_of_biggest_in_corner).tail(cols-k));
       ++number_of_transpositions;
@@ -561,9 +563,9 @@
 }
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-template<typename MatrixType_>
+template<typename MatrixType_, typename PermutationIndex_>
 template<typename RhsType, typename DstType>
-void FullPivHouseholderQR<MatrixType_>::_solve_impl(const RhsType &rhs, DstType &dst) const
+void FullPivHouseholderQR<MatrixType_, PermutationIndex_>::_solve_impl(const RhsType &rhs, DstType &dst) const
 {
   const Index l_rank = rank();
 
@@ -595,9 +597,9 @@
   for(Index i = l_rank; i < cols(); ++i) dst.row(m_cols_permutation.indices().coeff(i)).setZero();
 }
 
-template<typename MatrixType_>
+template<typename MatrixType_, typename PermutationIndex_>
 template<bool Conjugate, typename RhsType, typename DstType>
-void FullPivHouseholderQR<MatrixType_>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
+void FullPivHouseholderQR<MatrixType_, PermutationIndex_>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
 {
   const Index l_rank = rank();
 
@@ -634,10 +636,10 @@
 
 namespace internal {
   
-template<typename DstXprType, typename MatrixType>
-struct Assignment<DstXprType, Inverse<FullPivHouseholderQR<MatrixType> >, internal::assign_op<typename DstXprType::Scalar,typename FullPivHouseholderQR<MatrixType>::Scalar>, Dense2Dense>
+template<typename DstXprType, typename MatrixType, typename PermutationIndex>
+struct Assignment<DstXprType, Inverse<FullPivHouseholderQR<MatrixType, PermutationIndex> >, internal::assign_op<typename DstXprType::Scalar,typename FullPivHouseholderQR<MatrixType, PermutationIndex>::Scalar>, Dense2Dense>
 {
-  typedef FullPivHouseholderQR<MatrixType> QrType;
+  typedef FullPivHouseholderQR<MatrixType, PermutationIndex> QrType;
   typedef Inverse<QrType> SrcXprType;
   static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename QrType::Scalar> &)
   {    
@@ -651,11 +653,11 @@
   *
   * \tparam MatrixType type of underlying dense matrix
   */
-template<typename MatrixType> struct FullPivHouseholderQRMatrixQReturnType
-  : public ReturnByValue<FullPivHouseholderQRMatrixQReturnType<MatrixType> >
+template<typename MatrixType, typename PermutationIndex> struct FullPivHouseholderQRMatrixQReturnType
+  : public ReturnByValue<FullPivHouseholderQRMatrixQReturnType<MatrixType, PermutationIndex> >
 {
 public:
-  typedef typename FullPivHouseholderQR<MatrixType>::IntDiagSizeVectorType IntDiagSizeVectorType;
+  typedef typename FullPivHouseholderQR<MatrixType, PermutationIndex>::IntDiagSizeVectorType IntDiagSizeVectorType;
   typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
   typedef Matrix<typename MatrixType::Scalar, 1, MatrixType::RowsAtCompileTime, RowMajor, 1,
                  MatrixType::MaxRowsAtCompileTime> WorkVectorType;
@@ -712,8 +714,8 @@
 
 } // end namespace internal
 
-template<typename MatrixType>
-inline typename FullPivHouseholderQR<MatrixType>::MatrixQReturnType FullPivHouseholderQR<MatrixType>::matrixQ() const
+template<typename MatrixType, typename PermutationIndex>
+inline typename FullPivHouseholderQR<MatrixType, PermutationIndex>::MatrixQReturnType FullPivHouseholderQR<MatrixType, PermutationIndex>::matrixQ() const
 {
   eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
   return MatrixQReturnType(m_qr, m_hCoeffs, m_rows_transpositions);
@@ -724,10 +726,11 @@
   * \sa class FullPivHouseholderQR
   */
 template<typename Derived>
-const FullPivHouseholderQR<typename MatrixBase<Derived>::PlainObject>
+template<typename PermutationIndex>
+const FullPivHouseholderQR<typename MatrixBase<Derived>::PlainObject, PermutationIndex>
 MatrixBase<Derived>::fullPivHouseholderQr() const
 {
-  return FullPivHouseholderQR<PlainObject>(eval());
+  return FullPivHouseholderQR<PlainObject, PermutationIndex>(eval());
 }
 
 } // end namespace Eigen

diff --git a/Eigen/src/SparseCore/CompressedStorage.h b/Eigen/src/SparseCore/CompressedStorage.h
index d89d99e..6b0fb4b 100644
--- a/Eigen/src/SparseCore/CompressedStorage.h
+++ b/Eigen/src/SparseCore/CompressedStorage.h

@@ -193,10 +193,8 @@
     inline void moveChunk(Index from, Index to, Index chunkSize)
     {
       eigen_internal_assert(chunkSize >= 0 && to+chunkSize <= m_size);
-      if (chunkSize > 0) {
-        internal::smart_memmove(m_values + from, m_values + from + chunkSize, m_values + to);
-        internal::smart_memmove(m_indices + from, m_indices + from + chunkSize, m_indices + to);
-      }
+      internal::smart_memmove(m_values + from, m_values + from + chunkSize, m_values + to);
+      internal::smart_memmove(m_indices + from, m_indices + from + chunkSize, m_indices + to);
     }
 
   protected:

diff --git a/Eigen/src/SparseCore/SparseCompressedBase.h b/Eigen/src/SparseCore/SparseCompressedBase.h
index 243cd16..95af15a 100644
--- a/Eigen/src/SparseCore/SparseCompressedBase.h
+++ b/Eigen/src/SparseCore/SparseCompressedBase.h

@@ -60,14 +60,14 @@
     /** \returns the number of non zero coefficients */
     inline Index nonZeros() const
     {
-      if(Derived::IsVectorAtCompileTime && outerIndexPtr()==0)
-        return derived().nonZeros();
-      else if(isCompressed())
-        return outerIndexPtr()[derived().outerSize()]-outerIndexPtr()[0];
-      else if(derived().outerSize()==0)
-        return 0;
-      else
-        return innerNonZeros().sum();
+     if (Derived::IsVectorAtCompileTime && outerIndexPtr() == 0)
+       return derived().nonZeros();
+     else if (derived().outerSize() == 0)
+       return 0;
+     else if (isCompressed())
+       return outerIndexPtr()[derived().outerSize()] - outerIndexPtr()[0];
+     else
+       return innerNonZeros().sum();
     }
     
     /** \returns a const pointer to the array of values.

diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h
index eaefcef..bf1d562 100644
--- a/Eigen/src/SparseCore/SparseMatrix.h
+++ b/Eigen/src/SparseCore/SparseMatrix.h

@@ -238,13 +238,24 @@
       const Index outer = IsRowMajor ? row : col;
       const Index inner = IsRowMajor ? col : row;
       Index start = m_outerIndex[outer];
-      Index end = m_innerNonZeros ? m_outerIndex[outer] + m_innerNonZeros[outer] : m_outerIndex[outer + 1];
+      Index end = isCompressed() ? m_outerIndex[outer + 1] : m_outerIndex[outer] + m_innerNonZeros[outer];
       eigen_assert(end >= start && "you probably called coeffRef on a non finalized matrix");
-      if (end <= start) return insertAtByOuterInner(outer, inner, start);
-      Index dst = m_data.searchLowerIndex(start, end, inner);
+      Index dst = start == end ? end : m_data.searchLowerIndex(start, end, inner);
+      if (dst == end) {
+        Index capacity = m_outerIndex[outer + 1] - end;
+        if (capacity > 0) {
+          // implies uncompressed: push to back of vector
+          m_innerNonZeros[outer]++;
+          m_data.index(end) = StorageIndex(inner);
+          m_data.value(end) = Scalar(0);
+          return m_data.value(end);
+        }
+      }
       if ((dst < end) && (m_data.index(dst) == inner))
+        // this coefficient exists, return a refernece to it
         return m_data.value(dst);
       else
+        // insertion will require reconfiguring the buffer
         return insertAtByOuterInner(outer, inner, dst);
     }
 
@@ -323,7 +334,7 @@
       if(isCompressed())
       {
         Index totalReserveSize = 0;
-        for (Index j = 0; j < m_outerSize; ++j) totalReserveSize += reserveSizes[j];
+        for (Index j = 0; j < m_outerSize; ++j) totalReserveSize += internal::convert_index<Index>(reserveSizes[j]);
 
         // if reserveSizes is empty, don't do anything!
         if (totalReserveSize == 0) return;
@@ -334,11 +345,12 @@
         // temporarily use m_innerSizes to hold the new starting points.
         StorageIndex* newOuterIndex = m_innerNonZeros;
         
-        StorageIndex count = 0;
+        Index count = 0;
         for(Index j=0; j<m_outerSize; ++j)
         {
-          newOuterIndex[j] = count;
-          count += reserveSizes[j] + (m_outerIndex[j+1]-m_outerIndex[j]);
+          newOuterIndex[j] = internal::convert_index<StorageIndex>(count);
+          Index reserveSize = internal::convert_index<Index>(reserveSizes[j]);
+          count += reserveSize + internal::convert_index<Index>(m_outerIndex[j+1]-m_outerIndex[j]);
         }
 
         m_data.reserve(totalReserveSize);
@@ -364,29 +376,24 @@
       {
         StorageIndex* newOuterIndex = internal::conditional_aligned_new_auto<StorageIndex, true>(m_outerSize + 1);
         
-        StorageIndex count = 0;
+        Index count = 0;
         for(Index j=0; j<m_outerSize; ++j)
         {
-          newOuterIndex[j] = count;
-          StorageIndex alreadyReserved = (m_outerIndex[j+1]-m_outerIndex[j]) - m_innerNonZeros[j];
-          StorageIndex toReserve = std::max<StorageIndex>(reserveSizes[j], alreadyReserved);
-          count += toReserve + m_innerNonZeros[j];
+          newOuterIndex[j] = internal::convert_index<StorageIndex>(count);
+          Index alreadyReserved = internal::convert_index<Index>(m_outerIndex[j+1] - m_outerIndex[j] - m_innerNonZeros[j]);
+          Index reserveSize = internal::convert_index<Index>(reserveSizes[j]);
+          Index toReserve = numext::maxi(reserveSize, alreadyReserved);
+          count += toReserve + internal::convert_index<Index>(m_innerNonZeros[j]);
         }
-        newOuterIndex[m_outerSize] = count;
-        
+        newOuterIndex[m_outerSize] = internal::convert_index<StorageIndex>(count);
+
         m_data.resize(count);
         for(Index j=m_outerSize-1; j>=0; --j)
         {
-          StorageIndex offset = newOuterIndex[j] - m_outerIndex[j];
-          if(offset>0)
-          {
-            StorageIndex innerNNZ = m_innerNonZeros[j];
-            StorageIndex begin = m_outerIndex[j];
-            StorageIndex end = begin + innerNNZ;
-            StorageIndex target = newOuterIndex[j];
-            internal::smart_memmove(innerIndexPtr() + begin, innerIndexPtr() + end, innerIndexPtr() + target);
-            internal::smart_memmove(valuePtr() + begin, valuePtr() + end, valuePtr() + target);
-          }
+          StorageIndex innerNNZ = m_innerNonZeros[j];
+          StorageIndex begin = m_outerIndex[j];
+          StorageIndex target = newOuterIndex[j];
+          m_data.moveChunk(begin, target, innerNNZ);
         }
         
         std::swap(m_outerIndex, newOuterIndex);
@@ -488,7 +495,22 @@
       * same as insert(Index,Index) except that the indices are given relative to the storage order */
     Scalar& insertByOuterInner(Index j, Index i)
     {
-      return insert(IsRowMajor ? j : i, IsRowMajor ? i : j);
+      Index start = m_outerIndex[j];
+      Index end = isCompressed() ? m_outerIndex[j + 1] : start + m_innerNonZeros[j];
+      Index dst = start == end ? end : m_data.searchLowerIndex(start, end, i);
+      if (dst == end) {
+        Index capacity = m_outerIndex[j + 1] - end;
+        if (capacity > 0) {
+          // implies uncompressed: push to back of vector
+          m_innerNonZeros[j]++;
+          m_data.index(end) = StorageIndex(i);
+          m_data.value(end) = Scalar(0);
+          return m_data.value(end);
+        }
+      }
+      eigen_assert((dst == end || m_data.index(dst) != i) &&
+          "you cannot insert an element that already exists, you must call coeffRef to this end");
+      return insertAtByOuterInner(j, i, dst);
     }
 
     /** Turns the matrix into the \em compressed format.
@@ -498,36 +520,45 @@
       if (isCompressed()) return;
       
       eigen_internal_assert(m_outerIndex!=0 && m_outerSize>0);
-      
+
       StorageIndex start = m_outerIndex[1];
       m_outerIndex[1] = m_innerNonZeros[0];
-      for(Index j=1; j<m_outerSize; ++j)
+      // try to move fewer, larger contiguous chunks
+      Index copyStart = start;
+      Index copyTarget = m_innerNonZeros[0];
+      for (Index j = 1; j < m_outerSize; j++)
       {
         StorageIndex end = start + m_innerNonZeros[j];
-        StorageIndex target = m_outerIndex[j];
-        if (start != target)
+        StorageIndex nextStart = m_outerIndex[j + 1];
+        // dont forget to move the last chunk!
+        bool breakUpCopy = (end != nextStart) || (j == m_outerSize - 1);
+        if (breakUpCopy)
         {
-          internal::smart_memmove(innerIndexPtr() + start, innerIndexPtr() + end, innerIndexPtr() + target);
-          internal::smart_memmove(valuePtr() + start, valuePtr() + end, valuePtr() + target);
+          Index chunkSize = end - copyStart;
+          if(chunkSize > 0) m_data.moveChunk(copyStart, copyTarget, chunkSize);
+          copyStart = nextStart;
+          copyTarget += chunkSize;
         }
-        start = m_outerIndex[j + 1];
+        start = nextStart;
         m_outerIndex[j + 1] = m_outerIndex[j] + m_innerNonZeros[j];
       }
+      m_data.resize(m_outerIndex[m_outerSize]);
+
+      // release as much memory as possible
       internal::conditional_aligned_delete_auto<StorageIndex, true>(m_innerNonZeros, m_outerSize);
       m_innerNonZeros = 0;
-      m_data.resize(m_outerIndex[m_outerSize]);
       m_data.squeeze();
     }
 
     /** Turns the matrix into the uncompressed mode */
     void uncompress()
     {
-      if(m_innerNonZeros != 0) return; 
+      if (!isCompressed()) return;
       m_innerNonZeros = internal::conditional_aligned_new_auto<StorageIndex, true>(m_outerSize);
-      typename IndexVector::AlignedMapType innerNonZeroMap(m_innerNonZeros, m_outerSize);
-      typename IndexVector::ConstAlignedMapType outerIndexMap(m_outerIndex, m_outerSize);
-      typename IndexVector::ConstMapType nextOuterIndexMap(m_outerIndex + 1, m_outerSize);
-      innerNonZeroMap = nextOuterIndexMap - outerIndexMap;
+      if (m_outerIndex[m_outerSize] == 0)
+        std::fill_n(m_innerNonZeros, m_outerSize, StorageIndex(0));
+      else
+        for (Index j = 0; j < m_outerSize; j++) m_innerNonZeros[j] = m_outerIndex[j + 1] - m_outerIndex[j];
     }
 
     /** Suppresses all nonzeros which are \b much \b smaller \b than \a reference under the tolerance \a epsilon */
@@ -590,40 +621,40 @@
       Index newOuterSize = IsRowMajor ? rows : cols;
       Index newInnerSize = IsRowMajor ? cols : rows;
 
-      Index innerChange = newInnerSize - innerSize();
-      Index outerChange = newOuterSize - outerSize();
+      Index innerChange = newInnerSize - m_innerSize;
+      Index outerChange = newOuterSize - m_outerSize;
 
       if (outerChange != 0) {
         m_outerIndex = internal::conditional_aligned_realloc_new_auto<StorageIndex, true>(
-            outerIndexPtr(), newOuterSize + 1, outerSize() + 1);
+            m_outerIndex, newOuterSize + 1, m_outerSize + 1);
 
         if (!isCompressed())
           m_innerNonZeros = internal::conditional_aligned_realloc_new_auto<StorageIndex, true>(
-              innerNonZeroPtr(), newOuterSize, outerSize());
+              m_innerNonZeros, newOuterSize, m_outerSize);
 
         if (outerChange > 0) {
-          StorageIndex lastIdx = outerSize() == 0 ? StorageIndex(0) : outerIndexPtr()[outerSize()];
-          std::fill_n(outerIndexPtr() + outerSize(), outerChange + 1, lastIdx);
+          StorageIndex lastIdx = m_outerSize == 0 ? StorageIndex(0) : m_outerIndex[m_outerSize];
+          std::fill_n(m_outerIndex + m_outerSize, outerChange + 1, lastIdx);
 
-          if (!isCompressed()) std::fill_n(innerNonZeroPtr() + outerSize(), outerChange, StorageIndex(0));
+          if (!isCompressed()) std::fill_n(m_innerNonZeros + m_outerSize, outerChange, StorageIndex(0));
         }
       }
       m_outerSize = newOuterSize;
 
       if (innerChange < 0) {
-        for (Index j = 0; j < outerSize(); j++) {
-          Index start = outerIndexPtr()[j];
-          Index end = isCompressed() ? outerIndexPtr()[j + 1] : start + innerNonZeroPtr()[j];
-          Index lb = data().searchLowerIndex(start, end, newInnerSize);
+        for (Index j = 0; j < m_outerSize; j++) {
+          Index start = m_outerIndex[j];
+          Index end = isCompressed() ? m_outerIndex[j + 1] : start + m_innerNonZeros[j];
+          Index lb = m_data.searchLowerIndex(start, end, newInnerSize);
           if (lb != end) {
             uncompress();
-            innerNonZeroPtr()[j] = StorageIndex(lb - start);
+            m_innerNonZeros[j] = StorageIndex(lb - start);
           }
         }
       }
       m_innerSize = newInnerSize;
 
-      Index newSize = outerIndexPtr()[outerSize()];
+      Index newSize = m_outerIndex[m_outerSize];
       eigen_assert(newSize <= m_data.size());
       m_data.resize(newSize);
     }
@@ -640,17 +671,15 @@
       const Index outerSize = IsRowMajor ? rows : cols;
       m_innerSize = IsRowMajor ? cols : rows;
       m_data.clear();
-      if (m_outerSize != outerSize || m_outerSize==0)
-      {
-        m_outerIndex = internal::conditional_aligned_realloc_new_auto<StorageIndex, true>(m_outerIndex, outerSize + 1,
-            m_outerSize + 1);
+
+      if ((m_outerIndex == 0) || (m_outerSize != outerSize)) {
+        m_outerIndex = internal::conditional_aligned_realloc_new_auto<StorageIndex, true>(m_outerIndex, outerSize + 1, m_outerSize + 1);
         m_outerSize = outerSize;
       }
-      if(m_innerNonZeros)
-      {
-        internal::conditional_aligned_delete_auto<StorageIndex, true>(m_innerNonZeros, m_outerSize);
-        m_innerNonZeros = 0;
-      }
+
+      internal::conditional_aligned_delete_auto<StorageIndex, true>(m_innerNonZeros, m_outerSize);
+      m_innerNonZeros = 0;
+
       std::fill_n(m_outerIndex, m_outerSize + 1, StorageIndex(0));
     }
 
@@ -672,7 +701,7 @@
 
     /** Default constructor yielding an empty \c 0 \c x \c 0 matrix */
     inline SparseMatrix()
-      : m_outerSize(-1), m_innerSize(0), m_outerIndex(0), m_innerNonZeros(0)
+      : m_outerSize(0), m_innerSize(0), m_outerIndex(0), m_innerNonZeros(0)
     {
       resize(0, 0);
     }
@@ -756,20 +785,15 @@
       * This function also turns the matrix into compressed mode, and drop any reserved memory. */
     inline void setIdentity()
     {
-      eigen_assert(rows() == cols() && "ONLY FOR SQUARED MATRICES");
-      if (m_innerNonZeros) {
-        internal::conditional_aligned_delete_auto<StorageIndex, true>(m_innerNonZeros, m_outerSize);
-        m_innerNonZeros = 0;
-      }
-      m_data.resize(rows());
+      eigen_assert(m_outerSize == m_innerSize && "ONLY FOR SQUARED MATRICES");
+      internal::conditional_aligned_delete_auto<StorageIndex, true>(m_innerNonZeros, m_outerSize);
+      m_innerNonZeros = 0;
+      m_data.resize(m_outerSize);
       // is it necessary to squeeze?
       m_data.squeeze();
-      typename IndexVector::AlignedMapType outerIndexMap(outerIndexPtr(), rows() + 1);
-      typename IndexVector::AlignedMapType innerIndexMap(innerIndexPtr(), rows());
-      typename ScalarVector::AlignedMapType valueMap(valuePtr(), rows());
-      outerIndexMap.setEqualSpaced(StorageIndex(0), StorageIndex(1));
-      innerIndexMap.setEqualSpaced(StorageIndex(0), StorageIndex(1));
-      valueMap.setOnes();
+      std::iota(m_outerIndex, m_outerIndex + m_outerSize + 1, StorageIndex(0));
+      std::iota(innerIndexPtr(), innerIndexPtr() + m_outerSize, StorageIndex(0));
+      std::fill_n(valuePtr(), m_outerSize, Scalar(1));
     }
 
     inline SparseMatrix& operator=(const SparseMatrix& other)
@@ -880,11 +904,8 @@
     void initAssignment(const Other& other)
     {
       resize(other.rows(), other.cols());
-      if(m_innerNonZeros)
-      {
-        internal::conditional_aligned_delete_auto<StorageIndex, true>(m_innerNonZeros, m_outerSize);
-        m_innerNonZeros = 0;
-      }
+      internal::conditional_aligned_delete_auto<StorageIndex, true>(m_innerNonZeros, m_outerSize);
+      m_innerNonZeros = 0;
     }
 
     /** \internal
@@ -922,8 +943,9 @@
       eigen_assert(m_innerNonZeros[outer]<=(m_outerIndex[outer+1] - m_outerIndex[outer]));
 
       Index p = m_outerIndex[outer] + m_innerNonZeros[outer]++;
-      m_data.index(p) = convert_index(inner);
-      return (m_data.value(p) = Scalar(0));
+      m_data.index(p) = StorageIndex(inner);
+      m_data.value(p) = Scalar(0);
+      return m_data.value(p);
     }
 protected:
     struct IndexPosPair {
@@ -946,7 +968,6 @@
     {
       
       constexpr StorageIndex kEmptyIndexVal(-1);
-      typedef typename IndexVector::AlignedMapType IndexMap;
       typedef typename ScalarVector::AlignedMapType ValueMap;
 
       Index n = diagXpr.size();
@@ -954,22 +975,18 @@
       const bool overwrite = internal::is_same<Func, internal::assign_op<Scalar,Scalar> >::value;
       if(overwrite)
       {
-        if((this->rows()!=n) || (this->cols()!=n))
-          this->resize(n, n);
+        if((m_outerSize != n) || (m_innerSize != n))
+          resize(n, n);
       }
 
-      if(data().size()==0 || overwrite)
+      if(m_data.size()==0 || overwrite)
       {
-        if (!isCompressed()) {
-          internal::conditional_aligned_delete_auto<StorageIndex, true>(m_innerNonZeros, m_outerSize);
-          m_innerNonZeros = 0;
-        }
+        internal::conditional_aligned_delete_auto<StorageIndex, true>(m_innerNonZeros, m_outerSize);
+        m_innerNonZeros = 0;
         resizeNonZeros(n);
-        IndexMap outerIndexMap(outerIndexPtr(), n + 1);
-        IndexMap innerIndexMap(innerIndexPtr(), n);
         ValueMap valueMap(valuePtr(), n);
-        outerIndexMap.setEqualSpaced(StorageIndex(0), StorageIndex(1));
-        innerIndexMap.setEqualSpaced(StorageIndex(0), StorageIndex(1));
+        std::iota(m_outerIndex, m_outerIndex + n + 1, StorageIndex(0));
+        std::iota(innerIndexPtr(), innerIndexPtr() + n, StorageIndex(0));
         valueMap.setZero();
         internal::call_assignment_no_alias(valueMap, diagXpr, assignFunc);
       }
@@ -985,12 +1002,12 @@
           Index shift = 0;
 
           for (Index j = 0; j < n; j++) {
-            Index begin = outerIndexPtr()[j];
-            Index end = isCompressed() ? outerIndexPtr()[j + 1] : begin + innerNonZeroPtr()[j];
-            Index capacity = outerIndexPtr()[j + 1] - end;
-            Index dst = data().searchLowerIndex(begin, end, j);
+            Index begin = m_outerIndex[j];
+            Index end = isCompressed() ? m_outerIndex[j + 1] : begin + m_innerNonZeros[j];
+            Index capacity = m_outerIndex[j + 1] - end;
+            Index dst = m_data.searchLowerIndex(begin, end, j);
             // the entry exists: update it now
-            if (dst != end && data().index(dst) == j) assignFunc.assignCoeff(data().value(dst), diaEval.coeff(j));
+            if (dst != end && m_data.index(dst) == StorageIndex(j)) assignFunc.assignCoeff(m_data.value(dst), diaEval.coeff(j));
             // the entry belongs at the back of the vector: push to back
             else if (dst == end && capacity > 0)
               assignFunc.assignCoeff(insertBackUncompressed(j, j), diaEval.coeff(j));
@@ -1005,13 +1022,13 @@
           
           if (deferredInsertions > 0) {
 
-            data().resize(data().size() + shift);
-            Index copyEnd = isCompressed() ? outerIndexPtr()[outerSize()]
-                                           : outerIndexPtr()[outerSize() - 1] + innerNonZeroPtr()[outerSize() - 1];
-            for (Index j = outerSize() - 1; deferredInsertions > 0; j--) {
-              Index begin = outerIndexPtr()[j];
-              Index end = isCompressed() ? outerIndexPtr()[j + 1] : begin + innerNonZeroPtr()[j];
-              Index capacity = outerIndexPtr()[j + 1] - end;
+            m_data.resize(m_data.size() + shift);
+            Index copyEnd = isCompressed() ? m_outerIndex[m_outerSize]
+                                           : m_outerIndex[m_outerSize - 1] + m_innerNonZeros[m_outerSize - 1];
+            for (Index j = m_outerSize - 1; deferredInsertions > 0; j--) {
+              Index begin = m_outerIndex[j];
+              Index end = isCompressed() ? m_outerIndex[j + 1] : begin + m_innerNonZeros[j];
+              Index capacity = m_outerIndex[j + 1] - end;
 
               bool doInsertion = insertionLocations(j) >= 0;
               bool breakUpCopy = doInsertion && (capacity > 0);
@@ -1020,14 +1037,14 @@
               // where `threshold >= 0` to skip inactive nonzeros in each vector
               // this reduces the total number of copied elements, but requires more moveChunk calls
               if (breakUpCopy) {
-                Index copyBegin = outerIndexPtr()[j + 1];
+                Index copyBegin = m_outerIndex[j + 1];
                 Index to = copyBegin + shift;
                 Index chunkSize = copyEnd - copyBegin;
-                if (chunkSize > 0) data().moveChunk(copyBegin, to, chunkSize);
+                m_data.moveChunk(copyBegin, to, chunkSize);
                 copyEnd = end;
               }
 
-              outerIndexPtr()[j + 1] += shift;
+              m_outerIndex[j + 1] += shift;
               
               if (doInsertion) {
                 // if there is capacity, shift into the inactive nonzeros
@@ -1035,11 +1052,12 @@
                 Index copyBegin = insertionLocations(j);
                 Index to = copyBegin + shift;
                 Index chunkSize = copyEnd - copyBegin;
-                if (chunkSize > 0) data().moveChunk(copyBegin, to, chunkSize);
+                m_data.moveChunk(copyBegin, to, chunkSize);
                 Index dst = to - 1;
-                data().index(dst) = StorageIndex(j);
-                assignFunc.assignCoeff(data().value(dst) = Scalar(0), diaEval.coeff(j));
-                if (!isCompressed()) innerNonZeroPtr()[j]++;
+                m_data.index(dst) = StorageIndex(j);
+                m_data.value(dst) = Scalar(0);
+                assignFunc.assignCoeff(m_data.value(dst), diaEval.coeff(j));
+                if (!isCompressed()) m_innerNonZeros[j]++;
                 shift--;
                 deferredInsertions--;
                 copyEnd = copyBegin;
@@ -1050,9 +1068,6 @@
       }
     }
 
-    /* Provides a consistent reserve and reallocation strategy for insertCompressed and insertUncompressed
-    If there is insufficient space for one insertion, the size of the memory buffer is doubled */
-    inline void checkAllocatedSpaceAndMaybeExpand();
     /* These functions are used to avoid a redundant binary search operation in functions such as coeffRef() and assume `dst` is the appropriate sorted insertion point */
     EIGEN_STRONG_INLINE Scalar& insertAtByOuterInner(Index outer, Index inner, Index dst);
     Scalar& insertCompressedAtByOuterInner(Index outer, Index inner, Index dst);
@@ -1299,10 +1314,8 @@
   }
   m_outerIndex[m_outerSize] = count;
   // turn the matrix into compressed form
-  if (m_innerNonZeros) {
-    internal::conditional_aligned_delete_auto<StorageIndex, true>(m_innerNonZeros, m_outerSize);
-    m_innerNonZeros = 0;
-  }
+  internal::conditional_aligned_delete_auto<StorageIndex, true>(m_innerNonZeros, m_outerSize);
+  m_innerNonZeros = 0;
   m_data.resize(m_outerIndex[m_outerSize]);
 }
 
@@ -1383,21 +1396,15 @@
 }
 
 template <typename Scalar_, int Options_, typename StorageIndex_>
-inline typename SparseMatrix<Scalar_, Options_, StorageIndex_>::Scalar& SparseMatrix<Scalar_, Options_, StorageIndex_>::insert(
-    Index row, Index col) {
-  Index outer = IsRowMajor ? row : col;
-  Index inner = IsRowMajor ? col : row;
-  Index start = outerIndexPtr()[outer];
-  Index end = isCompressed() ? outerIndexPtr()[outer + 1] : start + innerNonZeroPtr()[outer];
-  Index dst = data().searchLowerIndex(start, end, inner);
-  eigen_assert((dst == end || data().index(dst) != inner) &&
-      "you cannot insert an element that already exists, you must call coeffRef to this end");
-  return insertAtByOuterInner(outer, inner, dst);
+inline typename SparseMatrix<Scalar_, Options_, StorageIndex_>::Scalar&
+SparseMatrix<Scalar_, Options_, StorageIndex_>::insert(Index row, Index col) {
+  return insertByOuterInner(IsRowMajor ? row : col, IsRowMajor ? col : row);
 }
 
 template <typename Scalar_, int Options_, typename StorageIndex_>
 EIGEN_STRONG_INLINE typename SparseMatrix<Scalar_, Options_, StorageIndex_>::Scalar&
 SparseMatrix<Scalar_, Options_, StorageIndex_>::insertAtByOuterInner(Index outer, Index inner, Index dst) {
+  // random insertion into compressed matrix is very slow
   uncompress();
   return insertUncompressedAtByOuterInner(outer, inner, dst);
 }
@@ -1408,10 +1415,20 @@
   eigen_assert(!isCompressed());
   Index outer = IsRowMajor ? row : col;
   Index inner = IsRowMajor ? col : row;
-  Index start = outerIndexPtr()[outer];
-  Index end = start + innerNonZeroPtr()[outer];
-  Index dst = data().searchLowerIndex(start, end, inner);
-  eigen_assert((dst == end || data().index(dst) != inner) &&
+  Index start = m_outerIndex[outer];
+  Index end = start + m_innerNonZeros[outer];
+  Index dst = start == end ? end : m_data.searchLowerIndex(start, end, inner);
+  if (dst == end) {
+    Index capacity = m_outerIndex[outer + 1] - end;
+    if (capacity > 0) {
+      // implies uncompressed: push to back of vector
+      m_innerNonZeros[outer]++;
+      m_data.index(end) = StorageIndex(inner);
+      m_data.value(end) = Scalar(0);
+      return m_data.value(end);
+    }
+  }
+  eigen_assert((dst == end || m_data.index(dst) != inner) &&
                "you cannot insert an element that already exists, you must call coeffRef to this end");
   return insertUncompressedAtByOuterInner(outer, inner, dst);
 }
@@ -1422,97 +1439,119 @@
   eigen_assert(isCompressed());
   Index outer = IsRowMajor ? row : col;
   Index inner = IsRowMajor ? col : row;
-  Index start = outerIndexPtr()[outer];
-  Index end = outerIndexPtr()[outer + 1];
-  Index dst = data().searchLowerIndex(start, end, inner);
-  eigen_assert((dst == end || data().index(dst) != inner) &&
+  Index start = m_outerIndex[outer];
+  Index end = m_outerIndex[outer + 1];
+  Index dst = start == end ? end : m_data.searchLowerIndex(start, end, inner);
+  eigen_assert((dst == end || m_data.index(dst) != inner) &&
                "you cannot insert an element that already exists, you must call coeffRef to this end");
   return insertCompressedAtByOuterInner(outer, inner, dst);
 }
 
 template <typename Scalar_, int Options_, typename StorageIndex_>
-EIGEN_STRONG_INLINE void SparseMatrix<Scalar_, Options_, StorageIndex_>::checkAllocatedSpaceAndMaybeExpand() {
-  // if there is no capacity for a single insertion, double the capacity
-  if (data().allocatedSize() <= data().size()) {
-    // increase capacity by a mininum of 32
-    Index minReserve = 32;
-    Index reserveSize = numext::maxi(minReserve, data().allocatedSize());
-    data().reserve(reserveSize);
-  }
-}
-
-template <typename Scalar_, int Options_, typename StorageIndex_>
 typename SparseMatrix<Scalar_, Options_, StorageIndex_>::Scalar&
 SparseMatrix<Scalar_, Options_, StorageIndex_>::insertCompressedAtByOuterInner(Index outer, Index inner, Index dst) {
   eigen_assert(isCompressed());
   // compressed insertion always requires expanding the buffer
-  checkAllocatedSpaceAndMaybeExpand();
-  data().resize(data().size() + 1);
-  Index chunkSize = outerIndexPtr()[outerSize()] - dst;
+  // first, check if there is adequate allocated memory
+  if (m_data.allocatedSize() <= m_data.size()) {
+    // if there is no capacity for a single insertion, double the capacity
+    // increase capacity by a mininum of 32
+    Index minReserve = 32;
+    Index reserveSize = numext::maxi(minReserve, m_data.allocatedSize());
+    m_data.reserve(reserveSize);
+  }
+  m_data.resize(m_data.size() + 1);
+  Index chunkSize = m_outerIndex[m_outerSize] - dst;
   // shift the existing data to the right if necessary
-  if (chunkSize > 0) data().moveChunk(dst, dst + 1, chunkSize);
+  m_data.moveChunk(dst, dst + 1, chunkSize);
   // update nonzero counts
-  typename IndexVector::AlignedMapType outerIndexMap(outerIndexPtr(), outerSize() + 1);
-  outerIndexMap.segment(outer + 1, outerSize() - outer).array() += 1;
+  // potentially O(outerSize) bottleneck!
+  for (Index j = outer; j < m_outerSize; j++) m_outerIndex[j + 1]++;
   // initialize the coefficient
-  data().index(dst) = StorageIndex(inner);
+  m_data.index(dst) = StorageIndex(inner);
+  m_data.value(dst) = Scalar(0);
   // return a reference to the coefficient
-  return data().value(dst) = Scalar(0);
+  return m_data.value(dst);
 }
 
 template <typename Scalar_, int Options_, typename StorageIndex_>
 typename SparseMatrix<Scalar_, Options_, StorageIndex_>::Scalar&
 SparseMatrix<Scalar_, Options_, StorageIndex_>::insertUncompressedAtByOuterInner(Index outer, Index inner, Index dst) {
   eigen_assert(!isCompressed());
-  // find nearest outer vector to the right with capacity (if any) to minimize copy size
-  Index target = outer;
-  for (; target < outerSize(); target++) {
-    Index start = outerIndexPtr()[target];
-    Index end = start + innerNonZeroPtr()[target];
-    Index capacity = outerIndexPtr()[target + 1] - end;
-    if (capacity > 0) {
-      // `target` has room for interior insertion
-      Index chunkSize = end - dst;
-      // shift the existing data to the right if necessary
-      if (chunkSize > 0) data().moveChunk(dst, dst + 1, chunkSize);
-      break;
+  // find a vector with capacity, starting at `outer` and searching to the left and right
+  for (Index leftTarget = outer - 1, rightTarget = outer; (leftTarget >= 0) || (rightTarget < m_outerSize);) {
+    if (rightTarget < m_outerSize) {
+      Index start = m_outerIndex[rightTarget];
+      Index end = start + m_innerNonZeros[rightTarget];
+      Index nextStart = m_outerIndex[rightTarget + 1];
+      Index capacity = nextStart - end;
+      if (capacity > 0) {
+        // move [dst, end) to dst+1 and insert at dst
+        Index chunkSize = end - dst;
+        if (chunkSize > 0) m_data.moveChunk(dst, dst + 1, chunkSize);
+        m_innerNonZeros[outer]++;
+        for (Index j = outer; j < rightTarget; j++) m_outerIndex[j + 1]++;
+        m_data.index(dst) = StorageIndex(inner);
+        m_data.value(dst) = Scalar(0);
+        return m_data.value(dst);
+      }
+      rightTarget++;
+    }
+    if (leftTarget >= 0) {
+      Index start = m_outerIndex[leftTarget];
+      Index end = start + m_innerNonZeros[leftTarget];
+      Index nextStart = m_outerIndex[leftTarget + 1];
+      Index capacity = nextStart - end;
+      if (capacity > 0) {
+        // tricky: dst is a lower bound, so we must insert at dst-1 when shifting left
+        // move [nextStart, dst) to nextStart-1 and insert at dst-1
+        Index chunkSize = dst - nextStart;
+        if (chunkSize > 0) m_data.moveChunk(nextStart, nextStart - 1, chunkSize);
+        m_innerNonZeros[outer]++;
+        for (Index j = leftTarget; j < outer; j++) m_outerIndex[j + 1]--;
+        m_data.index(dst - 1) = StorageIndex(inner);
+        m_data.value(dst - 1) = Scalar(0);
+        return m_data.value(dst - 1);
+      }
+      leftTarget--;
     }
   }
-  if (target == outerSize()) {
-    // no room for interior insertion (to the right of `outer`)
-    target = outer;
-    Index dst_offset = dst - outerIndexPtr()[target];
-    Index totalCapacity = data().allocatedSize() - data().size();
-    eigen_assert(totalCapacity >= 0);
-    if (totalCapacity == 0) {
-      // there is no room left. we must reallocate. reserve space in each vector
-      constexpr StorageIndex kReserveSizePerVector(2);
-      reserveInnerVectors(IndexVector::Constant(outerSize(), kReserveSizePerVector));
+
+  // no room for interior insertion
+  // nonZeros() == m_data.size()
+  // record offset as outerIndxPtr will change
+  Index dst_offset = dst - m_outerIndex[outer];
+  // allocate space for random insertion
+  if (m_data.allocatedSize() == 0) {
+    // fast method to allocate space for one element per vector in empty matrix
+    m_data.resize(m_outerSize);
+    std::iota(m_outerIndex, m_outerIndex + m_outerSize + 1, StorageIndex(0));
+  } else {
+    // check for integer overflow: if maxReserveSize == 0, insertion is not possible
+    Index maxReserveSize = static_cast<Index>(NumTraits<StorageIndex>::highest()) - m_data.allocatedSize();
+    eigen_assert(maxReserveSize > 0);
+    if (m_outerSize <= maxReserveSize) {
+      // allocate space for one additional element per vector
+      reserveInnerVectors(IndexVector::Constant(m_outerSize, 1));
     } else {
-      // dont reallocate, but re-distribute the remaining capacity to the right of `outer`
-      // each vector in the range [outer,outerSize) will receive totalCapacity / (outerSize - outer) nonzero
-      // reservations each vector in the range [outer,remainder) will receive an additional nonzero reservation where
-      // remainder = totalCapacity % (outerSize - outer)
+      // handle the edge case where StorageIndex is insufficient to reserve outerSize additional elements
+      // allocate space for one additional element in the interval [outer,maxReserveSize)
       typedef internal::sparse_reserve_op<StorageIndex> ReserveSizesOp;
       typedef CwiseNullaryOp<ReserveSizesOp, IndexVector> ReserveSizesXpr;
-      ReserveSizesXpr reserveSizesXpr(outerSize(), 1, ReserveSizesOp(target, outerSize(), totalCapacity));
-      eigen_assert(reserveSizesXpr.sum() == totalCapacity);
+      ReserveSizesXpr reserveSizesXpr(m_outerSize, 1, ReserveSizesOp(outer, m_outerSize, maxReserveSize));
       reserveInnerVectors(reserveSizesXpr);
     }
-    Index start = outerIndexPtr()[target];
-    Index end = start + innerNonZeroPtr()[target];
-    dst = start + dst_offset;
-    Index chunkSize = end - dst;
-    if (chunkSize > 0) data().moveChunk(dst, dst + 1, chunkSize);
   }
-  // update nonzero counts
-  innerNonZeroPtr()[outer]++;
-  typename IndexVector::AlignedMapType outerIndexMap(outerIndexPtr(), outerSize() + 1);
-  outerIndexMap.segment(outer + 1, target - outer).array() += 1;
-  // initialize the coefficient
-  data().index(dst) = StorageIndex(inner);
-  // return a reference to the coefficient
-  return data().value(dst) = Scalar(0);
+  // insert element at `dst` with new outer indices
+  Index start = m_outerIndex[outer];
+  Index end = start + m_innerNonZeros[outer];
+  Index new_dst = start + dst_offset;
+  Index chunkSize = end - new_dst;
+  if (chunkSize > 0) m_data.moveChunk(new_dst, new_dst + 1, chunkSize);
+  m_innerNonZeros[outer]++;
+  m_data.index(new_dst) = StorageIndex(inner);
+  m_data.value(new_dst) = Scalar(0);
+  return m_data.value(new_dst);
 }
 
 namespace internal {

diff --git a/Eigen/src/SparseCore/SparsePermutation.h b/Eigen/src/SparseCore/SparsePermutation.h
index af9a1fe..7e402cc 100644
--- a/Eigen/src/SparseCore/SparsePermutation.h
+++ b/Eigen/src/SparseCore/SparsePermutation.h

@@ -18,69 +18,142 @@
 
 namespace internal {
 
+template<typename ExpressionType, typename PlainObjectType, bool NeedEval = !is_same<ExpressionType, PlainObjectType>::value>
+struct XprHelper
+{
+    XprHelper(const ExpressionType& xpr) : m_xpr(xpr) {}
+    inline const PlainObjectType& xpr() const { return m_xpr; }
+    // this is a new PlainObjectType initialized by xpr
+    const PlainObjectType m_xpr;
+};
+template<typename ExpressionType, typename PlainObjectType>
+struct XprHelper<ExpressionType, PlainObjectType, false>
+{
+    XprHelper(const ExpressionType& xpr) : m_xpr(xpr) {}
+    inline const PlainObjectType& xpr() const { return m_xpr; }
+    // this is a reference to xpr
+    const PlainObjectType& m_xpr;
+};
+
+template<typename PermDerived, bool NeedInverseEval>
+struct PermHelper
+{
+    using IndicesType = typename PermDerived::IndicesType;
+    using PermutationIndex = typename IndicesType::Scalar;
+    using type = PermutationMatrix<IndicesType::SizeAtCompileTime, IndicesType::MaxSizeAtCompileTime, PermutationIndex>;
+    PermHelper(const PermDerived& perm) : m_perm(perm.inverse()) {}
+    inline const type& perm() const { return m_perm; }
+    // this is a new PermutationMatrix initialized by perm.inverse()
+    const type m_perm;
+};
+template<typename PermDerived>
+struct PermHelper<PermDerived, false>
+{
+    using type = PermDerived;
+    PermHelper(const PermDerived& perm) : m_perm(perm) {}
+    inline const type& perm() const { return m_perm; }
+    // this is a reference to perm
+    const type& m_perm;
+};
+
 template<typename ExpressionType, int Side, bool Transposed>
 struct permutation_matrix_product<ExpressionType, Side, Transposed, SparseShape>
 {
-    typedef typename nested_eval<ExpressionType, 1>::type MatrixType;
-    typedef remove_all_t<MatrixType> MatrixTypeCleaned;
+    using MatrixType = typename nested_eval<ExpressionType, 1>::type;
+    using MatrixTypeCleaned = remove_all_t<MatrixType>;
 
-    typedef typename MatrixTypeCleaned::Scalar Scalar;
-    typedef typename MatrixTypeCleaned::StorageIndex StorageIndex;
+    using Scalar = typename MatrixTypeCleaned::Scalar;
+    using StorageIndex = typename MatrixTypeCleaned::StorageIndex;
 
-    enum {
-      SrcStorageOrder = MatrixTypeCleaned::Flags&RowMajorBit ? RowMajor : ColMajor,
-      MoveOuter = SrcStorageOrder==RowMajor ? Side==OnTheLeft : Side==OnTheRight
-    };
-    
-    typedef std::conditional_t<MoveOuter,
-        SparseMatrix<Scalar,SrcStorageOrder,StorageIndex>,
-        SparseMatrix<Scalar,int(SrcStorageOrder)==RowMajor?ColMajor:RowMajor,StorageIndex> > ReturnType;
+    // the actual "return type" is `Dest`. this is a temporary type
+    using ReturnType = SparseMatrix<Scalar, MatrixTypeCleaned::IsRowMajor ? RowMajor : ColMajor, StorageIndex>;
+    using TmpHelper = XprHelper<ExpressionType, ReturnType>;
 
-    template<typename Dest,typename PermutationType>
-    static inline void run(Dest& dst, const PermutationType& perm, const ExpressionType& xpr)
-    {
-      MatrixType mat(xpr);
-      if(MoveOuter)
-      {
-        SparseMatrix<Scalar,SrcStorageOrder,StorageIndex> tmp(mat.rows(), mat.cols());
-        Matrix<StorageIndex,Dynamic,1> sizes(mat.outerSize());
-        for(Index j=0; j<mat.outerSize(); ++j)
-        {
+    static constexpr bool NeedOuterPermutation = ExpressionType::IsRowMajor ? Side == OnTheLeft : Side == OnTheRight;
+    static constexpr bool NeedInversePermutation = Transposed ? Side == OnTheLeft : Side == OnTheRight;
+
+    template <typename Dest, typename PermutationType>
+    static inline void permute_outer(Dest& dst, const PermutationType& perm, const ExpressionType& xpr) {
+
+        // if ExpressionType is not ReturnType, evaluate `xpr` (allocation)
+        // otherwise, just reference `xpr`
+        // TODO: handle trivial expressions such as CwiseBinaryOp without temporary
+        const TmpHelper tmpHelper(xpr);
+        const ReturnType& tmp = tmpHelper.xpr();
+
+        ReturnType result(tmp.rows(), tmp.cols());
+
+        for (Index j = 0; j < tmp.outerSize(); j++) {
           Index jp = perm.indices().coeff(j);
-          sizes[((Side==OnTheLeft) ^ Transposed) ? jp : j] = StorageIndex(mat.innerVector(((Side==OnTheRight) ^ Transposed) ? jp : j).nonZeros());
+          Index jsrc = NeedInversePermutation ? jp : j;
+          Index jdst = NeedInversePermutation ? j : jp;
+          Index begin = tmp.outerIndexPtr()[jsrc];
+          Index end = tmp.isCompressed() ? tmp.outerIndexPtr()[jsrc + 1] : begin + tmp.innerNonZeroPtr()[jsrc];
+          result.outerIndexPtr()[jdst + 1] += end - begin;
         }
-        tmp.reserve(sizes);
-        for(Index j=0; j<mat.outerSize(); ++j)
-        {
-          Index jp = perm.indices().coeff(j);
-          Index jsrc = ((Side==OnTheRight) ^ Transposed) ? jp : j;
-          Index jdst = ((Side==OnTheLeft) ^ Transposed) ? jp : j;
-          for(typename MatrixTypeCleaned::InnerIterator it(mat,jsrc); it; ++it)
-            tmp.insertByOuterInner(jdst,it.index()) = it.value();
-        }
-        dst = tmp;
-      }
-      else
-      {
-        SparseMatrix<Scalar,int(SrcStorageOrder)==RowMajor?ColMajor:RowMajor,StorageIndex> tmp(mat.rows(), mat.cols());
-        Matrix<StorageIndex,Dynamic,1> sizes(tmp.outerSize());
-        sizes.setZero();
-        PermutationMatrix<Dynamic,Dynamic,StorageIndex> perm_cpy;
-        if((Side==OnTheLeft) ^ Transposed)
-          perm_cpy = perm;
-        else
-          perm_cpy = perm.transpose();
 
-        for(Index j=0; j<mat.outerSize(); ++j)
-          for(typename MatrixTypeCleaned::InnerIterator it(mat,j); it; ++it)
-            sizes[perm_cpy.indices().coeff(it.index())]++;
-        tmp.reserve(sizes);
-        for(Index j=0; j<mat.outerSize(); ++j)
-          for(typename MatrixTypeCleaned::InnerIterator it(mat,j); it; ++it)
-            tmp.insertByOuterInner(perm_cpy.indices().coeff(it.index()),j) = it.value();
-        dst = tmp;
-      }
+        std::partial_sum(result.outerIndexPtr(), result.outerIndexPtr() + result.outerSize() + 1,
+                         result.outerIndexPtr());
+        result.resizeNonZeros(result.nonZeros());
+
+        for (Index j = 0; j < tmp.outerSize(); j++) {
+          Index jp = perm.indices().coeff(j);
+          Index jsrc = NeedInversePermutation ? jp : j;
+          Index jdst = NeedInversePermutation ? j : jp;
+          Index begin = tmp.outerIndexPtr()[jsrc];
+          Index end = tmp.isCompressed() ? tmp.outerIndexPtr()[jsrc + 1] : begin + tmp.innerNonZeroPtr()[jsrc];
+          Index target = result.outerIndexPtr()[jdst];
+          smart_copy(tmp.innerIndexPtr() + begin, tmp.innerIndexPtr() + end, result.innerIndexPtr() + target);
+          smart_copy(tmp.valuePtr() + begin, tmp.valuePtr() + end, result.valuePtr() + target);
+        }
+        dst = std::move(result);
     }
+
+    template <typename Dest, typename PermutationType>
+    static inline void permute_inner(Dest& dst, const PermutationType& perm, const ExpressionType& xpr) {
+        using InnerPermHelper = PermHelper<PermutationType, NeedInversePermutation>;
+        using InnerPermType = typename InnerPermHelper::type;
+
+        // if ExpressionType is not ReturnType, evaluate `xpr` (allocation)
+        // otherwise, just reference `xpr`
+        // TODO: handle trivial expressions such as CwiseBinaryOp without temporary
+        const TmpHelper tmpHelper(xpr);
+        const ReturnType& tmp = tmpHelper.xpr();
+
+        // if inverse permutation of inner indices is requested, calculate perm.inverse() (allocation)
+        // otherwise, just reference `perm`
+        const InnerPermHelper permHelper(perm);
+        const InnerPermType& innerPerm = permHelper.perm();
+
+        ReturnType result(tmp.rows(), tmp.cols());
+
+        for (Index j = 0; j < tmp.outerSize(); j++) {
+            Index begin = tmp.outerIndexPtr()[j];
+            Index end = tmp.isCompressed() ? tmp.outerIndexPtr()[j + 1] : begin + tmp.innerNonZeroPtr()[j];
+            result.outerIndexPtr()[j + 1] += end - begin;
+        }
+
+        std::partial_sum(result.outerIndexPtr(), result.outerIndexPtr() + result.outerSize() + 1, result.outerIndexPtr());
+        result.resizeNonZeros(result.nonZeros());
+
+        for (Index j = 0; j < tmp.outerSize(); j++) {
+            Index begin = tmp.outerIndexPtr()[j];
+            Index end = tmp.isCompressed() ? tmp.outerIndexPtr()[j + 1] : begin + tmp.innerNonZeroPtr()[j];
+            Index target = result.outerIndexPtr()[j];
+            std::transform(tmp.innerIndexPtr() + begin, tmp.innerIndexPtr() + end, result.innerIndexPtr() + target,
+                           [&innerPerm](StorageIndex i) { return innerPerm.indices().coeff(i); });
+            smart_copy(tmp.valuePtr() + begin, tmp.valuePtr() + end, result.valuePtr() + target);
+        }
+        // the inner indices were permuted, and must be sorted
+        result.sortInnerIndices();
+        dst = std::move(result);
+    }
+
+    template <typename Dest, typename PermutationType, bool DoOuter = NeedOuterPermutation, std::enable_if_t<DoOuter, int> = 0>
+    static inline void run(Dest& dst, const PermutationType& perm, const ExpressionType& xpr) { permute_outer(dst, perm, xpr); }
+
+    template <typename Dest, typename PermutationType, bool DoOuter = NeedOuterPermutation, std::enable_if_t<!DoOuter, int> = 0>
+    static inline void run(Dest& dst, const PermutationType& perm, const ExpressionType& xpr) { permute_inner(dst, perm, xpr); }
 };
 
 }
@@ -143,35 +216,34 @@
 } // end namespace internal
 
 /** \returns the matrix with the permutation applied to the columns
-  */
-template<typename SparseDerived, typename PermDerived>
-inline const Product<SparseDerived, PermDerived, AliasFreeProduct>
-operator*(const SparseMatrixBase<SparseDerived>& matrix, const PermutationBase<PermDerived>& perm)
-{ return Product<SparseDerived, PermDerived, AliasFreeProduct>(matrix.derived(), perm.derived()); }
+ */
+template <typename SparseDerived, typename PermDerived>
+inline const Product<SparseDerived, PermDerived, AliasFreeProduct> operator*(
+    const SparseMatrixBase<SparseDerived>& matrix, const PermutationBase<PermDerived>& perm) {
+  return Product<SparseDerived, PermDerived, AliasFreeProduct>(matrix.derived(), perm.derived());
+}
 
 /** \returns the matrix with the permutation applied to the rows
-  */
-template<typename SparseDerived, typename PermDerived>
-inline const Product<PermDerived, SparseDerived, AliasFreeProduct>
-operator*( const PermutationBase<PermDerived>& perm, const SparseMatrixBase<SparseDerived>& matrix)
-{ return  Product<PermDerived, SparseDerived, AliasFreeProduct>(perm.derived(), matrix.derived()); }
-
+ */
+template <typename SparseDerived, typename PermDerived>
+inline const Product<PermDerived, SparseDerived, AliasFreeProduct> operator*(
+    const PermutationBase<PermDerived>& perm, const SparseMatrixBase<SparseDerived>& matrix) {
+  return Product<PermDerived, SparseDerived, AliasFreeProduct>(perm.derived(), matrix.derived());
+}
 
 /** \returns the matrix with the inverse permutation applied to the columns.
-  */
-template<typename SparseDerived, typename PermutationType>
-inline const Product<SparseDerived, Inverse<PermutationType>, AliasFreeProduct>
-operator*(const SparseMatrixBase<SparseDerived>& matrix, const InverseImpl<PermutationType, PermutationStorage>& tperm)
-{
+ */
+template <typename SparseDerived, typename PermutationType>
+inline const Product<SparseDerived, Inverse<PermutationType>, AliasFreeProduct> operator*(
+    const SparseMatrixBase<SparseDerived>& matrix, const InverseImpl<PermutationType, PermutationStorage>& tperm) {
   return Product<SparseDerived, Inverse<PermutationType>, AliasFreeProduct>(matrix.derived(), tperm.derived());
 }
 
 /** \returns the matrix with the inverse permutation applied to the rows.
-  */
-template<typename SparseDerived, typename PermutationType>
-inline const Product<Inverse<PermutationType>, SparseDerived, AliasFreeProduct>
-operator*(const InverseImpl<PermutationType,PermutationStorage>& tperm, const SparseMatrixBase<SparseDerived>& matrix)
-{
+ */
+template <typename SparseDerived, typename PermutationType>
+inline const Product<Inverse<PermutationType>, SparseDerived, AliasFreeProduct> operator*(
+    const InverseImpl<PermutationType, PermutationStorage>& tperm, const SparseMatrixBase<SparseDerived>& matrix) {
   return Product<Inverse<PermutationType>, SparseDerived, AliasFreeProduct>(tperm.derived(), matrix.derived());
 }
 

diff --git a/Eigen/src/misc/lapacke_helpers.h b/Eigen/src/misc/lapacke_helpers.h
index b6ad6e8..0703c2b 100644
--- a/Eigen/src/misc/lapacke_helpers.h
+++ b/Eigen/src/misc/lapacke_helpers.h

@@ -76,12 +76,6 @@
   return Derived::IsRowMajor ? LAPACK_ROW_MAJOR : LAPACK_COL_MAJOR;
 }
 
-/// translate UpLo type to the corresponding letter code
-template<UpLoType mode> char translate_mode;
-template<> constexpr char translate_mode<Lower> = 'L';
-template<> constexpr char translate_mode<Upper> = 'U';
-
-
 // ---------------------------------------------------------------------------------------------------------------------
 //              Automatic generation of low-level wrappers
 // ---------------------------------------------------------------------------------------------------------------------

diff --git a/bench/tensors/README b/bench/tensors/README
index dcbf021..acce5a1 100644
--- a/bench/tensors/README
+++ b/bench/tensors/README

@@ -16,5 +16,11 @@
 1. export COMPUTECPP_PACKAGE_ROOT_DIR={PATH TO COMPUTECPP ROOT DIRECTORY}
 2. bash eigen_sycl_bench.sh
 
+To compile the floating point GPU benchmarks using Intel DPCPP compiler 
+/path/to/dpcpp/bin/clang+  -DSYCL_COMPILER_IS_DPCPP -DNDEBUG -DEIGEN_MPL2_ONLY -DEIGEN_USE_SYCL=1 -I ../../  -O3 -DNDEBUG -fsycl -fsycl-targets="supported backend in DPCPP. i.e. spir64 or nvptx64-nvidia-cuda"  -std=c++17  tensor_benchmarks_sycl.cc benchmark_main.cc  -lpthread -o eigen_dpcpp_sycl
+
 Last but not least, we also provide a suite of benchmarks to measure the scalability of the contraction code on CPU. To compile these benchmarks, call
 g++ contraction_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu
+
+To compile the contraction with DPCPP: 
+/path/to/dpcpp/bin/clang++  -DSYCL_COMPILER_IS_DPCPP -DNDEBUG -DEIGEN_MPL2_ONLY -DEIGEN_USE_SYCL=1 -I ../../  -O3 -DNDEBUG -fsycl -fsycl-targets="supported backend in DPCPP. i.e. spir64 or nvptx64-nvidia-cuda" -std=c++17   tensor_contract_sycl_bench.cc -lpthread -o eigen_dpcpp_contract

diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake
index 1ddaa12..639790c 100644
--- a/cmake/EigenTesting.cmake
+++ b/cmake/EigenTesting.cmake

@@ -230,10 +230,10 @@
 
   # Add the tests to ctest.
   add_test(NAME ${test_target_ok}
-          COMMAND ${CMAKE_COMMAND} --build . --target ${test_target_ok} --config $<CONFIGURATION>
+          COMMAND ${CMAKE_COMMAND} --build . --target ${test_target_ok} --config $<CONFIG>
           WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
   add_test(NAME ${test_target_ko}
-          COMMAND ${CMAKE_COMMAND} --build . --target ${test_target_ko} --config $<CONFIGURATION>
+          COMMAND ${CMAKE_COMMAND} --build . --target ${test_target_ko} --config $<CONFIG>
           WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
 
   # Expect the second test to fail

diff --git a/cmake/FindDPCPP.cmake b/cmake/FindDPCPP.cmake
new file mode 100644
index 0000000..73aa30f
--- /dev/null
+++ b/cmake/FindDPCPP.cmake

@@ -0,0 +1,62 @@
+include_guard()
+
+include(CheckCXXCompilerFlag)
+include(FindPackageHandleStandardArgs)
+
+if("${DPCPP_SYCL_TARGET}" STREQUAL "amdgcn-amd-amdhsa" AND
+   "${DPCPP_SYCL_ARCH}" STREQUAL "")
+   message(FATAL_ERROR "Architecture required for AMD DPCPP builds,"
+                       " please specify in DPCPP_SYCL_ARCH")
+endif()
+
+set(DPCPP_USER_FLAGS "" CACHE STRING 
+    "Additional user-specified compiler flags for DPC++")
+
+get_filename_component(DPCPP_BIN_DIR ${CMAKE_CXX_COMPILER} DIRECTORY)
+find_library(DPCPP_LIB_DIR NAMES sycl sycl6 PATHS "${DPCPP_BIN_DIR}/../lib")
+
+add_library(DPCPP::DPCPP INTERFACE IMPORTED)
+
+set(DPCPP_FLAGS "-fsycl;-fsycl-targets=${DPCPP_SYCL_TARGET};-fsycl-unnamed-lambda;${DPCPP_USER_FLAGS};-ftemplate-backtrace-limit=0")
+if(NOT "${DPCPP_SYCL_ARCH}" STREQUAL "")
+  if("${DPCPP_SYCL_TARGET}" STREQUAL "amdgcn-amd-amdhsa")
+    list(APPEND DPCPP_FLAGS "-Xsycl-target-backend")
+    list(APPEND DPCPP_FLAGS "--offload-arch=${DPCPP_SYCL_ARCH}")
+  elseif("${DPCPP_SYCL_TARGET}" STREQUAL "nvptx64-nvidia-cuda")
+    list(APPEND DPCPP_FLAGS "-Xsycl-target-backend")
+    list(APPEND DPCPP_FLAGS "--cuda-gpu-arch=${DPCPP_SYCL_ARCH}")
+  endif()
+endif()
+
+if(UNIX)
+  set_target_properties(DPCPP::DPCPP PROPERTIES
+    INTERFACE_COMPILE_OPTIONS "${DPCPP_FLAGS}"
+    INTERFACE_LINK_OPTIONS "${DPCPP_FLAGS}"
+    INTERFACE_LINK_LIBRARIES ${DPCPP_LIB_DIR}
+    INTERFACE_INCLUDE_DIRECTORIES "${DPCPP_BIN_DIR}/../include/sycl;${DPCPP_BIN_DIR}/../include")
+    message(STATUS ">>>>>>>>> DPCPP INCLUDE DIR: ${DPCPP_BIN_DIR}/../include/sycl")
+else()
+  set_target_properties(DPCPP::DPCPP PROPERTIES
+    INTERFACE_COMPILE_OPTIONS "${DPCPP_FLAGS}"
+    INTERFACE_LINK_LIBRARIES ${DPCPP_LIB_DIR}
+    INTERFACE_INCLUDE_DIRECTORIES "${DPCPP_BIN_DIR}/../include/sycl")
+endif()
+
+function(add_sycl_to_target)
+  set(options)
+  set(one_value_args TARGET)
+  set(multi_value_args SOURCES)
+  cmake_parse_arguments(SB_ADD_SYCL
+    "${options}"
+    "${one_value_args}"
+    "${multi_value_args}"
+    ${ARGN}
+  )
+  target_compile_options(${SB_ADD_SYCL_TARGET} PUBLIC ${DPCPP_FLAGS})
+  target_link_libraries(${SB_ADD_SYCL_TARGET} DPCPP::DPCPP)
+  target_compile_features(${SB_ADD_SYCL_TARGET} PRIVATE cxx_std_17)
+  get_target_property(target_type ${SB_ADD_SYCL_TARGET} TYPE)
+  if (NOT target_type STREQUAL "OBJECT_LIBRARY")
+    target_link_options(${SB_ADD_SYCL_TARGET} PUBLIC ${DPCPP_FLAGS})
+  endif()
+endfunction()

diff --git a/test/array_cwise.cpp b/test/array_cwise.cpp
index af8a1ef..f0a80df 100644
--- a/test/array_cwise.cpp
+++ b/test/array_cwise.cpp

@@ -83,6 +83,7 @@
       Scalar e = static_cast<Scalar>(ref(x(i,j), y(i,j)));
       Scalar a = actual(i, j);
       bool success = (a==e) || ((numext::isfinite)(e) && internal::isApprox(a, e, tol)) || ((numext::isnan)(a) && (numext::isnan)(e));
+      if ((a == a) && (e == e)) success &= (bool)numext::signbit(e) == (bool)numext::signbit(a);
       all_pass &= success;
       if (!success) {
         std::cout << name << "(" << x(i,j) << "," << y(i,j) << ") = " << a << " !=  " << e << std::endl;
@@ -95,11 +96,11 @@
 template <typename Scalar>
 void binary_ops_test() {
   binary_op_test<Scalar>("pow",
-                         [](auto x, auto y) { return Eigen::pow(x, y); },
-                         [](auto x, auto y) { return std::pow(x, y); });
+                         [](const auto& x, const auto& y) { return Eigen::pow(x, y); },
+                         [](const auto& x, const auto& y) { return std::pow(x, y); });
   binary_op_test<Scalar>("atan2",
-                         [](auto x, auto y) { return Eigen::atan2(x, y); },
-                         [](auto x, auto y) { return std::atan2(x, y); });
+                         [](const auto& x, const auto& y) { return Eigen::atan2(x, y); },
+                         [](const auto& x, const auto& y) { return std::atan2(x, y); });
 }
 
 template <typename Scalar>
@@ -125,6 +126,7 @@
           Scalar a = eigenPow(j);
           bool success = (a == e) || ((numext::isfinite)(e) && internal::isApprox(a, e, tol)) ||
                          ((numext::isnan)(a) && (numext::isnan)(e));
+          if ((a == a) && (e == e)) success &= (bool)numext::signbit(e) == (bool)numext::signbit(a);
           all_pass &= success;
           if (!success) {
             std::cout << "pow(" << bases(j) << "," << exponent << ") = " << a << " !=  " << e << std::endl;
@@ -138,6 +140,7 @@
           Scalar a = eigenPow(j);
           bool success = (a == e) || ((numext::isfinite)(e) && internal::isApprox(a, e, tol)) ||
                          ((numext::isnan)(a) && (numext::isnan)(e));
+          if ((a == a) && (e == e)) success &= (bool)numext::signbit(e) == (bool)numext::signbit(a);
           all_pass &= success;
           if (!success) {
             std::cout << "pow(" << bases(j) << "," << exponent << ")   =   " << a << " !=  " << e << std::endl;

diff --git a/test/array_for_matrix.cpp b/test/array_for_matrix.cpp
index 06e04a2..34d7215 100644
--- a/test/array_for_matrix.cpp
+++ b/test/array_for_matrix.cpp

@@ -24,10 +24,10 @@
 
   ColVectorType cv1 = ColVectorType::Random(rows);
   RowVectorType rv1 = RowVectorType::Random(cols);
-  
+
   Scalar  s1 = internal::random<Scalar>(),
           s2 = internal::random<Scalar>();
-          
+
   // scalar addition
   VERIFY_IS_APPROX(m1.array() + s1, s1 + m1.array());
   VERIFY_IS_APPROX((m1.array() + s1).matrix(), MatrixType::Constant(rows,cols,s1) + m1);
@@ -55,18 +55,18 @@
   VERIFY_IS_APPROX(m3.rowwise() += rv1, m1.rowwise() + rv1);
   m3 = m1;
   VERIFY_IS_APPROX(m3.rowwise() -= rv1, m1.rowwise() - rv1);
-  
-  // empty objects
-  VERIFY_IS_APPROX((m1.template block<0,Dynamic>(0,0,0,cols).colwise().sum()), RowVectorType::Zero(cols));
-  VERIFY_IS_APPROX((m1.template block<Dynamic,0>(0,0,rows,0).rowwise().sum()), ColVectorType::Zero(rows));
-  VERIFY_IS_APPROX((m1.template block<0,Dynamic>(0,0,0,cols).colwise().prod()), RowVectorType::Ones(cols));
-  VERIFY_IS_APPROX((m1.template block<Dynamic,0>(0,0,rows,0).rowwise().prod()), ColVectorType::Ones(rows));
 
-  VERIFY_IS_APPROX(m1.block(0,0,0,cols).colwise().sum(), RowVectorType::Zero(cols));
-  VERIFY_IS_APPROX(m1.block(0,0,rows,0).rowwise().sum(), ColVectorType::Zero(rows));
-  VERIFY_IS_APPROX(m1.block(0,0,0,cols).colwise().prod(), RowVectorType::Ones(cols));
-  VERIFY_IS_APPROX(m1.block(0,0,rows,0).rowwise().prod(), ColVectorType::Ones(rows));
-  
+  // empty objects
+  VERIFY_IS_EQUAL((m1.template block<0,Dynamic>(0,0,0,cols).colwise().sum()), RowVectorType::Zero(cols));
+  VERIFY_IS_EQUAL((m1.template block<Dynamic,0>(0,0,rows,0).rowwise().sum()), ColVectorType::Zero(rows));
+  VERIFY_IS_EQUAL((m1.template block<0,Dynamic>(0,0,0,cols).colwise().prod()), RowVectorType::Ones(cols));
+  VERIFY_IS_EQUAL((m1.template block<Dynamic,0>(0,0,rows,0).rowwise().prod()), ColVectorType::Ones(rows));
+
+  VERIFY_IS_EQUAL(m1.block(0,0,0,cols).colwise().sum(), RowVectorType::Zero(cols));
+  VERIFY_IS_EQUAL(m1.block(0,0,rows,0).rowwise().sum(), ColVectorType::Zero(rows));
+  VERIFY_IS_EQUAL(m1.block(0,0,0,cols).colwise().prod(), RowVectorType::Ones(cols));
+  VERIFY_IS_EQUAL(m1.block(0,0,rows,0).rowwise().prod(), ColVectorType::Ones(rows));
+
   // verify the const accessors exist
   const Scalar& ref_m1 = m.matrix().array().coeffRef(0);
   const Scalar& ref_m2 = m.matrix().array().coeffRef(0,0);

diff --git a/test/blasutil.cpp b/test/blasutil.cpp
index ee98df4..2fc8b68 100644
--- a/test/blasutil.cpp
+++ b/test/blasutil.cpp

@@ -13,7 +13,7 @@
 // for packet_traits<Packet*>
 // => The only workaround would be to wrap _m128 and the likes
 //    within wrappers.
-#if EIGEN_GNUC_AT_LEAST(6,0)
+#if EIGEN_GNUC_STRICT_AT_LEAST(6,0,0)
     #pragma GCC diagnostic ignored "-Wignored-attributes"
 #endif
 

diff --git a/test/gpu_basic.cu b/test/gpu_basic.cu
index 82f9d61..00838ea 100644
--- a/test/gpu_basic.cu
+++ b/test/gpu_basic.cu

@@ -139,7 +139,7 @@
     out[out_idx++] = a / numext::real(b);
     out[out_idx++] = numext::real(a) / b;
     
-#if !defined(EIGEN_COMP_MSVC)
+#if !EIGEN_COMP_MSVC
     out[out_idx] = a; out[out_idx++] += b;
     out[out_idx] = a; out[out_idx++] -= b;
     out[out_idx] = a; out[out_idx++] *= b;
@@ -191,7 +191,7 @@
     res.segment(block_idx, size) = x1.real().array() / x2.array();
     block_idx += size;
     
-#if !defined(EIGEN_COMP_MSVC)
+#if !EIGEN_COMP_MSVC
     res.segment(block_idx, size) = x1; res.segment(block_idx, size) += x2;
     block_idx += size;
     res.segment(block_idx, size) = x1; res.segment(block_idx, size) -= x2;

diff --git a/test/gpu_common.h b/test/gpu_common.h
index 05080ad..659babe 100644
--- a/test/gpu_common.h
+++ b/test/gpu_common.h

@@ -146,7 +146,7 @@
   std::cout << "  EIGEN_CUDA_SDK_VER:             " << int(EIGEN_CUDA_SDK_VER) << "\n";
   #endif
 
-  #ifdef EIGEN_COMP_NVCC
+  #if EIGEN_COMP_NVCC
   std::cout << "  EIGEN_COMP_NVCC:             " << int(EIGEN_COMP_NVCC) << "\n";
   #endif
   

diff --git a/test/gpu_test_helper.h b/test/gpu_test_helper.h
index 008e95c..ca9378b 100644
--- a/test/gpu_test_helper.h
+++ b/test/gpu_test_helper.h

@@ -395,7 +395,7 @@
   std::cout << "  EIGEN_CUDA_SDK_VER:          " << int(EIGEN_CUDA_SDK_VER) << std::endl;
   #endif
 
-  #ifdef EIGEN_COMP_NVCC
+  #if EIGEN_COMP_NVCC
   std::cout << "  EIGEN_COMP_NVCC:             " << int(EIGEN_COMP_NVCC) << std::endl;
   #endif
   

diff --git a/test/main.h b/test/main.h
index a52da9e..8538581 100644
--- a/test/main.h
+++ b/test/main.h

@@ -18,6 +18,9 @@
 #include <vector>
 #include <typeinfo>
 #include <functional>
+#ifdef EIGEN_USE_SYCL
+#include <CL/sycl.hpp>
+#endif
 
 // The following includes of STL headers have to be done _before_ the
 // definition of macros min() and max().  The reason is that many STL
@@ -121,9 +124,7 @@
 #define FORBIDDEN_IDENTIFIER (this_identifier_is_forbidden_to_avoid_clashes) this_identifier_is_forbidden_to_avoid_clashes
 // B0 is defined in POSIX header termios.h
 #define B0 FORBIDDEN_IDENTIFIER
-// `I` may be defined by complex.h:
 #define I  FORBIDDEN_IDENTIFIER
-
 // Unit tests calling Eigen's blas library must preserve the default blocking size
 // to avoid troubles.
 #ifndef EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS
@@ -301,15 +302,16 @@
       }
     #endif //EIGEN_EXCEPTIONS
 
-  #elif !defined(__CUDACC__) && !defined(__HIPCC__) && !defined(SYCL_DEVICE_ONLY) // EIGEN_DEBUG_ASSERTS
+  #elif !defined(__CUDACC__) && !defined(__HIPCC__) && !defined(__SYCL_DEVICE_ONLY__) // EIGEN_DEBUG_ASSERTS
     #define eigen_assert(a) \
       if( (!(a)) && (!no_more_assert) )       \
       {                                       \
         Eigen::no_more_assert = true;         \
-        if(report_on_cerr_on_assert_failure)  \
+        if(report_on_cerr_on_assert_failure) { \
           eigen_plain_assert(a);              \
-        else                                  \
+        } else {                                  \
           EIGEN_THROW_X(Eigen::eigen_assert_exception()); \
+        } \
       }
 
     #ifdef EIGEN_EXCEPTIONS

diff --git a/test/meta.cpp b/test/meta.cpp
index cac7af1..97ebdd7 100644
--- a/test/meta.cpp
+++ b/test/meta.cpp

@@ -86,9 +86,9 @@
     VERIFY(( check_is_convertible(A*B, A) ));
   }
 
-  #if (EIGEN_COMP_GNUC  && EIGEN_COMP_GNUC  <=  99) \
-   || (EIGEN_COMP_CLANG && EIGEN_COMP_CLANG <= 909) \
-   || (EIGEN_COMP_MSVC  && EIGEN_COMP_MSVC  <=1914)
+  #if (EIGEN_COMP_GNUC_STRICT  && EIGEN_COMP_GNUC  <=  990) \
+   || (EIGEN_COMP_CLANG_STRICT && EIGEN_COMP_CLANG <=  990) \
+   || (EIGEN_COMP_MSVC         && EIGEN_COMP_MSVC  <= 1914)
   // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1752,
   // basically, a fix in the c++ standard breaks our c++98 implementation
   // of is_convertible for abstract classes.

diff --git a/test/qr_colpivoting.cpp b/test/qr_colpivoting.cpp
index 4185f51..b0cc000 100644
--- a/test/qr_colpivoting.cpp
+++ b/test/qr_colpivoting.cpp

@@ -15,7 +15,6 @@
 
 template <typename MatrixType>
 void cod() {
-  STATIC_CHECK(( internal::is_same<typename CompleteOrthogonalDecomposition<MatrixType>::StorageIndex,int>::value ));
 
   Index rows = internal::random<Index>(2, EIGEN_TEST_MAX_SIZE);
   Index cols = internal::random<Index>(2, EIGEN_TEST_MAX_SIZE);
@@ -100,8 +99,6 @@
 {
   using std::sqrt;
 
-  STATIC_CHECK(( internal::is_same<typename ColPivHouseholderQR<MatrixType>::StorageIndex,int>::value ));
-
   Index rows = internal::random<Index>(2,EIGEN_TEST_MAX_SIZE), cols = internal::random<Index>(2,EIGEN_TEST_MAX_SIZE), cols2 = internal::random<Index>(2,EIGEN_TEST_MAX_SIZE);
   Index rank = internal::random<Index>(1, (std::min)(rows, cols)-1);
 

diff --git a/test/qr_fullpivoting.cpp b/test/qr_fullpivoting.cpp
index cca9a8c..bd40881 100644
--- a/test/qr_fullpivoting.cpp
+++ b/test/qr_fullpivoting.cpp

@@ -14,7 +14,6 @@
 
 template<typename MatrixType> void qr()
 {
-  STATIC_CHECK(( internal::is_same<typename FullPivHouseholderQR<MatrixType>::StorageIndex,int>::value ));
 
   static const int Rows = MatrixType::RowsAtCompileTime, Cols = MatrixType::ColsAtCompileTime;
   Index max_size = EIGEN_TEST_MAX_SIZE;

diff --git a/test/sparse_basic.cpp b/test/sparse_basic.cpp
index 6e2661e..a9fc9fd 100644
--- a/test/sparse_basic.cpp
+++ b/test/sparse_basic.cpp

@@ -105,7 +105,6 @@
         VERIFY(g_realloc_count==0);
       }
       
-      m2.finalize();
       VERIFY_IS_APPROX(m2,m1);
     }
 

diff --git a/test/sparse_permutations.cpp b/test/sparse_permutations.cpp
index 5974c74..775c560 100644
--- a/test/sparse_permutations.cpp
+++ b/test/sparse_permutations.cpp

@@ -17,6 +17,15 @@
     VERIFY( (#XPR) && nb_transposed_copies==N ); \
   }
 
+static long int nb_temporaries;
+#define EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN {nb_temporaries++;}
+#define VERIFY_TEMPORARY_COUNT(XPR,N) {\
+    nb_temporaries = 0; \
+    XPR; \
+    if(nb_temporaries!=N) std::cerr << "nb_temporaries == " << nb_temporaries << "\n"; \
+    VERIFY( (#XPR) && nb_temporaries==N ); \
+  }
+
 #include "sparse.h"
 
 template<typename T>
@@ -79,28 +88,52 @@
   VERIFY( is_sorted( ::eval(mat*p) ));
   VERIFY( is_sorted( res = mat*p ));
   VERIFY_TRANSPOSITION_COUNT( ::eval(mat*p), 0);
-  //VERIFY_TRANSPOSITION_COUNT( res = mat*p, IsRowMajor ? 1 : 0 );
+  VERIFY_TEMPORARY_COUNT(     ::eval(mat*p), 1)
   res_d = mat_d*p;
   VERIFY(res.isApprox(res_d) && "mat*p");
 
   VERIFY( is_sorted( ::eval(p*mat) ));
   VERIFY( is_sorted( res = p*mat ));
   VERIFY_TRANSPOSITION_COUNT( ::eval(p*mat), 0);
+  VERIFY_TEMPORARY_COUNT(     ::eval(p*mat), 1);
   res_d = p*mat_d;
   VERIFY(res.isApprox(res_d) && "p*mat");
 
   VERIFY( is_sorted( (mat*p).eval() ));
   VERIFY( is_sorted( res = mat*p.inverse() ));
   VERIFY_TRANSPOSITION_COUNT( ::eval(mat*p.inverse()), 0);
+  VERIFY_TEMPORARY_COUNT(     ::eval(mat*p.inverse()), 1);
   res_d = mat*p.inverse();
   VERIFY(res.isApprox(res_d) && "mat*inv(p)");
 
   VERIFY( is_sorted( (p*mat+p*mat).eval() ));
   VERIFY( is_sorted( res = p.inverse()*mat ));
   VERIFY_TRANSPOSITION_COUNT( ::eval(p.inverse()*mat), 0);
+  VERIFY_TEMPORARY_COUNT(     ::eval(p.inverse()*mat), 1);
   res_d = p.inverse()*mat_d;
   VERIFY(res.isApprox(res_d) && "inv(p)*mat");
 
+  // test non-plaintype expressions that require additional temporary
+  const Scalar alpha(2.34);
+
+  res_d = p * (alpha * mat_d);
+  VERIFY_TEMPORARY_COUNT( res = p * (alpha * mat), 2);
+  VERIFY( res.isApprox(res_d) && "p*(alpha*mat)" );
+
+  res_d = (alpha * mat_d) * p;
+  VERIFY_TEMPORARY_COUNT( res = (alpha * mat) * p, 2);
+  VERIFY( res.isApprox(res_d) && "(alpha*mat)*p" );
+
+  res_d = p.inverse() * (alpha * mat_d);
+  VERIFY_TEMPORARY_COUNT( res = p.inverse() * (alpha * mat), 2);
+  VERIFY( res.isApprox(res_d) && "inv(p)*(alpha*mat)" );
+
+  res_d = (alpha * mat_d) * p.inverse();
+  VERIFY_TEMPORARY_COUNT( res = (alpha * mat) * p.inverse(), 2);
+  VERIFY( res.isApprox(res_d) && "(alpha*mat)*inv(p)" );
+
+  //
+
   VERIFY( is_sorted( (p * mat * p.inverse()).eval() ));
   VERIFY( is_sorted( res = mat.twistedBy(p) ));
   VERIFY_TRANSPOSITION_COUNT( ::eval(p * mat * p.inverse()), 0);

diff --git a/test/sparse_product.cpp b/test/sparse_product.cpp
index 8b46fc8..dbf549a 100644
--- a/test/sparse_product.cpp
+++ b/test/sparse_product.cpp

@@ -477,7 +477,7 @@
     initSparse<Real>(0.1, refA, A);
     initSparse<Real>(0.1, refB, B);
 
-    SparseMatrix<Real, OrderC, std::int16_t> result;
+    SparseMatrix<Real, OrderC, std::int8_t> result;
     SparseMatrix<Real, OrderC> result_large;
     DenseMat refResult;
 
@@ -486,8 +486,8 @@
 
   // Case: Small input but large result
   {
-    SparseMatrix<Real, OrderA, std::int16_t> A(127, 8);
-    SparseMatrix<Real, OrderB, std::int16_t> B(8, 127);
+    SparseMatrix<Real, OrderA, std::int8_t> A(127, 8);
+    SparseMatrix<Real, OrderB, std::int8_t> B(8, 127);
     DenseMat refA(127, 8);
     DenseMat refB(8, 127);
 

diff --git a/test/vectorization_logic.cpp b/test/vectorization_logic.cpp
index a6441ab..59f88cf 100644
--- a/test/vectorization_logic.cpp
+++ b/test/vectorization_logic.cpp

@@ -26,7 +26,7 @@
 // for packet_traits<Packet*>
 // => The only workaround would be to wrap _m128 and the likes
 //    within wrappers.
-#if EIGEN_GNUC_AT_LEAST(6,0)
+#if EIGEN_GNUC_STRICT_AT_LEAST(6,0,0)
     #pragma GCC diagnostic ignored "-Wignored-attributes"
 #endif
 

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
index 0bd3a00..3365b72 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h

@@ -170,10 +170,10 @@
     m_rightImpl.cleanup();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) const {
     m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i);
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) const {
 
     const int LhsStoreMode = TensorEvaluator<LeftArgType, Device>::IsAligned ? Aligned : Unaligned;
     const int RhsLoadMode = TensorEvaluator<RightArgType, Device>::IsAligned ? Aligned : Unaligned;

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index a4ac2ad..e3fc8f4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h

@@ -1185,7 +1185,6 @@
       internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
       return derived();
     }
-
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Derived& derived() { return *static_cast<Derived*>(this); }
     EIGEN_DEVICE_FUNC
@@ -1195,3 +1194,4 @@
 } // end namespace Eigen
 
 #endif // EIGEN_CXX11_TENSOR_TENSOR_BASE_H
+

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index ef01e30..e5f3d5e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h

@@ -438,13 +438,13 @@
     : Base(op, device)
     { }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const
   {
     return this->m_impl.coeffRef(this->srcCoeff(index));
   }
 
   template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void writePacket(Index index, const PacketReturnType& x)
+  void writePacket(Index index, const PacketReturnType& x) const
   {
     if (this->isInnerChipping()) {
       // m_stride is equal to 1, so let's avoid the integer division.

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
index 073be81..dea41df 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h

@@ -331,7 +331,7 @@
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const
   {
     // Collect dimension-wise indices (subs).
     array<Index, Base::NumDims> subs;
@@ -360,7 +360,7 @@
   }
 
   template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void writePacket(Index index, const PacketReturnType& x)
+  void writePacket(Index index, const PacketReturnType& x) const
   {
     const int packetSize = PacketType<CoeffReturnType, Device>::size;
     EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index c629e44..3a917a0 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h

@@ -362,10 +362,8 @@
     const OutputKernelType m_output_kernel;
 };
 
-
 template<typename Derived>
-struct TensorContractionEvaluatorBase : internal::no_assignment_operator
-{
+struct TensorContractionEvaluatorBase {
   typedef typename internal::traits<Derived>::Indices Indices;
   typedef typename internal::traits<Derived>::LeftArgType LeftArgType;
   typedef typename internal::traits<Derived>::RightArgType RightArgType;

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
index c6c4077..526fc81 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h

@@ -597,7 +597,7 @@
                                                                 const TripleDim triple_dim_)
       : TensorContractionKernel(scratch_, lhs_, rhs_, out_res_, groupSizeM_, 1, numTiles_, triple_dim_) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
     const StorageIndex linearLocalThreadId = itemID.get_local_id(0);
     const StorageIndex nLocalThreadId = linearLocalThreadId / Properties::LocalThreadSizeM;
     const StorageIndex mLocalThreadId = linearLocalThreadId % Properties::LocalThreadSizeM;
@@ -636,7 +636,7 @@
   // privateRes memory of Each computation the compute block function is independent of local and no local concepts as
   // it only compute the block on each thread's private memory space
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_block_per_tile(OutScalar *lhs_block_ptr, OutScalar *rhs_block_ptr,
-                                                                    PacketReturnType *privateRes) {
+                                                                    PacketReturnType *privateRes) const {
     StorageIndex idx = 0;
     EIGEN_CONSTEXPR StorageIndex lhs_stride =
         contraction_tp == contraction_type::local ? (PacketSize * Properties::LocalThreadSizeM) : 1;
@@ -661,7 +661,7 @@
   // class.
   template <bool is_internal_block, StorageIndex PrivateNStride, typename OutPtr>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void store(OutPtr *out_ptr, PacketReturnType *privateRes,
-                                                   StorageIndex mGlobalOffset, StorageIndex nGlobalOffset) {
+                                                   StorageIndex mGlobalOffset, StorageIndex nGlobalOffset) const {
     auto chk_bound = [&](const StorageIndex &mIndex, const StorageIndex &nIndex) EIGEN_DEVICE_FUNC {
       return (mIndex + PacketSize - 1 < triple_dim.M && nGlobalOffset + nIndex < triple_dim.N);
     };
@@ -713,7 +713,7 @@
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
       std::enable_if_t<contract_tp == contraction_type::no_local>
       extract_block(const Input &inpt, PrivateReg private_ptr, const std::pair<StorageIndex, StorageIndex> &,
-                    const StorageIndex &ncOffset, const StorageIndex cOffset) {
+                    const StorageIndex &ncOffset, const StorageIndex cOffset) const {
     EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC =
         InputBlockProperties::is_rhs ? Properties::LocalThreadSizeN : Properties::LocalThreadSizeM;
     EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC =
@@ -833,7 +833,8 @@
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_tile_per_panel(const cl::sycl::nd_item<1> &itemID,
                                                                     ThreadProperties<StorageIndex> &thread_properties,
                                                                     TiledMemory &tiled_input_block,
-                                                                    PacketReturnType *privateRes, bool &db_offset) {
+                                                                    PacketReturnType *privateRes, bool &db_offset) const {
+
     // Tiling the Rhs block from global to local memory
     extract_block<RHSBlockProperties, is_internal_block>(
         rhs, tiled_input_block.rhs_scratch_extract.ptr + (db_offset * Properties::TileSizeDimK * LSDR),
@@ -871,7 +872,7 @@
   template <bool is_internal_block, typename OutPtr>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(const cl::sycl::nd_item<1> &itemID,
                                                            ThreadProperties<StorageIndex> &thread_properties,
-                                                           OutPtr out_ptr) {
+                                                           OutPtr out_ptr) const {
     auto tiled_input_block = TiledMemory{thread_properties, scratch.get_pointer()};
     // Allocate register space
     PacketReturnType privateRes[Properties::WorkLoadPerThreadM * Properties::WorkLoadPerThreadN / PacketSize] = {
@@ -897,7 +898,7 @@
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
       std::enable_if_t<contract_tp == contraction_type::local>
       extract_block(const Input &inpt, Local local_ptr, const std::pair<StorageIndex, StorageIndex>& local_index,
-                    const StorageIndex &ncOffset, const StorageIndex cOffset) {
+                    const StorageIndex &ncOffset, const StorageIndex cOffset) const {
     EIGEN_CONSTEXPR StorageIndex TileSizeDimNC =
         InputBlockProperties::is_rhs ? Properties::TileSizeDimN : Properties::TileSizeDimM;
     EIGEN_CONSTEXPR StorageIndex LoadPerThread =
@@ -1035,7 +1036,7 @@
         nonContractDim(nonContractDim_),
         contractDim(contractDim_) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
     auto scratch_ptr = scratch.get_pointer();
     const StorageIndex linearLocalThreadId = itemID.get_local_id(0);
     StorageIndex nonContractId = is_lhs_vec ? linearLocalThreadId / Properties::LocalThreadSizeC
@@ -1252,7 +1253,8 @@
                            const StorageIndex rng_)
       : scratch(scratch_), lhs(lhs_), rhs(rhs_), out_res(out_res_), rng(rng_) {}
 
-  EIGEN_DEVICE_FUNC void operator()(cl::sycl::nd_item<1> itemID) {
+  EIGEN_DEVICE_FUNC void operator()(cl::sycl::nd_item<1> itemID) const {
+
     auto out_ptr = out_res.get_pointer();
     auto scratch_ptr = scratch.get_pointer().get();
 

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index 70d2129..e6e586b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h

@@ -123,7 +123,9 @@
         inputIndex += idx * m_inputStrides[d];
         p -= idx * m_gpuInputStrides[d];
       }
-      inputIndex += p * m_inputStrides[NumKernelDims];
+      if (NumKernelDims < NumDims) {
+        inputIndex += p * m_inputStrides[NumKernelDims];
+      }
     } else {
       std::ptrdiff_t limit = 0;
       if (NumKernelDims < NumDims) {
@@ -147,7 +149,9 @@
         outputIndex += idx * m_outputStrides[d];
         p -= idx * m_gpuOutputStrides[d];
       }
-      outputIndex += p * m_outputStrides[NumKernelDims];
+      if (NumKernelDims < NumDims) {
+        outputIndex += p * m_outputStrides[NumKernelDims];
+      }
     } else {
       std::ptrdiff_t limit = 0;
       if (NumKernelDims < NumDims) {
@@ -386,7 +390,7 @@
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
     m_inputImpl.evalSubExprsIfNeeded(NULL);
     preloadKernel();
     return true;
@@ -824,7 +828,7 @@
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
     preloadKernel();
     m_inputImpl.evalSubExprsIfNeeded(NULL);
     if (data) {
@@ -1112,9 +1116,6 @@
   }
 
  private:
-  // No assignment (copies are needed by the kernels)
-  TensorEvaluator& operator = (const TensorEvaluator&);
-
   TensorEvaluator<InputArgType, GpuDevice> m_inputImpl;
   TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl;
   KernelArgType m_kernelArg;

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
index 3cbc1ab..7ccb174 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h

@@ -57,10 +57,10 @@
         input_range(input_range_) {}
 
   template <typename BooleanDim2>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) const {
     return (boolean_check[0] && boolean_check[1]);
   }
-  void operator()(cl::sycl::nd_item<2> itemID) {
+  void operator()(cl::sycl::nd_item<2> itemID)  const {
     auto buffer_ptr = buffer_acc.get_pointer();
     auto kernel_ptr = kernel_filter.get_pointer();
     // the required row to be calculated for the for each plane in shered memory
@@ -123,11 +123,11 @@
         kernel_size(kernel_size_),
         input_range(input_range_) {}
   template <typename BooleanDim3>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) const {
     return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
   }
 
-  void operator()(cl::sycl::nd_item<3> itemID) {
+  void operator()(cl::sycl::nd_item<3> itemID) const {
     auto buffer_ptr = buffer_acc.get_pointer();
     auto kernel_ptr = kernel_filter.get_pointer();
     // the required row to be calculated for the for each plane in shered memory
@@ -212,10 +212,10 @@
         input_range(input_range_),
         numP(numP_) {}
   template <typename BooleanDim3>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) const {
     return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
   }
-  void operator()(cl::sycl::nd_item<3> itemID) {
+  void operator()(cl::sycl::nd_item<3> itemID) const {
     auto buffer_ptr = buffer_acc.get_pointer();
     auto kernel_ptr = kernel_filter.get_pointer();
     const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1};

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
index 84ebe38..8fdc8ba 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h

@@ -31,8 +31,7 @@
                 .template get_info<cl::sycl::info::device::local_mem_type>()),
         max_work_item_sizes(
             queue.get_device()
-                .template get_info<
-                    cl::sycl::info::device::max_work_item_sizes>()),
+                .template get_info<cl::sycl::info::device::max_work_item_sizes<3>>()),
         max_mem_alloc_size(
             queue.get_device()
                 .template get_info<
@@ -356,7 +355,7 @@
       return;
     }
     const ptrdiff_t count = end - begin;
-    auto f = [&](cl::sycl::handler &cgh) {
+    auto f = [&](cl::sycl::handler &cgh)  {
       auto dst_acc = get_typed_range_accessor<write_mode, T>(cgh, begin, count);
       cgh.fill(dst_acc, value);
     };

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
index 0e9cdfe..7e13c69 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h

@@ -159,10 +159,10 @@
   }
 #endif
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) const {
     m_buffer[i] = m_impl.coeff(i);
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) const {
     internal::pstoret<CoeffReturnType, PacketReturnType, Aligned>(m_buffer + i, m_impl.template packet<TensorEvaluator<ArgType, Device>::IsAligned ? Aligned : Unaligned>(i));
   }
 

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index 92ad0f5..2bd94c3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h

@@ -98,7 +98,7 @@
     return m_data[index];
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const{
     eigen_assert(m_data != NULL);
     return m_data[index];
   }
@@ -122,7 +122,7 @@
   }
 
   template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void writePacket(Index index, const PacketReturnType& x)
+  void writePacket(Index index, const PacketReturnType& x) const
   {
     return internal::pstoret<Scalar, PacketReturnType, StoreMode>(m_data + index, x);
   }
@@ -137,7 +137,7 @@
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType&
-  coeffRef(const array<DenseIndex, NumCoords>& coords) {
+  coeffRef(const array<DenseIndex, NumCoords>& coords) const {
     eigen_assert(m_data != NULL);
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       return m_data[m_dims.IndexOfColMajor(coords)];
@@ -978,7 +978,14 @@
   TensorEvaluator<ElseArgType, Device> m_elseImpl;
 };
 
-
 } // end namespace Eigen
 
+#if defined(EIGEN_USE_SYCL) && defined(SYCL_COMPILER_IS_DPCPP)
+template <typename Derived, typename Device>
+struct cl::sycl::is_device_copyable<
+    Eigen::TensorEvaluator<Derived, Device>,
+    std::enable_if_t<!std::is_trivially_copyable<
+        Eigen::TensorEvaluator<Derived, Device>>::value>> : std::true_type {};
+#endif
+
 #endif // EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index de9bed4..f961b40 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h

@@ -688,12 +688,12 @@
       : evaluator(evaluator_), range(range_) {}
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void operator()(
-      cl::sycl::nd_item<1> itemID) {
+      cl::sycl::nd_item<1> itemID) const {
     compute(itemID);
   }
   template <bool is_vec = Evaluator::PacketAccess>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t<!is_vec>
-  compute(const cl::sycl::nd_item<1>& itemID) {
+  compute(const cl::sycl::nd_item<1>& itemID) const {
     Index gId = static_cast<Index>(itemID.get_global_linear_id());
     Index total_threads = itemID.get_global_range(0);
 
@@ -703,7 +703,7 @@
   }
   template <bool is_vec = Evaluator::PacketAccess>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t<is_vec>
-  compute(const cl::sycl::nd_item<1>& itemID) {
+  compute(const cl::sycl::nd_item<1>& itemID) const {
     const Index vectorizedRange =
         (range / Evaluator::PacketSize) * Evaluator::PacketSize;
     Index gId = static_cast<Index>(itemID.get_global_linear_id());

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
index d12bd6d..8b14085 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h

@@ -202,12 +202,12 @@
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const
   {
     return this->m_impl.coeffRef(index);
   }
   template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void writePacket(Index index, const PacketReturnType& x)
+  void writePacket(Index index, const PacketReturnType& x) const
   {
     this->m_impl.template writePacket<StoreMode>(index, x);
   }

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index 0205710..b696190b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h

@@ -267,13 +267,13 @@
       TensorBlockDesc;
   //===--------------------------------------------------------------------===//
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const
   {
     return this->m_impl.coeffRef(index);
   }
 
   template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void writePacket(Index index, const PacketReturnType& x)
+  void writePacket(Index index, const PacketReturnType& x) const
   {
     this->m_impl.template writePacket<StoreMode>(index, x);
   }
@@ -733,7 +733,7 @@
     : Base(op, device)
     { }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const
   {
     if (this->m_is_identity) {
       return this->m_impl.coeffRef(index);
@@ -743,7 +743,7 @@
   }
 
   template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void writePacket(Index index, const PacketReturnType& x)
+  void writePacket(Index index, const PacketReturnType& x) const
   {
     if (this->m_is_identity) {
       this->m_impl.template writePacket<StoreMode>(index, x);
@@ -1085,7 +1085,7 @@
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
   typedef Strides Dimensions;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const
   {
     if (this->m_is_identity) {
       return this->m_impl.coeffRef(index);

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index e62397a..ae03ba5 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h

@@ -895,7 +895,7 @@
   // binding placeholder accessors to a command group handler for SYCL
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
     m_impl.bind(cgh);
-    m_result.bind(cgh);
+    if(m_result) m_result.bind(cgh);
   }
 #endif
 

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
index 397870f..715797d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h

@@ -87,7 +87,7 @@
   SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_)
       : scratch(scratch_), aI(aI_), outAcc(outAcc_), op(OpDef::get_op(op_)) {}
 
-  void operator()(cl::sycl::nd_item<1> itemID) {
+  void operator()(cl::sycl::nd_item<1> itemID) const {
     // Our empirical research shows that the best performance will be achieved
     // when there is only one element per thread to reduce in the second step.
     // in this step the second step reduction time is almost negligible.
@@ -141,11 +141,11 @@
                              Index rng_, OpType op_)
       : scratch(scratch_), evaluator(evaluator_), final_output(final_output_), rng(rng_), op(OpDef::get_op(op_)) {}
 
-  void operator()(cl::sycl::nd_item<1> itemID) { compute_reduction(itemID); }
+  void operator()(cl::sycl::nd_item<1> itemID) const { compute_reduction(itemID); }
 
   template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<Vect> compute_reduction(
-      const cl::sycl::nd_item<1> &itemID) {
+      const cl::sycl::nd_item<1> &itemID) const {
     auto output_ptr = final_output.get_pointer();
     Index VectorizedRange = (rng / Evaluator::PacketSize) * Evaluator::PacketSize;
     Index globalid = itemID.get_global_id(0);
@@ -184,7 +184,7 @@
 
   template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!Vect> compute_reduction(
-      const cl::sycl::nd_item<1> &itemID) {
+      const cl::sycl::nd_item<1> &itemID) const {
     auto output_ptr = final_output.get_pointer();
     Index globalid = itemID.get_global_id(0);
     Index localid = itemID.get_local_id(0);
@@ -228,14 +228,16 @@
         range(range_),
         num_values_to_reduce(num_values_to_reduce_) {}
 
-  void operator()(cl::sycl::nd_item<1> itemID) {
+  void operator()(cl::sycl::nd_item<1> itemID) const {
+    //This is to bypass the statefull condition in Eigen meanReducer
+    Op non_const_functor;
+    std::memcpy(&non_const_functor, &functor, sizeof (Op));
     auto output_accessor_ptr = output_accessor.get_pointer();
-    /// const cast added as a naive solution to solve the qualifier drop error
     Index globalid = static_cast<Index>(itemID.get_global_linear_id());
     if (globalid < range) {
       CoeffReturnType accum = functor.initialize();
       Eigen::internal::GenericDimReducer<Evaluator::NumReducedDims - 1, Evaluator, Op>::reduce(
-          evaluator, evaluator.firstInput(globalid), functor, &accum);
+          evaluator, evaluator.firstInput(globalid), non_const_functor, &accum);
       output_accessor_ptr[globalid] = OpDef::finalise_op(functor.finalize(accum), num_values_to_reduce);
     }
   }
@@ -281,7 +283,7 @@
         num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void element_wise_reduce(Index globalRId, Index globalPId,
-                                                                 CoeffReturnType &accumulator) {
+                                                                 CoeffReturnType &accumulator) const {
     if (globalPId >= num_coeffs_to_preserve) {
       return;
     }
@@ -298,7 +300,7 @@
       global_offset += per_thread_global_stride;
     }
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
     const Index linearLocalThreadId = itemID.get_local_id(0);
     Index pLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId % PannelParameters::LocalThreadSizeP
                                                            : linearLocalThreadId / PannelParameters::LocalThreadSizeR;
@@ -380,7 +382,7 @@
         num_coeffs_to_preserve(num_coeffs_to_preserve_),
         num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
     const Index globalId = itemID.get_global_id(0);
 
     if (globalId >= num_coeffs_to_preserve) return;

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
index b5e66b3..342d9e9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h

@@ -441,12 +441,12 @@
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   const Dimensions& dimensions() const { return this->m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) const {
     return this->m_impl.coeffRef(this->reverseIndex(index));
   }
 
   template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void writePacket(Index index, const PacketReturnType& x) {
+  void writePacket(Index index, const PacketReturnType& x) const {
     eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     // This code is pilfered from TensorMorphing.h

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h
index 636fb7d..ecc872b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h

@@ -109,28 +109,28 @@
   template <scan_step sst = stp, typename Input>
   std::enable_if_t<sst == scan_step::first, CoeffReturnType> EIGEN_DEVICE_FUNC
       EIGEN_STRONG_INLINE
-      read(const Input &inpt, Index global_id) {
+      read(const Input &inpt, Index global_id) const {
     return inpt.coeff(global_id);
   }
 
   template <scan_step sst = stp, typename Input>
   std::enable_if_t<sst != scan_step::first, CoeffReturnType> EIGEN_DEVICE_FUNC
       EIGEN_STRONG_INLINE
-      read(const Input &inpt, Index global_id) {
+      read(const Input &inpt, Index global_id) const {
     return inpt[global_id];
   }
 
   template <scan_step sst = stp, typename InclusiveOp>
   std::enable_if_t<sst == scan_step::first> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  first_step_inclusive_Operation(InclusiveOp inclusive_op) {
+  first_step_inclusive_Operation(InclusiveOp inclusive_op) const {
     inclusive_op();
   }
 
   template <scan_step sst = stp, typename InclusiveOp>
   std::enable_if_t<sst != scan_step::first> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  first_step_inclusive_Operation(InclusiveOp) {}
+  first_step_inclusive_Operation(InclusiveOp) const {}
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
     auto out_ptr = out_accessor.get_pointer();
     auto tmp_ptr = temp_accessor.get_pointer();
     auto scratch_ptr = scratch.get_pointer().get();
@@ -307,7 +307,7 @@
         scanParameters(scanParameters_),
         accumulator(accumulator_) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
     auto in_ptr = in_accessor.get_pointer();
     auto out_ptr = out_accessor.get_pointer();
 
@@ -473,7 +473,7 @@
   typedef typename Self::CoeffReturnType CoeffReturnType;
   typedef typename Self::Storage Storage;
   typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
-  void operator()(Self &self, EvaluatorPointerType data) {
+  void operator()(Self &self, EvaluatorPointerType data) const {
     const Index total_size = internal::array_prod(self.dimensions());
     const Index scan_size = self.size();
     const Index scan_stride = self.stride();

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
index 977263a..3d97e66 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h

@@ -390,13 +390,13 @@
       : Base(op, device)
   { }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const
   {
     return this->m_impl.coeffRef(this->srcCoeff(index));
   }
 
   template <int StoreMode> EIGEN_STRONG_INLINE
-  void writePacket(Index index, const PacketReturnType& x)
+  void writePacket(Index index, const PacketReturnType& x) const
   {
     EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
     internal::pstore<CoeffReturnType, PacketReturnType>(values, x);

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
index 609afe3..681c4e5 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h

@@ -288,13 +288,13 @@
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
   static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) const
   {
     return this->m_impl.coeffRef(this->srcCoeff(index));
   }
 
   template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void writePacket(Index index, const PacketReturnType& x)
+  void writePacket(Index index, const PacketReturnType& x) const
   {
     EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+PacketSize-1 < this->dimensions().TotalSize());

diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h
index 4545119..ff3f834 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h

@@ -31,8 +31,7 @@
 #endif
 // Checks whether C++11's `thread_local` storage duration specifier is
 // supported.
-#if defined(__apple_build_version__) &&     \
-    ((__apple_build_version__ < 8000042) || \
+#if EIGEN_COMP_CLANGAPPLE && ((EIGEN_COMP_CLANGAPPLE < 8000042) || \
      (TARGET_OS_IPHONE && __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_9_0))
 // Notes: Xcode's clang did not support `thread_local` until version
 // 8, and even then not for all iOS < 9.0.

diff --git a/unsupported/Eigen/src/SparseExtra/MarketIO.h b/unsupported/Eigen/src/SparseExtra/MarketIO.h
index cf5828e..5c26133 100644
--- a/unsupported/Eigen/src/SparseExtra/MarketIO.h
+++ b/unsupported/Eigen/src/SparseExtra/MarketIO.h

@@ -189,6 +189,7 @@
         readsizes = true;
         mat.resize(M,N);
         mat.reserve(NNZ);
+        elements.reserve(NNZ);
       }
     }
     else

diff --git a/unsupported/doc/examples/SYCL/CMakeLists.txt b/unsupported/doc/examples/SYCL/CMakeLists.txt
index 8ee96f5..4fe94c6 100644
--- a/unsupported/doc/examples/SYCL/CMakeLists.txt
+++ b/unsupported/doc/examples/SYCL/CMakeLists.txt

@@ -2,9 +2,8 @@
 
 set(EIGEN_SYCL ON)
 list(APPEND CMAKE_EXE_LINKER_FLAGS -pthread)
-if(EIGEN_SYCL_TRISYCL)
-  set(CMAKE_CXX_STANDARD 17)
-else(EIGEN_SYCL_TRISYCL)
+set(CMAKE_CXX_STANDARD 17)
+if(EIGEN_SYCL_ComputeCpp)
   if(MSVC)
     list(APPEND COMPUTECPP_USER_FLAGS -DWIN32)
   else()
@@ -22,7 +21,7 @@
       -no-serial-memop
       -Xclang
       -cl-mad-enable)
-endif(EIGEN_SYCL_TRISYCL)
+endif(EIGEN_SYCL_ComputeCpp)
 
 FOREACH(example_src ${examples_SRCS})
   GET_FILENAME_COMPONENT(example ${example_src} NAME_WE)

diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index 21d8c5e..45cc4a3 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt

@@ -122,6 +122,7 @@
 
 if(EIGEN_TEST_SYCL)
   set(EIGEN_SYCL ON)
+  set(CMAKE_CXX_STANDARD 17)
   # Forward CMake options as preprocessor definitions
   if(EIGEN_SYCL_USE_DEFAULT_SELECTOR)
     add_definitions(-DEIGEN_SYCL_USE_DEFAULT_SELECTOR=${EIGEN_SYCL_USE_DEFAULT_SELECTOR})
@@ -172,10 +173,7 @@
     add_definitions(-DEIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION=${EIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION})
   endif()
 
-  if(EIGEN_SYCL_TRISYCL)
-    # triSYCL now requires c++17.
-    set(CMAKE_CXX_STANDARD 17)
-  else()
+  if(EIGEN_SYCL_ComputeCpp)
     if(MSVC)
       list(APPEND COMPUTECPP_USER_FLAGS -DWIN32)
     else()
@@ -193,7 +191,7 @@
         -no-serial-memop
         -Xclang
         -cl-mad-enable)
-  endif()
+  endif(EIGEN_SYCL_ComputeCpp)
 
   ei_add_test(cxx11_tensor_sycl)
   ei_add_test(cxx11_tensor_image_op_sycl)
@@ -409,4 +407,3 @@
   endif()
 
 endif()
-

diff --git a/unsupported/test/cxx11_eventcount.cpp b/unsupported/test/cxx11_eventcount.cpp
index 7bf4e96..714540e 100644
--- a/unsupported/test/cxx11_eventcount.cpp
+++ b/unsupported/test/cxx11_eventcount.cpp

@@ -15,7 +15,7 @@
 // Visual studio doesn't implement a rand_r() function since its
 // implementation of rand() is already thread safe
 int rand_reentrant(unsigned int* s) {
-#ifdef EIGEN_COMP_MSVC_STRICT
+#if EIGEN_COMP_MSVC_STRICT
   EIGEN_UNUSED_VARIABLE(s);
   return rand();
 #else

diff --git a/unsupported/test/cxx11_runqueue.cpp b/unsupported/test/cxx11_runqueue.cpp
index 8fc5a30..0f7f13b 100644
--- a/unsupported/test/cxx11_runqueue.cpp
+++ b/unsupported/test/cxx11_runqueue.cpp

@@ -17,7 +17,7 @@
 // Visual studio doesn't implement a rand_r() function since its
 // implementation of rand() is already thread safe
 int rand_reentrant(unsigned int* s) {
-#ifdef EIGEN_COMP_MSVC_STRICT
+#if EIGEN_COMP_MSVC_STRICT
   EIGEN_UNUSED_VARIABLE(s);
   return rand();
 #else

diff --git a/unsupported/test/cxx11_tensor_argmax_gpu.cu b/unsupported/test/cxx11_tensor_argmax_gpu.cu
index 79f4066..d9d5da4 100644
--- a/unsupported/test/cxx11_tensor_argmax_gpu.cu
+++ b/unsupported/test/cxx11_tensor_argmax_gpu.cu

@@ -23,8 +23,8 @@
 void test_gpu_simple_argmax()
 {
   Tensor<double, 3, Layout> in(Eigen::array<DenseIndex, 3>(72,53,97));
-  Tensor<DenseIndex, 1, Layout> out_max(Eigen::array<DenseIndex, 1>(1));
-  Tensor<DenseIndex, 1, Layout> out_min(Eigen::array<DenseIndex, 1>(1));
+  Tensor<DenseIndex, 0, Layout> out_max;
+  Tensor<DenseIndex, 0, Layout> out_min;
   in.setRandom();
   in *= in.constant(100.0);
   in(0, 0, 0) = -1000.0;
@@ -46,8 +46,8 @@
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<double, 3, Layout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 3>(72,53,97));
-  Eigen::TensorMap<Eigen::Tensor<DenseIndex, 1, Layout>, Aligned > gpu_out_max(d_out_max, Eigen::array<DenseIndex, 1>(1));
-  Eigen::TensorMap<Eigen::Tensor<DenseIndex, 1, Layout>, Aligned > gpu_out_min(d_out_min, Eigen::array<DenseIndex, 1>(1));
+  Eigen::TensorMap<Eigen::Tensor<DenseIndex, 0, Layout>, Aligned > gpu_out_max(d_out_max);
+  Eigen::TensorMap<Eigen::Tensor<DenseIndex, 0, Layout>, Aligned > gpu_out_min(d_out_min);
 
   gpu_out_max.device(gpu_device) = gpu_in.argmax();
   gpu_out_min.device(gpu_device) = gpu_in.argmin();
@@ -56,8 +56,8 @@
   assert(gpuMemcpyAsync(out_min.data(), d_out_min, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
-  VERIFY_IS_EQUAL(out_max(Eigen::array<DenseIndex, 1>(0)), 72*53*97 - 1);
-  VERIFY_IS_EQUAL(out_min(Eigen::array<DenseIndex, 1>(0)), 0);
+  VERIFY_IS_EQUAL(out_max(), 72*53*97 - 1);
+  VERIFY_IS_EQUAL(out_min(), 0);
 
   gpuFree(d_in);
   gpuFree(d_out_max);

diff --git a/unsupported/test/cxx11_tensor_builtins_sycl.cpp b/unsupported/test/cxx11_tensor_builtins_sycl.cpp
index 27a8254..be7afd2 100644
--- a/unsupported/test/cxx11_tensor_builtins_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_builtins_sycl.cpp

@@ -27,17 +27,64 @@
 
 // Functions used to compare the TensorMap implementation on the device with
 // the equivalent on the host
-namespace cl {
-namespace sycl {
-template <typename T> T abs(T x) { return cl::sycl::fabs(x); }
+namespace SYCL {
+template <typename T> T abs(T x) { 
+  return cl::sycl::abs(x);
+}
+
+template <> float abs(float x) { 
+  return cl::sycl::fabs(x);
+}
+
+template <> double abs(double x) { 
+  return cl::sycl::fabs(x);
+}
+
 template <typename T> T square(T x) { return x * x; }
 template <typename T> T cube(T x) { return x * x * x; }
 template <typename T> T inverse(T x) { return T(1) / x; }
-template <typename T> T cwiseMax(T x, T y) { return cl::sycl::max(x, y); }
-template <typename T> T cwiseMin(T x, T y) { return cl::sycl::min(x, y); }
+template <typename T> T cwiseMax(T x, T y) { 
+return cl::sycl::max(x, y);
+}
+template <typename T> T cwiseMin(T x, T y) { 
+  return cl::sycl::min(x, y); 
 }
 }
 
+
+#define DECLARE_UNARY_STRUCT_NON_SYCL(FUNC)                        \
+  struct op_##FUNC {                                               \
+    template <typename T>                                          \
+    auto operator()(const T& x)  {                                 \
+      return SYCL::FUNC(x);                                        \
+    }                                                              \
+    template <typename T>                                          \
+    auto operator()(const TensorMap<T>& x) {                       \
+      return x.FUNC();                                             \
+    }                                                              \
+  };
+
+DECLARE_UNARY_STRUCT_NON_SYCL(abs)
+DECLARE_UNARY_STRUCT_NON_SYCL(square)
+DECLARE_UNARY_STRUCT_NON_SYCL(cube)
+DECLARE_UNARY_STRUCT_NON_SYCL(inverse)
+
+#define DECLARE_BINARY_STRUCT_NON_SYCL(FUNC)                                                 \
+  struct op_##FUNC {                                                                         \
+    template <typename T1, typename T2>                                                      \
+    auto operator()(const T1& x, const T2& y){                                               \
+      return SYCL::FUNC(x, y);                                                               \
+    }                                                                                        \
+    template <typename T1, typename T2>                                                      \
+    auto operator()(const TensorMap<T1>& x, const TensorMap<T2>& y) {                        \
+      return x.FUNC(y);                                                                      \
+    }                                                                                        \
+  };
+
+DECLARE_BINARY_STRUCT_NON_SYCL(cwiseMax)
+DECLARE_BINARY_STRUCT_NON_SYCL(cwiseMin)
+
+
 struct EqualAssignment {
   template <typename Lhs, typename Rhs>
   void operator()(Lhs& lhs, const Rhs& rhs) { lhs = rhs; }
@@ -119,12 +166,9 @@
     }                                                              \
   };
 
-DECLARE_UNARY_STRUCT(abs)
+
 DECLARE_UNARY_STRUCT(sqrt)
 DECLARE_UNARY_STRUCT(rsqrt)
-DECLARE_UNARY_STRUCT(square)
-DECLARE_UNARY_STRUCT(cube)
-DECLARE_UNARY_STRUCT(inverse)
 DECLARE_UNARY_STRUCT(tanh)
 DECLARE_UNARY_STRUCT(exp)
 DECLARE_UNARY_STRUCT(expm1)
@@ -288,8 +332,6 @@
     }                                                                                        \
   };
 
-DECLARE_BINARY_STRUCT(cwiseMax)
-DECLARE_BINARY_STRUCT(cwiseMin)
 
 #define DECLARE_BINARY_STRUCT_OP(NAME, OPERATOR)                          \
   struct op_##NAME {                                                      \

diff --git a/unsupported/test/cxx11_tensor_device_sycl.cpp b/unsupported/test/cxx11_tensor_device_sycl.cpp
index d7ff38d..74e9026 100644
--- a/unsupported/test/cxx11_tensor_device_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_device_sycl.cpp

@@ -23,6 +23,13 @@
 #include <stdint.h>
 #include <iostream>
 
+#ifdef SYCL_COMPILER_IS_DPCPP
+template <typename T>
+struct cl::sycl::is_device_copyable<
+    const OffByOneScalar<T>,
+    std::enable_if_t<!std::is_trivially_copyable<const OffByOneScalar<T>>::value>> : std::true_type {};
+#endif
+
 template <typename DataType, int DataLayout, typename IndexType>
 void test_device_memory(const Eigen::SyclDevice &sycl_device) {
   IndexType sizeDim1 = 100;

diff --git a/unsupported/test/cxx11_tensor_gpu.cu b/unsupported/test/cxx11_tensor_gpu.cu
index 7b3fb5a..31baf1b 100644
--- a/unsupported/test/cxx11_tensor_gpu.cu
+++ b/unsupported/test/cxx11_tensor_gpu.cu

@@ -1100,9 +1100,9 @@
 template <typename Scalar>
 void test_gpu_ndtri()
 {
-  Tensor<Scalar, 1> in_x(8);
-  Tensor<Scalar, 1> out(8);
-  Tensor<Scalar, 1> expected_out(8);
+  Tensor<Scalar, 1> in_x(9);
+  Tensor<Scalar, 1> out(9);
+  Tensor<Scalar, 1> expected_out(9);
   out.setZero();
 
   in_x(0) = Scalar(1);
commit	4e9978d189f13f5167f7a510d0c541655793bc3a	[log] [tgz]
author	Rasmus Munk Larsen <rmlarsen@google.com>	Wed Feb 01 22:56:02 2023 -0800
committer	Antonio Sanchez <cantonios@google.com>	Mon Feb 06 12:41:30 2023 -0800
tree	0945b0813c5dea8c0cc400f64319d1fdcb230845
parent	9e1418030ad39044ab96c3c11388a0b189f73de9 [diff]