Update Eigen to commit:2e9b945baf2901b644decf3fe48c84679d303d14

CHANGELOG
=========
2e9b945ba - Fix bug that disabled vectorization for coeffMin/coeffMax.
bc5cdc7a6 - Guard use of long double on GPU device.
e4598fedb - Fix compiler versions for certain instructions on Power.
1c0a6cf22 - Get rid of EIGEN_HAS_AVX512_MATH workaround.
00844e386 - Fix a number of MSAN failures in SVD tests.
c3f67063e - [SYCL-2020]- null placeholder accessor issue in Reduction SYCL test
6bcd941ee - Use pmsub in twoprod. This speeds up pow() on Skylake by ~1%.
ce62177b5 - Vectorize atanh & add a missing definition and unit test for atan.
049a14479 - Add typed logicals
e79797468 - Add and enable Packet int divide for Power10.
54459214a - Fix epsilon and dummy_precision values in long double for double doubles.  Prevented some algorithms from converging on PPC.
a16fb889d - Guard complex sqrt on old MSVC compilers.
94b19dc5f - Add CArg
71a8e60a7 - Tweak pasin_float, fix psqrt_complex
384269937 - More NEON packetmath fixes.
c15b38620 - Fix MSVC atan2 test.
2dfbf1b25 - Fix NEON make_packet2f.
07aaa62e6 - Fix compiler warnings in tests.
4a0340956 - Fix problem with array conversions BF16->F32 in Power.
77b48c440 - Fix compiler warnings.
0ecae6156 - Disable array BF16 to F32 conversions in Power
c999284ba - Print diagonal matrix
fba12e02b - Fold extra column calculations into an extra MMA accumulator and other bfloat16 MMA GEMM improvements
79cfc74f4 - Revert ODR changes and make gemm_extra_cols and gemm_complex_extra_cols EIGEN_ALWAYS_INLINE to avoid external functions.
f9659d91f - Fix ODR violation with `gemm_extra_cols` on PPC
325e3063d - Optimize psign
0e490d452 - Update file ColPivHouseholderQR_LAPACKE.h
0a5392d60 - Fix MSVC arm build.
3f7e77571 - Add IWYU export pragmas to top-level headers.
e4f58816d - Get rid of custom implementation of equal_to and not_equal_no. No longer needed with c+14.
e256ad182 - Remove LGPL Code and references.
e71f88abc - Change in Power eigen_asserts to eigen_internal_asserts since it is putting unnecessary error checking and assertions without NDEBUG.
232b18fa8 - Fixes #2602
f6cc359e1 - More EIGEN_DEVICE_FUNC fixes for CUDA 10/11/12.
2a9065339 - fix lapacke config

PiperOrigin-RevId: 514870586
Change-Id: I069caeedac31a1adaba0bbdfb44329884bd7f75f
diff --git a/Eigen/AccelerateSupport b/Eigen/AccelerateSupport
index 8cee7ac..e16f36b 100644
--- a/Eigen/AccelerateSupport
+++ b/Eigen/AccelerateSupport
@@ -43,7 +43,9 @@
   * \endcode
   */
 
+// IWYU pragma: begin_exports
 #include "src/AccelerateSupport/AccelerateSupport.h"
+// IWYU pragma: end_exports
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
diff --git a/Eigen/Cholesky b/Eigen/Cholesky
index 2c686f1..4211151 100644
--- a/Eigen/Cholesky
+++ b/Eigen/Cholesky
@@ -29,12 +29,14 @@
   * \endcode
   */
 
+// IWYU pragma: begin_exports
 #include "src/Cholesky/LLT.h"
 #include "src/Cholesky/LDLT.h"
 #ifdef EIGEN_USE_LAPACKE
 #include "src/misc/lapacke_helpers.h"
 #include "src/Cholesky/LLT_LAPACKE.h"
 #endif
+// IWYU pragma: end_exports
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
diff --git a/Eigen/CholmodSupport b/Eigen/CholmodSupport
index 1037bd5..2ccb211 100644
--- a/Eigen/CholmodSupport
+++ b/Eigen/CholmodSupport
@@ -40,7 +40,9 @@
   *
   */
 
+// IWYU pragma: begin_exports
 #include "src/CholmodSupport/CholmodSupport.h"
+// IWYU pragma: end_exports
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
diff --git a/Eigen/Core b/Eigen/Core
index af71891..c4bde63 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -155,6 +155,15 @@
   * \endcode
   */
 
+#ifdef EIGEN_USE_LAPACKE
+  #ifdef EIGEN_USE_MKL
+    #include "mkl_lapacke.h"
+  #else
+    #include "src/misc/lapacke.h"
+  #endif
+#endif
+
+// IWYU pragma: begin_exports
 #include "src/Core/util/Constants.h"
 #include "src/Core/util/Meta.h"
 #include "src/Core/util/Assert.h"
@@ -390,6 +399,7 @@
 #endif
 
 #include "src/Core/GlobalFunctions.h"
+// IWYU pragma: end_exports
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
diff --git a/Eigen/Eigenvalues b/Eigen/Eigenvalues
index 5467a2e..97c526d 100644
--- a/Eigen/Eigenvalues
+++ b/Eigen/Eigenvalues
@@ -33,6 +33,8 @@
   */
 
 #include "src/misc/RealSvd2x2.h"
+
+// IWYU pragma: begin_exports
 #include "src/Eigenvalues/Tridiagonalization.h"
 #include "src/Eigenvalues/RealSchur.h"
 #include "src/Eigenvalues/EigenSolver.h"
@@ -54,6 +56,7 @@
 #include "src/Eigenvalues/ComplexSchur_LAPACKE.h"
 #include "src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h"
 #endif
+// IWYU pragma: end_exports
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
diff --git a/Eigen/Geometry b/Eigen/Geometry
index bc78110..0a733c5 100644
--- a/Eigen/Geometry
+++ b/Eigen/Geometry
@@ -33,9 +33,9 @@
   * \endcode
   */
 
+// IWYU pragma: begin_exports
 #include "src/Geometry/OrthoMethods.h"
 #include "src/Geometry/EulerAngles.h"
-
 #include "src/Geometry/Homogeneous.h"
 #include "src/Geometry/RotationBase.h"
 #include "src/Geometry/Rotation2D.h"
@@ -53,6 +53,7 @@
 #if (defined EIGEN_VECTORIZE_SSE) || (defined EIGEN_VECTORIZE_NEON)
 #include "src/Geometry/arch/Geometry_SIMD.h"
 #endif
+// IWYU pragma: end_exports
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
diff --git a/Eigen/Householder b/Eigen/Householder
index f2fa799..0f7f9b3 100644
--- a/Eigen/Householder
+++ b/Eigen/Householder
@@ -20,9 +20,11 @@
   * \endcode
   */
 
+// IWYU pragma: begin_exports
 #include "src/Householder/Householder.h"
 #include "src/Householder/HouseholderSequence.h"
 #include "src/Householder/BlockHouseholder.h"
+// IWYU pragma: end_exports
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
diff --git a/Eigen/IterativeLinearSolvers b/Eigen/IterativeLinearSolvers
index 26a0560..833189b 100644
--- a/Eigen/IterativeLinearSolvers
+++ b/Eigen/IterativeLinearSolvers
@@ -34,6 +34,7 @@
     \endcode
   */
 
+// IWYU pragma: begin_exports
 #include "src/IterativeLinearSolvers/SolveWithGuess.h"
 #include "src/IterativeLinearSolvers/IterativeSolverBase.h"
 #include "src/IterativeLinearSolvers/BasicPreconditioners.h"
@@ -42,6 +43,7 @@
 #include "src/IterativeLinearSolvers/BiCGSTAB.h"
 #include "src/IterativeLinearSolvers/IncompleteLUT.h"
 #include "src/IterativeLinearSolvers/IncompleteCholesky.h"
+// IWYU pragma: end_exports
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
diff --git a/Eigen/Jacobi b/Eigen/Jacobi
index 43edc7a..490a6a8 100644
--- a/Eigen/Jacobi
+++ b/Eigen/Jacobi
@@ -24,7 +24,9 @@
   *  - MatrixBase::applyOnTheRight().
   */
 
+// IWYU pragma: begin_exports
 #include "src/Jacobi/Jacobi.h"
+// IWYU pragma: end_exports
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
diff --git a/Eigen/KLUSupport b/Eigen/KLUSupport
index b23d905..5c8f8a3 100644
--- a/Eigen/KLUSupport
+++ b/Eigen/KLUSupport
@@ -8,9 +8,9 @@
 #ifndef EIGEN_KLUSUPPORT_MODULE_H
 #define EIGEN_KLUSUPPORT_MODULE_H
 
-#include <Eigen/SparseCore>
+#include "SparseCore"
 
-#include <Eigen/src/Core/util/DisableStupidWarnings.h>
+#include "src/Core/util/DisableStupidWarnings.h"
 
 extern "C" {
 #include <btf.h>
@@ -34,8 +34,10 @@
   *
   */
 
+// IWYU pragma: begin_exports
 #include "src/KLUSupport/KLUSupport.h"
+// IWYU pragma: end_exports
 
-#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
+#include "src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_KLUSUPPORT_MODULE_H
diff --git a/Eigen/LU b/Eigen/LU
index b7f9a8a..49d7b91 100644
--- a/Eigen/LU
+++ b/Eigen/LU
@@ -25,6 +25,8 @@
 
 #include "src/misc/Kernel.h"
 #include "src/misc/Image.h"
+
+// IWYU pragma: begin_exports
 #include "src/LU/FullPivLU.h"
 #include "src/LU/PartialPivLU.h"
 #ifdef EIGEN_USE_LAPACKE
@@ -37,6 +39,7 @@
 #if defined EIGEN_VECTORIZE_SSE || defined EIGEN_VECTORIZE_NEON
   #include "src/LU/arch/InverseSize4.h"
 #endif
+// IWYU pragma: end_exports
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
diff --git a/Eigen/MetisSupport b/Eigen/MetisSupport
index 85c41bf..e74c3bb 100644
--- a/Eigen/MetisSupport
+++ b/Eigen/MetisSupport
@@ -27,8 +27,9 @@
   * It can be used just as any other built-in method as explained in \link OrderingMethods_Module here. \endlink
   */
 
-
+// IWYU pragma: begin_exports
 #include "src/MetisSupport/MetisSupport.h"
+// IWYU pragma: end_exports
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
diff --git a/Eigen/OrderingMethods b/Eigen/OrderingMethods
index 29691a6..5031de7 100644
--- a/Eigen/OrderingMethods
+++ b/Eigen/OrderingMethods
@@ -63,8 +63,11 @@
   * \endcode
   */
 
+// IWYU pragma: begin_exports
 #include "src/OrderingMethods/Amd.h"
 #include "src/OrderingMethods/Ordering.h"
+// IWYU pragma: end_exports
+
 #include "src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_ORDERINGMETHODS_MODULE_H
diff --git a/Eigen/PaStiXSupport b/Eigen/PaStiXSupport
index 234619a..5d3cb38 100644
--- a/Eigen/PaStiXSupport
+++ b/Eigen/PaStiXSupport
@@ -42,7 +42,9 @@
   *
   */
 
+// IWYU pragma: begin_exports
 #include "src/PaStiXSupport/PaStiXSupport.h"
+// IWYU pragma: end_exports
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
diff --git a/Eigen/PardisoSupport b/Eigen/PardisoSupport
index 340edf5..c022ce1 100644
--- a/Eigen/PardisoSupport
+++ b/Eigen/PardisoSupport
@@ -28,7 +28,9 @@
   * 
   */
 
+// IWYU pragma: begin_exports
 #include "src/PardisoSupport/PardisoSupport.h"
+// IWYU pragma: end_exports
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
diff --git a/Eigen/QR b/Eigen/QR
index 1f6c22e..2f57a60 100644
--- a/Eigen/QR
+++ b/Eigen/QR
@@ -31,6 +31,7 @@
   * \endcode
   */
 
+// IWYU pragma: begin_exports
 #include "src/QR/HouseholderQR.h"
 #include "src/QR/FullPivHouseholderQR.h"
 #include "src/QR/ColPivHouseholderQR.h"
@@ -40,6 +41,7 @@
 #include "src/QR/HouseholderQR_LAPACKE.h"
 #include "src/QR/ColPivHouseholderQR_LAPACKE.h"
 #endif
+// IWYU pragma: end_exports
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
diff --git a/Eigen/SPQRSupport b/Eigen/SPQRSupport
index d83495e..a25f5ef 100644
--- a/Eigen/SPQRSupport
+++ b/Eigen/SPQRSupport
@@ -29,6 +29,11 @@
   */
 
 #include "CholmodSupport"
+
+// IWYU pragma: begin_exports
 #include "src/SPQRSupport/SuiteSparseQRSupport.h"
+// IWYU pragma: end_exports
+
+#include "src/Core/util/ReenableStupidWarnings.h"
 
 #endif
diff --git a/Eigen/SVD b/Eigen/SVD
index 8241c73..b13ec2f 100644
--- a/Eigen/SVD
+++ b/Eigen/SVD
@@ -31,6 +31,7 @@
   * \endcode
   */
 
+// IWYU pragma: begin_exports
 #include "src/misc/RealSvd2x2.h"
 #include "src/SVD/UpperBidiagonalization.h"
 #include "src/SVD/SVDBase.h"
@@ -47,6 +48,7 @@
 #endif
 #include "src/SVD/BDCSVD_LAPACKE.h"
 #endif
+// IWYU pragma: end_exports
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
diff --git a/Eigen/SparseCholesky b/Eigen/SparseCholesky
index d2b1f12..967196f 100644
--- a/Eigen/SparseCholesky
+++ b/Eigen/SparseCholesky
@@ -30,8 +30,11 @@
   * \endcode
   */
 
+// IWYU pragma: begin_exports
 #include "src/SparseCholesky/SimplicialCholesky.h"
 #include "src/SparseCholesky/SimplicialCholesky_impl.h"
+// IWYU pragma: end_exports
+
 #include "src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_SPARSECHOLESKY_MODULE_H
diff --git a/Eigen/SparseCore b/Eigen/SparseCore
index 352f1ec..292b089 100644
--- a/Eigen/SparseCore
+++ b/Eigen/SparseCore
@@ -34,6 +34,7 @@
   * This module depends on: Core.
   */
 
+// IWYU pragma: begin_exports
 #include "src/SparseCore/SparseUtil.h"
 #include "src/SparseCore/SparseMatrixBase.h"
 #include "src/SparseCore/SparseAssign.h"
@@ -62,6 +63,7 @@
 #include "src/SparseCore/SparsePermutation.h"
 #include "src/SparseCore/SparseFuzzy.h"
 #include "src/SparseCore/SparseSolverBase.h"
+// IWYU pragma: end_exports
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
diff --git a/Eigen/SparseLU b/Eigen/SparseLU
index 047cf0d..9ec6ac7 100644
--- a/Eigen/SparseLU
+++ b/Eigen/SparseLU
@@ -25,6 +25,7 @@
 
 #include "src/Core/util/DisableStupidWarnings.h"
 
+// IWYU pragma: begin_exports
 #include "src/SparseLU/SparseLU_Structs.h"
 #include "src/SparseLU/SparseLU_SupernodalMatrix.h"
 #include "src/SparseLU/SparseLUImpl.h"
@@ -42,6 +43,7 @@
 #include "src/SparseLU/SparseLU_pruneL.h"
 #include "src/SparseLU/SparseLU_Utils.h"
 #include "src/SparseLU/SparseLU.h"
+// IWYU pragma: end_exports
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
diff --git a/Eigen/SparseQR b/Eigen/SparseQR
index f5fc5fa..4bc144c 100644
--- a/Eigen/SparseQR
+++ b/Eigen/SparseQR
@@ -28,8 +28,10 @@
   * 
   */
 
+// IWYU pragma: begin_exports
 #include "src/SparseCore/SparseColEtree.h"
 #include "src/SparseQR/SparseQR.h"
+// IWYU pragma: end_exports
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
diff --git a/Eigen/StdDeque b/Eigen/StdDeque
index bc68397..29550e2 100644
--- a/Eigen/StdDeque
+++ b/Eigen/StdDeque
@@ -20,7 +20,9 @@
 
 #else
 
+// IWYU pragma: begin_exports
 #include "src/StlSupport/StdDeque.h"
+// IWYU pragma: end_exports
 
 #endif
 
diff --git a/Eigen/StdList b/Eigen/StdList
index 4c6262c..8a22fc8 100644
--- a/Eigen/StdList
+++ b/Eigen/StdList
@@ -19,7 +19,9 @@
 
 #else
 
+// IWYU pragma: begin_exports
 #include "src/StlSupport/StdList.h"
+// IWYU pragma: end_exports
 
 #endif
 
diff --git a/Eigen/StdVector b/Eigen/StdVector
index 0c4697a..e68f6b5 100644
--- a/Eigen/StdVector
+++ b/Eigen/StdVector
@@ -20,7 +20,9 @@
 
 #else
 
+// IWYU pragma: begin_exports
 #include "src/StlSupport/StdVector.h"
+// IWYU pragma: end_exports
 
 #endif
 
diff --git a/Eigen/SuperLUSupport b/Eigen/SuperLUSupport
index 59312a8..33997fa 100644
--- a/Eigen/SuperLUSupport
+++ b/Eigen/SuperLUSupport
@@ -57,7 +57,9 @@
   *
   */
 
+// IWYU pragma: begin_exports
 #include "src/SuperLUSupport/SuperLUSupport.h"
+// IWYU pragma: end_exports
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
diff --git a/Eigen/UmfPackSupport b/Eigen/UmfPackSupport
index 00eec80..eea150e 100644
--- a/Eigen/UmfPackSupport
+++ b/Eigen/UmfPackSupport
@@ -33,7 +33,9 @@
   *
   */
 
+// IWYU pragma: begin_exports
 #include "src/UmfPackSupport/UmfPackSupport.h"
+// IWYU pragma: endexports
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
diff --git a/Eigen/src/Core/BooleanRedux.h b/Eigen/src/Core/BooleanRedux.h
index 20e5bd9..40a36da 100644
--- a/Eigen/src/Core/BooleanRedux.h
+++ b/Eigen/src/Core/BooleanRedux.h
@@ -27,7 +27,9 @@
 
   EIGEN_DEVICE_FUNC static inline bool run(const Derived &mat)
   {
-    return all_unroller<Derived, UnrollCount-1, InnerSize>::run(mat) && mat.coeff(IsRowMajor ? i : j, IsRowMajor ? j : i);
+    return all_unroller<Derived, UnrollCount - 1, InnerSize>::run(mat) &&
+           mat.coeff(IsRowMajor ? i : j, IsRowMajor ? j : i) !=
+               typename Derived::CoeffReturnType(0);
   }
 };
 
@@ -54,7 +56,9 @@
 
   EIGEN_DEVICE_FUNC static inline bool run(const Derived &mat)
   {
-    return any_unroller<Derived, UnrollCount-1, InnerSize>::run(mat) || mat.coeff(IsRowMajor ? i : j, IsRowMajor ? j : i);
+    return any_unroller<Derived, UnrollCount - 1, InnerSize>::run(mat) ||
+           mat.coeff(IsRowMajor ? i : j, IsRowMajor ? j : i) !=
+               typename Derived::CoeffReturnType(0);
   }
 };
 
@@ -94,7 +98,9 @@
   {
     for(Index i = 0; i < derived().outerSize(); ++i)
       for(Index j = 0; j < derived().innerSize(); ++j)
-        if (!evaluator.coeff(IsRowMajor ? i : j, IsRowMajor ? j : i)) return false;
+        if (evaluator.coeff(IsRowMajor ? i : j, IsRowMajor ? j : i) ==
+            Scalar(0))
+          return false;
     return true;
   }
 }
@@ -118,7 +124,9 @@
   {
     for(Index i = 0; i < derived().outerSize(); ++i)
       for(Index j = 0; j < derived().innerSize(); ++j)
-        if (evaluator.coeff(IsRowMajor ? i : j, IsRowMajor ? j : i)) return true;
+        if (evaluator.coeff(IsRowMajor ? i : j, IsRowMajor ? j : i) !=
+            Scalar(0))
+          return true;
     return false;
   }
 }
diff --git a/Eigen/src/Core/DiagonalMatrix.h b/Eigen/src/Core/DiagonalMatrix.h
index 405cc71..9fd0652 100644
--- a/Eigen/src/Core/DiagonalMatrix.h
+++ b/Eigen/src/Core/DiagonalMatrix.h
@@ -72,6 +72,14 @@
     EIGEN_DEVICE_FUNC
     inline DiagonalVectorType& diagonal() { return derived().diagonal(); }
 
+    /** \returns the value of the coefficient as if \c *this was a dense matrix.
+     */
+    EIGEN_DEVICE_FUNC
+    inline Scalar coeff(Index row, Index col) const {
+      eigen_assert(row >= 0 && col >= 0 && row < rows() && col <= cols());
+      return row == col ? diagonal().coeff(row) : Scalar(0);
+    }
+
     /** \returns the number of rows. */
     EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR 
     inline Index rows() const { return diagonal().size(); }
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index d65c63a..e651dbc 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -46,45 +46,46 @@
   enum {
     HasHalfPacket = 0,
 
-    HasAdd       = 1,
-    HasSub       = 1,
-    HasShift     = 1,
-    HasMul       = 1,
-    HasNegate    = 1,
-    HasAbs       = 1,
-    HasArg       = 0,
-    HasAbs2      = 1,
-    HasAbsDiff   = 0,
-    HasMin       = 1,
-    HasMax       = 1,
-    HasConj      = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasAbsDiff = 0,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
     HasSetLinear = 1,
-    HasSign      = 1,
-    HasBlend     = 0,
+    HasSign = 1,
+    HasBlend = 0,
     // This flag is used to indicate whether packet comparison is supported.
     // pcmp_eq, pcmp_lt and pcmp_le should be defined for it to be true.
-    HasCmp       = 0,
+    HasCmp = 0,
 
-    HasDiv    = 0,
+    HasDiv = 0,
     HasReciprocal = 0,
-    HasSqrt   = 0,
-    HasRsqrt  = 0,
-    HasExp    = 0,
-    HasExpm1  = 0,
-    HasLog    = 0,
-    HasLog1p  = 0,
-    HasLog10  = 0,
-    HasPow    = 0,
+    HasSqrt = 0,
+    HasRsqrt = 0,
+    HasExp = 0,
+    HasExpm1 = 0,
+    HasLog = 0,
+    HasLog1p = 0,
+    HasLog10 = 0,
+    HasPow = 0,
 
-    HasSin    = 0,
-    HasCos    = 0,
-    HasTan    = 0,
-    HasASin   = 0,
-    HasACos   = 0,
-    HasATan   = 0,
-    HasSinh   = 0,
-    HasCosh   = 0,
-    HasTanh   = 0,
+    HasSin = 0,
+    HasCos = 0,
+    HasTan = 0,
+    HasASin = 0,
+    HasACos = 0,
+    HasATan = 0,
+    HasATanh = 0,
+    HasSinh = 0,
+    HasCosh = 0,
+    HasTanh = 0,
     HasLGamma = 0,
     HasDiGamma = 0,
     HasZeta = 0,
@@ -99,10 +100,10 @@
     HasIGammac = 0,
     HasBetaInc = 0,
 
-    HasRound  = 0,
-    HasRint   = 0,
-    HasFloor  = 0,
-    HasCeil   = 0
+    HasRound = 0,
+    HasRint = 0,
+    HasFloor = 0,
+    HasCeil = 0
   };
 };
 
@@ -857,9 +858,6 @@
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pacos(const Packet& a) { EIGEN_USING_STD(acos); return acos(a); }
 
-/** \internal \returns the arc tangent of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet patan(const Packet& a) { EIGEN_USING_STD(atan); return atan(a); }
 
 /** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
@@ -869,10 +867,26 @@
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pcosh(const Packet& a) { EIGEN_USING_STD(cosh); return cosh(a); }
 
+/** \internal \returns the arc tangent of \a a (coeff-wise) */
+template <typename Packet>
+EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet
+patan(const Packet& a) {
+  EIGEN_USING_STD(atan);
+  return atan(a);
+}
+
 /** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet ptanh(const Packet& a) { EIGEN_USING_STD(tanh); return tanh(a); }
 
+/** \internal \returns the arc tangent of \a a (coeff-wise) */
+template <typename Packet>
+EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet
+patanh(const Packet& a) {
+  EIGEN_USING_STD(atanh);
+  return atanh(a);
+}
+
 /** \internal \returns the exp of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pexp(const Packet& a) { EIGEN_USING_STD(exp); return exp(a); }
@@ -1192,27 +1206,34 @@
 }
 
 template <typename Packet, bool IsScalar = is_scalar<Packet>::value,
-    bool IsInteger = NumTraits<typename unpacket_traits<Packet>::type>::IsInteger>
-    struct psignbit_impl;
+          bool IsInteger =
+              NumTraits<typename unpacket_traits<Packet>::type>::IsInteger>
+struct psignbit_impl;
 template <typename Packet, bool IsInteger>
 struct psignbit_impl<Packet, true, IsInteger> {
-     EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static constexpr Packet run(const Packet& a) { return numext::signbit(a); }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static constexpr Packet run(
+      const Packet& a) {
+    return numext::signbit(a);
+  }
 };
 template <typename Packet>
 struct psignbit_impl<Packet, false, false> {
-    // generic implementation if not specialized in PacketMath.h
-    // slower than arithmetic shift
-    typedef typename unpacket_traits<Packet>::type Scalar;
-    EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static Packet run(const Packet& a) {
-        const Packet cst_pos_one = pset1<Packet>(Scalar(1));
-        const Packet cst_neg_one = pset1<Packet>(Scalar(-1));
-        return pcmp_eq(por(pand(a, cst_neg_one), cst_pos_one), cst_neg_one);
-    }
+  // generic implementation if not specialized in PacketMath.h
+  // slower than arithmetic shift
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static Packet run(const Packet& a) {
+    const Packet cst_pos_one = pset1<Packet>(Scalar(1));
+    const Packet cst_neg_one = pset1<Packet>(Scalar(-1));
+    return pcmp_eq(por(pand(a, cst_neg_one), cst_pos_one), cst_neg_one);
+  }
 };
 template <typename Packet>
 struct psignbit_impl<Packet, false, true> {
-    // generic implementation for integer packets
-    EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static constexpr Packet run(const Packet& a) { return pcmp_lt(a, pzero(a)); }
+  // generic implementation for integer packets
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static constexpr Packet run(
+      const Packet& a) {
+    return pcmp_lt(a, pzero(a));
+  }
 };
 /** \internal \returns the sign bit of \a a as a bitmask*/
 template <typename Packet>
@@ -1256,6 +1277,25 @@
   return result;
 }
 
+/** \internal \returns the argument of \a a as a complex number */
+template <typename Packet, std::enable_if_t<is_scalar<Packet>::value, int> = 0>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pcarg(const Packet& a) {
+  return Packet(numext::arg(a));
+}
+
+/** \internal \returns the argument of \a a as a complex number */
+template <typename Packet, std::enable_if_t<!is_scalar<Packet>::value, int> = 0>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pcarg(const Packet& a) {
+  using Scalar = typename unpacket_traits<Packet>::type;
+  EIGEN_STATIC_ASSERT(NumTraits<Scalar>::IsComplex,
+                      THIS METHOD IS FOR COMPLEX TYPES ONLY)
+  using RealPacket = typename unpacket_traits<Packet>::as_real;
+  // a                                              // r     i    r     i    ...
+  RealPacket aflip = pcplxflip(a).v;                // i     r    i     r    ...
+  RealPacket result = patan2(aflip, a.v);           // atan2 crap atan2 crap ...
+  return (Packet)pand(result, peven_mask(result));  // atan2 0    atan2 0    ...
+}
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/GlobalFunctions.h b/Eigen/src/Core/GlobalFunctions.h
index 18792cb..89eefe1 100644
--- a/Eigen/src/Core/GlobalFunctions.h
+++ b/Eigen/src/Core/GlobalFunctions.h
@@ -86,6 +86,9 @@
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs,scalar_abs_op,absolute value,\sa ArrayBase::abs DOXCOMMA MatrixBase::cwiseAbs)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs2,scalar_abs2_op,squared absolute value,\sa ArrayBase::abs2 DOXCOMMA MatrixBase::cwiseAbs2)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(arg,scalar_arg_op,complex argument,\sa ArrayBase::arg DOXCOMMA MatrixBase::cwiseArg)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(
+      carg, scalar_carg_op,
+      complex argument, \sa ArrayBase::carg DOXCOMMA MatrixBase::cwiseCArg)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sqrt,scalar_sqrt_op,square root,\sa ArrayBase::sqrt DOXCOMMA MatrixBase::cwiseSqrt)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(rsqrt,scalar_rsqrt_op,reciprocal square root,\sa ArrayBase::rsqrt)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(square,scalar_square_op,square (power 2),\sa Eigen::abs2 DOXCOMMA Eigen::pow DOXCOMMA ArrayBase::square)
diff --git a/Eigen/src/Core/IO.h b/Eigen/src/Core/IO.h
index 897d7b0..053905f 100644
--- a/Eigen/src/Core/IO.h
+++ b/Eigen/src/Core/IO.h
@@ -253,6 +253,11 @@
   return internal::print_matrix(s, m.eval(), EIGEN_DEFAULT_IO_FORMAT);
 }
 
+template <typename Derived>
+std::ostream& operator<<(std::ostream& s, const DiagonalBase<Derived>& m) {
+  return internal::print_matrix(s, m.derived(), EIGEN_DEFAULT_IO_FORMAT);
+}
+
 } // end namespace Eigen
 
 #endif // EIGEN_IO_H
diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index def5428..ecc6f38 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -1015,11 +1015,15 @@
 }
 
 //MSVC defines a _isnan builtin function, but for double only
+#ifndef EIGEN_GPU_COMPILE_PHASE
 EIGEN_DEVICE_FUNC inline bool isnan_impl(const long double& x) { return _isnan(x)!=0; }
+#endif
 EIGEN_DEVICE_FUNC inline bool isnan_impl(const double& x)      { return _isnan(x)!=0; }
 EIGEN_DEVICE_FUNC inline bool isnan_impl(const float& x)       { return _isnan(x)!=0; }
 
+#ifndef EIGEN_GPU_COMPILE_PHASE
 EIGEN_DEVICE_FUNC inline bool isinf_impl(const long double& x) { return isinf_msvc_helper(x); }
+#endif
 EIGEN_DEVICE_FUNC inline bool isinf_impl(const double& x)      { return isinf_msvc_helper(x); }
 EIGEN_DEVICE_FUNC inline bool isinf_impl(const float& x)       { return isinf_msvc_helper(x); }
 
@@ -1033,12 +1037,16 @@
   #define EIGEN_TMP_NOOPT_ATTRIB EIGEN_DEVICE_FUNC inline __attribute__((noinline,optimize("no-finite-math-only")))
 #endif
 
+#ifndef EIGEN_GPU_COMPILE_PHASE
 template<> EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const long double& x) { return __builtin_isnan(x); }
+#endif
 template<> EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const double& x)      { return __builtin_isnan(x); }
 template<> EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const float& x)       { return __builtin_isnan(x); }
 template<> EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const double& x)      { return __builtin_isinf(x); }
 template<> EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const float& x)       { return __builtin_isinf(x); }
+#ifndef EIGEN_GPU_COMPILE_PHASE
 template<> EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const long double& x) { return __builtin_isinf(x); }
+#endif
 
 #undef EIGEN_TMP_NOOPT_ATTRIB
 
@@ -1095,6 +1103,8 @@
 {
   return fmin(x, y);
 }
+
+#ifndef EIGEN_GPU_COMPILE_PHASE
 template<>
 EIGEN_DEVICE_FUNC
 EIGEN_ALWAYS_INLINE long double mini(const long double& x, const long double& y)
@@ -1106,6 +1116,7 @@
   return fminl(x, y);
 #endif
 }
+#endif
 
 template<typename T>
 EIGEN_DEVICE_FUNC
@@ -1125,6 +1136,7 @@
 {
   return fmax(x, y);
 }
+#ifndef EIGEN_GPU_COMPILE_PHASE
 template<>
 EIGEN_DEVICE_FUNC
 EIGEN_ALWAYS_INLINE long double maxi(const long double& x, const long double& y)
@@ -1137,6 +1149,7 @@
 #endif
 }
 #endif
+#endif
 
 #if defined(SYCL_DEVICE_ONLY)
 
@@ -1300,8 +1313,8 @@
   return fabs(x - y);
 }
 
-#if !defined(EIGEN_GPUCC)
 // HIP and CUDA do not support long double.
+#ifndef EIGEN_GPU_COMPILE_PHASE
 template<>
 EIGEN_DEVICE_FUNC
 EIGEN_ALWAYS_INLINE long double absdiff(const long double& x, const long double& y) {
diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h
index 53362ef..d066be9 100644
--- a/Eigen/src/Core/NumTraits.h
+++ b/Eigen/src/Core/NumTraits.h
@@ -243,12 +243,25 @@
   static inline double dummy_precision() { return 1e-12; }
 };
 
+// GPU devices treat `long double` as `double`.
+#ifndef EIGEN_GPU_COMPILE_PHASE
 template<> struct NumTraits<long double>
   : GenericNumTraits<long double>
 {
-  EIGEN_CONSTEXPR
-  static inline long double dummy_precision() { return 1e-15l; }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline long double
+  dummy_precision() {
+    return static_cast<long double>(1e-15l);
+  }
+
+#if defined(EIGEN_ARCH_PPC) && (__LDBL_MANT_DIG__ == 106)
+  // PowerPC double double causes issues with some values
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline long double epsilon() {
+    // 2^(-(__LDBL_MANT_DIG__)+1)
+    return static_cast<long double>(2.4651903288156618919116517665087e-32l);
+  }
+#endif
 };
+#endif
 
 template<typename Real_> struct NumTraits<std::complex<Real_> >
   : GenericNumTraits<std::complex<Real_> >
diff --git a/Eigen/src/Core/PlainObjectBase.h b/Eigen/src/Core/PlainObjectBase.h
index 60a75b1..4eae3b3 100644
--- a/Eigen/src/Core/PlainObjectBase.h
+++ b/Eigen/src/Core/PlainObjectBase.h
@@ -538,7 +538,10 @@
       if (ColsAtCompileTime == 1 && list.size() == 1) {
         eigen_assert(list_size == static_cast<size_t>(RowsAtCompileTime) || RowsAtCompileTime == Dynamic);
         resize(list_size, ColsAtCompileTime);
-        std::copy(list.begin()->begin(), list.begin()->end(), m_storage.data());
+        if (list.begin()->begin() != nullptr) {
+          std::copy(list.begin()->begin(), list.begin()->end(),
+                    m_storage.data());
+        }
       } else {
         eigen_assert(list.size() == static_cast<size_t>(RowsAtCompileTime) || RowsAtCompileTime == Dynamic);
         eigen_assert(list_size == static_cast<size_t>(ColsAtCompileTime) || ColsAtCompileTime == Dynamic);
diff --git a/Eigen/src/Core/SkewSymmetricMatrix3.h b/Eigen/src/Core/SkewSymmetricMatrix3.h
index 7f6b5fd..5efbc44 100644
--- a/Eigen/src/Core/SkewSymmetricMatrix3.h
+++ b/Eigen/src/Core/SkewSymmetricMatrix3.h
@@ -383,6 +383,7 @@
 template< typename DstXprType, typename SrcXprType, typename Functor>
 struct Assignment<DstXprType, SrcXprType, Functor, SkewSymmetric2Dense>
 {
+  EIGEN_DEVICE_FUNC
   static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
   {
     if((dst.rows()!=3) || (dst.cols()!=3)) {
@@ -397,10 +398,11 @@
     dst(1, 2) = -v(0);
     dst(2, 1) = v(0);
   }
-  
+  EIGEN_DEVICE_FUNC
   static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
   { dst.vector() += src.vector(); }
-  
+
+  EIGEN_DEVICE_FUNC
   static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
   { dst.vector() -= src.vector(); }
 };
diff --git a/Eigen/src/Core/VectorwiseOp.h b/Eigen/src/Core/VectorwiseOp.h
index 762cbfc..737df13 100644
--- a/Eigen/src/Core/VectorwiseOp.h
+++ b/Eigen/src/Core/VectorwiseOp.h
@@ -350,8 +350,8 @@
     typedef typename ReturnType<internal::member_hypotNorm,RealScalar>::Type HypotNormReturnType;
     typedef typename ReturnType<internal::member_sum>::Type SumReturnType;
     typedef EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(SumReturnType,Scalar,quotient) MeanReturnType;
-    typedef typename ReturnType<internal::member_all>::Type AllReturnType;
-    typedef typename ReturnType<internal::member_any>::Type AnyReturnType;
+    typedef typename ReturnType<internal::member_all, bool>::Type AllReturnType;
+    typedef typename ReturnType<internal::member_any, bool>::Type AnyReturnType;
     typedef PartialReduxExpr<ExpressionType, internal::member_count<Index,Scalar>, Direction> CountReturnType;
     typedef typename ReturnType<internal::member_prod>::Type ProdReturnType;
     typedef Reverse<const ExpressionType, Direction> ConstReverseReturnType;
diff --git a/Eigen/src/Core/Visitor.h b/Eigen/src/Core/Visitor.h
index 5e5eead..434f893 100644
--- a/Eigen/src/Core/Visitor.h
+++ b/Eigen/src/Core/Visitor.h
@@ -338,8 +338,9 @@
   }
 };
 
-template<typename Scalar, bool is_min, int NaNPropagation>
-struct functor_traits<minmax_coeff_visitor<Scalar, is_min, NaNPropagation> > {
+template <typename Derived, bool is_min, int NaNPropagation>
+struct functor_traits<minmax_coeff_visitor<Derived, is_min, NaNPropagation> > {
+  using Scalar = typename Derived::Scalar;
   enum {
     Cost = NumTraits<Scalar>::AddCost,
     PacketAccess = packet_traits<Scalar>::HasCmp
diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h
index cb7d7b8..8ce181e 100644
--- a/Eigen/src/Core/arch/AVX/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX/MathFunctions.h
@@ -58,6 +58,12 @@
 
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8f
+patanh<Packet8f>(const Packet8f& _x) {
+  return patanh_float(_x);
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8f
 plog<Packet8f>(const Packet8f& _x) {
   return plog_float(_x);
 }
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index 60e1ff4..ea4e0ed 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -68,7 +68,7 @@
     size = 8,
     HasHalfPacket = 1,
 
-    HasCmp  = 1,
+    HasCmp = 1,
     HasDiv = 1,
     HasReciprocal = EIGEN_FAST_MATH,
     HasSin = EIGEN_FAST_MATH,
@@ -76,6 +76,7 @@
     HasACos = 1,
     HasASin = 1,
     HasATan = 1,
+    HasATanh = 1,
     HasLog = 1,
     HasLog1p = 1,
     HasExpm1 = 1,
diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h
index af47a85..1636e90 100644
--- a/Eigen/src/Core/arch/AVX512/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h
@@ -16,8 +16,6 @@
 
 namespace internal {
 
-#if EIGEN_HAS_AVX512_MATH
-
 #define EIGEN_DECLARE_CONST_Packet16f(NAME, X) \
   const Packet16f p16f_##NAME = pset1<Packet16f>(X)
 
@@ -165,8 +163,12 @@
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8d
 psqrt<Packet8d>(const Packet8d& _x) {
-  // Double requires 2 Newton-Raphson steps for convergence.
+#ifdef EIGEN_VECTORIZE_AVX512ER
+  return generic_sqrt_newton_step<Packet8d, /*Steps=*/1>::run(
+      _x, _mm512_rsqrt28_pd(_x));
+#else
   return generic_sqrt_newton_step<Packet8d, /*Steps=*/2>::run(_x, _mm512_rsqrt14_pd(_x));
+#endif
 }
 #else
 template <>
@@ -185,7 +187,6 @@
 
 // prsqrt for float.
 #if defined(EIGEN_VECTORIZE_AVX512ER)
-
 template <>
 EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) {
   return _mm512_rsqrt28_ps(x);
@@ -221,10 +222,10 @@
   return generic_reciprocal_newton_step<Packet16f, /*Steps=*/1>::run(a, _mm512_rcp14_ps(a));
 #endif
 }
+#endif
 
 F16_PACKET_FUNCTION(Packet16f, Packet16h, preciprocal)
 BF16_PACKET_FUNCTION(Packet16f, Packet16bf, preciprocal)
-#endif
 
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet16f plog1p<Packet16f>(const Packet16f& _x) {
@@ -242,9 +243,6 @@
 F16_PACKET_FUNCTION(Packet16f, Packet16h, pexpm1)
 BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pexpm1)
 
-#endif  // EIGEN_HAS_AVX512_MATH
-
-
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f
 psin<Packet16f>(const Packet16f& _x) {
@@ -276,6 +274,12 @@
 }
 
 template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f
+patanh<Packet16f>(const Packet16f& _x) {
+  return patanh_float(_x);
+}
+
+template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8d
 patan<Packet8d>(const Packet8d& _x) {
   return patan_double(_x);
diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h
index 916babf..bb98be2 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
@@ -30,13 +30,6 @@
 #endif
 #endif
 
-// Disable the code for older versions of gcc that don't support many of the required avx512 math instrinsics.
-#if EIGEN_GNUC_STRICT_AT_LEAST(5,3,0) || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC >= 1923 || EIGEN_COMP_ICC >= 1900
-#define EIGEN_HAS_AVX512_MATH 1
-#else
-#define EIGEN_HAS_AVX512_MATH 0
-#endif
-
 typedef __m512 Packet16f;
 typedef __m512i Packet16i;
 typedef __m512d Packet8d;
@@ -72,35 +65,35 @@
     size = 16,
     HasHalfPacket = 1,
 
-    HasCmp    = 1,
-    HasAdd    = 1,
-    HasSub    = 1,
-    HasMul    = 1,
-    HasDiv    = 1,
+    HasCmp = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
     HasNegate = 1,
-    HasAbs    = 1,
-    HasAbs2   = 0,
-    HasMin    = 1,
-    HasMax    = 1,
-    HasConj   = 1,
+    HasAbs = 1,
+    HasAbs2 = 0,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
     HasSetLinear = 0,
-    HasLog    = EIGEN_HAS_AVX512_MATH,
-    HasLog1p  = EIGEN_HAS_AVX512_MATH,
-    HasExp    = EIGEN_HAS_AVX512_MATH,
-    HasExpm1  = EIGEN_HAS_AVX512_MATH,
-    HasSqrt   = EIGEN_HAS_AVX512_MATH,
-    HasRsqrt  = EIGEN_HAS_AVX512_MATH,
-    HasBessel = EIGEN_HAS_AVX512_MATH,
-    HasNdtri  = EIGEN_HAS_AVX512_MATH,
-    HasSin    = EIGEN_FAST_MATH,
-    HasCos    = EIGEN_FAST_MATH,
-    HasTanh   = EIGEN_FAST_MATH,
-    HasErf    = EIGEN_FAST_MATH,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasLog = 1,
+    HasLog1p = 1,
+    HasExp = 1,
+    HasExpm1 = 1,
+    HasBessel = 1,
+    HasNdtri = 1,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH,
     HasBlend = 0,
-    HasRound  = 1,
-    HasFloor  = 1,
-    HasCeil   = 1,
-    HasRint   = 1
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasRint = 1
   };
 };
 #endif
@@ -116,29 +109,28 @@
     HasHalfPacket = 1,
 
     HasAbs = 1,
-    HasMin    = 1,
-    HasMax    = 1,
-    HasConj   = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
     HasBlend = 0,
     HasSin = EIGEN_FAST_MATH,
     HasCos = EIGEN_FAST_MATH,
     HasACos = 1,
     HasASin = 1,
     HasATan = 1,
-#if EIGEN_HAS_AVX512_MATH
+    HasATanh = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
     HasLog = 1,
-    HasLog1p  = 1,
-    HasExpm1  = 1,
+    HasLog1p = 1,
+    HasExpm1 = 1,
     HasNdtri = 1,
-    HasBessel  = 1,
+    HasBessel = 1,
     HasExp = 1,
-    HasSqrt = EIGEN_FAST_MATH,
-    HasRsqrt = EIGEN_FAST_MATH,
     HasReciprocal = EIGEN_FAST_MATH,
     HasTanh = EIGEN_FAST_MATH,
     HasErf = EIGEN_FAST_MATH,
-#endif
-    HasCmp  = 1,
+    HasCmp = 1,
     HasDiv = 1,
     HasRound = 1,
     HasFloor = 1,
@@ -155,14 +147,12 @@
     AlignedOnScalar = 1,
     size = 8,
     HasHalfPacket = 1,
-#if EIGEN_HAS_AVX512_MATH
-    HasLog  = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasLog = 1,
     HasExp = 1,
-    HasSqrt = EIGEN_FAST_MATH,
-    HasRsqrt = EIGEN_FAST_MATH,
-#endif
     HasATan = 1,
-    HasCmp  = 1,
+    HasCmp = 1,
     HasDiv = 1,
     HasRound = 1,
     HasFloor = 1,
@@ -2293,21 +2283,19 @@
     HasInsert = 1,
     HasSin = EIGEN_FAST_MATH,
     HasCos = EIGEN_FAST_MATH,
-#if EIGEN_HAS_AVX512_MATH
+    HasSqrt = 1,
+    HasRsqrt = 1,
 #ifdef EIGEN_VECTORIZE_AVX512DQ
     HasLog = 1,  // Currently fails test with bad accuracy.
-    HasLog1p  = 1,
-    HasExpm1  = 1,
+    HasLog1p = 1,
+    HasExpm1 = 1,
     HasNdtri = 1,
-    HasBessel  = 1,
+    HasBessel = 1,
 #endif
     HasExp = 1,
-    HasSqrt = EIGEN_FAST_MATH,
-    HasRsqrt = EIGEN_FAST_MATH,
     HasTanh = EIGEN_FAST_MATH,
     HasErf = EIGEN_FAST_MATH,
-#endif
-    HasCmp  = 1,
+    HasCmp = 1,
     HasDiv = 1
   };
 };
diff --git a/Eigen/src/Core/arch/AVX512/PacketMathFP16.h b/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
index bbc016a..289815d 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
@@ -16,13 +16,6 @@
 
 namespace internal {
 
-// Disable the code for older versions of gcc that don't support many of the required avx512 math instrinsics.
-#if EIGEN_GNUC_STRICT_AT_LEAST(5,3,0) || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC >= 1923 || EIGEN_COMP_ICC >= 1900
-#define EIGEN_HAS_AVX512_MATH 1
-#else
-#define EIGEN_HAS_AVX512_MATH 0
-#endif
-
 typedef __m512h Packet32h;
 typedef eigen_packet_wrapper<__m256i, 1> Packet16h;
 typedef eigen_packet_wrapper<__m128i, 2> Packet8h;
@@ -54,15 +47,15 @@
     HasMax = 1,
     HasConj = 1,
     HasSetLinear = 0,
+    HasLog = 1,
+    HasLog1p = 1,
+    HasExp = 1,
+    HasExpm1 = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
     // These ones should be implemented in future
-    HasLog = EIGEN_HAS_AVX512_MATH,
-    HasLog1p = EIGEN_HAS_AVX512_MATH,
-    HasExp = EIGEN_HAS_AVX512_MATH,
-    HasExpm1 = EIGEN_HAS_AVX512_MATH,
-    HasSqrt = EIGEN_HAS_AVX512_MATH,
-    HasRsqrt = EIGEN_HAS_AVX512_MATH,
-    HasBessel = 0,  // EIGEN_HAS_AVX512_MATH,
-    HasNdtri = 0,   // EIGEN_HAS_AVX512_MATH,
+    HasBessel = 0,
+    HasNdtri = 0,
     HasSin = EIGEN_FAST_MATH,
     HasCos = EIGEN_FAST_MATH,
     HasTanh = EIGEN_FAST_MATH,
@@ -135,7 +128,8 @@
 EIGEN_STRONG_INLINE Eigen::half pfirst<Packet32h>(const Packet32h& from) {
 #ifdef EIGEN_VECTORIZE_AVX512DQ
   return half_impl::raw_uint16_to_half(
-      static_cast<unsigned short>(_mm256_extract_epi16(_mm512_extracti32x8_epi32(_mm512_castph_si512(from), 0), 0)));
+      static_cast<unsigned short>(_mm256_extract_epi16(
+          _mm512_extracti32x8_epi32(_mm512_castph_si512(from), 0), 0)));
 #else
   Eigen::half dest[32];
   _mm512_storeu_ph(dest, from);
@@ -175,9 +169,10 @@
 template <>
 EIGEN_STRONG_INLINE Packet32h ploaddup<Packet32h>(const Eigen::half* from) {
   __m512h a = _mm512_castph256_ph512(_mm256_loadu_ph(from));
-  return _mm512_permutexvar_ph(_mm512_set_epi16(15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6,
-                                                5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0),
-                               a);
+  return _mm512_permutexvar_ph(
+      _mm512_set_epi16(15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8,
+                       8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0),
+      a);
 }
 
 // ploadquad
@@ -185,7 +180,8 @@
 EIGEN_STRONG_INLINE Packet32h ploadquad<Packet32h>(const Eigen::half* from) {
   __m512h a = _mm512_castph128_ph512(_mm_loadu_ph(from));
   return _mm512_permutexvar_ph(
-      _mm512_set_epi16(7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0),
+      _mm512_set_epi16(7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 3, 3, 3,
+                       3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0),
       a);
 }
 
@@ -206,59 +202,70 @@
 // pmin
 
 template <>
-EIGEN_STRONG_INLINE Packet32h pmin<Packet32h>(const Packet32h& a, const Packet32h& b) {
+EIGEN_STRONG_INLINE Packet32h pmin<Packet32h>(const Packet32h& a,
+                                              const Packet32h& b) {
   return _mm512_min_ph(a, b);
 }
 
 // pmax
 
 template <>
-EIGEN_STRONG_INLINE Packet32h pmax<Packet32h>(const Packet32h& a, const Packet32h& b) {
+EIGEN_STRONG_INLINE Packet32h pmax<Packet32h>(const Packet32h& a,
+                                              const Packet32h& b) {
   return _mm512_max_ph(a, b);
 }
 
 // plset
 template <>
 EIGEN_STRONG_INLINE Packet32h plset<Packet32h>(const half& a) {
-  return _mm512_add_ph(_mm512_set1_ph(a),
-                       _mm512_set_ph(31.0f, 30.0f, 29.0f, 28.0f, 27.0f, 26.0f, 25.0f, 24.0f, 23.0f, 22.0f, 21.0f, 20.0f,
-                                     19.0f, 18.0f, 17.0f, 16.0f, 15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f,
-                                     7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f));
+  return _mm512_add_ph(
+      _mm512_set1_ph(a),
+      _mm512_set_ph(31.0f, 30.0f, 29.0f, 28.0f, 27.0f, 26.0f, 25.0f, 24.0f,
+                    23.0f, 22.0f, 21.0f, 20.0f, 19.0f, 18.0f, 17.0f, 16.0f,
+                    15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f,
+                    6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f));
 }
 
 // por
 
 template <>
 EIGEN_STRONG_INLINE Packet32h por(const Packet32h& a, const Packet32h& b) {
-  return _mm512_castsi512_ph(_mm512_or_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
+  return _mm512_castsi512_ph(
+      _mm512_or_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
 }
 
 // pxor
 
 template <>
 EIGEN_STRONG_INLINE Packet32h pxor(const Packet32h& a, const Packet32h& b) {
-  return _mm512_castsi512_ph(_mm512_xor_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
+  return _mm512_castsi512_ph(
+      _mm512_xor_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
 }
 
 // pand
 
 template <>
 EIGEN_STRONG_INLINE Packet32h pand(const Packet32h& a, const Packet32h& b) {
-  return _mm512_castsi512_ph(_mm512_and_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
+  return _mm512_castsi512_ph(
+      _mm512_and_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
 }
 
 // pandnot
 
 template <>
 EIGEN_STRONG_INLINE Packet32h pandnot(const Packet32h& a, const Packet32h& b) {
-  return _mm512_castsi512_ph(_mm512_andnot_si512(_mm512_castph_si512(b), _mm512_castph_si512(a)));
+  return _mm512_castsi512_ph(
+      _mm512_andnot_si512(_mm512_castph_si512(b), _mm512_castph_si512(a)));
 }
 
 // pselect
 
 template <>
-EIGEN_DEVICE_FUNC inline Packet32h pselect(const Packet32h& mask, const Packet32h& a, const Packet32h& b) {
-  __mmask32 mask32 = _mm512_cmp_epi16_mask(_mm512_castph_si512(mask), _mm512_setzero_epi32(), _MM_CMPINT_EQ);
+EIGEN_DEVICE_FUNC inline Packet32h pselect(const Packet32h& mask,
+                                           const Packet32h& a,
+                                           const Packet32h& b) {
+  __mmask32 mask32 = _mm512_cmp_epi16_mask(
+      _mm512_castph_si512(mask), _mm512_setzero_epi32(), _MM_CMPINT_EQ);
   return _mm512_mask_blend_ph(mask32, a, b);
 }
 
@@ -267,7 +274,8 @@
 template <>
 EIGEN_STRONG_INLINE Packet32h pcmp_eq(const Packet32h& a, const Packet32h& b) {
   __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_EQ_OQ);
-  return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu));
+  return _mm512_castsi512_ph(
+      _mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu));
 }
 
 // pcmp_le
@@ -275,7 +283,8 @@
 template <>
 EIGEN_STRONG_INLINE Packet32h pcmp_le(const Packet32h& a, const Packet32h& b) {
   __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_LE_OQ);
-  return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu));
+  return _mm512_castsi512_ph(
+      _mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu));
 }
 
 // pcmp_lt
@@ -283,82 +292,101 @@
 template <>
 EIGEN_STRONG_INLINE Packet32h pcmp_lt(const Packet32h& a, const Packet32h& b) {
   __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_LT_OQ);
-  return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu));
+  return _mm512_castsi512_ph(
+      _mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu));
 }
 
 // pcmp_lt_or_nan
 
 template <>
-EIGEN_STRONG_INLINE Packet32h pcmp_lt_or_nan(const Packet32h& a, const Packet32h& b) {
+EIGEN_STRONG_INLINE Packet32h pcmp_lt_or_nan(const Packet32h& a,
+                                             const Packet32h& b) {
   __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_NGE_UQ);
-  return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi16(0), mask, 0xffffu));
+  return _mm512_castsi512_ph(
+      _mm512_mask_set1_epi16(_mm512_set1_epi16(0), mask, 0xffffu));
 }
 
 // padd
 
 template <>
-EIGEN_STRONG_INLINE Packet32h padd<Packet32h>(const Packet32h& a, const Packet32h& b) {
+EIGEN_STRONG_INLINE Packet32h padd<Packet32h>(const Packet32h& a,
+                                              const Packet32h& b) {
   return _mm512_add_ph(a, b);
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
-  return _mm256_castph_si256(_mm256_add_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
+EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a,
+                                              const Packet16h& b) {
+  return _mm256_castph_si256(
+      _mm256_add_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {
+EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a,
+                                            const Packet8h& b) {
   return _mm_castph_si128(_mm_add_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
 }
 
 // psub
 
 template <>
-EIGEN_STRONG_INLINE Packet32h psub<Packet32h>(const Packet32h& a, const Packet32h& b) {
+EIGEN_STRONG_INLINE Packet32h psub<Packet32h>(const Packet32h& a,
+                                              const Packet32h& b) {
   return _mm512_sub_ph(a, b);
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet16h psub<Packet16h>(const Packet16h& a, const Packet16h& b) {
-  return _mm256_castph_si256(_mm256_sub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
+EIGEN_STRONG_INLINE Packet16h psub<Packet16h>(const Packet16h& a,
+                                              const Packet16h& b) {
+  return _mm256_castph_si256(
+      _mm256_sub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet8h psub<Packet8h>(const Packet8h& a, const Packet8h& b) {
+EIGEN_STRONG_INLINE Packet8h psub<Packet8h>(const Packet8h& a,
+                                            const Packet8h& b) {
   return _mm_castph_si128(_mm_sub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
 }
 
 // pmul
 
 template <>
-EIGEN_STRONG_INLINE Packet32h pmul<Packet32h>(const Packet32h& a, const Packet32h& b) {
+EIGEN_STRONG_INLINE Packet32h pmul<Packet32h>(const Packet32h& a,
+                                              const Packet32h& b) {
   return _mm512_mul_ph(a, b);
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {
-  return _mm256_castph_si256(_mm256_mul_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
+EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a,
+                                              const Packet16h& b) {
+  return _mm256_castph_si256(
+      _mm256_mul_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {
+EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a,
+                                            const Packet8h& b) {
   return _mm_castph_si128(_mm_mul_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
 }
 
 // pdiv
 
 template <>
-EIGEN_STRONG_INLINE Packet32h pdiv<Packet32h>(const Packet32h& a, const Packet32h& b) {
+EIGEN_STRONG_INLINE Packet32h pdiv<Packet32h>(const Packet32h& a,
+                                              const Packet32h& b) {
   return _mm512_div_ph(a, b);
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet16h pdiv<Packet16h>(const Packet16h& a, const Packet16h& b) {
-  return _mm256_castph_si256(_mm256_div_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
+EIGEN_STRONG_INLINE Packet16h pdiv<Packet16h>(const Packet16h& a,
+                                              const Packet16h& b) {
+  return _mm256_castph_si256(
+      _mm256_div_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet8h pdiv<Packet8h>(const Packet8h& a, const Packet8h& b) {
+EIGEN_STRONG_INLINE Packet8h pdiv<Packet8h>(const Packet8h& a,
+                                            const Packet8h& b) {
   return _mm_castph_si128(_mm_div_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
 }
 
@@ -369,11 +397,14 @@
   // Work-around for default std::round rounding mode.
 
   // Mask for the sign bit
-  const Packet32h signMask = pset1frombits<Packet32h>(static_cast<numext::uint16_t>(0x8000u));
+  const Packet32h signMask =
+      pset1frombits<Packet32h>(static_cast<numext::uint16_t>(0x8000u));
   // The largest half-preicision float less than 0.5
-  const Packet32h prev0dot5 = pset1frombits<Packet32h>(static_cast<numext::uint16_t>(0x37FFu));
+  const Packet32h prev0dot5 =
+      pset1frombits<Packet32h>(static_cast<numext::uint16_t>(0x37FFu));
 
-  return _mm512_roundscale_ph(padd(por(pand(a, signMask), prev0dot5), a), _MM_FROUND_TO_ZERO);
+  return _mm512_roundscale_ph(padd(por(pand(a, signMask), prev0dot5), a),
+                              _MM_FROUND_TO_ZERO);
 }
 
 // print
@@ -415,10 +446,13 @@
 
 // predux_half_dowto4
 template <>
-EIGEN_STRONG_INLINE Packet16h predux_half_dowto4<Packet32h>(const Packet32h& a) {
+EIGEN_STRONG_INLINE Packet16h
+predux_half_dowto4<Packet32h>(const Packet32h& a) {
 #ifdef EIGEN_VECTORIZE_AVX512DQ
-  __m256i lowHalf = _mm256_castps_si256(_mm512_extractf32x8_ps(_mm512_castph_ps(a), 0));
-  __m256i highHalf = _mm256_castps_si256(_mm512_extractf32x8_ps(_mm512_castph_ps(a), 1));
+  __m256i lowHalf =
+      _mm256_castps_si256(_mm512_extractf32x8_ps(_mm512_castph_ps(a), 0));
+  __m256i highHalf =
+      _mm256_castps_si256(_mm512_extractf32x8_ps(_mm512_castph_ps(a), 1));
 
   return Packet16h(padd<Packet16h>(lowHalf, highHalf));
 #else
@@ -443,69 +477,89 @@
 // pmadd
 
 template <>
-EIGEN_STRONG_INLINE Packet32h pmadd(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
+EIGEN_STRONG_INLINE Packet32h pmadd(const Packet32h& a, const Packet32h& b,
+                                    const Packet32h& c) {
   return _mm512_fmadd_ph(a, b, c);
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet16h pmadd(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
-  return _mm256_castph_si256(_mm256_fmadd_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
+EIGEN_STRONG_INLINE Packet16h pmadd(const Packet16h& a, const Packet16h& b,
+                                    const Packet16h& c) {
+  return _mm256_castph_si256(_mm256_fmadd_ph(
+      _mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet8h pmadd(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
-  return _mm_castph_si128(_mm_fmadd_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
+EIGEN_STRONG_INLINE Packet8h pmadd(const Packet8h& a, const Packet8h& b,
+                                   const Packet8h& c) {
+  return _mm_castph_si128(_mm_fmadd_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b),
+                                       _mm_castsi128_ph(c)));
 }
 
 // pmsub
 
 template <>
-EIGEN_STRONG_INLINE Packet32h pmsub(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
+EIGEN_STRONG_INLINE Packet32h pmsub(const Packet32h& a, const Packet32h& b,
+                                    const Packet32h& c) {
   return _mm512_fmsub_ph(a, b, c);
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet16h pmsub(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
-  return _mm256_castph_si256(_mm256_fmsub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
+EIGEN_STRONG_INLINE Packet16h pmsub(const Packet16h& a, const Packet16h& b,
+                                    const Packet16h& c) {
+  return _mm256_castph_si256(_mm256_fmsub_ph(
+      _mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet8h pmsub(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
-  return _mm_castph_si128(_mm_fmsub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
+EIGEN_STRONG_INLINE Packet8h pmsub(const Packet8h& a, const Packet8h& b,
+                                   const Packet8h& c) {
+  return _mm_castph_si128(_mm_fmsub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b),
+                                       _mm_castsi128_ph(c)));
 }
 
 // pnmadd
 
 template <>
-EIGEN_STRONG_INLINE Packet32h pnmadd(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
+EIGEN_STRONG_INLINE Packet32h pnmadd(const Packet32h& a, const Packet32h& b,
+                                     const Packet32h& c) {
   return _mm512_fnmadd_ph(a, b, c);
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet16h pnmadd(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
-  return _mm256_castph_si256(_mm256_fnmadd_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
+EIGEN_STRONG_INLINE Packet16h pnmadd(const Packet16h& a, const Packet16h& b,
+                                     const Packet16h& c) {
+  return _mm256_castph_si256(_mm256_fnmadd_ph(
+      _mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet8h pnmadd(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
-  return _mm_castph_si128(_mm_fnmadd_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
+EIGEN_STRONG_INLINE Packet8h pnmadd(const Packet8h& a, const Packet8h& b,
+                                    const Packet8h& c) {
+  return _mm_castph_si128(_mm_fnmadd_ph(
+      _mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
 }
 
 // pnmsub
 
 template <>
-EIGEN_STRONG_INLINE Packet32h pnmsub(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
+EIGEN_STRONG_INLINE Packet32h pnmsub(const Packet32h& a, const Packet32h& b,
+                                     const Packet32h& c) {
   return _mm512_fnmsub_ph(a, b, c);
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet16h pnmsub(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
-  return _mm256_castph_si256(_mm256_fnmsub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
+EIGEN_STRONG_INLINE Packet16h pnmsub(const Packet16h& a, const Packet16h& b,
+                                     const Packet16h& c) {
+  return _mm256_castph_si256(_mm256_fnmsub_ph(
+      _mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet8h pnmsub(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
-  return _mm_castph_si128(_mm_fnmsub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
+EIGEN_STRONG_INLINE Packet8h pnmsub(const Packet8h& a, const Packet8h& b,
+                                    const Packet8h& c) {
+  return _mm_castph_si128(_mm_fnmsub_ph(
+      _mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
 }
 
 #endif
@@ -552,9 +606,11 @@
 
   EIGEN_UNROLL_LOOP
   for (int i = 0; i < 16; i++) {
-    t[2 * i] = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[2 * i]), _mm512_castph_si512(a.packet[2 * i + 1]));
+    t[2 * i] = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[2 * i]),
+                                     _mm512_castph_si512(a.packet[2 * i + 1]));
     t[2 * i + 1] =
-        _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[2 * i]), _mm512_castph_si512(a.packet[2 * i + 1]));
+        _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[2 * i]),
+                              _mm512_castph_si512(a.packet[2 * i + 1]));
   }
 
   __m512i p[32];
@@ -583,16 +639,24 @@
 
   __m512i f[32];
 
-#define PACKET32H_TRANSPOSE_HELPER(X, Y)                                                            \
-  do {                                                                                              \
-    f[Y * 8] = _mm512_inserti32x4(f[Y * 8], _mm512_extracti32x4_epi32(q[X * 8], Y), X);             \
-    f[Y * 8 + 1] = _mm512_inserti32x4(f[Y * 8 + 1], _mm512_extracti32x4_epi32(q[X * 8 + 1], Y), X); \
-    f[Y * 8 + 2] = _mm512_inserti32x4(f[Y * 8 + 2], _mm512_extracti32x4_epi32(q[X * 8 + 2], Y), X); \
-    f[Y * 8 + 3] = _mm512_inserti32x4(f[Y * 8 + 3], _mm512_extracti32x4_epi32(q[X * 8 + 3], Y), X); \
-    f[Y * 8 + 4] = _mm512_inserti32x4(f[Y * 8 + 4], _mm512_extracti32x4_epi32(q[X * 8 + 4], Y), X); \
-    f[Y * 8 + 5] = _mm512_inserti32x4(f[Y * 8 + 5], _mm512_extracti32x4_epi32(q[X * 8 + 5], Y), X); \
-    f[Y * 8 + 6] = _mm512_inserti32x4(f[Y * 8 + 6], _mm512_extracti32x4_epi32(q[X * 8 + 6], Y), X); \
-    f[Y * 8 + 7] = _mm512_inserti32x4(f[Y * 8 + 7], _mm512_extracti32x4_epi32(q[X * 8 + 7], Y), X); \
+#define PACKET32H_TRANSPOSE_HELPER(X, Y)                                      \
+  do {                                                                        \
+    f[Y * 8] = _mm512_inserti32x4(f[Y * 8],                                   \
+                                  _mm512_extracti32x4_epi32(q[X * 8], Y), X); \
+    f[Y * 8 + 1] = _mm512_inserti32x4(                                        \
+        f[Y * 8 + 1], _mm512_extracti32x4_epi32(q[X * 8 + 1], Y), X);         \
+    f[Y * 8 + 2] = _mm512_inserti32x4(                                        \
+        f[Y * 8 + 2], _mm512_extracti32x4_epi32(q[X * 8 + 2], Y), X);         \
+    f[Y * 8 + 3] = _mm512_inserti32x4(                                        \
+        f[Y * 8 + 3], _mm512_extracti32x4_epi32(q[X * 8 + 3], Y), X);         \
+    f[Y * 8 + 4] = _mm512_inserti32x4(                                        \
+        f[Y * 8 + 4], _mm512_extracti32x4_epi32(q[X * 8 + 4], Y), X);         \
+    f[Y * 8 + 5] = _mm512_inserti32x4(                                        \
+        f[Y * 8 + 5], _mm512_extracti32x4_epi32(q[X * 8 + 5], Y), X);         \
+    f[Y * 8 + 6] = _mm512_inserti32x4(                                        \
+        f[Y * 8 + 6], _mm512_extracti32x4_epi32(q[X * 8 + 6], Y), X);         \
+    f[Y * 8 + 7] = _mm512_inserti32x4(                                        \
+        f[Y * 8 + 7], _mm512_extracti32x4_epi32(q[X * 8 + 7], Y), X);         \
   } while (false);
 
   PACKET32H_TRANSPOSE_HELPER(0, 0);
@@ -624,10 +688,14 @@
 
 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet32h, 4>& a) {
   __m512i p0, p1, p2, p3, t0, t1, t2, t3, a0, a1, a2, a3;
-  t0 = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[0]), _mm512_castph_si512(a.packet[1]));
-  t1 = _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[0]), _mm512_castph_si512(a.packet[1]));
-  t2 = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[2]), _mm512_castph_si512(a.packet[3]));
-  t3 = _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[2]), _mm512_castph_si512(a.packet[3]));
+  t0 = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[0]),
+                             _mm512_castph_si512(a.packet[1]));
+  t1 = _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[0]),
+                             _mm512_castph_si512(a.packet[1]));
+  t2 = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[2]),
+                             _mm512_castph_si512(a.packet[3]));
+  t3 = _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[2]),
+                             _mm512_castph_si512(a.packet[3]));
 
   p0 = _mm512_unpacklo_epi32(t0, t2);
   p1 = _mm512_unpackhi_epi32(t0, t2);
@@ -667,15 +735,19 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet32h preverse(const Packet32h& a) {
-  return _mm512_permutexvar_ph(_mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
-                                                20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31),
-                               a);
+  return _mm512_permutexvar_ph(
+      _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+                       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
+                       31),
+      a);
 }
 
 // pscatter
 
 template <>
-EIGEN_STRONG_INLINE void pscatter<half, Packet32h>(half* to, const Packet32h& from, Index stride) {
+EIGEN_STRONG_INLINE void pscatter<half, Packet32h>(half* to,
+                                                   const Packet32h& from,
+                                                   Index stride) {
   EIGEN_ALIGN64 half aux[32];
   pstore(aux, from);
 
@@ -688,14 +760,19 @@
 // pgather
 
 template <>
-EIGEN_STRONG_INLINE Packet32h pgather<Eigen::half, Packet32h>(const Eigen::half* from, Index stride) {
+EIGEN_STRONG_INLINE Packet32h
+pgather<Eigen::half, Packet32h>(const Eigen::half* from, Index stride) {
   return _mm512_castsi512_ph(_mm512_set_epi16(
-      from[31 * stride].x, from[30 * stride].x, from[29 * stride].x, from[28 * stride].x, from[27 * stride].x,
-      from[26 * stride].x, from[25 * stride].x, from[24 * stride].x, from[23 * stride].x, from[22 * stride].x,
-      from[21 * stride].x, from[20 * stride].x, from[19 * stride].x, from[18 * stride].x, from[17 * stride].x,
-      from[16 * stride].x, from[15 * stride].x, from[14 * stride].x, from[13 * stride].x, from[12 * stride].x,
-      from[11 * stride].x, from[10 * stride].x, from[9 * stride].x, from[8 * stride].x, from[7 * stride].x,
-      from[6 * stride].x, from[5 * stride].x, from[4 * stride].x, from[3 * stride].x, from[2 * stride].x,
+      from[31 * stride].x, from[30 * stride].x, from[29 * stride].x,
+      from[28 * stride].x, from[27 * stride].x, from[26 * stride].x,
+      from[25 * stride].x, from[24 * stride].x, from[23 * stride].x,
+      from[22 * stride].x, from[21 * stride].x, from[20 * stride].x,
+      from[19 * stride].x, from[18 * stride].x, from[17 * stride].x,
+      from[16 * stride].x, from[15 * stride].x, from[14 * stride].x,
+      from[13 * stride].x, from[12 * stride].x, from[11 * stride].x,
+      from[10 * stride].x, from[9 * stride].x, from[8 * stride].x,
+      from[7 * stride].x, from[6 * stride].x, from[5 * stride].x,
+      from[4 * stride].x, from[3 * stride].x, from[2 * stride].x,
       from[1 * stride].x, from[0 * stride].x));
 }
 
@@ -718,16 +795,19 @@
 template <>
 EIGEN_STRONG_INLINE Packet16h pfrexp<Packet16h>(const Packet16h&, Packet16h&);
 template <>
-EIGEN_STRONG_INLINE Packet16h pldexp<Packet16h>(const Packet16h&, const Packet16h&);
+EIGEN_STRONG_INLINE Packet16h pldexp<Packet16h>(const Packet16h&,
+                                                const Packet16h&);
 
-EIGEN_STRONG_INLINE Packet32h combine2Packet16h(const Packet16h& a, const Packet16h& b) {
+EIGEN_STRONG_INLINE Packet32h combine2Packet16h(const Packet16h& a,
+                                                const Packet16h& b) {
   __m512d result = _mm512_undefined_pd();
   result = _mm512_insertf64x4(result, _mm256_castsi256_pd(a), 0);
   result = _mm512_insertf64x4(result, _mm256_castsi256_pd(b), 1);
   return _mm512_castpd_ph(result);
 }
 
-EIGEN_STRONG_INLINE void extract2Packet16h(const Packet32h& x, Packet16h& a, Packet16h& b) {
+EIGEN_STRONG_INLINE void extract2Packet16h(const Packet32h& x, Packet16h& a,
+                                           Packet16h& b) {
   a = _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(x), 0));
   b = _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(x), 1));
 }
@@ -838,7 +918,8 @@
 
 // pfrexp
 template <>
-EIGEN_STRONG_INLINE Packet32h pfrexp<Packet32h>(const Packet32h& a, Packet32h& exponent) {
+EIGEN_STRONG_INLINE Packet32h pfrexp<Packet32h>(const Packet32h& a,
+                                                Packet32h& exponent) {
   Packet16h low;
   Packet16h high;
   extract2Packet16h(a, low, high);
@@ -856,7 +937,8 @@
 
 // pldexp
 template <>
-EIGEN_STRONG_INLINE Packet32h pldexp<Packet32h>(const Packet32h& a, const Packet32h& exponent) {
+EIGEN_STRONG_INLINE Packet32h pldexp<Packet32h>(const Packet32h& a,
+                                                const Packet32h& exponent) {
   Packet16h low;
   Packet16h high;
   extract2Packet16h(a, low, high);
@@ -874,4 +956,4 @@
 }  // end namespace internal
 }  // end namespace Eigen
 
-#endif  // EIGEN_PACKET_MATH_FP16_AVX512_H
\ No newline at end of file
+#endif  // EIGEN_PACKET_MATH_FP16_AVX512_H
diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h
index 60f28ff..1ce35c3 100644
--- a/Eigen/src/Core/arch/AltiVec/Complex.h
+++ b/Eigen/src/Core/arch/AltiVec/Complex.h
@@ -177,7 +177,8 @@
 
 template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pgather_complex_size2(const Scalar* from, Index stride, const Index n = 2)
 {
-  eigen_assert(n <= unpacket_traits<Packet>::size && "number of elements will gather past end of packet");
+  eigen_internal_assert(n <= unpacket_traits<Packet>::size &&
+                        "number of elements will gather past end of packet");
   EIGEN_ALIGN16 Scalar af[2];
   for (Index i = 0; i < n; i++) {
     af[i] = from[i*stride];
@@ -194,7 +195,8 @@
 }
 template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_complex_size2(Scalar* to, const Packet& from, Index stride, const Index n = 2)
 {
-  eigen_assert(n <= unpacket_traits<Packet>::size && "number of elements will scatter past end of packet");
+  eigen_internal_assert(n <= unpacket_traits<Packet>::size &&
+                        "number of elements will scatter past end of packet");
   EIGEN_ALIGN16 Scalar af[2];
   pstore<Scalar>((Scalar *) af, from);
   for (Index i = 0; i < n; i++) {
diff --git a/Eigen/src/Core/arch/AltiVec/MathFunctions.h b/Eigen/src/Core/arch/AltiVec/MathFunctions.h
index 45761e2..783868e 100644
--- a/Eigen/src/Core/arch/AltiVec/MathFunctions.h
+++ b/Eigen/src/Core/arch/AltiVec/MathFunctions.h
@@ -60,6 +60,12 @@
   return patan_float(_x);
 }
 
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f
+patanh<Packet4f>(const Packet4f& _x) {
+  return patanh_float(_x);
+}
+
 #ifdef EIGEN_VECTORIZE_VSX
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet4f psqrt<Packet4f>(const Packet4f& x)
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
index adb2eac..c08f4d5 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
@@ -2187,23 +2187,13 @@
 #define MICRO_EXTRA_COLS(N) \
   gemm_cols<Scalar, Packet, DataMapper, N, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlpha, pMask);
 
-template<typename Scalar, typename Packet, typename DataMapper, const Index accCols>
-EIGEN_STRONG_INLINE void gemm_extra_cols(
-  const DataMapper& res,
-  const Scalar* blockA,
-  const Scalar* blockB,
-  Index depth,
-  Index strideA,
-  Index offsetA,
-  Index strideB,
-  Index offsetB,
-  Index col,
-  Index rows,
-  Index cols,
-  Index remaining_rows,
-  const Packet& pAlpha,
-  const Packet& pMask)
-{
+template <typename Scalar, typename Packet, typename DataMapper,
+          const Index accCols>
+EIGEN_ALWAYS_INLINE void gemm_extra_cols(
+    const DataMapper& res, const Scalar* blockA, const Scalar* blockB,
+    Index depth, Index strideA, Index offsetA, Index strideB, Index offsetB,
+    Index col, Index rows, Index cols, Index remaining_rows,
+    const Packet& pAlpha, const Packet& pMask) {
   MICRO_EXTRA(MICRO_EXTRA_COLS, cols-col, true)
 }
 
@@ -2621,24 +2611,14 @@
 #define MICRO_COMPLEX_EXTRA_COLS(N) \
   gemm_complex_cols<Scalar, Packet, Packetc, DataMapper, N, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask);
 
-template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
-EIGEN_STRONG_INLINE void gemm_complex_extra_cols(
-  const DataMapper& res,
-  const Scalar* blockA,
-  const Scalar* blockB,
-  Index depth,
-  Index strideA,
-  Index offsetA,
-  Index strideB,
-  Index offsetB,
-  Index col,
-  Index rows,
-  Index cols,
-  Index remaining_rows,
-  const Packet& pAlphaReal,
-  const Packet& pAlphaImag,
-  const Packet& pMask)
-{
+template <typename Scalar, typename Packet, typename Packetc,
+          typename DataMapper, const Index accCols, bool ConjugateLhs,
+          bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_ALWAYS_INLINE void gemm_complex_extra_cols(
+    const DataMapper& res, const Scalar* blockA, const Scalar* blockB,
+    Index depth, Index strideA, Index offsetA, Index strideB, Index offsetB,
+    Index col, Index rows, Index cols, Index remaining_rows,
+    const Packet& pAlphaReal, const Packet& pAlphaImag, const Packet& pMask) {
   MICRO_EXTRA(MICRO_COMPLEX_EXTRA_COLS, cols-col, true)
 }
 
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h b/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h
index 28868ca..abce5a9 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h
@@ -30,22 +30,13 @@
   const Packet& pAlpha,
   const Packet& pMask);
 
-template<typename Scalar, typename Packet, typename DataMapper, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
-EIGEN_STRONG_INLINE void gemm_extra_cols(
-  const DataMapper& res,
-  const Scalar* blockA,
-  const Scalar* blockB,
-  Index depth,
-  Index strideA,
-  Index offsetA,
-  Index strideB,
-  Index offsetB,
-  Index col,
-  Index rows,
-  Index cols,
-  Index remaining_rows,
-  const Packet& pAlpha,
-  const Packet& pMask);
+template <typename Scalar, typename Packet, typename DataMapper,
+          const Index accCols>
+EIGEN_ALWAYS_INLINE void gemm_extra_cols(
+    const DataMapper& res, const Scalar* blockA, const Scalar* blockB,
+    Index depth, Index strideA, Index offsetA, Index strideB, Index offsetB,
+    Index col, Index rows, Index cols, Index remaining_rows,
+    const Packet& pAlpha, const Packet& pMask);
 
 template<typename Packet>
 EIGEN_ALWAYS_INLINE Packet bmask(const Index remaining_rows);
@@ -66,23 +57,14 @@
   const Packet& pAlphaImag,
   const Packet& pMask);
 
-template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
-EIGEN_STRONG_INLINE void gemm_complex_extra_cols(
-  const DataMapper& res,
-  const Scalar* blockA,
-  const Scalar* blockB,
-  Index depth,
-  Index strideA,
-  Index offsetA,
-  Index strideB,
-  Index offsetB,
-  Index col,
-  Index rows,
-  Index cols,
-  Index remaining_rows,
-  const Packet& pAlphaReal,
-  const Packet& pAlphaImag,
-  const Packet& pMask);
+template <typename Scalar, typename Packet, typename Packetc,
+          typename DataMapper, const Index accCols, bool ConjugateLhs,
+          bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_ALWAYS_INLINE void gemm_complex_extra_cols(
+    const DataMapper& res, const Scalar* blockA, const Scalar* blockB,
+    Index depth, Index strideA, Index offsetA, Index strideB, Index offsetB,
+    Index col, Index rows, Index cols, Index remaining_rows,
+    const Packet& pAlphaReal, const Packet& pAlphaImag, const Packet& pMask);
 
 template<typename Packet>
 EIGEN_ALWAYS_INLINE Packet ploadLhs(const __UNPACK_TYPE__(Packet)* lhs);
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h b/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h
index c6d4216..4b64d33 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h
@@ -11,13 +11,6 @@
 
 namespace internal {
 
-EIGEN_ALWAYS_INLINE void scaleAndStore(float* result, Packet4f& acc, const Packet4f& pAlpha)
-{
-  Packet4f result_block = ploadu<Packet4f>(result);
-  result_block = pmadd(acc, pAlpha, result_block);
-  pstoreu(result, result_block);
-}
-
 template<bool zero>
 EIGEN_ALWAYS_INLINE Packet8bf loadBfloat16(const bfloat16* indexA)
 {
@@ -30,123 +23,207 @@
   }
 }
 
-template<bool zero>
-EIGEN_ALWAYS_INLINE Packet8bf loadBfloat16Extra(const bfloat16* indexA, Index extra_rows)
-{
-  if (zero) {
-    Packet8bf lhs1 = ploadu_partial<Packet8bf>(indexA, extra_rows);
-    Packet8bf lhs2 = pset1<Packet8bf>(Eigen::bfloat16(0));
-    return vec_mergeh(lhs1.m_val, lhs2.m_val);
-  } else {
-    return reinterpret_cast<Packet8us>(ploadu_partial<Packet4i>(reinterpret_cast<const int *>(indexA), extra_rows));
-  }
+template <bool zero>
+EIGEN_ALWAYS_INLINE Packet8bf loadRhsBfloat16(const bfloat16* blockB,
+                                              Index strideB, Index i) {
+  return loadBfloat16<zero>(blockB + strideB * i);
 }
 
-template<bool zero>
-EIGEN_ALWAYS_INLINE Packet8bf loadLhsBfloat16ExtraRows(const bfloat16* indexA, Index strideA, Index row, Index extra_rows)
-{
-  return loadBfloat16Extra<zero>(indexA + row*strideA, extra_rows);
-}
-
-template<bool zero>
-EIGEN_ALWAYS_INLINE Packet8bf loadRhsBfloat16(const bfloat16* baseB, Index strideB, Index i, Index k)
-{
-  return loadBfloat16<zero>(baseB + strideB*4*i + (k*4));
-}
-
-template<bool zero>
-EIGEN_ALWAYS_INLINE Packet8bf loadRhsBfloat16ExtraCols(const bfloat16* blockB, Index strideB, Index offsetB, Index col, Index i, Index k, Index extra_cols)
-{
-  return loadBfloat16Extra<zero>(blockB + ((col+4*i)*strideB)+k*extra_cols+offsetB, extra_cols);
-}
-
-template<Index num_acc, Index num_packets, bool zero, bool rhs_extra_cols, bool lhs_extra_rows>
-EIGEN_STRONG_INLINE void KLoop
-(
-  const bfloat16* indexA,
-  const bfloat16* indexB,
-  __vector_quad (&quad_acc)[num_acc],
-  Index strideA,
-  Index strideB,
-  Index offsetB,
-  Index k,
-  Index row,
-  Index col,
-  Index extra_rows,
-  Index extra_cols
-)
-{
-  Packet8bf lhs;
+template <Index num_acc, Index num_packets, bool zero, bool rhsExtraCols,
+          bool lhsExtraRows>
+EIGEN_ALWAYS_INLINE void KLoop(const bfloat16* indexA, const bfloat16* indexB,
+                               __vector_quad (&quad_acc)[num_acc],
+                               Index strideB, Index k, Index offsetB,
+                               Index extra_cols, Index extra_rows) {
+  Packet8bf lhs = loadBfloat16<zero>(
+      indexA + k * (lhsExtraRows
+                        ? extra_rows
+                        : num_packets));  // a packet of bfloat16 has 8 elements
   Packet8bf rhs[num_acc];
-  if(lhs_extra_rows) lhs = loadLhsBfloat16ExtraRows<zero>(indexA+k*extra_rows, strideA, row, extra_rows);
-  else lhs = loadBfloat16<zero>(indexA + k*num_packets); //a packet of bfloat16 has 8 elements
+
+  for (Index i = 0; i < (num_acc - (rhsExtraCols ? 1 : 0)); i++) {
+    rhs[i] = loadRhsBfloat16<zero>(indexB + k * 4, strideB, i);
+  }
+  if (rhsExtraCols) {
+    rhs[num_acc - 1] = loadRhsBfloat16<zero>(indexB + k * extra_cols - offsetB,
+                                             strideB, num_acc - 1);
+  }
+
   BFLOAT16_UNROLL
-  for(Index i = 0; i < num_acc; i++){
-    if(!rhs_extra_cols)
-      rhs[i] = loadRhsBfloat16<zero>(indexB, strideB, i, k);
-    else{
-      rhs[i] = loadRhsBfloat16ExtraCols<zero>(indexB, strideB, offsetB, col, i, k, extra_cols);
-    }
+  for (Index i = 0; i < num_acc; i++) {
     __builtin_mma_xvbf16ger2pp(&(quad_acc[i]), reinterpret_cast<Packet16uc>(rhs[i].m_val), reinterpret_cast<Packet16uc>(lhs.m_val));
   }
 }
 
-template<const Index num_acc, const Index num_packets, bool rhsExtraCols = false, bool lhsExtraRows = false>
-void colLoopBody(Index& col, Index row, Index depth, Index cols, Index rows, Index offset_row, Index block_index, const Packet4f& pAlpha, const bfloat16* indexA, Index strideA, const bfloat16* blockB, Index strideB, Index offsetB, float* result, Index extra_cols = 0, Index extra_rows = 0)
-{
-  const Index step = rhsExtraCols ? 1 : (num_acc * 4); //each accumulator has 4 elements
-  const bfloat16* indexB = rhsExtraCols ? blockB : (blockB + 4*offsetB + strideB*col);
-
-  while(col + step <= cols){
-    Index k = 0;
-    Packet4f acc[num_acc][4];
-    __vector_quad quad_acc[num_acc];
- 
-    BFLOAT16_UNROLL
-    for(Index i = 0; i < num_acc; i++)
-      __builtin_mma_xxsetaccz(&(quad_acc[i]));
-
-    for(; k + 2 <= depth; k += 2){
-      KLoop<num_acc, num_packets, false, rhsExtraCols, lhsExtraRows>(indexA, indexB, quad_acc, strideA, strideB, offsetB, k, row, col, extra_rows, extra_cols);
+template <bool rhsExtraCols, bool lhsExtraRows>
+EIGEN_ALWAYS_INLINE void storeResults(Packet4f (&acc)[4], Index rows,
+                                      const Packet4f pAlpha, float* result,
+                                      Index extra_cols, Index extra_rows) {
+  Index x = 0;
+  do {
+    Packet4f result_block = ploadu<Packet4f>(result);
+    result_block = pmadd(acc[x], pAlpha, result_block);
+    if (lhsExtraRows) {
+      pstoreu_partial(result, result_block, extra_rows);
+    } else {
+      pstoreu(result, result_block);
     }
-    if(depth&1){
-      KLoop<num_acc, num_packets, true, rhsExtraCols, lhsExtraRows>(indexA-(offset_row&(num_packets-1)), indexB, quad_acc, strideA, strideB, offsetB, k, row, col, extra_rows, extra_cols);
-    }
+    result += rows;
+  } while (++x < (rhsExtraCols ? extra_cols : 4));
+}
 
-    BFLOAT16_UNROLL
-    for(Index i = 0; i < num_acc; i++)
-      __builtin_mma_disassemble_acc((void*)acc[i], &(quad_acc[i]));
+#define MAX_BFLOAT16_ACC 8
 
-    for(Index i = 0; i < num_acc; i++){
-      if(lhsExtraRows){
-        float *r = result + (col+i*4)*rows + row;
-        for(Index x = 0; x < extra_cols; x++, r += rows){
-          Packet4f result_block = ploadu_partial<Packet4f>(r, extra_rows);
-          result_block = pmadd(acc[i][x], pAlpha, result_block);
-          pstoreu_partial<float>(r, result_block, extra_rows);
-        }
+template <const Index num_acc, const Index num_packets, bool rhsExtraCols,
+          bool lhsExtraRows>
+void colLoopBody(Index& col, Index depth, Index cols, Index rows,
+                 const Packet4f pAlpha, const bfloat16* indexA,
+                 const bfloat16* indexB, Index strideB, Index offsetB,
+                 float* result, Index extra_rows) {
+  const Index step = (num_acc * 4);  // each accumulator has 4 elements
+  const Index extra_cols = (rhsExtraCols) ? (cols & 3) : 0;
+
+  do {
+    for (Index offset_row = 0; offset_row < num_packets;
+         offset_row += 4, indexA += 8, result += 4) {
+      Index k;
+      Packet4f acc[num_acc][4];
+      __vector_quad quad_acc[num_acc];
+
+      BFLOAT16_UNROLL
+      for (k = 0; k < num_acc; k++) __builtin_mma_xxsetaccz(&(quad_acc[k]));
+
+      for (k = 0; k + 2 <= depth; k += 2) {
+        KLoop<num_acc, num_packets, false, rhsExtraCols, lhsExtraRows>(
+            indexA, indexB, quad_acc, strideB, k, offsetB, extra_cols,
+            extra_rows);
       }
-      else{
-        if(rhsExtraCols){
-          float *r = result + (col+i*4)*rows + row + offset_row;
-          for(Index x = 0; x < cols-col; x++, r += rows){
-            scaleAndStore(r,acc[i][x], pAlpha);
-          }
-        }
-        else{
-          float *r = result + (col+i*4)*rows + (block_index*16) + offset_row;
-          for(Index x = 0; x < 4; x++, r += rows){
-            scaleAndStore(r,acc[i][x], pAlpha);
-          }
-        }
+      if (depth & 1) {
+        KLoop<num_acc, num_packets, true, rhsExtraCols, lhsExtraRows>(
+            indexA - offset_row, indexB, quad_acc, strideB, k, offsetB,
+            extra_cols, extra_rows);
       }
+
+      BFLOAT16_UNROLL
+      for (k = 0; k < num_acc; k++)
+        __builtin_mma_disassemble_acc((void*)acc[k], &(quad_acc[k]));
+
+      for (k = 0; k < (num_acc - 1); k++) {
+        storeResults<false, lhsExtraRows>(acc[k], rows, pAlpha,
+                                          result + k * 4 * rows, extra_cols,
+                                          extra_rows);
+      }
+      storeResults<rhsExtraCols, lhsExtraRows>(
+          acc[k], rows, pAlpha, result + k * 4 * rows, extra_cols, extra_rows);
     }
-    if(rhsExtraCols) return;
-    indexB += strideB*step;
-    col += step;
+
+    indexA -= num_packets * 2;
+    indexB += strideB * num_acc;
+    result += (rows * step - num_packets);
+  } while (!rhsExtraCols && (num_acc == MAX_BFLOAT16_ACC) &&
+           (step <= cols - (col += step)));
+}
+
+template <const Index num_acc, const Index num_packets, bool rhsExtraCols,
+          bool lhsExtraRows>
+EIGEN_ALWAYS_INLINE void colLoopBodyExtraN(Index col, Index depth, Index cols,
+                                           Index rows, const Packet4f pAlpha,
+                                           const bfloat16* indexA,
+                                           const bfloat16* blockB,
+                                           Index strideB, Index offsetB,
+                                           float* result, Index extra_rows) {
+  if (MAX_BFLOAT16_ACC > num_acc) {
+    colLoopBody<num_acc + (rhsExtraCols ? 1 : 0), num_packets, rhsExtraCols,
+                lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB,
+                              strideB, offsetB, result, extra_rows);
   }
 }
 
+template <const Index num_packets, bool rhsExtraCols, bool lhsExtraRows>
+void colLoopBodyExtra(Index col, Index depth, Index cols, Index rows,
+                      const Packet4f pAlpha, const bfloat16* indexA,
+                      const bfloat16* blockB, Index strideB, Index offsetB,
+                      float* result, Index extra_rows) {
+  switch ((cols - col) >> 2) {
+    case 7:
+      colLoopBodyExtraN<7, num_packets, rhsExtraCols, lhsExtraRows>(
+          col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB,
+          result, extra_rows);
+      break;
+    case 6:
+      colLoopBodyExtraN<6, num_packets, rhsExtraCols, lhsExtraRows>(
+          col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB,
+          result, extra_rows);
+      break;
+    case 5:
+      colLoopBodyExtraN<5, num_packets, rhsExtraCols, lhsExtraRows>(
+          col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB,
+          result, extra_rows);
+      break;
+    case 4:
+      colLoopBodyExtraN<4, num_packets, rhsExtraCols, lhsExtraRows>(
+          col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB,
+          result, extra_rows);
+      break;
+    case 3:
+      colLoopBodyExtraN<3, num_packets, rhsExtraCols, lhsExtraRows>(
+          col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB,
+          result, extra_rows);
+      break;
+    case 2:
+      colLoopBodyExtraN<2, num_packets, rhsExtraCols, lhsExtraRows>(
+          col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB,
+          result, extra_rows);
+      break;
+    case 1:
+      colLoopBodyExtraN<1, num_packets, rhsExtraCols, lhsExtraRows>(
+          col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB,
+          result, extra_rows);
+      break;
+    default:
+      if (rhsExtraCols) {
+        colLoopBody<1, num_packets, true, lhsExtraRows>(
+            col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB,
+            result, extra_rows);
+      }
+      break;
+  }
+}
+
+template <const Index num_packets, bool lhsExtraRows = false>
+EIGEN_ALWAYS_INLINE void colLoops(Index depth, Index cols, Index rows,
+                                  const Packet4f pAlpha, const bfloat16* indexA,
+                                  const bfloat16* blockB, Index strideB,
+                                  Index offsetB, float* result,
+                                  Index extra_rows = 0) {
+  Index col = 0;
+  if (cols >= (MAX_BFLOAT16_ACC * 4)) {
+    colLoopBody<MAX_BFLOAT16_ACC, num_packets, false, lhsExtraRows>(
+        col, depth, cols, rows, pAlpha, indexA, blockB, strideB, 0, result,
+        extra_rows);
+    blockB += (strideB >> 2) * col;
+    result += rows * col;
+  }
+  if (cols & 3) {
+    colLoopBodyExtra<num_packets, true, lhsExtraRows>(
+        col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB,
+        result, extra_rows);
+  } else {
+    colLoopBodyExtra<num_packets, false, lhsExtraRows>(
+        col, depth, cols, rows, pAlpha, indexA, blockB, strideB, 0, result,
+        extra_rows);
+  }
+}
+
+EIGEN_ALWAYS_INLINE Packet8bf convertF16toF32(const float* res) {
+  Packet16uc fp16_0 = __builtin_vsx_xvcvspbf16(
+      reinterpret_cast<Packet16uc>(ploadu<Packet4f>(res + 0)));
+  Packet16uc fp16_1 = __builtin_vsx_xvcvspbf16(
+      reinterpret_cast<Packet16uc>(ploadu<Packet4f>(res + 4)));
+  return vec_pack(reinterpret_cast<Packet4ui>(fp16_0),
+                  reinterpret_cast<Packet4ui>(fp16_1));
+}
+
 template<typename Index, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols>
 void gemmMMAbfloat16(const DataMapper& res, const bfloat16* blockA, const bfloat16* blockB, Index rows, Index depth, Index cols, bfloat16 alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
 {
@@ -157,17 +234,56 @@
   ei_declare_aligned_stack_constructed_variable(float, result, cols*rows, 0);
 
   typedef typename DataMapper::LinearMapper LinearMapper;
+  Packet8us z = pset1<Packet8us>(0);
   for(Index j = 0; j < cols; j++){
     const LinearMapper res2 = res.getLinearMapper(0, j);
     float *result2 = result + j*rows;
-    BFLOAT16_UNROLL
-    for(Index i = 0; i < rows; i++){
+    Index i = 0;
+    for (; i + 32 <= rows; i += 32) {
+      Packet8us r32_0 = res2.template loadPacket<Packet8bf>(i + 0).m_val;
+      Packet8us r32_1 = res2.template loadPacket<Packet8bf>(i + 8).m_val;
+      Packet8us r32_2 = res2.template loadPacket<Packet8bf>(i + 16).m_val;
+      Packet8us r32_3 = res2.template loadPacket<Packet8bf>(i + 24).m_val;
+      pstore(result2 + i + 0, reinterpret_cast<Packet4f>(vec_mergeh(z, r32_0)));
+      pstore(result2 + i + 4, reinterpret_cast<Packet4f>(vec_mergel(z, r32_0)));
+      pstore(result2 + i + 8, reinterpret_cast<Packet4f>(vec_mergeh(z, r32_1)));
+      pstore(result2 + i + 12,
+             reinterpret_cast<Packet4f>(vec_mergel(z, r32_1)));
+      pstore(result2 + i + 16,
+             reinterpret_cast<Packet4f>(vec_mergeh(z, r32_2)));
+      pstore(result2 + i + 20,
+             reinterpret_cast<Packet4f>(vec_mergel(z, r32_2)));
+      pstore(result2 + i + 24,
+             reinterpret_cast<Packet4f>(vec_mergeh(z, r32_3)));
+      pstore(result2 + i + 28,
+             reinterpret_cast<Packet4f>(vec_mergel(z, r32_3)));
+    }
+    for (; i + 16 <= rows; i += 16) {
+      Packet8us r32_0 = res2.template loadPacket<Packet8bf>(i + 0).m_val;
+      Packet8us r32_1 = res2.template loadPacket<Packet8bf>(i + 8).m_val;
+      pstore(result2 + i + 0, reinterpret_cast<Packet4f>(vec_mergeh(z, r32_0)));
+      pstore(result2 + i + 4, reinterpret_cast<Packet4f>(vec_mergel(z, r32_0)));
+      pstore(result2 + i + 8, reinterpret_cast<Packet4f>(vec_mergeh(z, r32_1)));
+      pstore(result2 + i + 12,
+             reinterpret_cast<Packet4f>(vec_mergel(z, r32_1)));
+    }
+    for (; i + 8 <= rows; i += 8) {
+      Packet8us r32_0 = res2.template loadPacket<Packet8bf>(i + 0).m_val;
+      pstore(result2 + i + 0, reinterpret_cast<Packet4f>(vec_mergeh(z, r32_0)));
+      pstore(result2 + i + 4, reinterpret_cast<Packet4f>(vec_mergel(z, r32_0)));
+    }
+    for (; i + 4 <= rows; i += 4) {
+      Packet8us r32_0 =
+          res2.template loadPacketPartial<Packet8bf>(i + 0, 4).m_val;
+      pstore(result2 + i + 0, reinterpret_cast<Packet4f>(vec_mergeh(z, r32_0)));
+    }
+    for (; i < rows; i++) {
       result2[i] = res2(i);
     }
   }
 
   Index row = 0;
-  Index col = 0;
+  Index col;
 
   if( strideA == -1 ) strideA = depth;
   if( strideB == -1 ) strideB = depth;
@@ -183,90 +299,39 @@
   const Index standard_blocks_quantity = rows/standard_block_size; //Number of standard blocks
   Index bigSuffix = (2*8) * (strideA-offsetA);
   const bfloat16* indexA = blockA;
-  const Index offset_factor = 2;
+  const bfloat16* indexB = blockB + 4 * offsetB;
   Index block_index;
+  strideB *= 4;
+  offsetB *= 3;
   for(block_index = 0; block_index < standard_blocks_quantity; block_index++){
     indexA += 2*8*offsetA;
-    for(Index offset_row = 0; offset_row < standard_block_size; offset_row += 4){ //This block size has 16 rows maximum
-      col = 0;
-      colLoopBody<7, 16>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row*offset_factor, strideA, blockB, strideB, offsetB, result);
-      colLoopBody<6, 16>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row*offset_factor, strideA, blockB, strideB, offsetB, result);
-      colLoopBody<5, 16>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row*offset_factor, strideA, blockB, strideB, offsetB, result);
-      colLoopBody<4, 16>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row*offset_factor, strideA, blockB, strideB, offsetB, result);
-      colLoopBody<3, 16>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row*offset_factor, strideA, blockB, strideB, offsetB, result);
-      colLoopBody<2, 16>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row*offset_factor, strideA, blockB, strideB, offsetB, result);
-      colLoopBody<1, 16>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row*offset_factor, strideA, blockB, strideB, offsetB, result);
-      if(cols > col){
-        Index extra_cols= cols-col;
-        //Remember: It doesnt make sense use multiple acc to extra_cols as we are unrolling col loop
-        colLoopBody<1, 16, true>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row*offset_factor, strideA, blockB, strideB, offsetB, result, extra_cols, 4);
-      }
-    }
+    colLoops<16>(depth, cols, rows, pAlpha, indexA, indexB, strideB, offsetB,
+                 result + row);
     row += 16;
     indexA += bigSuffix;
   }
   //LHS (8x8) block
   if(rows & 8){
     indexA += 1*8*offsetA;
-    for(Index offset_row = 0; offset_row < 8; offset_row += 4){
-      col = 0;
-      colLoopBody<7, 8>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row*offset_factor, strideA, blockB, strideB, offsetB, result);
-      colLoopBody<6, 8>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row*offset_factor, strideA, blockB, strideB, offsetB, result);
-      colLoopBody<5, 8>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row*offset_factor, strideA, blockB, strideB, offsetB, result);
-      colLoopBody<4, 8>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row*offset_factor, strideA, blockB, strideB, offsetB, result);
-      colLoopBody<3, 8>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row*offset_factor, strideA, blockB, strideB, offsetB, result);
-      colLoopBody<2, 8>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row*offset_factor, strideA, blockB, strideB, offsetB, result);
-      colLoopBody<1, 8>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row*offset_factor, strideA, blockB, strideB, offsetB, result);
-    }
-    if(cols > col){
-      Index extra_cols= cols-col;
-
-      for(Index offset_row = 0; offset_row < 8; offset_row += 4){
-        colLoopBody<1, 8, true>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row*offset_factor, strideA, blockB, strideB, offsetB, result, extra_cols, 4);
-      }
-    } //end extra cols
+    colLoops<8>(depth, cols, rows, pAlpha, indexA, indexB, strideB, offsetB,
+                result + row);
     row += 8;
     indexA += (bigSuffix >> 1);
   }
   //LHS (8x4) block
   if(rows & 4){
-    Index offset_row = (rows & 8);
     indexA += 1*4*offsetA;
-    col = 0;
-    colLoopBody<7, 4>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA, strideA, blockB, strideB, offsetB, result);
-    colLoopBody<6, 4>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA, strideA, blockB, strideB, offsetB, result);
-    colLoopBody<5, 4>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA, strideA, blockB, strideB, offsetB, result);
-    colLoopBody<4, 4>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA, strideA, blockB, strideB, offsetB, result);
-    colLoopBody<3, 4>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA, strideA, blockB, strideB, offsetB, result);
-    colLoopBody<2, 4>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA, strideA, blockB, strideB, offsetB, result);
-    colLoopBody<1, 4>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA, strideA, blockB, strideB, offsetB, result);
-    if(cols > col){
-      Index extra_cols= cols-col;
-
-      colLoopBody<1, 4, true>(col, row, depth, cols, rows, 0, block_index, pAlpha, indexA, strideA, blockB, strideB, offsetB, result, extra_cols, 4);
-    }
+    colLoops<4>(depth, cols, rows, pAlpha, indexA, indexB, strideB, offsetB,
+                result + row);
     row += 4;
     indexA += (bigSuffix >> 2);
   }
   //extra rows
-  if(row < rows){
-    Index extra_rows_or_four = rows-row;
-
+  Index extra_rows = rows & 3;
+  if (extra_rows) {
     //This index is the beginning of remaining block.
-    col = 0;
-    colLoopBody<7, 8, false, true>(col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, 4, extra_rows_or_four);
-    colLoopBody<6, 8, false, true>(col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, 4, extra_rows_or_four);
-    colLoopBody<5, 8, false, true>(col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, 4, extra_rows_or_four);
-    colLoopBody<4, 8, false, true>(col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, 4, extra_rows_or_four);
-    colLoopBody<3, 8, false, true>(col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, 4, extra_rows_or_four);
-    colLoopBody<2, 8, false, true>(col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, 4, extra_rows_or_four);
-    colLoopBody<1, 8, false, true>(col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, 4, extra_rows_or_four);
-    if(cols > col){
-      Index extra_cols= cols-col;
-
-      colLoopBody<1, 8, true, true>(col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, extra_cols, extra_rows_or_four);
-    }
-    row += extra_rows_or_four;
+    colLoops<4, true>(depth, cols, rows, pAlpha, indexA, indexB, strideB,
+                      offsetB, result + row, extra_rows);
   }
 
   //Convert back to bfloat16
@@ -276,9 +341,8 @@
       //get and save block
       PacketBlock<Packet8bf,4> block;
       for(Index j = 0; j < 4; j++){
-        Packet16uc fp16_0 = __builtin_vsx_xvcvspbf16(reinterpret_cast<Packet16uc>(ploadu<Packet4f>(result + (col + j)*rows + row)));
-        Packet16uc fp16_1 = __builtin_vsx_xvcvspbf16(reinterpret_cast<Packet16uc>(ploadu<Packet4f>(result + (col + j)*rows + row + 4)));
-        block.packet[j].m_val = vec_pack(reinterpret_cast<Packet4ui>(fp16_0), reinterpret_cast<Packet4ui>(fp16_1));
+        block.packet[j].m_val =
+            convertF16toF32(result + (col + j) * rows + row);
       }
 
       res2.template storePacketBlock<Packet8bf,4>(row, 0, block);
@@ -295,8 +359,14 @@
   //extra cols
   while(col < cols){
     const LinearMapper res2 = res.getLinearMapper(0, col);
-    for(Index r= 0; r< rows; r++){
-      res2(r) = Eigen::bfloat16(result[col*rows + r]);
+    float* result2 = result + col * rows;
+    Index r = 0;
+    for (; r + 8 <= rows; r += 8) {
+      Packet8bf fp16 = convertF16toF32(result2 + r);
+      res2.template storePacket<Packet8bf>(r, fp16);
+    }
+    for (; r < rows; r++) {
+      res2(r) = Eigen::bfloat16(result2[r]);
     }
     col++;
   }
diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index fa12892..4fb9326 100644
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -171,6 +171,7 @@
     HasACos = 1,
     HasASin = 1,
     HasATan = 1,
+    HasATanh = 1,
     HasLog = 1,
     HasExp = 1,
 #ifdef EIGEN_VECTORIZE_VSX
@@ -251,11 +252,16 @@
     size = 4,
     HasHalfPacket = 0,
 
-    HasAdd   = 1,
-    HasSub   = 1,
+    HasAdd = 1,
+    HasSub = 1,
     HasShift = 1,
-    HasMul   = 1,
-    HasDiv   = 0,
+    HasMul = 1,
+#if defined(_ARCH_PWR10) && \
+    (EIGEN_COMP_LLVM || EIGEN_GNUC_STRICT_AT_LEAST(11, 0, 0))
+    HasDiv = 1,
+#else
+    HasDiv = 0,
+#endif
     HasBlend = 1,
     HasCmp = 1
   };
@@ -522,7 +528,9 @@
   // some versions of GCC throw "unused-but-set-parameter".
   // ignoring these warnings for now.
   const Index packet_size = unpacket_traits<Packet>::size;
-  eigen_assert(n + offset <= packet_size && "number of elements plus offset will read past end of packet");
+  eigen_internal_assert(
+      n + offset <= packet_size &&
+      "number of elements plus offset will read past end of packet");
   const Index size = sizeof(__UNPACK_TYPE__(Packet));
 #ifdef _ARCH_PWR9
   EIGEN_UNUSED_VARIABLE(packet_size);
@@ -655,7 +663,9 @@
   // some versions of GCC throw "unused-but-set-parameter" (float *to).
   // ignoring these warnings for now.
   const Index packet_size = unpacket_traits<Packet>::size;
-  eigen_assert(n + offset <= packet_size && "number of elements plus offset will write past end of packet");
+  eigen_internal_assert(
+      n + offset <= packet_size &&
+      "number of elements plus offset will write past end of packet");
   const Index size = sizeof(__UNPACK_TYPE__(Packet));
 #ifdef _ARCH_PWR9
   EIGEN_UNUSED_VARIABLE(packet_size);
@@ -815,7 +825,8 @@
 template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pgather_common(const __UNPACK_TYPE__(Packet)* from, Index stride, const Index n = unpacket_traits<Packet>::size)
 {
   EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[unpacket_traits<Packet>::size];
-  eigen_assert(n <= unpacket_traits<Packet>::size && "number of elements will gather past end of packet");
+  eigen_internal_assert(n <= unpacket_traits<Packet>::size &&
+                        "number of elements will gather past end of packet");
   LOAD_STORE_UNROLL_16
   for (Index i = 0; i < n; i++) {
     a[i] = from[i*stride];
@@ -897,7 +908,8 @@
 template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_common(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride, const Index n = unpacket_traits<Packet>::size)
 {
   EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[unpacket_traits<Packet>::size];
-  eigen_assert(n <= unpacket_traits<Packet>::size && "number of elements will scatter past end of packet");
+  eigen_internal_assert(n <= unpacket_traits<Packet>::size &&
+                        "number of elements will scatter past end of packet");
   pstore<__UNPACK_TYPE__(Packet)>(a, from);
   LOAD_STORE_UNROLL_16
   for (Index i = 0; i < n; i++) {
@@ -1043,9 +1055,18 @@
 #endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
-{ eigen_assert(false && "packet integer division are not supported by AltiVec");
+template <>
+EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a,
+                                            const Packet4i& b) {
+#if defined(_ARCH_PWR10) && \
+    (EIGEN_COMP_LLVM || EIGEN_GNUC_STRICT_AT_LEAST(11, 0, 0))
+  return vec_div(a, b);
+#else
+  EIGEN_UNUSED_VARIABLE(a);
+  EIGEN_UNUSED_VARIABLE(b);
+  eigen_assert(false && "packet integer division are not supported by AltiVec");
   return pset1<Packet4i>(0);
+#endif
 }
 
 // for some weird raisons, it has to be overloaded for packet of integers
@@ -1243,7 +1264,8 @@
 template<typename Packet> EIGEN_ALWAYS_INLINE Packet ploadu_partial_common(const __UNPACK_TYPE__(Packet)* from, const Index n)
 {
   const Index packet_size = unpacket_traits<Packet>::size;
-  eigen_assert(n <= packet_size && "number of elements will read past end of packet");
+  eigen_internal_assert(n <= packet_size &&
+                        "number of elements will read past end of packet");
   const Index size = sizeof(__UNPACK_TYPE__(Packet));
 #ifdef _ARCH_PWR9
   EIGEN_UNUSED_VARIABLE(packet_size);
@@ -1431,7 +1453,8 @@
 template<typename Packet> EIGEN_ALWAYS_INLINE void pstoreu_partial_common(__UNPACK_TYPE__(Packet)*  to, const Packet& from, const Index n)
 {
   const Index packet_size = unpacket_traits<Packet>::size;
-  eigen_assert(n <= packet_size && "number of elements will write past end of packet");
+  eigen_internal_assert(n <= packet_size &&
+                        "number of elements will write past end of packet");
   const Index size = sizeof(__UNPACK_TYPE__(Packet));
 #ifdef _ARCH_PWR9
   EIGEN_UNUSED_VARIABLE(packet_size);
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
index 38bd93d..9ae5085 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@@ -770,44 +770,49 @@
   typedef typename unpacket_traits<Packet>::type Scalar;
   static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
 
+  constexpr float kPiOverTwo = static_cast<float>(EIGEN_PI / 2);
+
+  const Packet cst_half = pset1<Packet>(0.5f);
+  const Packet cst_one = pset1<Packet>(1.0f);
+  const Packet cst_two = pset1<Packet>(2.0f);
+  const Packet cst_pi_over_two = pset1<Packet>(kPiOverTwo);
   // For |x| < 0.5 approximate asin(x)/x by an 8th order polynomial with
   // even terms only.
-  const Packet p9 = pset1<Packet>(Scalar(5.08838854730129241943359375e-2f));
-  const Packet p7 = pset1<Packet>(Scalar(3.95139865577220916748046875e-2f));
-  const Packet p5 = pset1<Packet>(Scalar(7.550220191478729248046875e-2f));
-  const Packet p3 = pset1<Packet>(Scalar(0.16664917767047882080078125f));
-  const Packet p1 = pset1<Packet>(Scalar(1.00000011920928955078125f));
+  const Packet p9 = pset1<Packet>(5.08838854730129241943359375e-2f);
+  const Packet p7 = pset1<Packet>(3.95139865577220916748046875e-2f);
+  const Packet p5 = pset1<Packet>(7.550220191478729248046875e-2f);
+  const Packet p3 = pset1<Packet>(0.16664917767047882080078125f);
+  const Packet p1 = pset1<Packet>(1.00000011920928955078125f);
 
-  const Packet neg_mask = pcmp_lt(x_in, pzero(x_in));
-  Packet x = pabs(x_in);
-  const Packet invalid_mask = pcmp_lt(pset1<Packet>(1.0f), x);
+  const Packet abs_x = pabs(x_in);
+  const Packet sign_mask = pandnot(x_in, abs_x);
+  const Packet invalid_mask = pcmp_lt(cst_one, abs_x);
+
   // For arguments |x| > 0.5, we map x back to [0:0.5] using
   // the transformation x_large = sqrt(0.5*(1-x)), and use the
   // identity
   //   asin(x) = pi/2 - 2 * asin( sqrt( 0.5 * (1 - x)))
-  const Packet cst_half = pset1<Packet>(Scalar(0.5f));
-  const Packet cst_two = pset1<Packet>(Scalar(2));
-  Packet x_large = psqrt(pnmadd(cst_half, x, cst_half));
-  const Packet large_mask = pcmp_lt(cst_half, x);
-  x = pselect(large_mask, x_large, x);
+
+  const Packet x_large = psqrt(pnmadd(cst_half, abs_x, cst_half));
+  const Packet large_mask = pcmp_lt(cst_half, abs_x);
+  const Packet x = pselect(large_mask, x_large, abs_x);
+  const Packet x2 = pmul(x, x);
 
   // Compute polynomial.
   // x * (p1 + x^2*(p3 + x^2*(p5 + x^2*(p7 + x^2*p9))))
-  Packet x2 = pmul(x, x);
+
   Packet p = pmadd(p9, x2, p7);
   p = pmadd(p, x2, p5);
   p = pmadd(p, x2, p3);
   p = pmadd(p, x2, p1);
   p = pmul(p, x);
 
-  constexpr float kPiOverTwo = static_cast<float>(EIGEN_PI/2);
-  Packet p_large = pnmadd(cst_two, p, pset1<Packet>(kPiOverTwo));
+  const Packet p_large = pnmadd(cst_two, p, cst_pi_over_two);
   p = pselect(large_mask, p_large, p);
   // Flip the sign for negative arguments.
-  p = pselect(neg_mask, pnegate(p), p);
-
+  p = pxor(p, sign_mask);
   // Return NaN for arguments outside [-1:1].
-  return pselect(invalid_mask, pset1<Packet>(std::numeric_limits<float>::quiet_NaN()), p);
+  return por(invalid_mask, p);
 }
 
 // Computes elementwise atan(x) for x in [-1:1] with 2 ulp accuracy.
@@ -962,6 +967,35 @@
   return pxor(p, x_signmask);
 }
 
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet
+patanh_float(const Packet& x) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  static_assert(std::is_same<Scalar, float>::value,
+                "Scalar type must be float");
+  const Packet half = pset1<Packet>(0.5f);
+  const Packet x_gt_half = pcmp_le(half, pabs(x));
+  // For |x| in [0:0.5] we use a polynomial approximation of the form
+  // P(x) = x + x^3*(c3 + x^2 * (c5 + x^2 * (... x^2 * c11) ... )).
+  const Packet C3 = pset1<Packet>(0.3333373963832855224609375f);
+  const Packet C5 = pset1<Packet>(0.1997792422771453857421875f);
+  const Packet C7 = pset1<Packet>(0.14672131836414337158203125f);
+  const Packet C9 = pset1<Packet>(8.2311116158962249755859375e-2f);
+  const Packet C11 = pset1<Packet>(0.1819281280040740966796875f);
+  const Packet x2 = pmul(x, x);
+  Packet p = pmadd(C11, x2, C9);
+  p = pmadd(x2, p, C7);
+  p = pmadd(x2, p, C5);
+  p = pmadd(x2, p, C3);
+  p = pmadd(pmul(x, x2), p, x);
+
+  // For |x| in ]0.5:1.0] we use atanh = 0.5*ln((1+x)/(1-x));
+  const Packet one = pset1<Packet>(1.0f);
+  Packet r = pdiv(padd(one, x), psub(one, x));
+  r = pmul(half, plog(r));
+  return pselect(x_gt_half, r, p);
+}
+
 template<typename Packet>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pdiv_complex(const Packet& x, const Packet& y) {
@@ -1090,12 +1124,15 @@
   is_imag_inf = por(is_imag_inf, pcplxflip(is_imag_inf));
   Packet imag_inf_result;
   imag_inf_result.v = por(pand(cst_pos_inf, real_mask), pandnot(a.v, real_mask));
+  // unless otherwise specified, if either the real or imaginary component is
+  // nan, the entire result is nan
+  Packet result_is_nan = pandnot(ptrue(result), pcmp_eq(result, result));
+  result = por(result_is_nan, result);
 
-  return  pselect(is_imag_inf, imag_inf_result,
-                  pselect(is_real_inf, real_inf_result,result));
+  return pselect(is_imag_inf, imag_inf_result,
+                 pselect(is_real_inf, real_inf_result, result));
 }
 
-
 template <typename Packet>
 struct psign_impl<
     Packet,
@@ -1105,16 +1142,13 @@
   static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a) {
     using Scalar = typename unpacket_traits<Packet>::type;
     const Packet cst_one = pset1<Packet>(Scalar(1));
-    const Packet cst_minus_one = pset1<Packet>(Scalar(-1));
     const Packet cst_zero = pzero(a);
 
-    const Packet not_nan_mask = pcmp_eq(a, a);
-    const Packet positive_mask = pcmp_lt(cst_zero, a);
-    const Packet positive = pand(positive_mask, cst_one);
-    const Packet negative_mask = pcmp_lt(a, cst_zero);
-    const Packet negative = pand(negative_mask, cst_minus_one);
+    const Packet abs_a = pabs(a);
+    const Packet sign_mask = pandnot(a, abs_a);
+    const Packet nonzero_mask = pcmp_lt(cst_zero, abs_a);
 
-    return pselect(not_nan_mask, por(positive, negative), a);
+    return pselect(nonzero_mask, por(sign_mask, cst_one), abs_a);
   }
 };
 
@@ -1221,7 +1255,7 @@
 void twoprod(const Packet& x, const Packet& y,
              Packet& p_hi, Packet& p_lo) {
   p_hi = pmul(x, y);
-  p_lo = pmadd(x, y, pnegate(p_hi));
+  p_lo = pmsub(x, y, p_hi);
 }
 
 #else
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
index 179c55c..650c441 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
@@ -109,6 +109,11 @@
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet patan_double(const Packet& x);
 
+/** \internal \returns atan(x) for single precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet
+patanh_float(const Packet& x);
+
 /** \internal \returns sqrt(x) for complex types */
 template<typename Packet>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
diff --git a/Eigen/src/Core/arch/Default/Half.h b/Eigen/src/Core/arch/Default/Half.h
index 6ff3e20..c8ca33a 100644
--- a/Eigen/src/Core/arch/Default/Half.h
+++ b/Eigen/src/Core/arch/Default/Half.h
@@ -779,6 +779,12 @@
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half acos(const half& a) {
   return half(::acosf(float(a)));
 }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half atan(const half& a) {
+  return half(::atanf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half atanh(const half& a) {
+  return half(::atanhf(float(a)));
+}
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
 #if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
   defined(EIGEN_HIP_DEVICE_COMPILE)
diff --git a/Eigen/src/Core/arch/GPU/PacketMath.h b/Eigen/src/Core/arch/GPU/PacketMath.h
index e2bcf48..17dd8fb 100644
--- a/Eigen/src/Core/arch/GPU/PacketMath.h
+++ b/Eigen/src/Core/arch/GPU/PacketMath.h
@@ -379,7 +379,7 @@
 template<>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
 #if defined(EIGEN_GPU_HAS_LDG)
-  return __ldg((const float4*)from);
+  return __ldg(reinterpret_cast<const float4*>(from));
 #else
   return make_float4(from[0], from[1], from[2], from[3]);
 #endif
@@ -387,7 +387,7 @@
 template<>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
 #if defined(EIGEN_GPU_HAS_LDG)
-  return __ldg((const double2*)from);
+  return __ldg(reinterpret_cast<const double2*>(from));
 #else
   return make_double2(from[0], from[1]);
 #endif
diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h
index 3b1c6e7..e436360 100644
--- a/Eigen/src/Core/arch/NEON/Complex.h
+++ b/Eigen/src/Core/arch/NEON/Complex.h
@@ -132,12 +132,12 @@
 
 template<> EIGEN_STRONG_INLINE Packet1cf pconj(const Packet1cf& a)
 {
-  const Packet2ui b = vreinterpret_u32_f32(a.v);
+  const Packet2ui b = Packet2ui(vreinterpret_u32_f32(a.v));
   return Packet1cf(vreinterpret_f32_u32(veor_u32(b, p2ui_CONJ_XOR())));
 }
 template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a)
 {
-  const Packet4ui b = vreinterpretq_u32_f32(a.v);
+  const Packet4ui b = Packet4ui(vreinterpretq_u32_f32(a.v));
   return Packet2cf(vreinterpretq_f32_u32(veorq_u32(b, p4ui_CONJ_XOR())));
 }
 
diff --git a/Eigen/src/Core/arch/NEON/MathFunctions.h b/Eigen/src/Core/arch/NEON/MathFunctions.h
index aea5149..7be60e0 100644
--- a/Eigen/src/Core/arch/NEON/MathFunctions.h
+++ b/Eigen/src/Core/arch/NEON/MathFunctions.h
@@ -55,6 +55,17 @@
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f ptanh<Packet4f>(const Packet4f& x)
 { return internal::generic_fast_tanh_float(x); }
 
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2f
+patanh<Packet2f>(const Packet2f& x) {
+  return patanh_float(x);
+}
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f
+patanh<Packet4f>(const Packet4f& x) {
+  return patanh_float(x);
+}
+
 #if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
 template <>
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 6d4d1c3..745a023 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -59,6 +59,16 @@
 typedef eigen_packet_wrapper<int64x2_t  ,16> Packet2l;
 typedef eigen_packet_wrapper<uint64x2_t ,17> Packet2ul;
 
+EIGEN_ALWAYS_INLINE Packet4f make_packet4f(float a, float b, float c, float d) {
+  float from[4] = {a, b, c, d};
+  return vld1q_f32(from);
+}
+
+EIGEN_ALWAYS_INLINE Packet2f make_packet2f(float a, float b) {
+  float from[2] = {a, b};
+  return vld1_f32(from);
+}
+
 #else
 
 typedef float32x2_t                          Packet2f;
@@ -80,11 +90,20 @@
 typedef int64x2_t                            Packet2l;
 typedef uint64x2_t                           Packet2ul;
 
+EIGEN_ALWAYS_INLINE Packet4f make_packet4f(float a, float b, float c, float d) {
+  return Packet4f{a, b, c, d};
+}
+EIGEN_ALWAYS_INLINE Packet2f make_packet2f(float a, float b) {
+  return Packet2f{a, b};
+}
+
 #endif // EIGEN_COMP_MSVC_STRICT
 
 EIGEN_STRONG_INLINE Packet4f shuffle1(const Packet4f& m, int mask){
   const float* a = reinterpret_cast<const float*>(&m);
-  Packet4f res = {*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3 )), *(a + ((mask >> 6) & 3))};
+  Packet4f res =
+      make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)),
+                    *(a + ((mask >> 4) & 3)), *(a + ((mask >> 6) & 3)));
   return res;
 }
 
@@ -97,7 +116,9 @@
 {
   const float* a = reinterpret_cast<const float*>(&m);
   const float* b = reinterpret_cast<const float*>(&n);
-  Packet4f res = {*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))};
+  Packet4f res =
+      make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)),
+                    *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3)));
   return res;
 }
 
@@ -106,7 +127,9 @@
 {
   const float* a = reinterpret_cast<const float*>(&m);
   const float* b = reinterpret_cast<const float*>(&n);
-  Packet4f res = {*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))};
+  Packet4f res =
+      make_packet4f(*(a + (mask & 3)), *(b + ((mask >> 2) & 3)),
+                    *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3)));
   return res;
 }
 
@@ -148,11 +171,12 @@
 #define EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
   const Packet4i p4i_##NAME = pset1<Packet4i>(X)
 
-#if EIGEN_ARCH_ARM64
-  // __builtin_prefetch tends to do nothing on ARM64 compilers because the
-  // prefetch instructions there are too detailed for __builtin_prefetch to map
-  // meaningfully to them.
-  #define EIGEN_ARM_PREFETCH(ADDR)  __asm__ __volatile__("prfm pldl1keep, [%[addr]]\n" ::[addr] "r"(ADDR) : );
+#if EIGEN_ARCH_ARM64 && EIGEN_COMP_GNUC
+// __builtin_prefetch tends to do nothing on ARM64 compilers because the
+// prefetch instructions there are too detailed for __builtin_prefetch to map
+// meaningfully to them.
+#define EIGEN_ARM_PREFETCH(ADDR)                                                                     \
+  __asm__ __volatile__("prfm pldl1keep, [%[addr]]\n" ::[addr] "r"(ADDR) :);
 #elif EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
   #define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR);
 #elif defined __pld
@@ -169,45 +193,45 @@
 {
   typedef Packet4f type;
   typedef Packet2f half;
-  enum
-  {
+  enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 4,
     HasHalfPacket = 1,
 
-    HasCmp       = 1,
-    HasAdd       = 1,
-    HasSub       = 1,
-    HasShift     = 1,
-    HasMul       = 1,
-    HasNegate    = 1,
-    HasAbs       = 1,
-    HasArg       = 0,
-    HasAbs2      = 1,
-    HasAbsDiff   = 1,
-    HasMin       = 1,
-    HasMax       = 1,
-    HasConj      = 1,
+    HasCmp = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasAbsDiff = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
     HasSetLinear = 1,
-    HasBlend     = 0,
+    HasBlend = 0,
 
-    HasDiv   = 1,
+    HasDiv = 1,
     HasFloor = 1,
     HasCeil = 1,
     HasRint = 1,
 
-    HasSin  = EIGEN_FAST_MATH,
-    HasCos  = EIGEN_FAST_MATH,
-    HasACos  = 1,
-    HasASin  = 1,
-    HasATan  = 1,
-    HasLog  = 1,
-    HasExp  = 1,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasACos = 1,
+    HasASin = 1,
+    HasATan = 1,
+    HasATanh = 1,
+    HasLog = 1,
+    HasExp = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
     HasTanh = EIGEN_FAST_MATH,
-    HasErf  = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH,
     HasBessel = 0,  // Issues with accuracy.
     HasNdtri = 0
   };
@@ -865,12 +889,13 @@
 
 template<> EIGEN_STRONG_INLINE Packet2f pxor<Packet2f>(const Packet2f& a, const Packet2f& b);
 template<> EIGEN_STRONG_INLINE Packet2f paddsub<Packet2f>(const Packet2f& a, const Packet2f & b) {
-  Packet2f mask = {numext::bit_cast<float>(0x80000000u), 0.0f};
+  Packet2f mask = make_packet2f(numext::bit_cast<float>(0x80000000u), 0.0f);
   return padd(a, pxor(mask, b));
 }
 template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b);
 template<> EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(const Packet4f& a, const Packet4f& b) {
-  Packet4f mask = {numext::bit_cast<float>(0x80000000u), 0.0f, numext::bit_cast<float>(0x80000000u), 0.0f};
+  Packet4f mask = make_packet4f(numext::bit_cast<float>(0x80000000u), 0.0f,
+                                numext::bit_cast<float>(0x80000000u), 0.0f);
   return padd(a, pxor(mask, b));
 }
 
@@ -2535,8 +2560,10 @@
 // mul
 template<> EIGEN_STRONG_INLINE float predux_mul<Packet2f>(const Packet2f& a)
 { return vget_lane_f32(a, 0) * vget_lane_f32(a, 1); }
-template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
-{ return predux_mul(vmul_f32(vget_low_f32(a), vget_high_f32(a))); }
+template <>
+EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
+  return predux_mul<Packet2f>(vmul_f32(vget_low_f32(a), vget_high_f32(a)));
+}
 template<> EIGEN_STRONG_INLINE int8_t predux_mul<Packet4c>(const Packet4c& a)
 {
   int8x8_t prod = vreinterpret_s8_s32(vdup_n_s32(a));
@@ -2549,8 +2576,10 @@
   prod = vmul_s8(prod, vrev32_s8(prod));
   return vget_lane_s8(prod, 0) * vget_lane_s8(prod, 4);
 }
-template<> EIGEN_STRONG_INLINE int8_t predux_mul<Packet16c>(const Packet16c& a)
-{ return predux_mul(vmul_s8(vget_low_s8(a), vget_high_s8(a))); }
+template <>
+EIGEN_STRONG_INLINE int8_t predux_mul<Packet16c>(const Packet16c& a) {
+  return predux_mul<Packet8c>(vmul_s8(vget_low_s8(a), vget_high_s8(a)));
+}
 template<> EIGEN_STRONG_INLINE uint8_t predux_mul<Packet4uc>(const Packet4uc& a)
 {
   uint8x8_t prod = vreinterpret_u8_u32(vdup_n_u32(a));
@@ -2563,8 +2592,10 @@
   prod = vmul_u8(prod, vrev32_u8(prod));
   return vget_lane_u8(prod, 0) * vget_lane_u8(prod, 4);
 }
-template<> EIGEN_STRONG_INLINE uint8_t predux_mul<Packet16uc>(const Packet16uc& a)
-{ return predux_mul(vmul_u8(vget_low_u8(a), vget_high_u8(a))); }
+template <>
+EIGEN_STRONG_INLINE uint8_t predux_mul<Packet16uc>(const Packet16uc& a) {
+  return predux_mul<Packet8uc>(vmul_u8(vget_low_u8(a), vget_high_u8(a)));
+}
 template<> EIGEN_STRONG_INLINE int16_t predux_mul<Packet4s>(const Packet4s& a)
 {
   const int16x4_t prod = vmul_s16(a, vrev32_s16(a));
@@ -2599,12 +2630,16 @@
 }
 template<> EIGEN_STRONG_INLINE int32_t predux_mul<Packet2i>(const Packet2i& a)
 { return vget_lane_s32(a, 0) * vget_lane_s32(a, 1); }
-template<> EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a)
-{ return predux_mul(vmul_s32(vget_low_s32(a), vget_high_s32(a))); }
+template <>
+EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a) {
+  return predux_mul<Packet2i>(vmul_s32(vget_low_s32(a), vget_high_s32(a)));
+}
 template<> EIGEN_STRONG_INLINE uint32_t predux_mul<Packet2ui>(const Packet2ui& a)
 { return vget_lane_u32(a, 0) * vget_lane_u32(a, 1); }
-template<> EIGEN_STRONG_INLINE uint32_t predux_mul<Packet4ui>(const Packet4ui& a)
-{ return predux_mul(vmul_u32(vget_low_u32(a), vget_high_u32(a))); }
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_mul<Packet4ui>(const Packet4ui& a) {
+  return predux_mul<Packet2ui>(vmul_u32(vget_low_u32(a), vget_high_u32(a)));
+}
 template<> EIGEN_STRONG_INLINE int64_t predux_mul<Packet2l>(const Packet2l& a)
 { return vgetq_lane_s64(a, 0) * vgetq_lane_s64(a, 1); }
 template<> EIGEN_STRONG_INLINE uint64_t predux_mul<Packet2ul>(const Packet2ul& a)
@@ -3457,7 +3492,7 @@
 {
   // See the scalar implementation in BFloat16.h for a comprehensible explanation
   // of this fast rounding algorithm
-  Packet4ui input = reinterpret_cast<Packet4ui>(p);
+  Packet4ui input = Packet4ui(vreinterpretq_u32_f32(p));
 
   // lsb = (input >> 16) & 1
   Packet4ui lsb =  vandq_u32(vshrq_n_u32(input, 16), vdupq_n_u32(1));
@@ -3482,7 +3517,7 @@
 
 EIGEN_STRONG_INLINE Packet4f Bf16ToF32(const Packet4bf& p)
 {
-  return reinterpret_cast<Packet4f>(vshlq_n_u32(vmovl_u16(p), 16));
+  return Packet4f(vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(p), 16)));
 }
 
 EIGEN_STRONG_INLINE Packet4bf F32MaskToBf16Mask(const Packet4f& p) {
@@ -3490,21 +3525,22 @@
 }
 
 template<> EIGEN_STRONG_INLINE Packet4bf pset1<Packet4bf>(const bfloat16& from) {
-  return pset1<Packet4us>(from.value);
+  return Packet4bf(pset1<Packet4us>(from.value));
 }
 
 template<> EIGEN_STRONG_INLINE bfloat16 pfirst<Packet4bf>(const Packet4bf& from) {
-  return bfloat16_impl::raw_uint16_to_bfloat16(static_cast<uint16_t>(pfirst<Packet4us>(from)));
+  return bfloat16_impl::raw_uint16_to_bfloat16(
+      static_cast<uint16_t>(pfirst<Packet4us>(Packet4us(from))));
 }
 
 template<> EIGEN_STRONG_INLINE Packet4bf pload<Packet4bf>(const bfloat16* from)
 {
-  return pload<Packet4us>(reinterpret_cast<const uint16_t*>(from));
+  return Packet4bf(pload<Packet4us>(reinterpret_cast<const uint16_t*>(from)));
 }
 
 template<> EIGEN_STRONG_INLINE Packet4bf ploadu<Packet4bf>(const bfloat16* from)
 {
-  return ploadu<Packet4us>(reinterpret_cast<const uint16_t*>(from));
+  return Packet4bf(ploadu<Packet4us>(reinterpret_cast<const uint16_t*>(from)));
 }
 
 template<> EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet4bf& from)
@@ -3519,7 +3555,8 @@
 
 template<> EIGEN_STRONG_INLINE Packet4bf ploaddup<Packet4bf>(const bfloat16* from)
 {
-  return ploaddup<Packet4us>(reinterpret_cast<const uint16_t*>(from));
+  return Packet4bf(
+      ploaddup<Packet4us>(reinterpret_cast<const uint16_t*>(from)));
 }
 
 template <> EIGEN_STRONG_INLINE Packet4bf pabs(const Packet4bf& a) {
@@ -3566,25 +3603,26 @@
 }
 
 template<> EIGEN_STRONG_INLINE Packet4bf por(const Packet4bf& a,const Packet4bf& b) {
-  return por<Packet4us>(a, b);
+  return Packet4bf(por<Packet4us>(Packet4us(a), Packet4us(b)));
 }
 
 template<> EIGEN_STRONG_INLINE Packet4bf pxor(const Packet4bf& a,const Packet4bf& b) {
-  return pxor<Packet4us>(a, b);
+  return Packet4bf(pxor<Packet4us>(Packet4us(a), Packet4us(b)));
 }
 
 template<> EIGEN_STRONG_INLINE Packet4bf pand(const Packet4bf& a,const Packet4bf& b) {
-  return pand<Packet4us>(a, b);
+  return Packet4bf(pand<Packet4us>(Packet4us(a), Packet4us(b)));
 }
 
 template<> EIGEN_STRONG_INLINE Packet4bf pandnot(const Packet4bf& a,const Packet4bf& b) {
-  return pandnot<Packet4us>(a, b);
+  return Packet4bf(pandnot<Packet4us>(Packet4us(a), Packet4us(b)));
 }
 
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4bf pselect(const Packet4bf& mask, const Packet4bf& a,
                                                       const Packet4bf& b)
 {
-  return pselect<Packet4us>(mask, a, b);
+  return Packet4bf(
+      pselect<Packet4us>(Packet4us(mask), Packet4us(a), Packet4us(b)));
 }
 
 template<> EIGEN_STRONG_INLINE Packet4bf print<Packet4bf>(const Packet4bf& a)
@@ -3623,13 +3661,15 @@
 template<>
 EIGEN_STRONG_INLINE Packet4bf pgather<bfloat16, Packet4bf>(const bfloat16* from, Index stride)
 {
-  return pgather<uint16_t, Packet4us>(reinterpret_cast<const uint16_t*>(from), stride);
+  return Packet4bf(pgather<uint16_t, Packet4us>(
+      reinterpret_cast<const uint16_t*>(from), stride));
 }
 
 template<>
 EIGEN_STRONG_INLINE void pscatter<bfloat16, Packet4bf>(bfloat16* to, const Packet4bf& from, Index stride)
 {
-  pscatter<uint16_t, Packet4us>(reinterpret_cast<uint16_t*>(to), from, stride);
+  pscatter<uint16_t, Packet4us>(reinterpret_cast<uint16_t*>(to),
+                                Packet4us(from), stride);
 }
 
 template<> EIGEN_STRONG_INLINE bfloat16 predux<Packet4bf>(const Packet4bf& a)
@@ -3654,7 +3694,7 @@
 
 template<> EIGEN_STRONG_INLINE Packet4bf preverse<Packet4bf>(const Packet4bf& a)
 {
-  return preverse<Packet4us>(a);
+  return Packet4bf(preverse<Packet4us>(Packet4us(a)));
 }
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4bf, 4>& kernel)
@@ -3689,7 +3729,8 @@
 
 template<> EIGEN_STRONG_INLINE Packet4bf pnegate<Packet4bf>(const Packet4bf& a)
 {
-  return pxor<Packet4us>(a, pset1<Packet4us>(static_cast<uint16_t>(0x8000)));
+  return Packet4bf(pxor<Packet4us>(
+      Packet4us(a), pset1<Packet4us>(static_cast<uint16_t>(0x8000))));
 }
 
 //---------- double ----------
@@ -3707,17 +3748,35 @@
 
 #if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
 
-// Bug 907: workaround missing declarations of the following two functions in the ADK
-// Defining these functions as templates ensures that if these intrinsics are
-// already defined in arm_neon.h, then our workaround doesn't cause a conflict
-// and has lower priority in overload resolution.
+#if EIGEN_COMP_GNUC
+// Bug 907: workaround missing declarations of the following two functions in
+// the ADK Defining these functions as templates ensures that if these
+// intrinsics are already defined in arm_neon.h, then our workaround doesn't
+// cause a conflict and has lower priority in overload resolution. This doesn't
+// work with MSVC though, since the function names are macros.
 template <typename T> uint64x2_t vreinterpretq_u64_f64(T a) { return (uint64x2_t) a; }
 
 template <typename T> float64x2_t vreinterpretq_f64_u64(T a) { return (float64x2_t) a; }
+#endif
 
+#if EIGEN_COMP_MSVC_STRICT
+typedef eigen_packet_wrapper<float64x2_t, 18> Packet2d;
+typedef eigen_packet_wrapper<float64x1_t, 19> Packet1d;
+
+EIGEN_ALWAYS_INLINE Packet2d make_packet2d(double a, double b) {
+  double from[2] = {a, b};
+  return vld1q_f64(from);
+}
+
+#else
 typedef float64x2_t Packet2d;
 typedef float64x1_t Packet1d;
 
+EIGEN_ALWAYS_INLINE Packet2d make_packet2d(double a, double b) {
+  return Packet2d{a, b};
+}
+#endif
+
 // fuctionally equivalent to _mm_shuffle_pd in SSE (i.e. shuffle(m, n, mask) equals _mm_shuffle_pd(m,n,mask))
 // Currently used in LU/arch/InverseSize4.h to enable a shared implementation
 // for fast inversion of matrices of size 4.
@@ -3725,7 +3784,7 @@
 {
   const double* a = reinterpret_cast<const double*>(&m);
   const double* b = reinterpret_cast<const double*>(&n);
-  Packet2d res = {*(a + (mask & 1)), *(b + ((mask >> 1) & 1))};
+  Packet2d res = make_packet2d(*(a + (mask & 1)), *(b + ((mask >> 1) & 1)));
   return res;
 }
 
@@ -3819,7 +3878,8 @@
 
 template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& , const Packet2d& );
 template<> EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(const Packet2d& a, const Packet2d& b){
-  const Packet2d mask = {numext::bit_cast<double>(0x8000000000000000ull),0.0};
+  const Packet2d mask =
+      make_packet2d(numext::bit_cast<double>(0x8000000000000000ull), 0.0);
   return padd(a, pxor(mask, b));
 }
 
@@ -3932,8 +3992,10 @@
 template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
 { return (vget_low_f64(a) * vget_high_f64(a))[0]; }
 #else
-template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
-{ return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); }
+template <>
+EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
+  return vget_lane_f64(vmul_f64(vget_low_f64(a), vget_high_f64(a)), 0);
+}
 #endif
 
 // min
diff --git a/Eigen/src/Core/arch/NEON/TypeCasting.h b/Eigen/src/Core/arch/NEON/TypeCasting.h
index e5ddab6..0ec6489 100644
--- a/Eigen/src/Core/arch/NEON/TypeCasting.h
+++ b/Eigen/src/Core/arch/NEON/TypeCasting.h
@@ -18,6 +18,137 @@
 namespace internal {
 
 //==============================================================================
+// preinterpret
+//==============================================================================
+template <>
+EIGEN_STRONG_INLINE Packet2f
+preinterpret<Packet2f, Packet2i>(const Packet2i& a) {
+  return Packet2f(vreinterpret_f32_s32(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f
+preinterpret<Packet2f, Packet2ui>(const Packet2ui& a) {
+  return Packet2f(vreinterpret_f32_u32(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f
+preinterpret<Packet4f, Packet4i>(const Packet4i& a) {
+  return Packet4f(vreinterpretq_f32_s32(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f
+preinterpret<Packet4f, Packet4ui>(const Packet4ui& a) {
+  return Packet4f(vreinterpretq_f32_u32(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4c
+preinterpret<Packet4c, Packet4uc>(const Packet4uc& a) {
+  return static_cast<Packet4c>(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c
+preinterpret<Packet8c, Packet8uc>(const Packet8uc& a) {
+  return Packet8c(vreinterpret_s8_u8(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c
+preinterpret<Packet16c, Packet16uc>(const Packet16uc& a) {
+  return Packet16c(vreinterpretq_s8_u8(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4uc
+preinterpret<Packet4uc, Packet4c>(const Packet4c& a) {
+  return static_cast<Packet4uc>(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc
+preinterpret<Packet8uc, Packet8c>(const Packet8c& a) {
+  return Packet8uc(vreinterpret_u8_s8(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc
+preinterpret<Packet16uc, Packet16c>(const Packet16c& a) {
+  return Packet16uc(vreinterpretq_u8_s8(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4s
+preinterpret<Packet4s, Packet4us>(const Packet4us& a) {
+  return Packet4s(vreinterpret_s16_u16(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s
+preinterpret<Packet8s, Packet8us>(const Packet8us& a) {
+  return Packet8s(vreinterpretq_s16_u16(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4us
+preinterpret<Packet4us, Packet4s>(const Packet4s& a) {
+  return Packet4us(vreinterpret_u16_s16(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us
+preinterpret<Packet8us, Packet8s>(const Packet8s& a) {
+  return Packet8us(vreinterpretq_u16_s16(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2i
+preinterpret<Packet2i, Packet2f>(const Packet2f& a) {
+  return Packet2i(vreinterpret_s32_f32(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i
+preinterpret<Packet2i, Packet2ui>(const Packet2ui& a) {
+  return Packet2i(vreinterpret_s32_u32(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i
+preinterpret<Packet4i, Packet4f>(const Packet4f& a) {
+  return Packet4i(vreinterpretq_s32_f32(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i
+preinterpret<Packet4i, Packet4ui>(const Packet4ui& a) {
+  return Packet4i(vreinterpretq_s32_u32(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2ui
+preinterpret<Packet2ui, Packet2f>(const Packet2f& a) {
+  return Packet2ui(vreinterpret_u32_f32(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui
+preinterpret<Packet2ui, Packet2i>(const Packet2i& a) {
+  return Packet2ui(vreinterpret_u32_s32(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui
+preinterpret<Packet4ui, Packet4f>(const Packet4f& a) {
+  return Packet4ui(vreinterpretq_u32_f32(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui
+preinterpret<Packet4ui, Packet4i>(const Packet4i& a) {
+  return Packet4ui(vreinterpretq_u32_s32(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2l
+preinterpret<Packet2l, Packet2ul>(const Packet2ul& a) {
+  return Packet2l(vreinterpretq_s64_u64(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul
+preinterpret<Packet2ul, Packet2l>(const Packet2l& a) {
+  return Packet2ul(vreinterpretq_u64_s64(a));
+}
+
+//==============================================================================
 // pcast, SrcType = float
 //==============================================================================
 template <>
@@ -190,7 +321,7 @@
 };
 template <>
 EIGEN_STRONG_INLINE Packet2ul pcast<Packet16c, Packet2ul>(const Packet16c& a) {
-  return vreinterpretq_u64_s64(pcast<Packet16c, Packet2l>(a));
+  return preinterpret<Packet2ul>(pcast<Packet16c, Packet2l>(a));
 }
 
 template <>
@@ -214,11 +345,11 @@
 };
 template <>
 EIGEN_STRONG_INLINE Packet4ui pcast<Packet16c, Packet4ui>(const Packet16c& a) {
-  return vreinterpretq_u32_s32(pcast<Packet16c, Packet4i>(a));
+  return preinterpret<Packet4ui>(pcast<Packet16c, Packet4i>(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2ui pcast<Packet8c, Packet2ui>(const Packet8c& a) {
-  return vreinterpret_u32_s32(pcast<Packet8c, Packet2i>(a));
+  return preinterpret<Packet2ui>(pcast<Packet8c, Packet2i>(a));
 }
 
 template <>
@@ -242,11 +373,11 @@
 };
 template <>
 EIGEN_STRONG_INLINE Packet8us pcast<Packet16c, Packet8us>(const Packet16c& a) {
-  return vreinterpretq_u16_s16(pcast<Packet16c, Packet8s>(a));
+  return preinterpret<Packet8us>(pcast<Packet16c, Packet8s>(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet4us pcast<Packet8c, Packet4us>(const Packet8c& a) {
-  return vreinterpret_u16_s16(pcast<Packet8c, Packet4s>(a));
+  return preinterpret<Packet4us>(pcast<Packet8c, Packet4s>(a));
 }
 
 template <>
@@ -272,11 +403,11 @@
 };
 template <>
 EIGEN_STRONG_INLINE Packet16uc pcast<Packet16c, Packet16uc>(const Packet16c& a) {
-  return vreinterpretq_u8_s8(a);
+  return preinterpret<Packet16uc>(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet8uc pcast<Packet8c, Packet8uc>(const Packet8c& a) {
-  return vreinterpret_u8_s8(a);
+  return preinterpret<Packet8uc>(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet4uc pcast<Packet4c, Packet4uc>(const Packet4c& a) {
@@ -317,7 +448,7 @@
 };
 template <>
 EIGEN_STRONG_INLINE Packet2l pcast<Packet16uc, Packet2l>(const Packet16uc& a) {
-  return vreinterpretq_s64_u64(pcast<Packet16uc, Packet2ul>(a));
+  return preinterpret<Packet2l>(pcast<Packet16uc, Packet2ul>(a));
 }
 
 template <>
@@ -341,11 +472,11 @@
 };
 template <>
 EIGEN_STRONG_INLINE Packet4i pcast<Packet16uc, Packet4i>(const Packet16uc& a) {
-  return vreinterpretq_s32_u32(pcast<Packet16uc, Packet4ui>(a));
+  return preinterpret<Packet4i>(pcast<Packet16uc, Packet4ui>(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2i pcast<Packet8uc, Packet2i>(const Packet8uc& a) {
-  return vreinterpret_s32_u32(pcast<Packet8uc, Packet2ui>(a));
+  return preinterpret<Packet2i>(pcast<Packet8uc, Packet2ui>(a));
 }
 
 template <>
@@ -369,11 +500,11 @@
 };
 template <>
 EIGEN_STRONG_INLINE Packet8s pcast<Packet16uc, Packet8s>(const Packet16uc& a) {
-  return vreinterpretq_s16_u16(pcast<Packet16uc, Packet8us>(a));
+  return preinterpret<Packet8s>(pcast<Packet16uc, Packet8us>(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet4s pcast<Packet8uc, Packet4s>(const Packet8uc& a) {
-  return vreinterpret_s16_u16(pcast<Packet8uc, Packet4us>(a));
+  return preinterpret<Packet4s>(pcast<Packet8uc, Packet4us>(a));
 }
 
 template <>
@@ -399,11 +530,11 @@
 };
 template <>
 EIGEN_STRONG_INLINE Packet16c pcast<Packet16uc, Packet16c>(const Packet16uc& a) {
-  return vreinterpretq_s8_u8(a);
+  return preinterpret<Packet16c>(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet8c pcast<Packet8uc, Packet8c>(const Packet8uc& a) {
-  return vreinterpret_s8_u8(a);
+  return preinterpret<Packet8c>(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet4c pcast<Packet4uc, Packet4c>(const Packet4uc& a) {
@@ -444,7 +575,7 @@
 };
 template <>
 EIGEN_STRONG_INLINE Packet2ul pcast<Packet8s, Packet2ul>(const Packet8s& a) {
-  return vreinterpretq_u64_s64(pcast<Packet8s, Packet2l>(a));
+  return preinterpret<Packet2ul>(pcast<Packet8s, Packet2l>(a));
 }
 
 template <>
@@ -468,11 +599,11 @@
 };
 template <>
 EIGEN_STRONG_INLINE Packet4ui pcast<Packet8s, Packet4ui>(const Packet8s& a) {
-  return vreinterpretq_u32_s32(pcast<Packet8s, Packet4i>(a));
+  return preinterpret<Packet4ui>(pcast<Packet8s, Packet4i>(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2ui pcast<Packet4s, Packet2ui>(const Packet4s& a) {
-  return vreinterpret_u32_s32(pcast<Packet4s, Packet2i>(a));
+  return preinterpret<Packet2ui>(pcast<Packet4s, Packet2i>(a));
 }
 
 template <>
@@ -494,11 +625,11 @@
 };
 template <>
 EIGEN_STRONG_INLINE Packet8us pcast<Packet8s, Packet8us>(const Packet8s& a) {
-  return vreinterpretq_u16_s16(a);
+  return preinterpret<Packet8us>(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet4us pcast<Packet4s, Packet4us>(const Packet4s& a) {
-  return vreinterpret_u16_s16(a);
+  return preinterpret<Packet4us>(a);
 }
 
 template <>
@@ -561,7 +692,7 @@
 };
 template <>
 EIGEN_STRONG_INLINE Packet2l pcast<Packet8us, Packet2l>(const Packet8us& a) {
-  return vreinterpretq_s64_u64(pcast<Packet8us, Packet2ul>(a));
+  return preinterpret<Packet2l>(pcast<Packet8us, Packet2ul>(a));
 }
 
 template <>
@@ -585,11 +716,11 @@
 };
 template <>
 EIGEN_STRONG_INLINE Packet4i pcast<Packet8us, Packet4i>(const Packet8us& a) {
-  return vreinterpretq_s32_u32(pcast<Packet8us, Packet4ui>(a));
+  return preinterpret<Packet4i>(pcast<Packet8us, Packet4ui>(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2i pcast<Packet4us, Packet2i>(const Packet4us& a) {
-  return vreinterpret_s32_u32(pcast<Packet4us, Packet2ui>(a));
+  return preinterpret<Packet2i>(pcast<Packet4us, Packet2ui>(a));
 }
 
 template <>
@@ -611,11 +742,11 @@
 };
 template <>
 EIGEN_STRONG_INLINE Packet8s pcast<Packet8us, Packet8s>(const Packet8us& a) {
-  return vreinterpretq_s16_u16(a);
+  return preinterpret<Packet8s>(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet4s pcast<Packet4us, Packet4s>(const Packet4us& a) {
-  return vreinterpret_s16_u16(a);
+  return preinterpret<Packet4s>(a);
 }
 
 template <>
@@ -637,11 +768,11 @@
 };
 template <>
 EIGEN_STRONG_INLINE Packet16c pcast<Packet8us, Packet16c>(const Packet8us& a, const Packet8us& b) {
-  return vreinterpretq_s8_u8(pcast<Packet8us, Packet16uc>(a, b));
+  return preinterpret<Packet16c>(pcast<Packet8us, Packet16uc>(a, b));
 }
 template <>
 EIGEN_STRONG_INLINE Packet8c pcast<Packet4us, Packet8c>(const Packet4us& a, const Packet4us& b) {
-  return vreinterpret_s8_u8(pcast<Packet4us, Packet8uc>(a, b));
+  return preinterpret<Packet8c>(pcast<Packet4us, Packet8uc>(a, b));
 }
 
 //==============================================================================
@@ -676,7 +807,7 @@
 };
 template <>
 EIGEN_STRONG_INLINE Packet2ul pcast<Packet4i, Packet2ul>(const Packet4i& a) {
-  return vreinterpretq_u64_s64(pcast<Packet4i, Packet2l>(a));
+  return preinterpret<Packet2ul>(pcast<Packet4i, Packet2l>(a));
 }
 
 template <>
@@ -698,11 +829,11 @@
 };
 template <>
 EIGEN_STRONG_INLINE Packet4ui pcast<Packet4i, Packet4ui>(const Packet4i& a) {
-  return vreinterpretq_u32_s32(a);
+  return preinterpret<Packet4ui>(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet2ui pcast<Packet2i, Packet2ui>(const Packet2i& a) {
-  return vreinterpret_u32_s32(a);
+  return preinterpret<Packet2ui>(a);
 }
 
 template <>
@@ -801,7 +932,7 @@
 };
 template <>
 EIGEN_STRONG_INLINE Packet2l pcast<Packet4ui, Packet2l>(const Packet4ui& a) {
-  return vreinterpretq_s64_u64(pcast<Packet4ui, Packet2ul>(a));
+  return preinterpret<Packet2l>(pcast<Packet4ui, Packet2ul>(a));
 }
 
 template <>
@@ -823,11 +954,11 @@
 };
 template <>
 EIGEN_STRONG_INLINE Packet4i pcast<Packet4ui, Packet4i>(const Packet4ui& a) {
-  return vreinterpretq_s32_u32(a);
+  return preinterpret<Packet4i>(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet2i pcast<Packet2ui, Packet2i>(const Packet2ui& a) {
-  return vreinterpret_s32_u32(a);
+  return preinterpret<Packet2i>(a);
 }
 
 template <>
@@ -849,11 +980,11 @@
 };
 template <>
 EIGEN_STRONG_INLINE Packet8s pcast<Packet4ui, Packet8s>(const Packet4ui& a, const Packet4ui& b) {
-  return vreinterpretq_s16_u16(pcast<Packet4ui, Packet8us>(a, b));
+  return preinterpret<Packet8s>(pcast<Packet4ui, Packet8us>(a, b));
 }
 template <>
 EIGEN_STRONG_INLINE Packet4s pcast<Packet2ui, Packet4s>(const Packet2ui& a, const Packet2ui& b) {
-  return vreinterpret_s16_u16(pcast<Packet2ui, Packet4us>(a, b));
+  return preinterpret<Packet4s>(pcast<Packet2ui, Packet4us>(a, b));
 }
 
 template <>
@@ -882,12 +1013,12 @@
 template <>
 EIGEN_STRONG_INLINE Packet16c pcast<Packet4ui, Packet16c>(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c,
                                                           const Packet4ui& d) {
-  return vreinterpretq_s8_u8(pcast<Packet4ui, Packet16uc>(a, b, c, d));
+  return preinterpret<Packet16c>(pcast<Packet4ui, Packet16uc>(a, b, c, d));
 }
 template <>
 EIGEN_STRONG_INLINE Packet8c pcast<Packet2ui, Packet8c>(const Packet2ui& a, const Packet2ui& b, const Packet2ui& c,
                                                         const Packet2ui& d) {
-  return vreinterpret_s8_u8(pcast<Packet2ui, Packet8uc>(a, b, c, d));
+  return preinterpret<Packet8c>(pcast<Packet2ui, Packet8uc>(a, b, c, d));
 }
 
 //==============================================================================
@@ -917,7 +1048,7 @@
 };
 template <>
 EIGEN_STRONG_INLINE Packet2ul pcast<Packet2l, Packet2ul>(const Packet2l& a) {
-  return vreinterpretq_u64_s64(a);
+  return preinterpret<Packet2ul>(a);
 }
 
 template <>
@@ -1015,7 +1146,7 @@
 };
 template <>
 EIGEN_STRONG_INLINE Packet2l pcast<Packet2ul, Packet2l>(const Packet2ul& a) {
-  return vreinterpretq_s64_u64(a);
+  return preinterpret<Packet2l>(a);
 }
 
 template <>
@@ -1033,7 +1164,7 @@
 };
 template <>
 EIGEN_STRONG_INLINE Packet4i pcast<Packet2ul, Packet4i>(const Packet2ul& a, const Packet2ul& b) {
-  return vreinterpretq_s32_u32(pcast<Packet2ul, Packet4ui>(a, b));
+  return preinterpret<Packet4i>(pcast<Packet2ul, Packet4ui>(a, b));
 }
 
 template <>
@@ -1055,7 +1186,7 @@
 template <>
 EIGEN_STRONG_INLINE Packet8s pcast<Packet2ul, Packet8s>(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c,
                                                         const Packet2ul& d) {
-  return vreinterpretq_s16_u16(pcast<Packet2ul, Packet8us>(a, b, c, d));
+  return preinterpret<Packet8s>(pcast<Packet2ul, Packet8us>(a, b, c, d));
 }
 
 template <>
@@ -1079,114 +1210,8 @@
 EIGEN_STRONG_INLINE Packet16c pcast<Packet2ul, Packet16c>(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c,
                                                           const Packet2ul& d, const Packet2ul& e, const Packet2ul& f,
                                                           const Packet2ul& g, const Packet2ul& h) {
-  return vreinterpretq_s8_u8(pcast<Packet2ul, Packet16uc>(a, b, c, d, e, f, g, h));
-}
-
-//==============================================================================
-// preinterpret
-//==============================================================================
-template <>
-EIGEN_STRONG_INLINE Packet2f preinterpret<Packet2f, Packet2i>(const Packet2i& a) {
-  return vreinterpret_f32_s32(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet2f preinterpret<Packet2f, Packet2ui>(const Packet2ui& a) {
-  return vreinterpret_f32_u32(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet4i>(const Packet4i& a) {
-  return vreinterpretq_f32_s32(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet4ui>(const Packet4ui& a) {
-  return vreinterpretq_f32_u32(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4c preinterpret<Packet4c, Packet4uc>(const Packet4uc& a) {
-  return static_cast<Packet4c>(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet8c preinterpret<Packet8c, Packet8uc>(const Packet8uc& a) {
-  return vreinterpret_s8_u8(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet16c preinterpret<Packet16c, Packet16uc>(const Packet16uc& a) {
-  return vreinterpretq_s8_u8(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4uc preinterpret<Packet4uc, Packet4c>(const Packet4c& a) {
-  return static_cast<Packet4uc>(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet8uc preinterpret<Packet8uc, Packet8c>(const Packet8c& a) {
-  return vreinterpret_u8_s8(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet16uc preinterpret<Packet16uc, Packet16c>(const Packet16c& a) {
-  return vreinterpretq_u8_s8(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4s preinterpret<Packet4s, Packet4us>(const Packet4us& a) {
-  return vreinterpret_s16_u16(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet8s preinterpret<Packet8s, Packet8us>(const Packet8us& a) {
-  return vreinterpretq_s16_u16(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4us preinterpret<Packet4us, Packet4s>(const Packet4s& a) {
-  return vreinterpret_u16_s16(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet8us preinterpret<Packet8us, Packet8s>(const Packet8s& a) {
-  return vreinterpretq_u16_s16(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2i preinterpret<Packet2i, Packet2f>(const Packet2f& a) {
-  return vreinterpret_s32_f32(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet2i preinterpret<Packet2i, Packet2ui>(const Packet2ui& a) {
-  return vreinterpret_s32_u32(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet4f>(const Packet4f& a) {
-  return vreinterpretq_s32_f32(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet4ui>(const Packet4ui& a) {
-  return vreinterpretq_s32_u32(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2ui preinterpret<Packet2ui, Packet2f>(const Packet2f& a) {
-  return vreinterpret_u32_f32(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet2ui preinterpret<Packet2ui, Packet2i>(const Packet2i& a) {
-  return vreinterpret_u32_s32(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet4ui preinterpret<Packet4ui, Packet4f>(const Packet4f& a) {
-  return vreinterpretq_u32_f32(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet4ui preinterpret<Packet4ui, Packet4i>(const Packet4i& a) {
-  return vreinterpretq_u32_s32(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2l preinterpret<Packet2l, Packet2ul>(const Packet2ul& a) {
-  return vreinterpretq_s64_u64(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet2ul preinterpret<Packet2ul, Packet2l>(const Packet2l& a) {
-  return vreinterpretq_u64_s64(a);
+  return preinterpret<Packet16c>(
+      pcast<Packet2ul, Packet16uc>(a, b, c, d, e, f, g, h));
 }
 
 #if EIGEN_ARCH_ARM64
@@ -1196,6 +1221,37 @@
 //==============================================================================
 
 template <>
+EIGEN_STRONG_INLINE Packet2d
+preinterpret<Packet2d, Packet2l>(const Packet2l& a) {
+  return Packet2d(vreinterpretq_f64_s64(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d
+preinterpret<Packet2d, Packet2ul>(const Packet2ul& a) {
+  return Packet2d(vreinterpretq_f64_u64(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l
+preinterpret<Packet2l, Packet2d>(const Packet2d& a) {
+  return Packet2l(vreinterpretq_s64_f64(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul
+preinterpret<Packet2ul, Packet2d>(const Packet2d& a) {
+  return Packet2ul(vreinterpretq_u64_f64(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d
+preinterpret<Packet2d, Packet4i>(const Packet4i& a) {
+  return Packet2d(vreinterpretq_f64_s32(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i
+preinterpret<Packet4i, Packet2d>(const Packet2d& a) {
+  return Packet4i(vreinterpretq_s32_f64(a));
+}
+
+template <>
 struct type_casting_traits<double, double> {
   enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
 };
@@ -1316,7 +1372,10 @@
 template <>
 EIGEN_STRONG_INLINE Packet2d pcast<Packet16c, Packet2d>(const Packet16c& a) {
   // Discard all but first two values.
-  return vcvt_f64_f32(pcast<Packet8c, Packet2f>(vget_low_s8(a)));
+  // MSVC defines most intrinsics as macros, so we need to do this in two lines
+  // for portability.
+  Packet2f tmp = pcast<Packet8c, Packet2f>(vget_low_s8(a));
+  return vcvt_f64_f32(tmp);
 }
 
 template <>
@@ -1326,7 +1385,8 @@
 template <>
 EIGEN_STRONG_INLINE Packet2d pcast<Packet16uc, Packet2d>(const Packet16uc& a) {
   // Discard all but first two values.
-  return vcvt_f64_f32(pcast<Packet8uc, Packet2f>(vget_low_u8(a)));
+  Packet2f tmp = pcast<Packet8uc, Packet2f>(vget_low_u8(a));
+  return vcvt_f64_f32(tmp);
 }
 
 template <>
@@ -1336,7 +1396,8 @@
 template <>
 EIGEN_STRONG_INLINE Packet2d pcast<Packet8s, Packet2d>(const Packet8s& a) {
   // Discard all but first two values.
-  return vcvt_f64_f32(pcast<Packet4s, Packet2f>(vget_low_s16(a)));
+  Packet2f tmp = pcast<Packet4s, Packet2f>(vget_low_s16(a));
+  return vcvt_f64_f32(tmp);
 }
 
 template <>
@@ -1346,7 +1407,8 @@
 template <>
 EIGEN_STRONG_INLINE Packet2d pcast<Packet8us, Packet2d>(const Packet8us& a) {
   // Discard all but first two values.
-  return vcvt_f64_f32(pcast<Packet4us, Packet2f>(vget_low_u16(a)));
+  Packet2f tmp = pcast<Packet4us, Packet2f>(vget_low_u16(a));
+  return vcvt_f64_f32(tmp);
 }
 
 template <>
@@ -1387,31 +1449,6 @@
   return vcvtq_f64_u64(a);
 }
 
-template <>
-EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet2l>(const Packet2l& a) {
-  return vreinterpretq_f64_s64(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet2ul>(const Packet2ul& a) {
-  return vreinterpretq_f64_u64(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet2l preinterpret<Packet2l, Packet2d>(const Packet2d& a) {
-  return vreinterpretq_s64_f64(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet2ul preinterpret<Packet2ul, Packet2d>(const Packet2d& a) {
-  return vreinterpretq_u64_f64(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet4i>(const Packet4i& a) {
-  return vreinterpretq_f64_s32(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet2d>(const Packet2d& a) {
-  return vreinterpretq_s32_f64(a);
-}
-
 #endif  // EIGEN_ARCH_ARM64
 
 }  // end namespace internal
diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h
index f98fb7a..17ea6c9 100644
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@@ -81,11 +81,6 @@
   return pacos_float(_x);
 }
 
-template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet2d patan<Packet2d>(const Packet2d& _x) {
-  return patan_double(_x);
-}
-
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet4f pasin<Packet4f>(const Packet4f& _x)
 {
@@ -98,6 +93,18 @@
   return patan_float(_x);
 }
 
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2d
+patan<Packet2d>(const Packet2d& _x) {
+  return patan_double(_x);
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f
+patanh<Packet4f>(const Packet4f& _x) {
+  return patanh_float(_x);
+}
+
 // Notice that for newer processors, it is counterproductive to use Newton
 // iteration for square root. In particular, Skylake and Zen2 processors
 // have approximately doubled throughput of the _mm_sqrt_ps instruction
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 434d033..d1351b9 100644
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -134,7 +134,7 @@
     size = 4,
     HasHalfPacket = 0,
 
-    HasCmp  = 1,
+    HasCmp = 1,
     HasDiv = 1,
     HasReciprocal = EIGEN_FAST_MATH,
     HasSin = EIGEN_FAST_MATH,
@@ -142,6 +142,7 @@
     HasACos = 1,
     HasASin = 1,
     HasATan = 1,
+    HasATanh = 1,
     HasLog = 1,
     HasLog1p = 1,
     HasExpm1 = 1,
@@ -159,7 +160,7 @@
     HasRound = 1,
 #endif
     HasRint = 1,
-    HasSign = 0   // The manually vectorized version is slightly slower for SSE.
+    HasSign = 0  // The manually vectorized version is slightly slower for SSE.
   };
 };
 template <>
@@ -212,20 +213,21 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     HasHalfPacket = 0,
-    size=16,
-    
-    HasAdd       = 1,
-    HasSub       = 1,
-    HasShift     = 0,
-    HasMul       = 1,
-    HasNegate    = 1,
-    HasAbs       = 0,
-    HasAbs2      = 0,
-    HasMin       = 0,
-    HasMax       = 0,
-    HasConj      = 0,
-    HasSqrt      = 1,
-    HasSign      = 0   // Don't try to vectorize psign<bool> = identity.
+    size = 16,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasCmp = 1,  // note -- only pcmp_eq is defined
+    HasShift = 0,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 0,
+    HasMax = 0,
+    HasConj = 0,
+    HasSqrt = 1,
+    HasSign = 0  // Don't try to vectorize psign<bool> = identity.
   };
 };
 
diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h
index 3c76019..3759e1a 100644
--- a/Eigen/src/Core/functors/BinaryFunctors.h
+++ b/Eigen/src/Core/functors/BinaryFunctors.h
@@ -428,63 +428,202 @@
   };
 };
 
-
-
 /** \internal
-  * \brief Template functor to compute the and of two booleans
-  *
-  * \sa class CwiseBinaryOp, ArrayBase::operator&&
-  */
+ * \brief Template functor to compute the and of two scalars as if they were
+ * booleans
+ *
+ * \sa class CwiseBinaryOp, ArrayBase::operator&&
+ */
+template <typename Scalar>
 struct scalar_boolean_and_op {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a && b; }
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::pand(a,b); }
+  using result_type = Scalar;
+  // `false` any value `a` that satisfies `a == Scalar(0)`
+  // `true` is the complement of `false`
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
+  operator()(const Scalar& a, const Scalar& b) const {
+    return (a != Scalar(0)) && (b != Scalar(0)) ? Scalar(1) : Scalar(0);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a,
+                                                        const Packet& b) const {
+    const Packet cst_one = pset1<Packet>(Scalar(1));
+    // and(a,b) == !or(!a,!b)
+    Packet not_a = pcmp_eq(a, pzero(a));
+    Packet not_b = pcmp_eq(b, pzero(b));
+    Packet a_nand_b = por(not_a, not_b);
+    return pandnot(cst_one, a_nand_b);
+  }
 };
-template<> struct functor_traits<scalar_boolean_and_op> {
+template <typename Scalar>
+struct functor_traits<scalar_boolean_and_op<Scalar>> {
   enum {
-    Cost = NumTraits<bool>::AddCost,
-    PacketAccess = true
+    Cost = NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasCmp
   };
 };
 
 /** \internal
-  * \brief Template functor to compute the or of two booleans
-  *
-  * \sa class CwiseBinaryOp, ArrayBase::operator||
-  */
+ * \brief Template functor to compute the or of two scalars as if they were
+ * booleans
+ *
+ * \sa class CwiseBinaryOp, ArrayBase::operator||
+ */
+template <typename Scalar>
 struct scalar_boolean_or_op {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a || b; }
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::por(a,b); }
+  using result_type = Scalar;
+  // `false` any value `a` that satisfies `a == Scalar(0)`
+  // `true` is the complement of `false`
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
+  operator()(const Scalar& a, const Scalar& b) const {
+    return (a != Scalar(0)) || (b != Scalar(0)) ? Scalar(1) : Scalar(0);
+  }
+  template <typename Packet>
+  EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const {
+    const Packet cst_one = pset1<Packet>(Scalar(1));
+    // if or(a,b) == 0, then a == 0 and b == 0
+    // or(a,b) == !nor(a,b)
+    Packet a_nor_b = pcmp_eq(por(a, b), pzero(a));
+    return pandnot(cst_one, a_nor_b);
+  }
 };
-template<> struct functor_traits<scalar_boolean_or_op> {
+template <typename Scalar>
+struct functor_traits<scalar_boolean_or_op<Scalar>> {
   enum {
-    Cost = NumTraits<bool>::AddCost,
-    PacketAccess = true
+    Cost = NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasCmp
   };
 };
 
 /** \internal
- * \brief Template functor to compute the xor of two booleans
+ * \brief Template functor to compute the xor of two scalars as if they were
+ * booleans
  *
  * \sa class CwiseBinaryOp, ArrayBase::operator^
  */
+template <typename Scalar>
 struct scalar_boolean_xor_op {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a ^ b; }
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::pxor(a,b); }
+  using result_type = Scalar;
+  // `false` any value `a` that satisfies `a == Scalar(0)`
+  // `true` is the complement of `false`
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
+  operator()(const Scalar& a, const Scalar& b) const {
+    return (a != Scalar(0)) != (b != Scalar(0)) ? Scalar(1) : Scalar(0);
+  }
+  template <typename Packet>
+  EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const {
+    const Packet cst_one = pset1<Packet>(Scalar(1));
+    // xor(a,b) == xor(!a,!b)
+    Packet not_a = pcmp_eq(a, pzero(a));
+    Packet not_b = pcmp_eq(b, pzero(b));
+    Packet a_xor_b = pxor(not_a, not_b);
+    return pand(cst_one, a_xor_b);
+  }
 };
-template<> struct functor_traits<scalar_boolean_xor_op> {
+template <typename Scalar>
+struct functor_traits<scalar_boolean_xor_op<Scalar>> {
   enum {
-    Cost = NumTraits<bool>::AddCost,
-    PacketAccess = true
+    Cost = NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasCmp
   };
 };
 
 /** \internal
+ * \brief Template functor to compute the bitwise and of two scalars
+ *
+ * \sa class CwiseBinaryOp, ArrayBase::operator&
+ */
+template <typename Scalar>
+struct scalar_bitwise_and_op {
+  EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::RequireInitialization,
+                      BITWISE OPERATIONS MAY ONLY BE PERFORMED ON PLAIN DATA
+                          TYPES)
+  using result_type = Scalar;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
+  operator()(const Scalar& a, const Scalar& b) const {
+    Scalar result;
+    const uint8_t* a_bytes = reinterpret_cast<const uint8_t*>(&a);
+    const uint8_t* b_bytes = reinterpret_cast<const uint8_t*>(&b);
+    uint8_t* r_bytes = reinterpret_cast<uint8_t*>(&result);
+    for (Index i = 0; i < sizeof(Scalar); i++)
+      r_bytes[i] = a_bytes[i] & b_bytes[i];
+    return result;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a,
+                                                        const Packet& b) const {
+    return pand(a, b);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bitwise_and_op<Scalar>> {
+  enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = true };
+};
+
+/** \internal
+ * \brief Template functor to compute the bitwise or of two scalars
+ *
+ * \sa class CwiseBinaryOp, ArrayBase::operator|
+ */
+template <typename Scalar>
+struct scalar_bitwise_or_op {
+  EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::RequireInitialization,
+                      BITWISE OPERATIONS MAY ONLY BE PERFORMED ON PLAIN DATA
+                          TYPES)
+  using result_type = Scalar;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
+  operator()(const Scalar& a, const Scalar& b) const {
+    Scalar result;
+    const uint8_t* a_bytes = reinterpret_cast<const uint8_t*>(&a);
+    const uint8_t* b_bytes = reinterpret_cast<const uint8_t*>(&b);
+    uint8_t* r_bytes = reinterpret_cast<uint8_t*>(&result);
+    for (Index i = 0; i < sizeof(Scalar); i++)
+      r_bytes[i] = a_bytes[i] | b_bytes[i];
+    return result;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a,
+                                                        const Packet& b) const {
+    return por(a, b);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bitwise_or_op<Scalar>> {
+  enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = true };
+};
+
+/** \internal
+ * \brief Template functor to compute the bitwise xor of two scalars
+ *
+ * \sa class CwiseBinaryOp, ArrayBase::operator^
+ */
+template <typename Scalar>
+struct scalar_bitwise_xor_op {
+  EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::RequireInitialization,
+                      BITWISE OPERATIONS MAY ONLY BE PERFORMED ON PLAIN DATA
+                          TYPES)
+  using result_type = Scalar;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
+  operator()(const Scalar& a, const Scalar& b) const {
+    Scalar result;
+    const uint8_t* a_bytes = reinterpret_cast<const uint8_t*>(&a);
+    const uint8_t* b_bytes = reinterpret_cast<const uint8_t*>(&b);
+    uint8_t* r_bytes = reinterpret_cast<uint8_t*>(&result);
+    for (Index i = 0; i < sizeof(Scalar); i++)
+      r_bytes[i] = a_bytes[i] ^ b_bytes[i];
+    return result;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a,
+                                                        const Packet& b) const {
+    return pxor(a, b);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bitwise_xor_op<Scalar>> {
+  enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = true };
+};
+
+/** \internal
   * \brief Template functor to compute the absolute difference of two scalars
   *
   * \sa class CwiseBinaryOp, MatrixBase::absolute_difference
@@ -533,8 +672,12 @@
     struct functor_traits<scalar_atan2_op<LhsScalar, RhsScalar>> {
   using Scalar = LhsScalar;
   enum {
-    PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<Scalar>::HasATan && packet_traits<Scalar>::HasDiv && !NumTraits<Scalar>::IsInteger && !NumTraits<Scalar>::IsComplex,
-    Cost = scalar_div_cost<Scalar, PacketAccess>::value + functor_traits<scalar_atan_op<Scalar>>::Cost
+    PacketAccess =
+        is_same<LhsScalar, RhsScalar>::value &&
+        packet_traits<Scalar>::HasATan && packet_traits<Scalar>::HasDiv &&
+        !NumTraits<Scalar>::IsInteger && !NumTraits<Scalar>::IsComplex,
+    Cost = int(scalar_div_cost<Scalar, PacketAccess>::value) +
+           int(functor_traits<scalar_atan_op<Scalar>>::Cost)
   };
 };
 
diff --git a/Eigen/src/Core/functors/StlFunctors.h b/Eigen/src/Core/functors/StlFunctors.h
index 5971075..b8b842b 100644
--- a/Eigen/src/Core/functors/StlFunctors.h
+++ b/Eigen/src/Core/functors/StlFunctors.h
@@ -13,29 +13,6 @@
 #include "../InternalHeaderCheck.h"
 
 namespace Eigen {
-
-// Portable replacements for certain functors.
-namespace numext {
-
-template<typename T = void>
-struct equal_to {
-  typedef bool result_type;
-  EIGEN_DEVICE_FUNC bool operator()(const T& lhs, const T& rhs) const {
-    return lhs == rhs;
-  }
-};
-
-template<typename T = void>
-struct not_equal_to {
-  typedef bool result_type;
-  EIGEN_DEVICE_FUNC bool operator()(const T& lhs, const T& rhs) const {
-    return lhs != rhs;
-  }
-};
-
-}
-
-
 namespace internal {
 
 // default functor traits for STL functors:
@@ -93,17 +70,9 @@
 { enum { Cost = 1, PacketAccess = false }; };
 
 template<typename T>
-struct functor_traits<numext::equal_to<T> >
-  : functor_traits<std::equal_to<T> > {};
-
-template<typename T>
 struct functor_traits<std::not_equal_to<T> >
 { enum { Cost = 1, PacketAccess = false }; };
 
-template<typename T>
-struct functor_traits<numext::not_equal_to<T> >
-  : functor_traits<std::not_equal_to<T> > {};
-
 #if (EIGEN_COMP_CXXVER < 17)
 // std::unary_negate is deprecated since c++17 and will be removed in c++20
 template<typename T>
diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h
index 3485369..fca0df0 100644
--- a/Eigen/src/Core/functors/UnaryFunctors.h
+++ b/Eigen/src/Core/functors/UnaryFunctors.h
@@ -146,6 +146,35 @@
     PacketAccess = packet_traits<Scalar>::HasArg
   };
 };
+
+/** \internal
+ * \brief Template functor to compute the complex argument, returned as a
+ * complex type
+ *
+ * \sa class CwiseUnaryOp, Cwise::carg
+ */
+template <typename Scalar>
+struct scalar_carg_op {
+  using result_type = Scalar;
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const {
+    return Scalar(numext::arg(a));
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet
+  packetOp(const Packet& a) const {
+    return pcarg(a);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_carg_op<Scalar>> {
+  using RealScalar = typename NumTraits<Scalar>::Real;
+  enum {
+    Cost = functor_traits<scalar_atan2_op<RealScalar>>::Cost,
+    PacketAccess = packet_traits<RealScalar>::HasATan
+  };
+};
+
 /** \internal
   * \brief Template functor to cast a scalar to another type
   *
@@ -600,11 +629,18 @@
 template <typename Scalar>
 struct scalar_atanh_op {
   EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::atanh(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& x) const {
+    return patanh(x);
+  }
 };
 
 template <typename Scalar>
 struct functor_traits<scalar_atanh_op<Scalar> > {
-  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasATanh
+  };
 };
 
 /** \internal
@@ -891,22 +927,65 @@
 };
 
 /** \internal
-  * \brief Template functor to compute the logical not of a boolean
-  *
-  * \sa class CwiseUnaryOp, ArrayBase::operator!
-  */
-template<typename Scalar> struct scalar_boolean_not_op {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a) const { return !a; }
+ * \brief Template functor to compute the logical not of a scalar as if it were
+ * a boolean
+ *
+ * \sa class CwiseUnaryOp, ArrayBase::operator!
+ */
+template <typename Scalar>
+struct scalar_boolean_not_op {
+  using result_type = Scalar;
+  // `false` any value `a` that satisfies `a == Scalar(0)`
+  // `true` is the complement of `false`
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
+  operator()(const Scalar& a) const {
+    return a == Scalar(0) ? Scalar(1) : Scalar(0);
+  }
+  template <typename Packet>
+  EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const {
+    const Packet cst_one = pset1<Packet>(Scalar(1));
+    Packet not_a = pcmp_eq(a, pzero(a));
+    return pand(not_a, cst_one);
+  }
 };
-template<typename Scalar>
-struct functor_traits<scalar_boolean_not_op<Scalar> > {
+template <typename Scalar>
+struct functor_traits<scalar_boolean_not_op<Scalar>> {
   enum {
-    Cost = NumTraits<bool>::AddCost,
-    PacketAccess = false
+    Cost = NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasCmp
   };
 };
 
 /** \internal
+ * \brief Template functor to compute the bitwise not of a scalar
+ *
+ * \sa class CwiseUnaryOp, ArrayBase::operator~
+ */
+template <typename Scalar>
+struct scalar_bitwise_not_op {
+  EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::RequireInitialization,
+                      BITWISE OPERATIONS MAY ONLY BE PERFORMED ON PLAIN DATA
+                          TYPES)
+  using result_type = Scalar;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
+  operator()(const Scalar& a) const {
+    Scalar result;
+    const uint8_t* a_bytes = reinterpret_cast<const uint8_t*>(&a);
+    uint8_t* r_bytes = reinterpret_cast<uint8_t*>(&result);
+    for (Index i = 0; i < sizeof(Scalar); i++) r_bytes[i] = ~a_bytes[i];
+    return result;
+  }
+  template <typename Packet>
+  EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const {
+    return pandnot(ptrue(a), a);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bitwise_not_op<Scalar>> {
+  enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = true };
+};
+
+/** \internal
   * \brief Template functor to compute the signum of a scalar
   * \sa class CwiseUnaryOp, Cwise::sign()
   */
diff --git a/Eigen/src/Core/products/TriangularMatrixVector.h b/Eigen/src/Core/products/TriangularMatrixVector.h
index df15e81..58aa7cc 100644
--- a/Eigen/src/Core/products/TriangularMatrixVector.h
+++ b/Eigen/src/Core/products/TriangularMatrixVector.h
@@ -23,13 +23,14 @@
 struct triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,ColMajor,Version>
 {
   typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
-  enum {
-    IsLower = ((Mode&Lower)==Lower),
-    HasUnitDiag = (Mode & UnitDiag)==UnitDiag,
-    HasZeroDiag = (Mode & ZeroDiag)==ZeroDiag
-  };
-  static EIGEN_DONT_INLINE  void run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
-                                     const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const RhsScalar& alpha);
+  static constexpr bool IsLower = ((Mode & Lower) == Lower);
+  static constexpr bool HasUnitDiag = (Mode & UnitDiag) == UnitDiag;
+  static constexpr bool HasZeroDiag = (Mode & ZeroDiag) == ZeroDiag;
+  static EIGEN_DONT_INLINE void run(Index _rows, Index _cols,
+                                    const LhsScalar* _lhs, Index lhsStride,
+                                    const RhsScalar* _rhs, Index rhsIncr,
+                                    ResScalar* _res, Index resIncr,
+                                    const RhsScalar& alpha);
 };
 
 template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int Version>
@@ -94,13 +95,14 @@
 struct triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,RowMajor,Version>
 {
   typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
-  enum {
-    IsLower = ((Mode&Lower)==Lower),
-    HasUnitDiag = (Mode & UnitDiag)==UnitDiag,
-    HasZeroDiag = (Mode & ZeroDiag)==ZeroDiag
-  };
-  static EIGEN_DONT_INLINE void run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
-                                    const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha);
+  static constexpr bool IsLower = ((Mode & Lower) == Lower);
+  static constexpr bool HasUnitDiag = (Mode & UnitDiag) == UnitDiag;
+  static constexpr bool HasZeroDiag = (Mode & ZeroDiag) == ZeroDiag;
+  static EIGEN_DONT_INLINE void run(Index _rows, Index _cols,
+                                    const LhsScalar* _lhs, Index lhsStride,
+                                    const RhsScalar* _rhs, Index rhsIncr,
+                                    ResScalar* _res, Index resIncr,
+                                    const ResScalar& alpha);
 };
 
 template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs,int Version>
@@ -202,7 +204,7 @@
 namespace internal {
 
 // TODO: find a way to factorize this piece of code with gemv_selector since the logic is exactly the same.
-  
+
 template<int Mode> struct trmv_selector<Mode,ColMajor>
 {
   template<typename Lhs, typename Rhs, typename Dest>
@@ -211,13 +213,15 @@
     typedef typename Lhs::Scalar      LhsScalar;
     typedef typename Rhs::Scalar      RhsScalar;
     typedef typename Dest::Scalar     ResScalar;
-    
+
     typedef internal::blas_traits<Lhs> LhsBlasTraits;
     typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
     typedef internal::blas_traits<Rhs> RhsBlasTraits;
     typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
-    
-    typedef Map<Matrix<ResScalar,Dynamic,1>, plain_enum_min(AlignedMax,internal::packet_traits<ResScalar>::size)> MappedDest;
+    constexpr int Alignment = (std::min)(
+        int(AlignedMax), int(internal::packet_traits<ResScalar>::size));
+
+    typedef Map<Matrix<ResScalar, Dynamic, 1>, Alignment> MappedDest;
 
     add_const_on_value_type_t<ActualLhsType> actualLhs = LhsBlasTraits::extract(lhs);
     add_const_on_value_type_t<ActualRhsType> actualRhs = RhsBlasTraits::extract(rhs);
@@ -226,13 +230,15 @@
     RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(rhs);
     ResScalar actualAlpha = alpha * lhs_alpha * rhs_alpha;
 
-    enum {
-      // FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1
-      // on, the other hand it is good for the cache to pack the vector anyways...
-      EvalToDestAtCompileTime = Dest::InnerStrideAtCompileTime==1,
-      ComplexByReal = (NumTraits<LhsScalar>::IsComplex) && (!NumTraits<RhsScalar>::IsComplex),
-      MightCannotUseDest = (Dest::InnerStrideAtCompileTime!=1) || ComplexByReal
-    };
+    // FIXME find a way to allow an inner stride on the result if
+    // packet_traits<Scalar>::size==1 on, the other hand it is good for the
+    // cache to pack the vector anyways...
+    constexpr bool EvalToDestAtCompileTime =
+        Dest::InnerStrideAtCompileTime == 1;
+    constexpr bool ComplexByReal =
+        (NumTraits<LhsScalar>::IsComplex) && (!NumTraits<RhsScalar>::IsComplex);
+    constexpr bool MightCannotUseDest =
+        (Dest::InnerStrideAtCompileTime != 1) || ComplexByReal;
 
     gemv_static_vector_if<ResScalar,Dest::SizeAtCompileTime,Dest::MaxSizeAtCompileTime,MightCannotUseDest> static_dest;
 
@@ -307,9 +313,8 @@
     RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(rhs);
     ResScalar actualAlpha = alpha * lhs_alpha * rhs_alpha;
 
-    enum {
-      DirectlyUseRhs = ActualRhsTypeCleaned::InnerStrideAtCompileTime==1
-    };
+    constexpr bool DirectlyUseRhs =
+        ActualRhsTypeCleaned::InnerStrideAtCompileTime == 1;
 
     gemv_static_vector_if<RhsScalar,ActualRhsTypeCleaned::SizeAtCompileTime,ActualRhsTypeCleaned::MaxSizeAtCompileTime,!DirectlyUseRhs> static_rhs;
 
diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h
index 0865fb6..eed2397 100644
--- a/Eigen/src/Core/util/DisableStupidWarnings.h
+++ b/Eigen/src/Core/util/DisableStupidWarnings.h
@@ -121,6 +121,7 @@
   // The __device__ annotation seems to actually be needed in some cases,
   // otherwise resulting in kernel runtime errors.
   EIGEN_NV_DIAG_SUPPRESS(2886)
+  EIGEN_NV_DIAG_SUPPRESS(2929)
   EIGEN_NV_DIAG_SUPPRESS(2977)
   EIGEN_NV_DIAG_SUPPRESS(20012)
   #undef EIGEN_NV_DIAG_SUPPRESS
diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h
index facb074..ba05383 100644
--- a/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/Eigen/src/Core/util/ForwardDeclarations.h
@@ -210,6 +210,23 @@
 template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_hypot_op;
 template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_product_op;
 template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_quotient_op;
+// logical and bitwise operations
+template <typename Scalar>
+struct scalar_boolean_and_op;
+template <typename Scalar>
+struct scalar_boolean_or_op;
+template <typename Scalar>
+struct scalar_boolean_xor_op;
+template <typename Scalar>
+struct scalar_boolean_not_op;
+template <typename Scalar>
+struct scalar_bitwise_and_op;
+template <typename Scalar>
+struct scalar_bitwise_or_op;
+template <typename Scalar>
+struct scalar_bitwise_xor_op;
+template <typename Scalar>
+struct scalar_bitwise_not_op;
 
 // SpecialFunctions module
 template<typename Scalar> struct scalar_lgamma_op;
diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h
index 1a5d515..740b10f 100644
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@@ -135,7 +135,10 @@
 template<typename T> struct is_arithmetic      { enum { value = false }; };
 template<> struct is_arithmetic<float>         { enum { value = true }; };
 template<> struct is_arithmetic<double>        { enum { value = true }; };
+// GPU devices treat `long double` as `double`.
+#ifndef EIGEN_GPU_COMPILE_PHASE
 template<> struct is_arithmetic<long double>   { enum { value = true }; };
+#endif
 template<> struct is_arithmetic<bool>          { enum { value = true }; };
 template<> struct is_arithmetic<char>          { enum { value = true }; };
 template<> struct is_arithmetic<signed char>   { enum { value = true }; };
diff --git a/Eigen/src/Core/util/NonMPL2.h b/Eigen/src/Core/util/NonMPL2.h
deleted file mode 100644
index 1af67cf..0000000
--- a/Eigen/src/Core/util/NonMPL2.h
+++ /dev/null
@@ -1,3 +0,0 @@
-#ifdef EIGEN_MPL2_ONLY
-#error Including non-MPL2 code in EIGEN_MPL2_ONLY mode
-#endif
diff --git a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h
index e697f32..7a0519c 100644
--- a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h
+++ b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h
@@ -18,31 +18,35 @@
 
 namespace Eigen {
 /**
-  * \brief Modified Incomplete Cholesky with dual threshold
-  *
-  * References : C-J. Lin and J. J. Moré, Incomplete Cholesky Factorizations with
-  *              Limited memory, SIAM J. Sci. Comput.  21(1), pp. 24-45, 1999
-  *
-  * \tparam Scalar the scalar type of the input matrices
-  * \tparam UpLo_ The triangular part that will be used for the computations. It can be Lower
-    *               or Upper. Default is Lower.
-  * \tparam OrderingType_ The ordering method to use, either AMDOrdering<> or NaturalOrdering<>. Default is AMDOrdering<int>,
-  *                       unless EIGEN_MPL2_ONLY is defined, in which case the default is NaturalOrdering<int>.
-  *
-  * \implsparsesolverconcept
-  *
-  * It performs the following incomplete factorization: \f$ S P A P' S \approx L L' \f$
-  * where L is a lower triangular factor, S is a diagonal scaling matrix, and P is a
-  * fill-in reducing permutation as computed by the ordering method.
-  *
-  * \b Shifting \b strategy: Let \f$ B = S P A P' S \f$  be the scaled matrix on which the factorization is carried out,
-  * and \f$ \beta \f$ be the minimum value of the diagonal. If \f$ \beta > 0 \f$ then, the factorization is directly performed
-  * on the matrix B. Otherwise, the factorization is performed on the shifted matrix \f$ B + (\sigma+|\beta| I \f$ where
-  * \f$ \sigma \f$ is the initial shift value as returned and set by setInitialShift() method. The default value is \f$ \sigma = 10^{-3} \f$.
-  * If the factorization fails, then the shift in doubled until it succeed or a maximum of ten attempts. If it still fails, as returned by
-  * the info() method, then you can either increase the initial shift, or better use another preconditioning technique.
-  *
-  */
+ * \brief Modified Incomplete Cholesky with dual threshold
+ *
+ * References : C-J. Lin and J. J. Moré, Incomplete Cholesky Factorizations with
+ *              Limited memory, SIAM J. Sci. Comput.  21(1), pp. 24-45, 1999
+ *
+ * \tparam Scalar the scalar type of the input matrices
+ * \tparam UpLo_ The triangular part that will be used for the computations. It
+ * can be Lower or Upper. Default is Lower. \tparam OrderingType_ The ordering
+ * method to use, either AMDOrdering<> or NaturalOrdering<>. Default is
+ * AMDOrdering<int>.
+ *
+ * \implsparsesolverconcept
+ *
+ * It performs the following incomplete factorization: \f$ S P A P' S \approx L
+ * L' \f$ where L is a lower triangular factor, S is a diagonal scaling matrix,
+ * and P is a fill-in reducing permutation as computed by the ordering method.
+ *
+ * \b Shifting \b strategy: Let \f$ B = S P A P' S \f$  be the scaled matrix on
+ * which the factorization is carried out, and \f$ \beta \f$ be the minimum
+ * value of the diagonal. If \f$ \beta > 0 \f$ then, the factorization is
+ * directly performed on the matrix B. Otherwise, the factorization is performed
+ * on the shifted matrix \f$ B + (\sigma+|\beta| I \f$ where \f$ \sigma \f$ is
+ * the initial shift value as returned and set by setInitialShift() method. The
+ * default value is \f$ \sigma = 10^{-3} \f$. If the factorization fails, then
+ * the shift in doubled until it succeed or a maximum of ten attempts. If it
+ * still fails, as returned by the info() method, then you can either increase
+ * the initial shift, or better use another preconditioning technique.
+ *
+ */
 template <typename Scalar, int UpLo_ = Lower, typename OrderingType_ = AMDOrdering<int> >
 class IncompleteCholesky : public SparseSolverBase<IncompleteCholesky<Scalar,UpLo_,OrderingType_> >
 {
diff --git a/Eigen/src/Jacobi/Jacobi.h b/Eigen/src/Jacobi/Jacobi.h
index 5d96989..b225bef 100644
--- a/Eigen/src/Jacobi/Jacobi.h
+++ b/Eigen/src/Jacobi/Jacobi.h
@@ -347,18 +347,16 @@
     typedef typename packet_traits<Scalar>::type Packet;
     typedef typename packet_traits<OtherScalar>::type OtherPacket;
 
-    enum {
-      RequiredAlignment = plain_enum_max(unpacket_traits<Packet>::alignment,
-                                         unpacket_traits<OtherPacket>::alignment),
-      PacketSize = packet_traits<Scalar>::size,
-      OtherPacketSize = packet_traits<OtherScalar>::size
-    };
+    constexpr int RequiredAlignment =
+        (std::max)(unpacket_traits<Packet>::alignment,
+                   unpacket_traits<OtherPacket>::alignment);
+    constexpr Index PacketSize = packet_traits<Scalar>::size;
 
     /*** dynamic-size vectorized paths ***/
     if(size >= 2 * PacketSize && SizeAtCompileTime == Dynamic && ((incrx == 1 && incry == 1) || PacketSize == 1))
     {
       // both vectors are sequentially stored in memory => vectorization
-      enum { Peeling = 2 };
+      constexpr Index Peeling = 2;
 
       Index alignedStart = internal::first_default_aligned(y, size);
       Index alignedEnd = alignedStart + ((size-alignedStart)/PacketSize)*PacketSize;
@@ -474,11 +472,12 @@
   if (numext::is_exactly_one(c) && numext::is_exactly_zero(s))
     return;
 
-  apply_rotation_in_the_plane_selector<
-    Scalar,OtherScalar,
-    VectorX::SizeAtCompileTime,
-    plain_enum_min(evaluator<VectorX>::Alignment, evaluator<VectorY>::Alignment),
-    Vectorizable>::run(x,incrx,y,incry,size,c,s);
+  constexpr int Alignment = (std::min)(int(evaluator<VectorX>::Alignment),
+                                       int(evaluator<VectorY>::Alignment));
+  apply_rotation_in_the_plane_selector<Scalar, OtherScalar,
+                                       VectorX::SizeAtCompileTime, Alignment,
+                                       Vectorizable>::run(x, incrx, y, incry,
+                                                          size, c, s);
 }
 
 } // end namespace internal
diff --git a/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h b/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h
index 8a6ae9b..75b3c60 100644
--- a/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h
+++ b/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h
@@ -117,35 +117,32 @@
     #define COLPIVQR_LAPACKE_INIT(EIGTYPE)                                                                          \
     template <> inline void ColPivHouseholderQR<EIGTYPE, lapack_int>::init(Index rows, Index cols) {                \
       ColPivHouseholderQR_LAPACKE_impl<MatrixType>::init(rows, cols, m_hCoeffs, m_colsPermutation, m_isInitialized, \
-                                                         m_usePrescribedThreshold); }                               \
+                                                         m_usePrescribedThreshold); }
 
-    #define COLPIVQR_LAPACKE(EIGTYPE)          \
-      COLPIVQR_LAPACKE_COMPUTEINPLACE(EIGTYPE) \
-      COLPIVQR_LAPACKE_INIT(EIGTYPE)           \
+#define COLPIVQR_LAPACKE(EIGTYPE)               \
+      COLPIVQR_LAPACKE_COMPUTEINPLACE(EIGTYPE)      \
+      COLPIVQR_LAPACKE_INIT(EIGTYPE)                \
+      COLPIVQR_LAPACKE_COMPUTEINPLACE(Ref<EIGTYPE>) \
+      COLPIVQR_LAPACKE_INIT(Ref<EIGTYPE>)
 
+    typedef Matrix<float, Dynamic, Dynamic, ColMajor> MatrixXfC;
+    typedef Matrix<double, Dynamic, Dynamic, ColMajor> MatrixXdC;
+    typedef Matrix<std::complex<float>, Dynamic, Dynamic, ColMajor> MatrixXcfC;
+    typedef Matrix<std::complex<double>, Dynamic, Dynamic, ColMajor> MatrixXcdC;
     typedef Matrix<float, Dynamic, Dynamic, RowMajor> MatrixXfR;
     typedef Matrix<double, Dynamic, Dynamic, RowMajor> MatrixXdR;
-    typedef Matrix<scomplex, Dynamic, Dynamic, RowMajor> MatrixXcfR;
-    typedef Matrix<dcomplex, Dynamic, Dynamic, RowMajor> MatrixXcdR;
+    typedef Matrix<std::complex<float>, Dynamic, Dynamic, RowMajor> MatrixXcfR;
+    typedef Matrix<std::complex<double>, Dynamic, Dynamic, RowMajor> MatrixXcdR;
 
-    COLPIVQR_LAPACKE(MatrixXf)
-    COLPIVQR_LAPACKE(MatrixXd)
-    COLPIVQR_LAPACKE(MatrixXcf)
-    COLPIVQR_LAPACKE(MatrixXcd)
+    COLPIVQR_LAPACKE(MatrixXfC)
+    COLPIVQR_LAPACKE(MatrixXdC)
+    COLPIVQR_LAPACKE(MatrixXcfC)
+    COLPIVQR_LAPACKE(MatrixXcdC)
     COLPIVQR_LAPACKE(MatrixXfR)
     COLPIVQR_LAPACKE(MatrixXdR)
     COLPIVQR_LAPACKE(MatrixXcfR)
     COLPIVQR_LAPACKE(MatrixXcdR)
 
-    COLPIVQR_LAPACKE(Ref<MatrixXf>)
-    COLPIVQR_LAPACKE(Ref<MatrixXd>)
-    COLPIVQR_LAPACKE(Ref<MatrixXcf>)
-    COLPIVQR_LAPACKE(Ref<MatrixXcd>)
-    COLPIVQR_LAPACKE(Ref<MatrixXfR>)
-    COLPIVQR_LAPACKE(Ref<MatrixXdR>)
-    COLPIVQR_LAPACKE(Ref<MatrixXcfR>)
-    COLPIVQR_LAPACKE(Ref<MatrixXcdR>)
-
 #endif
 }  // end namespace Eigen
 
diff --git a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
index 17cdb8e..2352fcb 100644
--- a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
+++ b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
@@ -367,29 +367,32 @@
 };
 
 // "sparse && sparse"
-template<typename Lhs, typename Rhs>
-struct binary_evaluator<CwiseBinaryOp<scalar_boolean_and_op, Lhs, Rhs>, IteratorBased, IteratorBased>
-  : sparse_conjunction_evaluator<CwiseBinaryOp<scalar_boolean_and_op, Lhs, Rhs> >
-{
-  typedef CwiseBinaryOp<scalar_boolean_and_op, Lhs, Rhs> XprType;
+template <typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<scalar_boolean_and_op<bool>, Lhs, Rhs>,
+                        IteratorBased, IteratorBased>
+    : sparse_conjunction_evaluator<
+          CwiseBinaryOp<scalar_boolean_and_op<bool>, Lhs, Rhs> > {
+  typedef CwiseBinaryOp<scalar_boolean_and_op<bool>, Lhs, Rhs> XprType;
   typedef sparse_conjunction_evaluator<XprType> Base;
   explicit binary_evaluator(const XprType& xpr) : Base(xpr) {}
 };
 // "dense && sparse"
-template<typename Lhs, typename Rhs>
-struct binary_evaluator<CwiseBinaryOp<scalar_boolean_and_op, Lhs, Rhs>, IndexBased, IteratorBased>
-  : sparse_conjunction_evaluator<CwiseBinaryOp<scalar_boolean_and_op, Lhs, Rhs> >
-{
-  typedef CwiseBinaryOp<scalar_boolean_and_op, Lhs, Rhs> XprType;
+template <typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<scalar_boolean_and_op<bool>, Lhs, Rhs>,
+                        IndexBased, IteratorBased>
+    : sparse_conjunction_evaluator<
+          CwiseBinaryOp<scalar_boolean_and_op<bool>, Lhs, Rhs> > {
+  typedef CwiseBinaryOp<scalar_boolean_and_op<bool>, Lhs, Rhs> XprType;
   typedef sparse_conjunction_evaluator<XprType> Base;
   explicit binary_evaluator(const XprType& xpr) : Base(xpr) {}
 };
 // "sparse && dense"
-template<typename Lhs, typename Rhs>
-struct binary_evaluator<CwiseBinaryOp<scalar_boolean_and_op, Lhs, Rhs>, IteratorBased, IndexBased>
-  : sparse_conjunction_evaluator<CwiseBinaryOp<scalar_boolean_and_op, Lhs, Rhs> >
-{
-  typedef CwiseBinaryOp<scalar_boolean_and_op, Lhs, Rhs> XprType;
+template <typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<scalar_boolean_and_op<bool>, Lhs, Rhs>,
+                        IteratorBased, IndexBased>
+    : sparse_conjunction_evaluator<
+          CwiseBinaryOp<scalar_boolean_and_op<bool>, Lhs, Rhs> > {
+  typedef CwiseBinaryOp<scalar_boolean_and_op<bool>, Lhs, Rhs> XprType;
   typedef sparse_conjunction_evaluator<XprType> Base;
   explicit binary_evaluator(const XprType& xpr) : Base(xpr) {}
 };
diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h
index bf1d562..9270aa2 100644
--- a/Eigen/src/SparseCore/SparseMatrix.h
+++ b/Eigen/src/SparseCore/SparseMatrix.h
@@ -1109,7 +1109,8 @@
   IndexMap outerIndexMap(mat.outerIndexPtr(), mat.outerSize() + 1);
   for (InputIterator it(begin); it != end; ++it) {
     eigen_assert(it->row() >= 0 && it->row() < mat.rows() && it->col() >= 0 && it->col() < mat.cols());
-    StorageIndex j = IsRowMajor ? it->row() : it->col();
+    StorageIndex j =
+        static_cast<StorageIndex>(IsRowMajor ? it->row() : it->col());
     outerIndexMap.coeffRef(j + 1)++;
   }
 
@@ -1124,8 +1125,10 @@
 
   // push triplets to back of each inner vector
   for (InputIterator it(begin); it != end; ++it) {
-    StorageIndex j = IsRowMajor ? it->row() : it->col();
-    StorageIndex i = IsRowMajor ? it->col() : it->row();
+    StorageIndex j =
+        static_cast<StorageIndex>(IsRowMajor ? it->row() : it->col());
+    StorageIndex i =
+        static_cast<StorageIndex>(IsRowMajor ? it->col() : it->row());
     mat.data().index(back.coeff(j)) = i;
     mat.data().value(back.coeff(j)) = it->value();
     back.coeffRef(j)++;
diff --git a/Eigen/src/misc/lapacke.h b/Eigen/src/misc/lapacke.h
index 3d8e24f..c20204c 100644
--- a/Eigen/src/misc/lapacke.h
+++ b/Eigen/src/misc/lapacke.h
@@ -46,7 +46,11 @@
 #include <stdlib.h>
 
 #ifndef lapack_int
-#define lapack_int     int
+  #ifdef LAPACK_ILP64
+    #define lapack_int int64_t
+  #else
+    #define lapack_int int
+  #endif
 #endif
 
 #ifndef lapack_logical
@@ -72,8 +76,7 @@
 
 /* Complex type (single precision) */
 #ifndef lapack_complex_float
-#include <complex.h>
-#define lapack_complex_float    float _Complex
+#define lapack_complex_float std::complex<float>
 #endif
 
 #ifndef lapack_complex_float_real
@@ -88,8 +91,7 @@
 
 /* Complex type (double precision) */
 #ifndef lapack_complex_double
-#include <complex.h>
-#define lapack_complex_double   double _Complex
+#define lapack_complex_double std::complex<double>
 #endif
 
 #ifndef lapack_complex_double_real
diff --git a/Eigen/src/plugins/ArrayCwiseBinaryOps.h b/Eigen/src/plugins/ArrayCwiseBinaryOps.h
index 30e3ee1..35461da 100644
--- a/Eigen/src/plugins/ArrayCwiseBinaryOps.h
+++ b/Eigen/src/plugins/ArrayCwiseBinaryOps.h
@@ -307,25 +307,6 @@
   operator/(const T& s,const StorageBaseType& a);
 #endif
 
-/** \returns an expression of the coefficient-wise ^ operator of *this and \a other
- *
- * \warning this operator is for expression of bool only.
- *
- * Example: \include Cwise_boolean_xor.cpp
- * Output: \verbinclude Cwise_boolean_xor.out
- *
- * \sa operator&&(), select()
- */
-template<typename OtherDerived>
-EIGEN_DEVICE_FUNC
-inline const CwiseBinaryOp<internal::scalar_boolean_xor_op, const Derived, const OtherDerived>
-operator^(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
-{
-  EIGEN_STATIC_ASSERT((internal::is_same<bool,Scalar>::value && internal::is_same<bool,typename OtherDerived::Scalar>::value),
-                      THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL);
-  return CwiseBinaryOp<internal::scalar_boolean_xor_op, const Derived, const OtherDerived>(derived(),other.derived());
-}
-
 // NOTE disabled until we agree on argument order
 #if 0
 /** \cpp11 \returns an expression of the coefficient-wise polygamma function.
diff --git a/Eigen/src/plugins/ArrayCwiseUnaryOps.h b/Eigen/src/plugins/ArrayCwiseUnaryOps.h
index d8c1a84..7411fda 100644
--- a/Eigen/src/plugins/ArrayCwiseUnaryOps.h
+++ b/Eigen/src/plugins/ArrayCwiseUnaryOps.h
@@ -2,12 +2,16 @@
 
 typedef CwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived> AbsReturnType;
 typedef CwiseUnaryOp<internal::scalar_arg_op<Scalar>, const Derived> ArgReturnType;
+typedef CwiseUnaryOp<internal::scalar_carg_op<Scalar>, const Derived>
+    CArgReturnType;
 typedef CwiseUnaryOp<internal::scalar_abs2_op<Scalar>, const Derived> Abs2ReturnType;
 typedef CwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived> SqrtReturnType;
 typedef CwiseUnaryOp<internal::scalar_rsqrt_op<Scalar>, const Derived> RsqrtReturnType;
 typedef CwiseUnaryOp<internal::scalar_sign_op<Scalar>, const Derived> SignReturnType;
 typedef CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const Derived> InverseReturnType;
 typedef CwiseUnaryOp<internal::scalar_boolean_not_op<Scalar>, const Derived> BooleanNotReturnType;
+typedef CwiseUnaryOp<internal::scalar_bitwise_not_op<Scalar>, const Derived>
+    BitwiseNotReturnType;
 
 typedef CwiseUnaryOp<internal::scalar_exp_op<Scalar>, const Derived> ExpReturnType;
 typedef CwiseUnaryOp<internal::scalar_expm1_op<Scalar>, const Derived> Expm1ReturnType;
@@ -66,6 +70,11 @@
   return ArgReturnType(derived());
 }
 
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const CArgReturnType carg() const {
+  return CArgReturnType(derived());
+}
+
 /** \returns an expression of the coefficient-wise squared absolute value of \c *this
   *
   * Example: \include Cwise_abs2.cpp
@@ -576,8 +585,6 @@
 
 /** \returns an expression of the coefficient-wise ! operator of *this
   *
-  * \warning this operator is for expression of bool only.
-  *
   * Example: \include Cwise_boolean_not.cpp
   * Output: \verbinclude Cwise_boolean_not.out
   *
@@ -587,11 +594,15 @@
 inline const BooleanNotReturnType
 operator!() const
 {
-  EIGEN_STATIC_ASSERT((internal::is_same<bool,Scalar>::value),
-                      THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL);
   return BooleanNotReturnType(derived());
 }
 
+/** \returns an expression of the bitwise ~ operator of *this
+ */
+EIGEN_DEVICE_FUNC
+inline const BitwiseNotReturnType operator~() const {
+  return BitwiseNotReturnType(derived());
+}
 
 // --- SpecialFunctions module ---
 
diff --git a/Eigen/src/plugins/CommonCwiseBinaryOps.h b/Eigen/src/plugins/CommonCwiseBinaryOps.h
index 2f50329..029269c 100644
--- a/Eigen/src/plugins/CommonCwiseBinaryOps.h
+++ b/Eigen/src/plugins/CommonCwiseBinaryOps.h
@@ -76,40 +76,71 @@
 const CwiseBinaryOp<internal::scalar_quotient_op<Scalar,T>,Derived,Constant<T> > operator/(const T& scalar) const;
 #endif
 
-/** \returns an expression of the coefficient-wise boolean \b and operator of \c *this and \a other
-  *
-  * \warning this operator is for expression of bool only.
-  *
-  * Example: \include Cwise_boolean_and.cpp
-  * Output: \verbinclude Cwise_boolean_and.out
-  *
-  * \sa operator||(), select()
-  */
-template<typename OtherDerived>
-EIGEN_DEVICE_FUNC
-inline const CwiseBinaryOp<internal::scalar_boolean_and_op, const Derived, const OtherDerived>
-operator&&(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
-{
-  EIGEN_STATIC_ASSERT((internal::is_same<bool,Scalar>::value && internal::is_same<bool,typename OtherDerived::Scalar>::value),
-                      THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL);
-  return CwiseBinaryOp<internal::scalar_boolean_and_op, const Derived, const OtherDerived>(derived(),other.derived());
+/** \returns an expression of the coefficient-wise boolean \b and operator of \c
+ * *this and \a other
+ *
+ * Example: \include Cwise_boolean_and.cpp
+ * Output: \verbinclude Cwise_boolean_and.out
+ *
+ * \sa operator||(), select()
+ */
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC inline const CwiseBinaryOp<
+    internal::scalar_boolean_and_op<Scalar>, const Derived, const OtherDerived>
+operator&&(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
+  return CwiseBinaryOp<internal::scalar_boolean_and_op<Scalar>, const Derived,
+                       const OtherDerived>(derived(), other.derived());
 }
 
-/** \returns an expression of the coefficient-wise boolean \b or operator of \c *this and \a other
-  *
-  * \warning this operator is for expression of bool only.
-  *
-  * Example: \include Cwise_boolean_or.cpp
-  * Output: \verbinclude Cwise_boolean_or.out
-  *
-  * \sa operator&&(), select()
-  */
-template<typename OtherDerived>
-EIGEN_DEVICE_FUNC
-inline const CwiseBinaryOp<internal::scalar_boolean_or_op, const Derived, const OtherDerived>
-operator||(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
-{
-  EIGEN_STATIC_ASSERT((internal::is_same<bool,Scalar>::value && internal::is_same<bool,typename OtherDerived::Scalar>::value),
-                      THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL);
-  return CwiseBinaryOp<internal::scalar_boolean_or_op, const Derived, const OtherDerived>(derived(),other.derived());
+/** \returns an expression of the coefficient-wise boolean \b or operator of \c
+ * *this and \a other
+ *
+ * Example: \include Cwise_boolean_or.cpp
+ * Output: \verbinclude Cwise_boolean_or.out
+ *
+ * \sa operator&&(), select()
+ */
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC inline const CwiseBinaryOp<
+    internal::scalar_boolean_or_op<Scalar>, const Derived, const OtherDerived>
+operator||(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
+  return CwiseBinaryOp<internal::scalar_boolean_or_op<Scalar>, const Derived,
+                       const OtherDerived>(derived(), other.derived());
+}
+
+/** \returns an expression of the bitwise \b and operator of \c *this and \a
+ * other
+ *
+ * \sa operator|(), operator^()
+ */
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC inline const CwiseBinaryOp<
+    internal::scalar_bitwise_and_op<Scalar>, const Derived, const OtherDerived>
+operator&(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
+  return CwiseBinaryOp<internal::scalar_bitwise_and_op<Scalar>, const Derived,
+                       const OtherDerived>(derived(), other.derived());
+}
+
+/** \returns an expression of the bitwise boolean \b or operator of \c *this and
+ * \a other
+ *
+ * \sa operator&(), operator^()
+ */
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC inline const CwiseBinaryOp<
+    internal::scalar_bitwise_or_op<Scalar>, const Derived, const OtherDerived>
+operator|(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
+  return CwiseBinaryOp<internal::scalar_bitwise_or_op<Scalar>, const Derived,
+                       const OtherDerived>(derived(), other.derived());
+}
+
+/** \returns an expression of the bitwise xor operator of *this and \a other
+ * \sa operator&(), operator|()
+ */
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC inline const CwiseBinaryOp<
+    internal::scalar_bitwise_xor_op<Scalar>, const Derived, const OtherDerived>
+operator^(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
+  return CwiseBinaryOp<internal::scalar_bitwise_xor_op<Scalar>, const Derived,
+                       const OtherDerived>(derived(), other.derived());
 }
diff --git a/Eigen/src/plugins/MatrixCwiseBinaryOps.h b/Eigen/src/plugins/MatrixCwiseBinaryOps.h
index 46fe08c..b2ed335 100644
--- a/Eigen/src/plugins/MatrixCwiseBinaryOps.h
+++ b/Eigen/src/plugins/MatrixCwiseBinaryOps.h
@@ -37,12 +37,12 @@
   *
   * \sa cwiseNotEqual(), isApprox(), isMuchSmallerThan()
   */
-template<typename OtherDerived>
-EIGEN_DEVICE_FUNC
-inline const CwiseBinaryOp<numext::equal_to<Scalar>, const Derived, const OtherDerived>
-cwiseEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
-{
-  return CwiseBinaryOp<numext::equal_to<Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC inline const CwiseBinaryOp<std::equal_to<Scalar>,
+                                             const Derived, const OtherDerived>
+cwiseEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const {
+  return CwiseBinaryOp<std::equal_to<Scalar>, const Derived,
+                       const OtherDerived>(derived(), other.derived());
 }
 
 /** \returns an expression of the coefficient-wise != operator of *this and \a other
@@ -57,12 +57,13 @@
   *
   * \sa cwiseEqual(), isApprox(), isMuchSmallerThan()
   */
-template<typename OtherDerived>
-EIGEN_DEVICE_FUNC
-inline const CwiseBinaryOp<numext::not_equal_to<Scalar>, const Derived, const OtherDerived>
-cwiseNotEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
-{
-  return CwiseBinaryOp<numext::not_equal_to<Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC inline const CwiseBinaryOp<std::not_equal_to<Scalar>,
+                                             const Derived, const OtherDerived>
+cwiseNotEqual(
+    const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const {
+  return CwiseBinaryOp<std::not_equal_to<Scalar>, const Derived,
+                       const OtherDerived>(derived(), other.derived());
 }
 
 /** \returns an expression of the coefficient-wise min of *this and \a other
diff --git a/Eigen/src/plugins/MatrixCwiseUnaryOps.h b/Eigen/src/plugins/MatrixCwiseUnaryOps.h
index 98d925d..0e039c1 100644
--- a/Eigen/src/plugins/MatrixCwiseUnaryOps.h
+++ b/Eigen/src/plugins/MatrixCwiseUnaryOps.h
@@ -15,6 +15,8 @@
 typedef CwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived> CwiseAbsReturnType;
 typedef CwiseUnaryOp<internal::scalar_abs2_op<Scalar>, const Derived> CwiseAbs2ReturnType;
 typedef CwiseUnaryOp<internal::scalar_arg_op<Scalar>, const Derived> CwiseArgReturnType;
+typedef CwiseUnaryOp<internal::scalar_carg_op<Scalar>, const Derived>
+    CwiseCArgReturnType;
 typedef CwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived> CwiseSqrtReturnType;
 typedef CwiseUnaryOp<internal::scalar_sign_op<Scalar>, const Derived> CwiseSignReturnType;
 typedef CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const Derived> CwiseInverseReturnType;
@@ -94,6 +96,11 @@
 inline const CwiseArgReturnType
 cwiseArg() const { return CwiseArgReturnType(derived()); }
 
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const CwiseCArgReturnType cwiseCArg() const {
+  return CwiseCArgReturnType(derived());
+}
+
 template <typename ScalarExponent>
 using CwisePowReturnType =
     std::enable_if_t<internal::is_arithmetic<typename NumTraits<ScalarExponent>::Real>::value,
diff --git a/bench/tensors/README b/bench/tensors/README
index acce5a1..3aca767 100644
--- a/bench/tensors/README
+++ b/bench/tensors/README
@@ -17,10 +17,10 @@
 2. bash eigen_sycl_bench.sh
 
 To compile the floating point GPU benchmarks using Intel DPCPP compiler 
-/path/to/dpcpp/bin/clang+  -DSYCL_COMPILER_IS_DPCPP -DNDEBUG -DEIGEN_MPL2_ONLY -DEIGEN_USE_SYCL=1 -I ../../  -O3 -DNDEBUG -fsycl -fsycl-targets="supported backend in DPCPP. i.e. spir64 or nvptx64-nvidia-cuda"  -std=c++17  tensor_benchmarks_sycl.cc benchmark_main.cc  -lpthread -o eigen_dpcpp_sycl
+/path/to/dpcpp/bin/clang+  -DSYCL_COMPILER_IS_DPCPP -DNDEBUG -DEIGEN_USE_SYCL=1 -I ../../  -O3 -DNDEBUG -fsycl -fsycl-targets="supported backend in DPCPP. i.e. spir64 or nvptx64-nvidia-cuda"  -std=c++17  tensor_benchmarks_sycl.cc benchmark_main.cc  -lpthread -o eigen_dpcpp_sycl
 
 Last but not least, we also provide a suite of benchmarks to measure the scalability of the contraction code on CPU. To compile these benchmarks, call
 g++ contraction_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu
 
 To compile the contraction with DPCPP: 
-/path/to/dpcpp/bin/clang++  -DSYCL_COMPILER_IS_DPCPP -DNDEBUG -DEIGEN_MPL2_ONLY -DEIGEN_USE_SYCL=1 -I ../../  -O3 -DNDEBUG -fsycl -fsycl-targets="supported backend in DPCPP. i.e. spir64 or nvptx64-nvidia-cuda" -std=c++17   tensor_contract_sycl_bench.cc -lpthread -o eigen_dpcpp_contract
+/path/to/dpcpp/bin/clang++  -DSYCL_COMPILER_IS_DPCPP -DNDEBUG -DEIGEN_USE_SYCL=1 -I ../../  -O3 -DNDEBUG -fsycl -fsycl-targets="supported backend in DPCPP. i.e. spir64 or nvptx64-nvidia-cuda" -std=c++17   tensor_contract_sycl_bench.cc -lpthread -o eigen_dpcpp_contract
diff --git a/doc/PreprocessorDirectives.dox b/doc/PreprocessorDirectives.dox
index d6024dc..be47ac9 100644
--- a/doc/PreprocessorDirectives.dox
+++ b/doc/PreprocessorDirectives.dox
@@ -78,9 +78,6 @@
  - \b eigen_assert - macro with one argument that is used inside %Eigen for assertions. By default, it is
    basically defined to be \c assert, which aborts the program if the assertion is violated. Redefine this
    macro if you want to do something else, like throwing an exception.
- - \b EIGEN_MPL2_ONLY - disable non MPL2 compatible features, or in other words disable the features which
-   are still under the LGPL.
-
 
 \section TopicPreprocessorDirectivesPerformance Alignment, vectorization and performance tweaking
 
diff --git a/doc/snippets/Cwise_boolean_xor.cpp b/doc/snippets/Cwise_boolean_xor.cpp
deleted file mode 100644
index fafbec8..0000000
--- a/doc/snippets/Cwise_boolean_xor.cpp
+++ /dev/null
@@ -1,2 +0,0 @@
-Array3d v(-1,2,1), w(-3,2,3);
-cout << ((v<w) ^ (v<0)) << endl;
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 223a9f1..f402ab3 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -299,7 +299,6 @@
 ei_add_test(rvalue_types)
 ei_add_test(dense_storage)
 ei_add_test(ctorleak)
-ei_add_test(mpl2only)
 ei_add_test(inplace_decomposition)
 ei_add_test(half_float)
 ei_add_test(bfloat16_float)
diff --git a/test/array_cwise.cpp b/test/array_cwise.cpp
index f0a80df..44a6a50 100644
--- a/test/array_cwise.cpp
+++ b/test/array_cwise.cpp
@@ -29,7 +29,7 @@
   const Scalar two = Scalar(2);
   const Scalar three = Scalar(3);
   const Scalar sqrt_half = Scalar(std::sqrt(0.5));
-  const Scalar sqrt2 = Scalar(std::sqrt(2));
+  const Scalar sqrt2 = Scalar(std::sqrt(2));    
   const Scalar inf = Eigen::NumTraits<Scalar>::infinity();
   const Scalar nan = Eigen::NumTraits<Scalar>::quiet_NaN();
   const Scalar denorm_min = std::numeric_limits<Scalar>::denorm_min();
@@ -93,16 +93,91 @@
   VERIFY(all_pass);
 }
 
+#define BINARY_FUNCTOR_TEST_ARGS(fun) #fun, \
+      [](const auto& x, const auto& y) { return (Eigen::fun)(x, y); },    \
+      [](const auto& x, const auto& y) { return (std::fun)(x, y); }
+
+
 template <typename Scalar>
 void binary_ops_test() {
-  binary_op_test<Scalar>("pow",
-                         [](const auto& x, const auto& y) { return Eigen::pow(x, y); },
-                         [](const auto& x, const auto& y) { return std::pow(x, y); });
-  binary_op_test<Scalar>("atan2",
-                         [](const auto& x, const auto& y) { return Eigen::atan2(x, y); },
-                         [](const auto& x, const auto& y) { return std::atan2(x, y); });
+  binary_op_test<Scalar>(BINARY_FUNCTOR_TEST_ARGS(pow));
+#ifndef EIGEN_COMP_MSVC
+  binary_op_test<Scalar>(BINARY_FUNCTOR_TEST_ARGS(atan2));
+#else
+  binary_op_test<Scalar>(
+      "atan2", [](const auto& x, const auto& y) { return Eigen::atan2(x, y); },
+      [](Scalar x, Scalar y) {
+        auto t = Scalar(std::atan2(x, y));
+        // Work around MSVC return value on underflow.
+        // |atan(y/x)| is bounded above by |y/x|, so on underflow return y/x according to POSIX spec.
+        // MSVC otherwise returns denorm_min.
+        if (EIGEN_PREDICT_FALSE(std::abs(t) == std::numeric_limits<decltype(t)>::denorm_min())) {
+          return x / y;
+        }
+        return t;
+      });
+#endif
 }
 
+
+template <typename Scalar, typename Fn, typename RefFn>
+void unary_op_test(std::string name, Fn fun, RefFn ref) {
+  const Scalar tol = test_precision<Scalar>();
+  auto values = special_values<Scalar>();
+  Map<Array<Scalar, Dynamic, 1>> x(values.data(), values.size());
+
+  Array<Scalar, Dynamic, Dynamic> actual = fun(x);
+  bool all_pass = true;
+  for (Index i = 0; i < x.size(); ++i) {
+    Scalar e = static_cast<Scalar>(ref(x(i)));
+    Scalar a = actual(i);
+    bool success = (a == e) || ((numext::isfinite)(e) && internal::isApprox(a, e, tol)) ||
+                   ((numext::isnan)(a) && (numext::isnan)(e));
+    if ((a == a) && (e == e)) success &= (bool)numext::signbit(e) == (bool)numext::signbit(a);
+    all_pass &= success;
+    if (!success) {
+      std::cout << name << "(" << x(i) << ") = " << a << " !=  " << e << std::endl;
+    }
+  }
+  VERIFY(all_pass);
+}
+
+#define UNARY_FUNCTOR_TEST_ARGS(fun) #fun, \
+      [](const auto& x) { return (Eigen::fun)(x); },    \
+      [](const auto& x) { return (std::fun)(x); }
+
+template <typename Scalar>
+void unary_ops_test() {
+  unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(sqrt));
+  unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(exp));
+  unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(log));
+  unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(sin));
+  unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(cos));
+  unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(tan));
+  unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(asin));
+  unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(acos));
+  unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(atan));
+  unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(sinh));
+  unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(cosh));
+  unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(tanh));
+  unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(asinh));
+  unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(acosh));
+  unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(atanh));
+  /* FIXME: Enable when the behavior of rsqrt on denormals for half and double is fixed. 
+  unary_op_test<Scalar>("rsqrt",
+                        [](const auto& x) { return Eigen::rsqrt(x); }, 
+                        [](Scalar x) {
+                          if (x >= 0 && x < (std::numeric_limits<Scalar>::min)()) {
+                            // rsqrt return +inf for positive subnormals.
+                            return NumTraits<Scalar>::infinity();
+                          } else {
+                            return  Scalar(std::sqrt(Scalar(1)/x));
+                          }
+                        });
+  */
+}
+
+
 template <typename Scalar>
 void pow_scalar_exponent_test() {
   using Int_t = typename internal::make_integer<Scalar>::type;
@@ -251,9 +326,9 @@
   unary_pow_test<double, long long>();
 
   // The following cases will test promoting a wider exponent type
-  // to a narrower base type. This should compile but generate a
+  // to a narrower base type. This should compile but would generate a
   // deprecation warning:
-  unary_pow_test<float, double>();
+  // unary_pow_test<float, double>();
 }
 
 void int_pow_test() {
@@ -619,6 +694,7 @@
   VERIFY_IS_APPROX(m1.tanh().atanh(), atanh(tanh(m1)));
   VERIFY_IS_APPROX(m1.sinh().asinh(), asinh(sinh(m1)));
   VERIFY_IS_APPROX(m1.cosh().acosh(), acosh(cosh(m1)));
+  VERIFY_IS_APPROX(m1.tanh().atanh(), atanh(tanh(m1)));
   VERIFY_IS_APPROX(m1.logistic(), logistic(m1));
 
   VERIFY_IS_APPROX(m1.arg(), arg(m1));
@@ -711,6 +787,7 @@
   VERIFY_IS_APPROX(m3.pow(RealScalar(-2)), m3.square().inverse());
 
   // Test pow and atan2 on special IEEE values.
+  unary_ops_test<Scalar>();
   binary_ops_test<Scalar>();
   pow_scalar_exponent_test<Scalar>();
 
@@ -762,6 +839,8 @@
   VERIFY_IS_APPROX(m1.tanh(), tanh(m1));
   VERIFY_IS_APPROX(m1.logistic(), logistic(m1));
   VERIFY_IS_APPROX(m1.arg(), arg(m1));
+  VERIFY_IS_APPROX(m1.carg(), carg(m1));
+  VERIFY_IS_APPROX(arg(m1), carg(m1));
   VERIFY((m1.isNaN() == (Eigen::isnan)(m1)).all());
   VERIFY((m1.isInf() == (Eigen::isinf)(m1)).all());
   VERIFY((m1.isFinite() == (Eigen::isfinite)(m1)).all());
@@ -795,6 +874,7 @@
     for (Index j = 0; j < m.cols(); ++j)
       m3(i,j) = std::atan2(m1(i,j).imag(), m1(i,j).real());
   VERIFY_IS_APPROX(arg(m1), m3);
+  VERIFY_IS_APPROX(carg(m1), m3);
 
   std::complex<RealScalar> zero(0.0,0.0);
   VERIFY((Eigen::isnan)(m1*zero/zero).all());
@@ -954,6 +1034,104 @@
     signed_shift_test_impl<ArrayType>::run(m);
 }
 
+template <typename ArrayType>
+struct typed_logicals_test_impl {
+  using Scalar = typename ArrayType::Scalar;
+
+  static bool scalar_to_bool(const Scalar& x) { return x != Scalar(0); }
+  static Scalar bool_to_scalar(const bool& x) { return x ? Scalar(1) : Scalar(0); }
+
+  static Scalar eval_bool_and(const Scalar& x, const Scalar& y) { return bool_to_scalar(scalar_to_bool(x) && scalar_to_bool(y)); }
+  static Scalar eval_bool_or(const Scalar& x, const Scalar& y) { return bool_to_scalar(scalar_to_bool(x) || scalar_to_bool(y)); }
+  static Scalar eval_bool_xor(const Scalar& x, const Scalar& y) { return bool_to_scalar(scalar_to_bool(x) != scalar_to_bool(y)); }
+  static Scalar eval_bool_not(const Scalar& x) { return bool_to_scalar(!scalar_to_bool(x)); }
+
+  static void run(const ArrayType& m) {
+      
+    Index rows = m.rows();
+    Index cols = m.cols();
+
+    ArrayType m1(rows, cols), m2(rows, cols), m3(rows, cols), m4(rows, cols);
+
+    m1.setRandom();
+    m2.setRandom();
+    m1 *= ArrayX<bool>::Random(rows, cols).cast<Scalar>();
+    m2 *= ArrayX<bool>::Random(rows, cols).cast<Scalar>();
+
+    // test boolean and
+    m3 = m1 && m2;
+    m4 = m1.binaryExpr(m2, [](const Scalar& x, const Scalar& y) { return eval_bool_and(x, y); });
+    VERIFY_IS_CWISE_EQUAL(m3, m4);
+    for (const Scalar& val : m3) VERIFY(val == Scalar(0) || val == Scalar(1));
+
+    // test boolean or
+    m3 = m1 || m2;
+    m4 = m1.binaryExpr(m2, [](const Scalar& x, const Scalar& y) { return eval_bool_or(x, y); });
+    VERIFY_IS_CWISE_EQUAL(m3, m4);
+    for (const Scalar& val : m3) VERIFY(val == Scalar(0) || val == Scalar(1));
+
+    // test boolean xor
+    m3 = m1.binaryExpr(m2, internal::scalar_boolean_xor_op<Scalar>());
+    m4 = m1.binaryExpr(m2, [](const Scalar& x, const Scalar& y) { return eval_bool_xor(x, y); });
+    VERIFY_IS_CWISE_EQUAL(m3, m4);
+    for (const Scalar& val : m3) VERIFY(val == Scalar(0) || val == Scalar(1));
+
+    // test boolean not
+    m3 = !m1;
+    m4 = m1.unaryExpr([](const Scalar& x) { return eval_bool_not(x); });
+    VERIFY_IS_CWISE_EQUAL(m3, m4);
+    for (const Scalar& val : m3) VERIFY(val == Scalar(0) || val == Scalar(1));
+
+    // test something more complicated
+    m3 = m1 && m2;
+    m4 = !(!m1 || !m2);
+    VERIFY_IS_CWISE_EQUAL(m3, m4);
+
+    m3 = m1.binaryExpr(m2, internal::scalar_boolean_xor_op<Scalar>());
+    m4 = (!m1).binaryExpr((!m2), internal::scalar_boolean_xor_op<Scalar>());
+    VERIFY_IS_CWISE_EQUAL(m3, m4);
+
+    const Index bytes = rows * cols * sizeof(Scalar);
+    const uint8_t* m1_data = reinterpret_cast<const uint8_t*>(m1.data());
+    const uint8_t* m2_data = reinterpret_cast<const uint8_t*>(m2.data());
+    uint8_t* m3_data = reinterpret_cast<uint8_t*>(m3.data());
+    uint8_t* m4_data = reinterpret_cast<uint8_t*>(m4.data());
+
+    // test bitwise and
+    m3 = m1 & m2;
+    for (Index i = 0; i < bytes; i++) m4_data[i] = m1_data[i] & m2_data[i];
+    for (Index i = 0; i < bytes; i++) VERIFY_IS_EQUAL(m3_data[i], m4_data[i]);
+
+    // test bitwise or
+    m3 = m1 | m2;
+    for (Index i = 0; i < bytes; i++) m4_data[i] = m1_data[i] | m2_data[i];
+    for (Index i = 0; i < bytes; i++) VERIFY_IS_EQUAL(m3_data[i], m4_data[i]);
+
+    // test bitwise xor
+    m3 = m1 ^ m2;
+    for (Index i = 0; i < bytes; i++) m4_data[i] = m1_data[i] ^ m2_data[i];
+    for (Index i = 0; i < bytes; i++) VERIFY_IS_EQUAL(m3_data[i], m4_data[i]);
+
+    // test bitwise not
+    m3 = ~m1;
+    for (Index i = 0; i < bytes; i++) m4_data[i] = ~m1_data[i];
+    for (Index i = 0; i < bytes; i++) VERIFY_IS_EQUAL(m3_data[i], m4_data[i]);
+
+    // test something more complicated
+    m3 = m1 & m2;
+    m4 = ~(~m1 | ~m2);
+    for (Index i = 0; i < bytes; i++) VERIFY_IS_EQUAL(m3_data[i], m4_data[i]);
+
+    m3 = m1 ^ m2;
+    m4 = (~m1) ^ (~m2);
+    for (Index i = 0; i < bytes; i++) VERIFY_IS_EQUAL(m3_data[i], m4_data[i]);
+  }
+};
+template <typename ArrayType>
+void typed_logicals_test(const ArrayType& m) {
+    typed_logicals_test_impl<ArrayType>::run(m);
+}
+
 EIGEN_DECLARE_TEST(array_cwise)
 {
   for(int i = 0; i < g_repeat; i++) {
@@ -994,6 +1172,7 @@
   }
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_4( array_complex(ArrayXXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_5( array_complex(ArrayXXcd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))));
   }
 
   for(int i = 0; i < g_repeat; i++) {
@@ -1001,6 +1180,14 @@
     CALL_SUBTEST_7( mixed_pow_test() );
     CALL_SUBTEST_8( signbit_tests() );
   }
+  for (int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( typed_logicals_test(ArrayX<bool>(internal::random<int>(1, EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_2( typed_logicals_test(ArrayX<int>(internal::random<int>(1, EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_2( typed_logicals_test(ArrayX<float>(internal::random<int>(1, EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_3( typed_logicals_test(ArrayX<double>(internal::random<int>(1, EIGEN_TEST_MAX_SIZE))));
+    CALL_SUBTEST_3( typed_logicals_test(ArrayX<std::complex<float>>(internal::random<int>(1, EIGEN_TEST_MAX_SIZE))));
+    CALL_SUBTEST_3( typed_logicals_test(ArrayX<std::complex<double>>(internal::random<int>(1, EIGEN_TEST_MAX_SIZE))));
+  }
 
   VERIFY((internal::is_same< internal::global_math_functions_filtering_base<int>::type, int >::value));
   VERIFY((internal::is_same< internal::global_math_functions_filtering_base<float>::type, float >::value));
diff --git a/test/bdcsvd.cpp b/test/bdcsvd.cpp
index 539494b..b21479e 100644
--- a/test/bdcsvd.cpp
+++ b/test/bdcsvd.cpp
@@ -95,9 +95,7 @@
 
 template <typename MatrixType>
 void bdcsvd_all_options(const MatrixType& input = MatrixType()) {
-  MatrixType m(input.rows(), input.cols());
-  svd_fill_random(m);
-  svd_option_checks<MatrixType, 0>(m);
+  svd_option_checks<MatrixType, 0>(input);
 }
 
 template <typename MatrixType>
diff --git a/test/fastmath.cpp b/test/fastmath.cpp
index 00a1a59..902dfd7 100644
--- a/test/fastmath.cpp
+++ b/test/fastmath.cpp
@@ -43,11 +43,11 @@
   }
   else
   {
-    if( (std::isfinite)(m(3))) g_test_level=1;  VERIFY( !(numext::isfinite)(m(3)) ); g_test_level=0;
-    if( (std::isinf)   (m(3))) g_test_level=1;  VERIFY( !(numext::isinf)(m(3)) );    g_test_level=0;
-    if(!(std::isnan)   (m(3))) g_test_level=1;  VERIFY(  (numext::isnan)(m(3)) );    g_test_level=0;
-    if( (std::isfinite)(m(3))) g_test_level=1;  VERIFY( !m.allFinite() );            g_test_level=0;
-    if(!(std::isnan)   (m(3))) g_test_level=1;  VERIFY(  m.hasNaN() );               g_test_level=0;
+    if( (std::isfinite)(m(3))) { g_test_level=1;  VERIFY( !(numext::isfinite)(m(3)) ); g_test_level=0; }
+    if( (std::isinf)   (m(3))) { g_test_level=1;  VERIFY( !(numext::isinf)(m(3)) );    g_test_level=0; }
+    if(!(std::isnan)   (m(3))) { g_test_level=1;  VERIFY(  (numext::isnan)(m(3)) );    g_test_level=0; }
+    if( (std::isfinite)(m(3))) { g_test_level=1;  VERIFY( !m.allFinite() );            g_test_level=0; }
+    if(!(std::isnan)   (m(3))) { g_test_level=1;  VERIFY(  m.hasNaN() );               g_test_level=0; }
   }
   T hidden_zero = (std::numeric_limits<T>::min)()*(std::numeric_limits<T>::min)();
   m(4) /= hidden_zero;
@@ -62,11 +62,11 @@
   }
   else
   {
-    if( (std::isfinite)(m(3))) g_test_level=1;  VERIFY( !(numext::isfinite)(m(4)) );  g_test_level=0;
-    if(!(std::isinf)   (m(3))) g_test_level=1;  VERIFY(  (numext::isinf)(m(4)) );     g_test_level=0;
-    if( (std::isnan)   (m(3))) g_test_level=1;  VERIFY( !(numext::isnan)(m(4)) );     g_test_level=0;
-    if( (std::isfinite)(m(3))) g_test_level=1;  VERIFY( !m.allFinite() );             g_test_level=0;
-    if(!(std::isnan)   (m(3))) g_test_level=1;  VERIFY(  m.hasNaN() );                g_test_level=0;
+    if( (std::isfinite)(m(3))) { g_test_level=1;  VERIFY( !(numext::isfinite)(m(4)) );  g_test_level=0; }
+    if(!(std::isinf)   (m(3))) { g_test_level=1;  VERIFY(  (numext::isinf)(m(4)) );     g_test_level=0; }
+    if( (std::isnan)   (m(3))) { g_test_level=1;  VERIFY( !(numext::isnan)(m(4)) );     g_test_level=0; }
+    if( (std::isfinite)(m(3))) { g_test_level=1;  VERIFY( !m.allFinite() );             g_test_level=0; }
+    if(!(std::isnan)   (m(3))) { g_test_level=1;  VERIFY(  m.hasNaN() );                g_test_level=0; }
   }
   m(3) = 0;
   if(dryrun)
@@ -80,11 +80,11 @@
   }
   else
   {
-    if(!(std::isfinite)(m(3))) g_test_level=1;  VERIFY(  (numext::isfinite)(m(3)) );  g_test_level=0;
-    if( (std::isinf)   (m(3))) g_test_level=1;  VERIFY( !(numext::isinf)(m(3)) );     g_test_level=0;
-    if( (std::isnan)   (m(3))) g_test_level=1;  VERIFY( !(numext::isnan)(m(3)) );     g_test_level=0;
-    if( (std::isfinite)(m(3))) g_test_level=1;  VERIFY( !m.allFinite() );             g_test_level=0;
-    if( (std::isnan)   (m(3))) g_test_level=1;  VERIFY( !m.hasNaN() );                g_test_level=0;
+    if(!(std::isfinite)(m(3))) { g_test_level=1;  VERIFY(  (numext::isfinite)(m(3)) );  g_test_level=0; }
+    if( (std::isinf)   (m(3))) { g_test_level=1;  VERIFY( !(numext::isinf)(m(3)) );     g_test_level=0; }
+    if( (std::isnan)   (m(3))) { g_test_level=1;  VERIFY( !(numext::isnan)(m(3)) );     g_test_level=0; }
+    if( (std::isfinite)(m(3))) { g_test_level=1;  VERIFY( !m.allFinite() );             g_test_level=0; }
+    if( (std::isnan)   (m(3))) { g_test_level=1;  VERIFY( !m.hasNaN() );                g_test_level=0; }
   }
 }
 
diff --git a/test/gpu_basic.cu b/test/gpu_basic.cu
index 00838ea..67f16bf 100644
--- a/test/gpu_basic.cu
+++ b/test/gpu_basic.cu
@@ -456,11 +456,10 @@
   // numeric_limits
   CALL_SUBTEST( test_with_infs_nans(numeric_limits_test<Vector3f>(), 1, in, out) );
 
-#if defined(__NVCC__)
-  // FIXME
-  // These subtests compiles only with nvcc and fail with HIPCC and clang-cuda
-  CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues<Matrix4f>(), nthreads, in, out) );
-  typedef Matrix<float,6,6> Matrix6f;
-  CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues<Matrix6f>(), nthreads, in, out) );
-#endif
+  // These tests require dynamic-sized matrix multiplcation, which isn't currently
+  // supported on GPU.
+  
+  // CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues<Matrix4f>(), nthreads, in, out) );
+  // typedef Matrix<float,6,6> Matrix6f;
+  // CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues<Matrix6f>(), nthreads, in, out) );
 }
diff --git a/test/jacobisvd.cpp b/test/jacobisvd.cpp
index 401adf0..0c04235 100644
--- a/test/jacobisvd.cpp
+++ b/test/jacobisvd.cpp
@@ -75,9 +75,11 @@
 }
 
 template <typename MatrixType>
-void jacobisvd_verify_inputs(const MatrixType& m = MatrixType()) {
+void jacobisvd_verify_inputs(const MatrixType& input = MatrixType()) {
   // check defaults
   typedef JacobiSVD<MatrixType> DefaultSVD;
+  MatrixType m(input.rows(), input.cols());
+  svd_fill_random(m);
   DefaultSVD defaultSvd(m);
   VERIFY((int)DefaultSVD::QRPreconditioner == (int)ColPivHouseholderQRPreconditioner);
   VERIFY(!defaultSvd.computeU());
diff --git a/test/mpl2only.cpp b/test/mpl2only.cpp
deleted file mode 100644
index 296350d..0000000
--- a/test/mpl2only.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_MPL2_ONLY
-#define EIGEN_MPL2_ONLY
-#endif
-#include <Eigen/Dense>
-#include <Eigen/SparseCore>
-#include <Eigen/SparseLU>
-#include <Eigen/SparseQR>
-#include <Eigen/Sparse>
-#include <Eigen/IterativeLinearSolvers>
-#include <Eigen/Eigen>
-
-int main()
-{
-  return 0;
-}
diff --git a/test/packetmath.cpp b/test/packetmath.cpp
index f082985..76332d7 100644
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@@ -879,6 +879,9 @@
   }
   CHECK_CWISE1_IF(PacketTraits::HasASin, std::asin, internal::pasin);
   CHECK_CWISE1_IF(PacketTraits::HasACos, std::acos, internal::pacos);
+  CHECK_CWISE1_IF(PacketTraits::HasATan, std::atan, internal::patan);
+  CHECK_CWISE1_IF(PacketTraits::HasATanh, std::atanh, internal::patanh);
+
 
   for (int i = 0; i < size; ++i) {
     data1[i] = Scalar(internal::random<double>(-87, 88));
diff --git a/test/sparse_basic.cpp b/test/sparse_basic.cpp
index a9fc9fd..2cd9dab 100644
--- a/test/sparse_basic.cpp
+++ b/test/sparse_basic.cpp
@@ -164,7 +164,7 @@
 
     // test sort
     if (inner > 1) {
-      bool StorageOrdersMatch = DenseMatrix::IsRowMajor == SparseMatrixType::IsRowMajor;
+      bool StorageOrdersMatch = int(DenseMatrix::IsRowMajor) == int(SparseMatrixType::IsRowMajor);
       DenseMatrix m1(rows, cols);
       m1.setZero();
       SparseMatrixType m2(rows, cols);
diff --git a/test/svd_common.h b/test/svd_common.h
index 9822595..84551f3 100644
--- a/test/svd_common.h
+++ b/test/svd_common.h
@@ -392,12 +392,14 @@
 }
 
 template <typename MatrixType, int QRPreconditioner = 0>
-void svd_verify_assert_full_only(const MatrixType& m = MatrixType()) {
+void svd_verify_assert_full_only(const MatrixType& input = MatrixType()) {
   enum { RowsAtCompileTime = MatrixType::RowsAtCompileTime };
-
+  
   typedef Matrix<typename MatrixType::Scalar, RowsAtCompileTime, 1> RhsType;
-  RhsType rhs = RhsType::Zero(m.rows());
-
+  RhsType rhs = RhsType::Zero(input.rows());
+  MatrixType m(input.rows(), input.cols());
+  svd_fill_random(m);
+  
   SVD_STATIC_OPTIONS(MatrixType, QRPreconditioner) svd0;
   VERIFY_RAISES_ASSERT((svd0.matrixU()));
   VERIFY_RAISES_ASSERT((svd0.singularValues()));
@@ -420,11 +422,12 @@
 }
 
 template <typename MatrixType, int QRPreconditioner = 0>
-void svd_verify_assert(const MatrixType& m = MatrixType()) {
+void svd_verify_assert(const MatrixType& input = MatrixType()) {
   enum { RowsAtCompileTime = MatrixType::RowsAtCompileTime };
-
   typedef Matrix<typename MatrixType::Scalar, RowsAtCompileTime, 1> RhsType;
-  RhsType rhs = RhsType::Zero(m.rows());
+  RhsType rhs = RhsType::Zero(input.rows());
+  MatrixType m(input.rows(), input.cols());
+  svd_fill_random(m);
 
   SVD_STATIC_OPTIONS(MatrixType, QRPreconditioner | ComputeThinU) svdThinU(m);
   VERIFY_RAISES_ASSERT((svdThinU.matrixV()));
@@ -509,7 +512,9 @@
 }
 
 template <typename MatrixType, int QRPreconditioner = 0>
-void svd_option_checks(const MatrixType& m) {
+void svd_option_checks(const MatrixType& input) {
+  MatrixType m(input.rows(), input.cols());
+  svd_fill_random(m);
   svd_compute_checks<MatrixType, QRPreconditioner>(m);
   svd_compute_checks<MatrixType, QRPreconditioner | ComputeThinU>(m);
   svd_compute_checks<MatrixType, QRPreconditioner | ComputeThinV>(m);
@@ -543,7 +548,9 @@
 }
 
 template <typename MatrixType, int QRPreconditioner = 0>
-void svd_option_checks_full_only(const MatrixType& m) {
+void svd_option_checks_full_only(const MatrixType& input) {
+  MatrixType m(input.rows(), input.cols());
+  svd_fill_random(m);
   svd_compute_checks<MatrixType, QRPreconditioner | ComputeFullU>(m);
   svd_compute_checks<MatrixType, QRPreconditioner | ComputeFullV>(m);
   svd_compute_checks<MatrixType, QRPreconditioner | ComputeFullU | ComputeFullV>(m);
@@ -563,12 +570,14 @@
   int cols = MaxColsAtCompileTime == Dynamic ? initialCols : (std::min)(initialCols, (int)MaxColsAtCompileTime);
 
   MatrixType m(rows, cols);
+  svd_fill_random(m);
   SVD_STATIC_OPTIONS(MatrixType, QRPreconditioner | ComputeThinU | ComputeThinV) thinSvd(m);
   SVD_STATIC_OPTIONS(MatrixType, QRPreconditioner | ComputeThinU | ComputeFullV) mixedSvd1(m);
   SVD_STATIC_OPTIONS(MatrixType, QRPreconditioner | ComputeFullU | ComputeThinV) mixedSvd2(m);
   SVD_STATIC_OPTIONS(MatrixType, QRPreconditioner | ComputeFullU | ComputeFullV) fullSvd(m);
 
   MatrixType n(MaxRowsAtCompileTime, MaxColsAtCompileTime);
+  svd_fill_random(n);
   thinSvd.compute(n);
   mixedSvd1.compute(n);
   mixedSvd2.compute(n);
@@ -595,6 +604,7 @@
 
   typedef Matrix<Scalar, RowsAtCompileTime, 1> RhsType;
   RhsType rhs(rows);
+  svd_fill_random(rhs);
   SvdType svd;
   VERIFY_RAISES_ASSERT(svd.matrixU())
   VERIFY_RAISES_ASSERT(svd.singularValues())
diff --git a/test/unalignedcount.cpp b/test/unalignedcount.cpp
index 52cdd9e..40b9ab8 100644
--- a/test/unalignedcount.cpp
+++ b/test/unalignedcount.cpp
@@ -32,6 +32,7 @@
 {
   #if defined(EIGEN_VECTORIZE_AVX512)
   VectorXf a(48), b(48);
+  a.fill(0); b.fill(1);
   VERIFY_ALIGNED_UNALIGNED_COUNT(a += b, 6, 0, 3, 0);
   VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,48) += b.segment(0,48), 3, 3, 3, 0);
   VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,48) -= b.segment(0,48), 3, 3, 3, 0);
@@ -39,6 +40,7 @@
   VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,48) /= 3.5, 3, 0, 3, 0);
   #elif defined(EIGEN_VECTORIZE_AVX)
   VectorXf a(40), b(40);
+  a.fill(0); b.fill(1);
   VERIFY_ALIGNED_UNALIGNED_COUNT(a += b, 10, 0, 5, 0);
   VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,40) += b.segment(0,40), 5, 5, 5, 0);
   VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,40) -= b.segment(0,40), 5, 5, 5, 0);
@@ -46,6 +48,7 @@
   VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,40) /= 3.5, 5, 0, 5, 0);
   #elif defined(EIGEN_VECTORIZE_SSE)
   VectorXf a(40), b(40);
+  a.fill(0); b.fill(1);
   VERIFY_ALIGNED_UNALIGNED_COUNT(a += b, 20, 0, 10, 0);
   VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,40) += b.segment(0,40), 10, 10, 10, 0);
   VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,40) -= b.segment(0,40), 10, 10, 10, 0);
diff --git a/unsupported/Eigen/ArpackSupport b/unsupported/Eigen/ArpackSupport
index 67c4ac8..b89d8e9 100644
--- a/unsupported/Eigen/ArpackSupport
+++ b/unsupported/Eigen/ArpackSupport
@@ -23,7 +23,10 @@
 #include "../../Eigen/SparseCholesky"
 
 #include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
+// IWYU pragma: begin_exports
 #include "src/Eigenvalues/ArpackSelfAdjointEigenSolver.h"
+// IWYU pragma: end_exports
 
 #include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 
diff --git a/unsupported/Eigen/AutoDiff b/unsupported/Eigen/AutoDiff
index 62fc0b3..6372e8f 100644
--- a/unsupported/Eigen/AutoDiff
+++ b/unsupported/Eigen/AutoDiff
@@ -32,10 +32,11 @@
 }
 #include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
 
-
+// IWYU pragma: begin_exports
 #include "src/AutoDiff/AutoDiffScalar.h"
 // #include "src/AutoDiff/AutoDiffVector.h"
 #include "src/AutoDiff/AutoDiffJacobian.h"
+// IWYU pragma: end_exports
 
 #include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 
diff --git a/unsupported/Eigen/BVH b/unsupported/Eigen/BVH
index 666c983..136ec4d 100644
--- a/unsupported/Eigen/BVH
+++ b/unsupported/Eigen/BVH
@@ -87,8 +87,10 @@
 
 //@{
 
+// IWYU pragma: begin_exports
 #include "src/BVH/BVAlgorithms.h"
 #include "src/BVH/KdBVH.h"
+// IWYU pragma: end_exports
 
 //@}
 
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index 0a04a0e..98e0918 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -16,8 +16,11 @@
 #include "../SpecialFunctions"
 
 #include "../../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
+// IWYU pragma: begin_exports
 #include "src/util/CXX11Meta.h"
 #include "src/util/MaxSizeVector.h"
+// IWYU pragma: end_exports
 
 /** \defgroup CXX11_Tensor_Module Tensor Module
   *
@@ -54,6 +57,7 @@
   #endif
 #endif
 
+// IWYU pragma: begin_exports
 #include "src/Tensor/TensorMacros.h"
 #include "src/Tensor/TensorForwardDeclarations.h"
 #include "src/Tensor/TensorMeta.h"
@@ -130,8 +134,7 @@
 #include "src/Tensor/TensorFixedSize.h"
 #include "src/Tensor/TensorMap.h"
 #include "src/Tensor/TensorRef.h"
-
-
+// IWYU pragma: end_exports
 
 #include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 
diff --git a/unsupported/Eigen/CXX11/TensorSymmetry b/unsupported/Eigen/CXX11/TensorSymmetry
index a5c9609..f559c26 100644
--- a/unsupported/Eigen/CXX11/TensorSymmetry
+++ b/unsupported/Eigen/CXX11/TensorSymmetry
@@ -28,10 +28,12 @@
   * \endcode
   */
 
+// IWYU pragma: begin_exports
 #include "src/TensorSymmetry/util/TemplateGroupTheory.h"
 #include "src/TensorSymmetry/Symmetry.h"
 #include "src/TensorSymmetry/StaticSymmetry.h"
 #include "src/TensorSymmetry/DynamicSymmetry.h"
+// IWYU pragma: end_exports
 
 #include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 
diff --git a/unsupported/Eigen/CXX11/ThreadPool b/unsupported/Eigen/CXX11/ThreadPool
index ddd6cdc..16c7dbb 100644
--- a/unsupported/Eigen/CXX11/ThreadPool
+++ b/unsupported/Eigen/CXX11/ThreadPool
@@ -80,6 +80,7 @@
   #endif
 #endif
 
+// IWYU pragma: begin_exports
 #include "src/ThreadPool/ThreadLocal.h"
 #include "src/ThreadPool/ThreadYield.h"
 #include "src/ThreadPool/ThreadCancel.h"
@@ -89,6 +90,7 @@
 #include "src/ThreadPool/ThreadEnvironment.h"
 #include "src/ThreadPool/Barrier.h"
 #include "src/ThreadPool/NonBlockingThreadPool.h"
+// IWYU pragma: end_exports
 
 #include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index e3fc8f4..5e6cdb1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -490,34 +490,59 @@
       return binaryExpr(other.derived(), internal::scalar_quotient_op<Scalar>());
     }
 
-  template<int NaNPropagation=PropagateFast, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-      const TensorCwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar, NaNPropagation>, const Derived, const OtherDerived>
+    template<int NaNPropagation=PropagateFast, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar, NaNPropagation>, const Derived, const OtherDerived>
     cwiseMax(const OtherDerived& other) const {
-    return binaryExpr(other.derived(), internal::scalar_max_op<Scalar,Scalar, NaNPropagation>());
+      return binaryExpr(other.derived(), internal::scalar_max_op<Scalar,Scalar, NaNPropagation>());
     }
 
-  template<int NaNPropagation=PropagateFast, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    template<int NaNPropagation=PropagateFast, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorCwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar, NaNPropagation>, const Derived, const OtherDerived>
     cwiseMin(const OtherDerived& other) const {
       return binaryExpr(other.derived(), internal::scalar_min_op<Scalar,Scalar, NaNPropagation>());
     }
 
+    // logical operators
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_boolean_and_op, const Derived, const OtherDerived>
+    const TensorCwiseBinaryOp<internal::scalar_boolean_and_op<Scalar>, const Derived, const OtherDerived>
     operator&&(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_boolean_and_op());
+      return binaryExpr(other.derived(), internal::scalar_boolean_and_op<Scalar>());
     }
 
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_boolean_or_op, const Derived, const OtherDerived>
+    const TensorCwiseBinaryOp<internal::scalar_boolean_or_op<Scalar>, const Derived, const OtherDerived>
     operator||(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_boolean_or_op());
+      return binaryExpr(other.derived(), internal::scalar_boolean_or_op<Scalar>());
     }
 
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_boolean_xor_op, const Derived, const OtherDerived>
+    const TensorCwiseBinaryOp<internal::scalar_bitwise_and_op<Scalar>, const Derived, const OtherDerived>
+    operator&(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_bitwise_and_op<Scalar>());
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_bitwise_or_op<Scalar>, const Derived, const OtherDerived>
+    operator|(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_bitwise_or_op<Scalar>());
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_bitwise_xor_op<Scalar>, const Derived, const OtherDerived>
     operator^(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_boolean_xor_op());
+      return binaryExpr(other.derived(), internal::scalar_bitwise_xor_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseUnaryOp<internal::scalar_boolean_not_op<Scalar>, const Derived>
+    operator!() const {
+      return unaryExpr(internal::scalar_boolean_not_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseUnaryOp<internal::scalar_bitwise_not_op<Scalar>, const Derived>
+    operator~() const {
+      return unaryExpr(internal::scalar_bitwise_not_op<Scalar>());
     }
 
     // Comparisons and tests.
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index e6e586b..158d250 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -722,26 +722,26 @@
 #endif
 
   // Load inputs to shared memory
-  const int first_x = blockIdx.x * maxX;
-  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
-  const int num_x_input = last_x - first_x + kernelSizeX;
+  const size_t first_x = blockIdx.x * maxX;
+  const size_t last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
+  const size_t num_x_input = last_x - first_x + kernelSizeX;
 
-  const int first_y = blockIdx.y * maxY;
-  const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
-  const int num_y_input = last_y - first_y + kernelSizeY;
+  const size_t first_y = blockIdx.y * maxY;
+  const size_t last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
+  const size_t num_y_input = last_y - first_y + kernelSizeY;
 
-  const int first_z = blockIdx.z * maxZ;
-  const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
-  const int num_z_input = last_z - first_z + kernelSizeZ;
+  const size_t first_z = blockIdx.z * maxZ;
+  const size_t last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
+  const size_t num_z_input = last_z - first_z + kernelSizeZ;
 
   for (int p = 0; p < numPlanes; ++p) {
 
     const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
     const int plane_kernel_offset = 0;
 
-    for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) {
-      for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
-        for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
+    for (size_t k = threadIdx.z; k < num_z_input; k += blockDim.z) {
+      for (size_t j = threadIdx.y; j < num_y_input; j += blockDim.y) {
+        for (size_t i = threadIdx.x; i < num_x_input; i += blockDim.x) {
           const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
           s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
         }
@@ -751,18 +751,18 @@
     __syncthreads();
 
     // Convolution
-    const int num_z_output = last_z - first_z + 1;
-    const int num_y_output = last_y - first_y + 1;
-    const int num_x_output = last_x - first_x + 1;
+    const size_t num_z_output = last_z - first_z + 1;
+    const size_t num_y_output = last_y - first_y + 1;
+    const size_t num_x_output = last_x - first_x + 1;
     const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
 
-    for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) {
-      for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
-        for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
+    for (size_t k = threadIdx.z; k < num_z_output; k += blockDim.z) {
+      for (size_t j = threadIdx.y; j < num_y_output; j += blockDim.y) {
+        for (size_t i = threadIdx.x; i < num_x_output; i += blockDim.x) {
           float result = 0.0f;
-          for (int n = 0; n < kernelSizeZ; ++n) {
-            for (int m = 0; m < kernelSizeY; ++m) {
-              for (int l = 0; l < kernelSizeX; ++l) {
+          for (size_t n = 0; n < kernelSizeZ; ++n) {
+            for (size_t m = 0; m < kernelSizeY; ++m) {
+              for (size_t l = 0; l < kernelSizeX; ++l) {
                 result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)];
               }
             }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
index 8ea1bf0..b477907 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
@@ -378,7 +378,7 @@
     return stream_->deviceProperties().maxThreadsPerMultiProcessor;
   }
   EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
-    return stream_->deviceProperties().sharedMemPerBlock;
+    return static_cast<int>(stream_->deviceProperties().sharedMemPerBlock);
   }
   EIGEN_STRONG_INLINE int majorDeviceVersion() const {
     return stream_->deviceProperties().major;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
index 8fdc8ba..f3141db 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
@@ -794,6 +794,17 @@
 #endif
   }
 
+  template <typename T>
+  EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, T> get_null_accessor()
+      const {
+    eigen_assert(null_buff_simulator.get_size() % sizeof(T) == 0 && "The null buffer size must be a multiple of sizeof(T)");
+    const ptrdiff_t typed_size = null_buff_simulator.get_size() / sizeof(T);
+    eigen_assert(typed_size > 0);
+    auto typed_null_buff =
+        null_buff_simulator.template reinterpret<T>(cl::sycl::range<1>(typed_size));
+    return TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, T>(typed_null_buff);
+  }
+
  protected:
   EIGEN_STRONG_INLINE void set_latest_event(cl::sycl::event e) const {
 #ifdef EIGEN_SYCL_STORE_LATEST_EVENT
@@ -852,6 +863,7 @@
   /// SyclDevice. If a non-read-only pointer is needed to be accessed on the
   /// host we should manually deallocate it.
   mutable TensorSycl::internal::PointerMapper pMapper;
+  cl::sycl::buffer<uint8_t, 1> null_buff_simulator = cl::sycl::buffer<uint8_t, 1>(cl::sycl::range<1>(128));
 #ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
   mutable std::unordered_set<void *> scratch_buffers;
 #endif
@@ -884,7 +896,12 @@
 struct SyclDevice : public SyclDeviceBase {
   explicit SyclDevice(const QueueInterface *queue_stream)
       : SyclDeviceBase(queue_stream) {}
-
+ 
+  template <typename scalar_t>
+  EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, scalar_t>
+  get_null_accessor() const {
+    return queue_stream()->template get_null_accessor<scalar_t>();
+  }
   // this is the accessor used to construct the evaluator
   template <cl::sycl::access::mode AcMd, typename T>
   EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<AcMd, T>
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index 2bd94c3..f8e3f29 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -65,7 +65,8 @@
       TensorBlock;
   //===--------------------------------------------------------------------===//
 
-  EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  TensorEvaluator(const Derived& m, const Device& device)
       : m_data(device.get((const_cast<TensorPointerType>(m.data())))),
         m_dims(m.dimensions()),
         m_device(device)
@@ -263,7 +264,8 @@
       TensorBlock;
   //===--------------------------------------------------------------------===//
 
-  EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+  TensorEvaluator(const Derived& m, const Device& device)
       : m_data(device.get(m.data())), m_dims(m.dimensions()), m_device(device)
   { }
 
@@ -358,6 +360,7 @@
 {
   typedef TensorCwiseNullaryOp<NullaryOp, ArgType> XprType;
 
+  EIGEN_DEVICE_FUNC
   TensorEvaluator(const XprType& op, const Device& device)
       : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device), m_wrapper()
   { }
@@ -455,6 +458,7 @@
     RawAccess          = false
   };
 
+  EIGEN_DEVICE_FUNC
   TensorEvaluator(const XprType& op, const Device& device)
     : m_device(device),
       m_functor(op.functor()),
@@ -571,6 +575,7 @@
     RawAccess         = false
   };
 
+  EIGEN_DEVICE_FUNC
   TensorEvaluator(const XprType& op, const Device& device)
     : m_device(device),
       m_functor(op.functor()),
@@ -709,6 +714,7 @@
     RawAccess         = false
   };
 
+  EIGEN_DEVICE_FUNC
   TensorEvaluator(const XprType& op, const Device& device)
     : m_functor(op.functor()),
       m_arg1Impl(op.arg1Expression(), device),
@@ -829,6 +835,7 @@
     RawAccess         = false
   };
 
+  EIGEN_DEVICE_FUNC
   TensorEvaluator(const XprType& op, const Device& device)
     : m_condImpl(op.ifExpression(), device),
       m_thenImpl(op.thenExpression(), device),
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index f961b40..92d04f6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -94,9 +94,8 @@
                 "You must #define EIGEN_USE_THREADS, EIGEN_USE_GPU or "
                 "EIGEN_USE_SYCL before including Eigen headers.");
 
-  EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE void run(const Expression& expr,
-                                      const Device& device = Device()) {
+                                      const Device& device = DefaultDevice()) {
     TensorEvaluator<Expression, Device> evaluator(expr, device);
     const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
     if (needs_assign) {
@@ -126,7 +125,6 @@
  public:
   typedef typename Expression::Index StorageIndex;
 
-  EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE void run(
       const Expression& expr, const DefaultDevice& device = DefaultDevice()) {
     TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index ae03ba5..fdb4733 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -541,6 +541,24 @@
 template<typename ArgType, typename Device>
 struct TensorReductionEvaluatorBase;
 
+namespace internal {
+namespace reduction {
+
+template <typename CoeffReturnType, typename Device>
+EIGEN_ALWAYS_INLINE typename StorageMemory<CoeffReturnType, Device>::Type get_null_value(
+    typename std::enable_if<Eigen::internal::is_same<Device, Eigen::SyclDevice>::value, const Device>::type& dev) {
+    return (dev.template get_null_accessor<CoeffReturnType>());
+}
+
+template <typename CoeffReturnType, typename Device>
+EIGEN_ALWAYS_INLINE typename StorageMemory<CoeffReturnType, Device>::Type get_null_value(
+    typename std::enable_if<!Eigen::internal::is_same<Device, Eigen::SyclDevice>::value, const Device>::type&) {
+    return NULL;
+}
+
+}// end namespace reduction
+} // end namespace internal
+
 // Eval as rvalue
 template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
 struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
@@ -603,8 +621,10 @@
   static constexpr bool RunningFullReduction = (NumOutputDims==0);
 
   EIGEN_STRONG_INLINE TensorReductionEvaluatorBase(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device)
-  {
+      : m_impl(op.expression(), device),
+        m_reducer(op.reducer()),
+        m_result(internal::reduction::get_null_value<CoeffReturnType, Device>(device)),
+        m_device(device) {
     EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE);
     EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)),
                         YOU_MADE_A_PROGRAMMING_MISTAKE);
@@ -895,7 +915,7 @@
   // binding placeholder accessors to a command group handler for SYCL
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
     m_impl.bind(cgh);
-    if(m_result) m_result.bind(cgh);
+    m_result.bind(cgh);
   }
 #endif
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
index 51cdf44..600c2b0 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
@@ -52,7 +52,7 @@
       return;
     }
     unsigned long long readback;
-    while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) {
+    while ((readback = atomicCAS(reinterpret_cast<unsigned long long*>(output), oldval, newval)) != oldval) {
       oldval = readback;
       newval = oldval;
       reducer.reduce(accum, reinterpret_cast<T*>(&newval));
@@ -65,6 +65,9 @@
     gpu_assert(0 && "Wordsize not supported");
   }
 #else // EIGEN_CUDA_ARCH >= 300
+  EIGEN_UNUSED_VARIABLE(output);
+  EIGEN_UNUSED_VARIABLE(accum);
+  EIGEN_UNUSED_VARIABLE(reducer);
   gpu_assert(0 && "Shouldn't be called on unsupported device");
 #endif // EIGEN_CUDA_ARCH >= 300
 }
@@ -118,6 +121,8 @@
 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
   atomicAdd(output, accum);
 #else // EIGEN_CUDA_ARCH >= 300
+  EIGEN_UNUSED_VARIABLE(output);
+  EIGEN_UNUSED_VARIABLE(accum);
   gpu_assert(0 && "Shouldn't be called on unsupported device");
 #endif // EIGEN_CUDA_ARCH >= 300
 }
@@ -209,6 +214,11 @@
 #endif
   }
 #else // EIGEN_CUDA_ARCH >= 300
+  EIGEN_UNUSED_VARIABLE(reducer);
+  EIGEN_UNUSED_VARIABLE(input);
+  EIGEN_UNUSED_VARIABLE(num_coeffs);
+  EIGEN_UNUSED_VARIABLE(output);
+  EIGEN_UNUSED_VARIABLE(semaphore);
   gpu_assert(0 && "Shouldn't be called on unsupported device");
 #endif // EIGEN_CUDA_ARCH >= 300
 }
@@ -243,7 +253,7 @@
 
 template <typename Self,
           typename Reducer, typename Index>
-__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernelHalfFloat(Reducer reducer, const Self /*input*/, Index num_coeffs, half* output) {
   const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
   const Index num_threads = blockDim.x * gridDim.x;
   typedef typename packet_traits<Eigen::half>::type PacketType;
@@ -715,11 +725,11 @@
         half2* hr2 = reinterpret_cast<half2*>(&r2);
         half2* rr1 = reinterpret_cast<half2*>(&reduced_val1);
         half2* rr2 = reinterpret_cast<half2*>(&reduced_val2);
-        for (int i = 0; i < packet_width / 2; i++) {
-          hr1[i] =
-              __shfl_down_sync(0xFFFFFFFF, rr1[i], (unsigned)offset, warpSize);
-          hr2[i] =
-              __shfl_down_sync(0xFFFFFFFF, rr2[i], (unsigned)offset, warpSize);
+        for (int j = 0; j < packet_width / 2; j++) {
+          hr1[j] =
+              __shfl_down_sync(0xFFFFFFFF, rr1[j], (unsigned)offset, warpSize);
+          hr2[j] =
+              __shfl_down_sync(0xFFFFFFFF, rr2[j], (unsigned)offset, warpSize);
         }
         reducer.reducePacket(r1, &reduced_val1);
         reducer.reducePacket(r2, &reduced_val2);
@@ -744,7 +754,7 @@
       val = __halves2half2(val1, val2);
       if ((threadIdx.x & (warpSize - 1)) == 0) {
         half* loc = output + row;
-        atomicReduce((half2*)loc, val, reducer);
+        atomicReduce(reinterpret_cast<half2*>(loc), val, reducer);
       }
     }
   }
@@ -782,12 +792,12 @@
     if (num_blocks > 1) {
       // We initialize the outputs outside the reduction kernel when we can't be sure that there
       // won't be a race conditions between multiple thread blocks.
-      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
-      const int max_blocks = device.getNumGpuMultiProcessors() *
+      const int dyn_blocks2 = divup<int>(num_preserved_vals, 1024);
+      const int max_blocks2 = device.getNumGpuMultiProcessors() *
                            device.maxGpuThreadsPerMultiProcessor() / 1024;
-      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+      const int num_blocks2 = numext::mini<int>(max_blocks2, dyn_blocks2);
       LAUNCH_GPU_KERNEL((ReductionInitKernel<OutputType, Index>),
-                         num_blocks, 1024, 0, device, reducer.initialize(),
+                         num_blocks2, 1024, 0, device, reducer.initialize(),
                          num_preserved_vals, output);
     }
 
@@ -950,12 +960,12 @@
     if (num_blocks > 1) {
       // We initialize the outputs in the reduction kernel itself when we don't have to worry
       // about race conditions between multiple thread blocks.
-      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
-      const int max_blocks = device.getNumGpuMultiProcessors() *
+      const int dyn_blocks2 = divup<int>(num_preserved_vals, 1024);
+      const int max_blocks2 = device.getNumGpuMultiProcessors() *
                              device.maxGpuThreadsPerMultiProcessor() / 1024;
-      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+      const int num_blocks2 = numext::mini<int>(max_blocks2, dyn_blocks2);
       LAUNCH_GPU_KERNEL((ReductionInitKernel<float, Index>),
-                         num_blocks, 1024, 0, device, reducer.initialize(),
+                         num_blocks2, 1024, 0, device, reducer.initialize(),
                          num_preserved_vals, output);
     }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
index ed0a731..2c574c7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
@@ -191,7 +191,7 @@
               (TensorEvaluator<typename Self::ChildTypeNoConst, Device>::PacketAccess &&
                internal::reducer_traits<Reducer, Device>::PacketAccess)>
 struct ScanLauncher {
-  void operator()(Self& self, typename Self::CoeffReturnType* data) {
+  void operator()(Self& self, typename Self::CoeffReturnType* data) const {
     Index total_size = internal::array_prod(self.dimensions());
 
     // We fix the index along the scan axis to 0 and perform a
diff --git a/unsupported/Eigen/EulerAngles b/unsupported/Eigen/EulerAngles
index f8f1c5d..3e4c74b 100644
--- a/unsupported/Eigen/EulerAngles
+++ b/unsupported/Eigen/EulerAngles
@@ -35,8 +35,10 @@
 
 }
 
+// IWYU pragma: begin_exports
 #include "src/EulerAngles/EulerSystem.h"
 #include "src/EulerAngles/EulerAngles.h"
+// IWYU pragma: end_exports
 
 #include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 
diff --git a/unsupported/Eigen/FFT b/unsupported/Eigen/FFT
index b929e84..137b962 100644
--- a/unsupported/Eigen/FFT
+++ b/unsupported/Eigen/FFT
@@ -79,6 +79,8 @@
 
 #include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
 
+// IWYU pragma: begin_exports
+
 #ifdef EIGEN_FFTW_DEFAULT
 // FFTW: faster, GPL -- incompatible with Eigen in LGPL form, bigger code size
 #  include <fftw3.h>
@@ -110,6 +112,8 @@
   }
 #endif
 
+// IWYU pragma: end_exports
+
 namespace Eigen {
 
  
diff --git a/unsupported/Eigen/IterativeSolvers b/unsupported/Eigen/IterativeSolvers
index a22d2a3..5800e09 100644
--- a/unsupported/Eigen/IterativeSolvers
+++ b/unsupported/Eigen/IterativeSolvers
@@ -19,7 +19,6 @@
   * \defgroup IterativeLinearSolvers_Module Iterative Solvers module
   * This module aims to provide various iterative linear and non linear solver algorithms.
   * It currently provides:
-  *  - a constrained conjugate gradient
   *  - a Householder GMRES implementation
   *  - an IDR(s) implementation
   *  - a BiCGSTAB(L) implementation
@@ -79,6 +78,7 @@
 
 #include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
 
+// IWYU pragma: begin_exports
 #include "src/IterativeSolvers/IncompleteLU.h"
 #include "src/IterativeSolvers/GMRES.h"
 #include "src/IterativeSolvers/DGMRES.h"
@@ -86,6 +86,7 @@
 #include "src/IterativeSolvers/IDRS.h"
 #include "src/IterativeSolvers/BiCGSTABL.h"
 #include "src/IterativeSolvers/IDRSTABL.h"
+// IWYU pragma: end_exports
 
 #include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 
diff --git a/unsupported/Eigen/KroneckerProduct b/unsupported/Eigen/KroneckerProduct
index 9643ae2..0dcb995 100644
--- a/unsupported/Eigen/KroneckerProduct
+++ b/unsupported/Eigen/KroneckerProduct
@@ -26,7 +26,9 @@
 
 } // namespace Eigen
 
+// IWYU pragma: begin_exports
 #include "src/KroneckerProduct/KroneckerTensorProduct.h"
+// IWYU pragma: end_exports
 
 #include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 
diff --git a/unsupported/Eigen/LevenbergMarquardt b/unsupported/Eigen/LevenbergMarquardt
index b5ace56..96d1e67 100644
--- a/unsupported/Eigen/LevenbergMarquardt
+++ b/unsupported/Eigen/LevenbergMarquardt
@@ -33,16 +33,16 @@
 
 #include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
 
+// IWYU pragma: begin_exports
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-
 #include "src/LevenbergMarquardt/LMqrsolv.h"
 #include "src/LevenbergMarquardt/LMcovar.h"
 #include "src/LevenbergMarquardt/LMpar.h"
-
 #endif
 
 #include "src/LevenbergMarquardt/LevenbergMarquardt.h"
 #include "src/LevenbergMarquardt/LMonestep.h"
+// IWYU pragma: end_exports
 
 #include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 
diff --git a/unsupported/Eigen/MatrixFunctions b/unsupported/Eigen/MatrixFunctions
index dddedb4..f46df5d 100644
--- a/unsupported/Eigen/MatrixFunctions
+++ b/unsupported/Eigen/MatrixFunctions
@@ -55,11 +55,13 @@
 
 #include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
 
+// IWYU pragma: begin_exports
 #include "src/MatrixFunctions/MatrixExponential.h"
 #include "src/MatrixFunctions/MatrixFunction.h"
 #include "src/MatrixFunctions/MatrixSquareRoot.h"
 #include "src/MatrixFunctions/MatrixLogarithm.h"
 #include "src/MatrixFunctions/MatrixPower.h"
+// IWYU pragma: end_exports
 
 #include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 
diff --git a/unsupported/Eigen/MoreVectorization b/unsupported/Eigen/MoreVectorization
index 7662b47..ea86fac 100644
--- a/unsupported/Eigen/MoreVectorization
+++ b/unsupported/Eigen/MoreVectorization
@@ -19,6 +19,8 @@
 
 }
 
+// IWYU pragma: begin_exports
 #include "src/MoreVectorization/MathFunctions.h"
+// IWYU pragma: end_exports
 
 #endif // EIGEN_MOREVECTORIZATION_MODULE_H
diff --git a/unsupported/Eigen/NonLinearOptimization b/unsupported/Eigen/NonLinearOptimization
index 6bf566e..929c503 100644
--- a/unsupported/Eigen/NonLinearOptimization
+++ b/unsupported/Eigen/NonLinearOptimization
@@ -118,6 +118,7 @@
   * unsupported/test/NonLinearOptimization.cpp.
   */
 
+// IWYU pragma: begin_exports
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 
 #include "src/NonLinearOptimization/qrsolv.h"
@@ -135,6 +136,6 @@
 
 #include "src/NonLinearOptimization/HybridNonLinearSolver.h"
 #include "src/NonLinearOptimization/LevenbergMarquardt.h"
-
+// IWYU pragma: end_exports
 
 #endif // EIGEN_NONLINEAROPTIMIZATION_MODULE_H
diff --git a/unsupported/Eigen/NumericalDiff b/unsupported/Eigen/NumericalDiff
index 9d6270a..542e51a 100644
--- a/unsupported/Eigen/NumericalDiff
+++ b/unsupported/Eigen/NumericalDiff
@@ -48,7 +48,9 @@
 
 //@{
 
+// IWYU pragma: begin_exports
 #include "src/NumericalDiff/NumericalDiff.h"
+// IWYU pragma: end_exports
 
 //@}
 
diff --git a/unsupported/Eigen/Polynomials b/unsupported/Eigen/Polynomials
index 32ce2a2..0df6fa1 100644
--- a/unsupported/Eigen/Polynomials
+++ b/unsupported/Eigen/Polynomials
@@ -35,9 +35,11 @@
 	* at the start of your source file.
   */
 
+// IWYU pragma: begin_exports
 #include "src/Polynomials/PolynomialUtils.h"
 #include "src/Polynomials/Companion.h"
 #include "src/Polynomials/PolynomialSolver.h"
+// IWYU pragma: end_exports
 
 /**
 	\page polynomials Polynomials defines functions for dealing with polynomials
diff --git a/unsupported/Eigen/Skyline b/unsupported/Eigen/Skyline
index ebdf143..7a9f0ca 100644
--- a/unsupported/Eigen/Skyline
+++ b/unsupported/Eigen/Skyline
@@ -27,12 +27,14 @@
  *
  */
 
+// IWYU pragma: begin_exports
 #include "src/Skyline/SkylineUtil.h"
 #include "src/Skyline/SkylineMatrixBase.h"
 #include "src/Skyline/SkylineStorage.h"
 #include "src/Skyline/SkylineMatrix.h"
 #include "src/Skyline/SkylineInplaceLU.h"
 #include "src/Skyline/SkylineProduct.h"
+// IWYU pragma: end_exports
 
 #include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 
diff --git a/unsupported/Eigen/SparseExtra b/unsupported/Eigen/SparseExtra
index f4920de..88a639e 100644
--- a/unsupported/Eigen/SparseExtra
+++ b/unsupported/Eigen/SparseExtra
@@ -41,7 +41,7 @@
   * \endcode
   */
 
-
+// IWYU pragma: begin_exports
 #include "src/SparseExtra/RandomSetter.h"
 #include "src/SparseExtra/SparseInverse.h"
 
@@ -51,6 +51,7 @@
 #include <dirent.h>
 #include "src/SparseExtra/MatrixMarketIterator.h"
 #endif
+// IWYU pragma: end_exports
 
 #include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 
diff --git a/unsupported/Eigen/SpecialFunctions b/unsupported/Eigen/SpecialFunctions
index 41a3631..fb03db7 100644
--- a/unsupported/Eigen/SpecialFunctions
+++ b/unsupported/Eigen/SpecialFunctions
@@ -60,6 +60,7 @@
 
 }
 
+// IWYU pragma: begin_exports
 #include "src/SpecialFunctions/BesselFunctionsImpl.h"
 #include "src/SpecialFunctions/BesselFunctionsBFloat16.h"
 #include "src/SpecialFunctions/BesselFunctionsHalf.h"
@@ -92,6 +93,7 @@
 #if defined EIGEN_VECTORIZE_GPU
   #include "src/SpecialFunctions/arch/GPU/SpecialFunctions.h"
 #endif
+// IWYU pragma: end_exports
 
 namespace Eigen {
 //@}
diff --git a/unsupported/Eigen/Splines b/unsupported/Eigen/Splines
index 2ca5813..b4fc81e 100644
--- a/unsupported/Eigen/Splines
+++ b/unsupported/Eigen/Splines
@@ -26,9 +26,11 @@
 
 #include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
 
+// IWYU pragma: begin_exports
 #include "src/Splines/SplineFwd.h"
 #include "src/Splines/Spline.h"
 #include "src/Splines/SplineFitting.h"
+// IWYU pragma: end_exports
 
 #include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 
diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h b/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
index 4eb8651..aca717b 100644
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
@@ -283,7 +283,7 @@
 #endif
   int degree = 3;
   for (; degree <= maxPadeDegree; ++degree)
-    if (normIminusT <= maxNormForPade[degree - 3])
+    if (normIminusT <= static_cast<long double>(maxNormForPade[degree - 3]))
       break;
   return degree;
 }
diff --git a/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h b/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h
index 909b08e..7dd3c3e 100644
--- a/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h
+++ b/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h
@@ -4,9 +4,6 @@
 namespace Eigen {
 namespace internal {
 
-// Bessel functions only available for some compilers.
-#if EIGEN_HAS_AVX512_MATH
-
 F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_i0)
 BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_i0)
 
@@ -43,8 +40,6 @@
 F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_y1)
 BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_y1)
 
-#endif
-
 }  // namespace internal
 }  // namespace Eigen
 
diff --git a/unsupported/test/NNLS.cpp b/unsupported/test/NNLS.cpp
index d65920b..b347c06 100644
--- a/unsupported/test/NNLS.cpp
+++ b/unsupported/test/NNLS.cpp
@@ -12,10 +12,11 @@
 #include "main.h"
 #include <unsupported/Eigen/NNLS>
 
+
 /// Check that 'x' solves the NNLS optimization problem `min ||A*x-b|| s.t. 0 <= x`.
 /// The \p tolerance parameter is the absolute tolerance on the gradient, A'*(A*x-b).
 template <typename MatrixType, typename VectorB, typename VectorX, typename Scalar>
-static void verify_nnls_optimality(const MatrixType &A, const VectorB &b, const VectorX &x, const Scalar tolerance) {
+void verify_nnls_optimality(const MatrixType &A, const VectorB &b, const VectorX &x, const Scalar tolerance) {
   // The NNLS optimality conditions are:
   //
   // * 0 = A'*A*x - A'*b - lambda
@@ -38,7 +39,7 @@
 }
 
 template <typename MatrixType, typename VectorB, typename VectorX>
-static void test_nnls_known_solution(const MatrixType &A, const VectorB &b, const VectorX &x_expected) {
+void test_nnls_known_solution(const MatrixType &A, const VectorB &b, const VectorX &x_expected) {
   using Scalar = typename MatrixType::Scalar;
 
   using std::sqrt;
@@ -53,7 +54,7 @@
 }
 
 template <typename MatrixType>
-static void test_nnls_random_problem() {
+void test_nnls_random_problem() {
   //
   // SETUP
   //
@@ -101,7 +102,7 @@
   verify_nnls_optimality(A, b, x, tolerance);
 }
 
-static void test_nnls_handles_zero_rhs() {
+void test_nnls_handles_zero_rhs() {
   //
   // SETUP
   //
@@ -124,7 +125,7 @@
   VERIFY_IS_EQUAL(x, VectorXd::Zero(cols));
 }
 
-static void test_nnls_handles_Mx0_matrix() {
+void test_nnls_handles_Mx0_matrix() {
   //
   // SETUP
   //
@@ -146,7 +147,7 @@
   VERIFY_IS_EQUAL(x.size(), 0);
 }
 
-static void test_nnls_handles_0x0_matrix() {
+void test_nnls_handles_0x0_matrix() {
   //
   // SETUP
   //
@@ -167,7 +168,7 @@
   VERIFY_IS_EQUAL(x.size(), 0);
 }
 
-static void test_nnls_handles_dependent_columns() {
+void test_nnls_handles_dependent_columns() {
   //
   // SETUP
   //
@@ -197,7 +198,7 @@
   }
 }
 
-static void test_nnls_handles_wide_matrix() {
+void test_nnls_handles_wide_matrix() {
   //
   // SETUP
   //
@@ -230,7 +231,7 @@
 }
 
 // 4x2 problem, unconstrained solution positive
-static void test_nnls_known_1() {
+void test_nnls_known_1() {
   Matrix<double, 4, 2> A(4, 2);
   Matrix<double, 4, 1> b(4);
   Matrix<double, 2, 1> x(2);
@@ -242,7 +243,7 @@
 }
 
 // 4x3 problem, unconstrained solution positive
-static void test_nnls_known_2() {
+void test_nnls_known_2() {
   Matrix<double, 4, 3> A(4, 3);
   Matrix<double, 4, 1> b(4);
   Matrix<double, 3, 1> x(3);
@@ -255,7 +256,7 @@
 }
 
 // Simple 4x4 problem, unconstrained solution non-negative
-static void test_nnls_known_3() {
+void test_nnls_known_3() {
   Matrix<double, 4, 4> A(4, 4);
   Matrix<double, 4, 1> b(4);
   Matrix<double, 4, 1> x(4);
@@ -268,7 +269,7 @@
 }
 
 // Simple 4x3 problem, unconstrained solution non-negative
-static void test_nnls_known_4() {
+void test_nnls_known_4() {
   Matrix<double, 4, 3> A(4, 3);
   Matrix<double, 4, 1> b(4);
   Matrix<double, 3, 1> x(3);
@@ -281,7 +282,7 @@
 }
 
 // Simple 4x3 problem, unconstrained solution indefinite
-static void test_nnls_known_5() {
+void test_nnls_known_5() {
   Matrix<double, 4, 3> A(4, 3);
   Matrix<double, 4, 1> b(4);
   Matrix<double, 3, 1> x(3);
@@ -294,7 +295,7 @@
   test_nnls_known_solution(A, b, x);
 }
 
-static void test_nnls_small_reference_problems() {
+void test_nnls_small_reference_problems() {
   test_nnls_known_1();
   test_nnls_known_2();
   test_nnls_known_3();
@@ -302,7 +303,7 @@
   test_nnls_known_5();
 }
 
-static void test_nnls_with_half_precision() {
+void test_nnls_with_half_precision() {
   // The random matrix generation tools don't work with `half`,
   // so here's a simpler setup mostly just to check that NNLS compiles & runs with custom scalar types.
 
@@ -319,7 +320,7 @@
   verify_nnls_optimality(A, b, x, half(1e-1));
 }
 
-static void test_nnls_special_case_solves_in_zero_iterations() {
+void test_nnls_special_case_solves_in_zero_iterations() {
   // The particular NNLS algorithm that is implemented starts with all variables
   // in the active set.
   // This test builds a system where all constraints are active at the solution,
@@ -346,7 +347,7 @@
   VERIFY(nnls.iterations() == 0);
 }
 
-static void test_nnls_special_case_solves_in_n_iterations() {
+void test_nnls_special_case_solves_in_n_iterations() {
   // The particular NNLS algorithm that is implemented starts with all variables
   // in the active set and then adds one variable to the inactive set each iteration.
   // This test builds a system where all variables are inactive at the solution,
@@ -370,7 +371,7 @@
   VERIFY(nnls.iterations() == n);
 }
 
-static void test_nnls_returns_NoConvergence_when_maxIterations_is_too_low() {
+void test_nnls_returns_NoConvergence_when_maxIterations_is_too_low() {
   // Using the special case that takes `n` iterations,
   // from `test_nnls_special_case_solves_in_n_iterations`,
   // we can set max iterations too low and that should cause the solve to fail.
@@ -391,7 +392,7 @@
   VERIFY(nnls.iterations() == max_iters);
 }
 
-static void test_nnls_default_maxIterations_is_twice_column_count() {
+void test_nnls_default_maxIterations_is_twice_column_count() {
   const Index cols = internal::random<Index>(1, EIGEN_TEST_MAX_SIZE);
   const Index rows = internal::random<Index>(cols, EIGEN_TEST_MAX_SIZE);
   const MatrixXd A = MatrixXd::Random(rows, cols);
@@ -401,7 +402,7 @@
   VERIFY_IS_EQUAL(nnls.maxIterations(), 2 * cols);
 }
 
-static void test_nnls_does_not_allocate_during_solve() {
+void test_nnls_does_not_allocate_during_solve() {
   const Index cols = internal::random<Index>(1, EIGEN_TEST_MAX_SIZE);
   const Index rows = internal::random<Index>(cols, EIGEN_TEST_MAX_SIZE);
   const MatrixXd A = MatrixXd::Random(rows, cols);
@@ -414,7 +415,7 @@
   internal::set_is_malloc_allowed(true);
 }
 
-static void test_nnls_repeated_calls_to_compute_and_solve() {
+void test_nnls_repeated_calls_to_compute_and_solve() {
   const Index cols2 = internal::random<Index>(1, EIGEN_TEST_MAX_SIZE);
   const Index rows2 = internal::random<Index>(cols2, EIGEN_TEST_MAX_SIZE);
   const MatrixXd A2 = MatrixXd::Random(rows2, cols2);
@@ -449,8 +450,10 @@
     // Essential NNLS properties, across different types.
     CALL_SUBTEST_2(test_nnls_random_problem<MatrixXf>());
     CALL_SUBTEST_3(test_nnls_random_problem<MatrixXd>());
-    using MatFixed = Matrix<double, 12, 5>;
-    CALL_SUBTEST_4(test_nnls_random_problem<MatFixed>());
+    {
+      using MatFixed = Matrix<double, 12, 5>;
+      CALL_SUBTEST_4(test_nnls_random_problem<MatFixed>());
+    }
     CALL_SUBTEST_5(test_nnls_with_half_precision());
 
     // Robustness tests:
diff --git a/unsupported/test/cxx11_tensor_device_sycl.cpp b/unsupported/test/cxx11_tensor_device_sycl.cpp
index 74e9026..7ba104b 100644
--- a/unsupported/test/cxx11_tensor_device_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_device_sycl.cpp
@@ -26,8 +26,10 @@
 #ifdef SYCL_COMPILER_IS_DPCPP
 template <typename T>
 struct cl::sycl::is_device_copyable<
-    const OffByOneScalar<T>,
-    std::enable_if_t<!std::is_trivially_copyable<const OffByOneScalar<T>>::value>> : std::true_type {};
+    OffByOneScalar<T>,
+    std::enable_if_t<!(!std::is_trivially_copyable<OffByOneScalar<T>>::value &&
+                       (std::is_const_v<OffByOneScalar<T>> || std::is_volatile_v<OffByOneScalar<T>>))>>
+    : std::true_type {};
 #endif
 
 template <typename DataType, int DataLayout, typename IndexType>
diff --git a/unsupported/test/cxx11_tensor_reduction.cpp b/unsupported/test/cxx11_tensor_reduction.cpp
index a090e4a..ce4e538 100644
--- a/unsupported/test/cxx11_tensor_reduction.cpp
+++ b/unsupported/test/cxx11_tensor_reduction.cpp
@@ -485,7 +485,7 @@
     // Test against probabilistic forward error bound. In reality, the error is much smaller
     // when we use tree summation.
     double err = Eigen::numext::abs(static_cast<double>(sum()) - expected_sum);
-    double tol = numext::sqrt(num_elements) * NumTraits<ScalarType>::epsilon() * static_cast<ScalarType>(abs_sum);
+    double tol = numext::sqrt(static_cast<double>(num_elements)) * NumTraits<ScalarType>::epsilon() * static_cast<ScalarType>(abs_sum);
     VERIFY_LE(err, tol);
   }
 }