Update Eigen to commit:3cf6bb6f1ce9cd3cd6bdc43cdcfae7f4822ff2c3

CHANGELOG
=========
3cf6bb6f1 - Fix a bug in commit 76e8c0455396446f8166c798da5efe879e010bdc:
32165c6f0 - Fix Wshorten-64-to-32 warning in gemm parallelizer
b33dbb576 - Fix implicit narrowing warning in Parallelizer.h.
3a9635b20 - Link pthread for product_threaded test
f78c37f0a - traits<Ref>::match: use correct strides
516d08a49 - Fix typo in Parallelizer.h
76e8c0455 - Generalize parallel GEMM implementation in Core to work with ThreadPool in addition to OpenMP.
4d54c43d6 - Fix typo to allow nomalloc test to pass on AVX512.
a25f02d73 - Fix int overflow causing cxx11_tensor_gpu_1 to fail.
6f9ad7da6 - fix Wshorten-64-to-32 warnings in div_ceil
1aac9332c - TensorReduction: replace divup with div_ceil
5de0f2f89 - Fixes #2735: Component-wise cbrt
48b254a4b - Disable denorm deprecation warnings in MSVC C++23.
176821f2f - Avoid building docs if cross-compiling or not top level.

PiperOrigin-RevId: 582839371
Change-Id: I1a6247e188e09c8f9c56c5bbda59d4714573f5fd
diff --git a/Eigen/Core b/Eigen/Core
index 93d4789..700d815 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -349,6 +349,9 @@
 #include "src/Core/TriangularMatrix.h"
 #include "src/Core/SelfAdjointView.h"
 #include "src/Core/products/GeneralBlockPanelKernel.h"
+#ifdef EIGEN_GEMM_THREADPOOL
+#include "ThreadPool"
+#endif
 #include "src/Core/products/Parallelizer.h"
 #include "src/Core/ProductEvaluators.h"
 #include "src/Core/products/GeneralMatrixVector.h"
diff --git a/Eigen/src/Core/Assign_MKL.h b/Eigen/src/Core/Assign_MKL.h
index 0fda71c..448dae2 100644
--- a/Eigen/src/Core/Assign_MKL.h
+++ b/Eigen/src/Core/Assign_MKL.h
@@ -140,6 +140,7 @@
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(round, Round,  _)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(floor, Floor,  _)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(ceil,  Ceil,   _)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(cbrt,  Cbrt,  _)
 
 #define EIGEN_MKL_VML_DECLARE_POW_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE, VMLMODE)                                           \
   template< typename DstXprType, typename SrcXprNested, typename Plain>                                                       \
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index 8e9902e..d04f713 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -1017,6 +1017,10 @@
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet psqrt(const Packet& a) { return numext::sqrt(a); }
 
+/** \internal \returns the cube-root of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pcbrt(const Packet& a) { return numext::cbrt(a); }
+
 /** \internal \returns the rounded value of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pround(const Packet& a) { using numext::round; return round(a); }
diff --git a/Eigen/src/Core/GlobalFunctions.h b/Eigen/src/Core/GlobalFunctions.h
index 9818b86..2d8bb80 100644
--- a/Eigen/src/Core/GlobalFunctions.h
+++ b/Eigen/src/Core/GlobalFunctions.h
@@ -89,6 +89,7 @@
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(arg,scalar_arg_op,complex argument,\sa ArrayBase::arg DOXCOMMA MatrixBase::cwiseArg)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(carg, scalar_carg_op, complex argument, \sa ArrayBase::carg DOXCOMMA MatrixBase::cwiseCArg)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sqrt,scalar_sqrt_op,square root,\sa ArrayBase::sqrt DOXCOMMA MatrixBase::cwiseSqrt)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cbrt,scalar_cbrt_op,cube root,\sa ArrayBase::cbrt DOXCOMMA MatrixBase::cwiseCbrt)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(rsqrt,scalar_rsqrt_op,reciprocal square root,\sa ArrayBase::rsqrt)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(square,scalar_square_op,square (power 2),\sa Eigen::abs2 DOXCOMMA Eigen::pow DOXCOMMA ArrayBase::square)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cube,scalar_cube_op,cube (power 3),\sa Eigen::pow DOXCOMMA ArrayBase::cube)
diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index 0f5a0fd..6f2fd6d 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -1394,6 +1394,14 @@
 SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sqrt, sqrt)
 #endif
 
+/** \returns the cube root of \a x. **/
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T cbrt(const T &x) {
+  EIGEN_USING_STD(cbrt);
+  return static_cast<T>(cbrt(x));
+}
+
 /** \returns the reciprocal square root of \a x. **/
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
diff --git a/Eigen/src/Core/Ref.h b/Eigen/src/Core/Ref.h
index df43c05..5be39ce 100644
--- a/Eigen/src/Core/Ref.h
+++ b/Eigen/src/Core/Ref.h
@@ -26,7 +26,9 @@
   enum {
     Options = Options_,
     Flags = traits<Map<PlainObjectType_, Options_, StrideType_> >::Flags | NestByRefBit,
-    Alignment = traits<Map<PlainObjectType_, Options_, StrideType_> >::Alignment
+    Alignment = traits<Map<PlainObjectType_, Options_, StrideType_> >::Alignment,
+    InnerStrideAtCompileTime = traits<Map<PlainObjectType_, Options_, StrideType_> >::InnerStrideAtCompileTime,
+    OuterStrideAtCompileTime = traits<Map<PlainObjectType_, Options_, StrideType_> >::OuterStrideAtCompileTime
   };
 
   template<typename Derived> struct match {
@@ -34,11 +36,11 @@
       IsVectorAtCompileTime = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime,
       HasDirectAccess = internal::has_direct_access<Derived>::ret,
       StorageOrderMatch = IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)),
-      InnerStrideMatch = int(StrideType::InnerStrideAtCompileTime)==int(Dynamic)
-                      || int(StrideType::InnerStrideAtCompileTime)==int(Derived::InnerStrideAtCompileTime)
-                      || (int(StrideType::InnerStrideAtCompileTime)==0 && int(Derived::InnerStrideAtCompileTime)==1),
+      InnerStrideMatch = int(InnerStrideAtCompileTime)==int(Dynamic)
+                      || int(InnerStrideAtCompileTime)==int(Derived::InnerStrideAtCompileTime)
+                      || (int(InnerStrideAtCompileTime)==0 && int(Derived::InnerStrideAtCompileTime)==1),
       OuterStrideMatch = IsVectorAtCompileTime
-                      || int(StrideType::OuterStrideAtCompileTime)==int(Dynamic) || int(StrideType::OuterStrideAtCompileTime)==int(Derived::OuterStrideAtCompileTime),
+                      || int(OuterStrideAtCompileTime)==int(Dynamic) || int(OuterStrideAtCompileTime)==int(Derived::OuterStrideAtCompileTime),
       // NOTE, this indirection of evaluator<Derived>::Alignment is needed
       // to workaround a very strange bug in MSVC related to the instantiation
       // of has_*ary_operator in evaluator<CwiseNullaryOp>.
diff --git a/Eigen/src/Core/arch/AVX512/PacketMathFP16.h b/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
index faa3853..fc11174 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
@@ -1,870 +1,870 @@
-// This file is part of Eigen, a lightweight C++ template library

-// for linear algebra.

-//

-//

-//

-// This Source Code Form is subject to the terms of the Mozilla

-// Public License v. 2.0. If a copy of the MPL was not distributed

-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

-

-#ifndef EIGEN_PACKET_MATH_FP16_AVX512_H

-#define EIGEN_PACKET_MATH_FP16_AVX512_H

-

+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+//
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_FP16_AVX512_H
+#define EIGEN_PACKET_MATH_FP16_AVX512_H
+
 // IWYU pragma: private
-#include "../../InternalHeaderCheck.h"

-

-namespace Eigen {

-

-namespace internal {

-

-typedef __m512h Packet32h;

-typedef eigen_packet_wrapper<__m256i, 1> Packet16h;

-typedef eigen_packet_wrapper<__m128i, 2> Packet8h;

-

-template <>

-struct is_arithmetic<Packet8h> {

-  enum { value = true };

-};

-

-template <>

-struct packet_traits<half> : default_packet_traits {

-  typedef Packet32h type;

-  typedef Packet16h half;

-  enum {

-    Vectorizable = 1,

-    AlignedOnScalar = 1,

-    size = 32,

-

-    HasCmp = 1,

-    HasAdd = 1,

-    HasSub = 1,

-    HasMul = 1,

-    HasDiv = 1,

-    HasNegate = 1,

-    HasAbs = 1,

-    HasAbs2 = 0,

-    HasMin = 1,

-    HasMax = 1,

-    HasConj = 1,

-    HasSetLinear = 0,

-    HasLog = 1,

-    HasLog1p = 1,

-    HasExp = 1,

-    HasExpm1 = 1,

-    HasSqrt = 1,

-    HasRsqrt = 1,

-    // These ones should be implemented in future

-    HasBessel = 0,

-    HasNdtri = 0,

-    HasSin = EIGEN_FAST_MATH,

-    HasCos = EIGEN_FAST_MATH,

-    HasTanh = EIGEN_FAST_MATH,

-    HasErf = 0,  // EIGEN_FAST_MATH,

-    HasBlend = 0,

-    HasRound = 1,

-    HasFloor = 1,

-    HasCeil = 1,

-    HasRint = 1

-  };

-};

-

-template <>

-struct unpacket_traits<Packet32h> {

-  typedef Eigen::half type;

-  typedef Packet16h half;

-  enum {

-    size = 32,

-    alignment = Aligned64,

-    vectorizable = true,

-    masked_load_available = false,

-    masked_store_available = false

-  };

-};

-

-template <>

-struct unpacket_traits<Packet16h> {

-  typedef Eigen::half type;

-  typedef Packet8h half;

-  enum {

-    size = 16,

-    alignment = Aligned32,

-    vectorizable = true,

-    masked_load_available = false,

-    masked_store_available = false

-  };

-};

-

-template <>

-struct unpacket_traits<Packet8h> {

-  typedef Eigen::half type;

-  typedef Packet8h half;

-  enum {

-    size = 8,

-    alignment = Aligned16,

-    vectorizable = true,

-    masked_load_available = false,

-    masked_store_available = false

-  };

-};

-

-// Memory functions

-

-// pset1

-

-template <>

-EIGEN_STRONG_INLINE Packet32h pset1<Packet32h>(const Eigen::half& from) {

-  return _mm512_set1_ph(static_cast<_Float16>(from));

-}

-

-// pset1frombits

-template <>

-EIGEN_STRONG_INLINE Packet32h pset1frombits<Packet32h>(unsigned short from) {

-  return _mm512_castsi512_ph(_mm512_set1_epi16(from));

-}

-

-// pfirst

-

-template <>

-EIGEN_STRONG_INLINE Eigen::half pfirst<Packet32h>(const Packet32h& from) {

-#ifdef EIGEN_VECTORIZE_AVX512DQ

-  return half_impl::raw_uint16_to_half(

-      static_cast<unsigned short>(_mm256_extract_epi16(_mm512_extracti32x8_epi32(_mm512_castph_si512(from), 0), 0)));

-#else

-  Eigen::half dest[32];

-  _mm512_storeu_ph(dest, from);

-  return dest[0];

-#endif

-}

-

-// pload

-

-template <>

-EIGEN_STRONG_INLINE Packet32h pload<Packet32h>(const Eigen::half* from) {

-  EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_ph(from);

-}

-

-// ploadu

-

-template <>

-EIGEN_STRONG_INLINE Packet32h ploadu<Packet32h>(const Eigen::half* from) {

-  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_ph(from);

-}

-

-// pstore

-

-template <>

-EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet32h& from) {

-  EIGEN_DEBUG_ALIGNED_STORE _mm512_store_ph(to, from);

-}

-

-// pstoreu

-

-template <>

-EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet32h& from) {

-  EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_ph(to, from);

-}

-

-// ploaddup

-template <>

-EIGEN_STRONG_INLINE Packet32h ploaddup<Packet32h>(const Eigen::half* from) {

-  __m512h a = _mm512_castph256_ph512(_mm256_loadu_ph(from));

-  return _mm512_permutexvar_ph(_mm512_set_epi16(15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6,

-                                                5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0),

-                               a);

-}

-

-// ploadquad

-template <>

-EIGEN_STRONG_INLINE Packet32h ploadquad<Packet32h>(const Eigen::half* from) {

-  __m512h a = _mm512_castph128_ph512(_mm_loadu_ph(from));

-  return _mm512_permutexvar_ph(

-      _mm512_set_epi16(7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0),

-      a);

-}

-

-// pabs

-

-template <>

-EIGEN_STRONG_INLINE Packet32h pabs<Packet32h>(const Packet32h& a) {

-  return _mm512_abs_ph(a);

-}

-

-// psignbit

-

-template <>

-EIGEN_STRONG_INLINE Packet32h psignbit<Packet32h>(const Packet32h& a) {

-  return _mm512_castsi512_ph(_mm512_srai_epi16(_mm512_castph_si512(a), 15));

-}

-

-// pmin

-

-template <>

-EIGEN_STRONG_INLINE Packet32h pmin<Packet32h>(const Packet32h& a, const Packet32h& b) {

-  return _mm512_min_ph(a, b);

-}

-

-// pmax

-

-template <>

-EIGEN_STRONG_INLINE Packet32h pmax<Packet32h>(const Packet32h& a, const Packet32h& b) {

-  return _mm512_max_ph(a, b);

-}

-

-// plset

-template <>

-EIGEN_STRONG_INLINE Packet32h plset<Packet32h>(const half& a) {

-  return _mm512_add_ph(_mm512_set1_ph(a),

-                       _mm512_set_ph(31.0f, 30.0f, 29.0f, 28.0f, 27.0f, 26.0f, 25.0f, 24.0f, 23.0f, 22.0f, 21.0f, 20.0f,

-                                     19.0f, 18.0f, 17.0f, 16.0f, 15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f,

-                                     7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f));

-}

-

-// por

-

-template <>

-EIGEN_STRONG_INLINE Packet32h por(const Packet32h& a, const Packet32h& b) {

-  return _mm512_castsi512_ph(_mm512_or_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));

-}

-

-// pxor

-

-template <>

-EIGEN_STRONG_INLINE Packet32h pxor(const Packet32h& a, const Packet32h& b) {

-  return _mm512_castsi512_ph(_mm512_xor_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));

-}

-

-// pand

-

-template <>

-EIGEN_STRONG_INLINE Packet32h pand(const Packet32h& a, const Packet32h& b) {

-  return _mm512_castsi512_ph(_mm512_and_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));

-}

-

-// pandnot

-

-template <>

-EIGEN_STRONG_INLINE Packet32h pandnot(const Packet32h& a, const Packet32h& b) {

-  return _mm512_castsi512_ph(_mm512_andnot_si512(_mm512_castph_si512(b), _mm512_castph_si512(a)));

-}

-

-// pselect

-

-template <>

-EIGEN_DEVICE_FUNC inline Packet32h pselect(const Packet32h& mask, const Packet32h& a, const Packet32h& b) {

-  __mmask32 mask32 = _mm512_cmp_epi16_mask(_mm512_castph_si512(mask), _mm512_setzero_epi32(), _MM_CMPINT_EQ);

-  return _mm512_mask_blend_ph(mask32, a, b);

-}

-

-// pcmp_eq

-

-template <>

-EIGEN_STRONG_INLINE Packet32h pcmp_eq(const Packet32h& a, const Packet32h& b) {

-  __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_EQ_OQ);

-  return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu));

-}

-

-// pcmp_le

-

-template <>

-EIGEN_STRONG_INLINE Packet32h pcmp_le(const Packet32h& a, const Packet32h& b) {

-  __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_LE_OQ);

-  return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu));

-}

-

-// pcmp_lt

-

-template <>

-EIGEN_STRONG_INLINE Packet32h pcmp_lt(const Packet32h& a, const Packet32h& b) {

-  __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_LT_OQ);

-  return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu));

-}

-

-// pcmp_lt_or_nan

-

-template <>

-EIGEN_STRONG_INLINE Packet32h pcmp_lt_or_nan(const Packet32h& a, const Packet32h& b) {

-  __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_NGE_UQ);

-  return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi16(0), mask, 0xffffu));

-}

-

-// padd

-

-template <>

-EIGEN_STRONG_INLINE Packet32h padd<Packet32h>(const Packet32h& a, const Packet32h& b) {

-  return _mm512_add_ph(a, b);

-}

-

-template <>

-EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {

-  return _mm256_castph_si256(_mm256_add_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));

-}

-

-template <>

-EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {

-  return _mm_castph_si128(_mm_add_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));

-}

-

-// psub

-

-template <>

-EIGEN_STRONG_INLINE Packet32h psub<Packet32h>(const Packet32h& a, const Packet32h& b) {

-  return _mm512_sub_ph(a, b);

-}

-

-template <>

-EIGEN_STRONG_INLINE Packet16h psub<Packet16h>(const Packet16h& a, const Packet16h& b) {

-  return _mm256_castph_si256(_mm256_sub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));

-}

-

-template <>

-EIGEN_STRONG_INLINE Packet8h psub<Packet8h>(const Packet8h& a, const Packet8h& b) {

-  return _mm_castph_si128(_mm_sub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));

-}

-

-// pmul

-

-template <>

-EIGEN_STRONG_INLINE Packet32h pmul<Packet32h>(const Packet32h& a, const Packet32h& b) {

-  return _mm512_mul_ph(a, b);

-}

-

-template <>

-EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {

-  return _mm256_castph_si256(_mm256_mul_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));

-}

-

-template <>

-EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {

-  return _mm_castph_si128(_mm_mul_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));

-}

-

-// pdiv

-

-template <>

-EIGEN_STRONG_INLINE Packet32h pdiv<Packet32h>(const Packet32h& a, const Packet32h& b) {

-  return _mm512_div_ph(a, b);

-}

-

-template <>

-EIGEN_STRONG_INLINE Packet16h pdiv<Packet16h>(const Packet16h& a, const Packet16h& b) {

-  return _mm256_castph_si256(_mm256_div_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));

-}

-

-template <>

-EIGEN_STRONG_INLINE Packet8h pdiv<Packet8h>(const Packet8h& a, const Packet8h& b) {

-  return _mm_castph_si128(_mm_div_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));

-}

-

-// pround

-

-template <>

-EIGEN_STRONG_INLINE Packet32h pround<Packet32h>(const Packet32h& a) {

-  // Work-around for default std::round rounding mode.

-

-  // Mask for the sign bit

-  const Packet32h signMask = pset1frombits<Packet32h>(static_cast<numext::uint16_t>(0x8000u));

-  // The largest half-preicision float less than 0.5

-  const Packet32h prev0dot5 = pset1frombits<Packet32h>(static_cast<numext::uint16_t>(0x37FFu));

-

-  return _mm512_roundscale_ph(padd(por(pand(a, signMask), prev0dot5), a), _MM_FROUND_TO_ZERO);

-}

-

-// print

-

-template <>

-EIGEN_STRONG_INLINE Packet32h print<Packet32h>(const Packet32h& a) {

-  return _mm512_roundscale_ph(a, _MM_FROUND_CUR_DIRECTION);

-}

-

-// pceil

-

-template <>

-EIGEN_STRONG_INLINE Packet32h pceil<Packet32h>(const Packet32h& a) {

-  return _mm512_roundscale_ph(a, _MM_FROUND_TO_POS_INF);

-}

-

-// pfloor

-

-template <>

-EIGEN_STRONG_INLINE Packet32h pfloor<Packet32h>(const Packet32h& a) {

-  return _mm512_roundscale_ph(a, _MM_FROUND_TO_NEG_INF);

-}

-

-// predux

-template <>

-EIGEN_STRONG_INLINE half predux<Packet32h>(const Packet32h& a) {

-  return (half)_mm512_reduce_add_ph(a);

-}

-

-template <>

-EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& a) {

-  return (half)_mm256_reduce_add_ph(_mm256_castsi256_ph(a));

-}

-

-template <>

-EIGEN_STRONG_INLINE half predux<Packet8h>(const Packet8h& a) {

-  return (half)_mm_reduce_add_ph(_mm_castsi128_ph(a));

-}

-

-// predux_half_dowto4

-template <>

-EIGEN_STRONG_INLINE Packet16h predux_half_dowto4<Packet32h>(const Packet32h& a) {

-#ifdef EIGEN_VECTORIZE_AVX512DQ

-  __m256i lowHalf = _mm256_castps_si256(_mm512_extractf32x8_ps(_mm512_castph_ps(a), 0));

-  __m256i highHalf = _mm256_castps_si256(_mm512_extractf32x8_ps(_mm512_castph_ps(a), 1));

-

-  return Packet16h(padd<Packet16h>(lowHalf, highHalf));

-#else

-  Eigen::half data[32];

-  _mm512_storeu_ph(data, a);

-

-  __m256i lowHalf = _mm256_castph_si256(_mm256_loadu_ph(data));

-  __m256i highHalf = _mm256_castph_si256(_mm256_loadu_ph(data + 16));

-

-  return Packet16h(padd<Packet16h>(lowHalf, highHalf));

-#endif

-}

-

-// predux_max

-

-// predux_min

-

-// predux_mul

-

-#ifdef EIGEN_VECTORIZE_FMA

-

-// pmadd

-

-template <>

-EIGEN_STRONG_INLINE Packet32h pmadd(const Packet32h& a, const Packet32h& b, const Packet32h& c) {

-  return _mm512_fmadd_ph(a, b, c);

-}

-

-template <>

-EIGEN_STRONG_INLINE Packet16h pmadd(const Packet16h& a, const Packet16h& b, const Packet16h& c) {

-  return _mm256_castph_si256(_mm256_fmadd_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));

-}

-

-template <>

-EIGEN_STRONG_INLINE Packet8h pmadd(const Packet8h& a, const Packet8h& b, const Packet8h& c) {

-  return _mm_castph_si128(_mm_fmadd_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));

-}

-

-// pmsub

-

-template <>

-EIGEN_STRONG_INLINE Packet32h pmsub(const Packet32h& a, const Packet32h& b, const Packet32h& c) {

-  return _mm512_fmsub_ph(a, b, c);

-}

-

-template <>

-EIGEN_STRONG_INLINE Packet16h pmsub(const Packet16h& a, const Packet16h& b, const Packet16h& c) {

-  return _mm256_castph_si256(_mm256_fmsub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));

-}

-

-template <>

-EIGEN_STRONG_INLINE Packet8h pmsub(const Packet8h& a, const Packet8h& b, const Packet8h& c) {

-  return _mm_castph_si128(_mm_fmsub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));

-}

-

-// pnmadd

-

-template <>

-EIGEN_STRONG_INLINE Packet32h pnmadd(const Packet32h& a, const Packet32h& b, const Packet32h& c) {

-  return _mm512_fnmadd_ph(a, b, c);

-}

-

-template <>

-EIGEN_STRONG_INLINE Packet16h pnmadd(const Packet16h& a, const Packet16h& b, const Packet16h& c) {

-  return _mm256_castph_si256(_mm256_fnmadd_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));

-}

-

-template <>

-EIGEN_STRONG_INLINE Packet8h pnmadd(const Packet8h& a, const Packet8h& b, const Packet8h& c) {

-  return _mm_castph_si128(_mm_fnmadd_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));

-}

-

-// pnmsub

-

-template <>

-EIGEN_STRONG_INLINE Packet32h pnmsub(const Packet32h& a, const Packet32h& b, const Packet32h& c) {

-  return _mm512_fnmsub_ph(a, b, c);

-}

-

-template <>

-EIGEN_STRONG_INLINE Packet16h pnmsub(const Packet16h& a, const Packet16h& b, const Packet16h& c) {

-  return _mm256_castph_si256(_mm256_fnmsub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));

-}

-

-template <>

-EIGEN_STRONG_INLINE Packet8h pnmsub(const Packet8h& a, const Packet8h& b, const Packet8h& c) {

-  return _mm_castph_si128(_mm_fnmsub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));

-}

-

-#endif

-

-// pnegate

-

-template <>

-EIGEN_STRONG_INLINE Packet32h pnegate<Packet32h>(const Packet32h& a) {

-  return _mm512_sub_ph(_mm512_set1_ph(0.0), a);

-}

-

-// pconj

-

-template <>

-EIGEN_STRONG_INLINE Packet32h pconj<Packet32h>(const Packet32h& a) {

-  return a;

-}

-

-// psqrt

-

-template <>

-EIGEN_STRONG_INLINE Packet32h psqrt<Packet32h>(const Packet32h& a) {

-  return _mm512_sqrt_ph(a);

-}

-

-// prsqrt

-

-template <>

-EIGEN_STRONG_INLINE Packet32h prsqrt<Packet32h>(const Packet32h& a) {

-  return _mm512_rsqrt_ph(a);

-}

-

-// preciprocal

-

-template <>

-EIGEN_STRONG_INLINE Packet32h preciprocal<Packet32h>(const Packet32h& a) {

-  return _mm512_rcp_ph(a);

-}

-

-// ptranspose

-

-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet32h, 32>& a) {

-  __m512i t[32];

-

-  EIGEN_UNROLL_LOOP

-  for (int i = 0; i < 16; i++) {

-    t[2 * i] = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[2 * i]), _mm512_castph_si512(a.packet[2 * i + 1]));

-    t[2 * i + 1] =

-        _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[2 * i]), _mm512_castph_si512(a.packet[2 * i + 1]));

-  }

-

-  __m512i p[32];

-

-  EIGEN_UNROLL_LOOP

-  for (int i = 0; i < 8; i++) {

-    p[4 * i] = _mm512_unpacklo_epi32(t[4 * i], t[4 * i + 2]);

-    p[4 * i + 1] = _mm512_unpackhi_epi32(t[4 * i], t[4 * i + 2]);

-    p[4 * i + 2] = _mm512_unpacklo_epi32(t[4 * i + 1], t[4 * i + 3]);

-    p[4 * i + 3] = _mm512_unpackhi_epi32(t[4 * i + 1], t[4 * i + 3]);

-  }

-

-  __m512i q[32];

-

-  EIGEN_UNROLL_LOOP

-  for (int i = 0; i < 4; i++) {

-    q[8 * i] = _mm512_unpacklo_epi64(p[8 * i], p[8 * i + 4]);

-    q[8 * i + 1] = _mm512_unpackhi_epi64(p[8 * i], p[8 * i + 4]);

-    q[8 * i + 2] = _mm512_unpacklo_epi64(p[8 * i + 1], p[8 * i + 5]);

-    q[8 * i + 3] = _mm512_unpackhi_epi64(p[8 * i + 1], p[8 * i + 5]);

-    q[8 * i + 4] = _mm512_unpacklo_epi64(p[8 * i + 2], p[8 * i + 6]);

-    q[8 * i + 5] = _mm512_unpackhi_epi64(p[8 * i + 2], p[8 * i + 6]);

-    q[8 * i + 6] = _mm512_unpacklo_epi64(p[8 * i + 3], p[8 * i + 7]);

-    q[8 * i + 7] = _mm512_unpackhi_epi64(p[8 * i + 3], p[8 * i + 7]);

-  }

-

-  __m512i f[32];

-

-#define PACKET32H_TRANSPOSE_HELPER(X, Y)                                                            \

-  do {                                                                                              \

-    f[Y * 8] = _mm512_inserti32x4(f[Y * 8], _mm512_extracti32x4_epi32(q[X * 8], Y), X);             \

-    f[Y * 8 + 1] = _mm512_inserti32x4(f[Y * 8 + 1], _mm512_extracti32x4_epi32(q[X * 8 + 1], Y), X); \

-    f[Y * 8 + 2] = _mm512_inserti32x4(f[Y * 8 + 2], _mm512_extracti32x4_epi32(q[X * 8 + 2], Y), X); \

-    f[Y * 8 + 3] = _mm512_inserti32x4(f[Y * 8 + 3], _mm512_extracti32x4_epi32(q[X * 8 + 3], Y), X); \

-    f[Y * 8 + 4] = _mm512_inserti32x4(f[Y * 8 + 4], _mm512_extracti32x4_epi32(q[X * 8 + 4], Y), X); \

-    f[Y * 8 + 5] = _mm512_inserti32x4(f[Y * 8 + 5], _mm512_extracti32x4_epi32(q[X * 8 + 5], Y), X); \

-    f[Y * 8 + 6] = _mm512_inserti32x4(f[Y * 8 + 6], _mm512_extracti32x4_epi32(q[X * 8 + 6], Y), X); \

-    f[Y * 8 + 7] = _mm512_inserti32x4(f[Y * 8 + 7], _mm512_extracti32x4_epi32(q[X * 8 + 7], Y), X); \

-  } while (false);

-

-  PACKET32H_TRANSPOSE_HELPER(0, 0);

-  PACKET32H_TRANSPOSE_HELPER(1, 1);

-  PACKET32H_TRANSPOSE_HELPER(2, 2);

-  PACKET32H_TRANSPOSE_HELPER(3, 3);

-

-  PACKET32H_TRANSPOSE_HELPER(1, 0);

-  PACKET32H_TRANSPOSE_HELPER(2, 0);

-  PACKET32H_TRANSPOSE_HELPER(3, 0);

-  PACKET32H_TRANSPOSE_HELPER(2, 1);

-  PACKET32H_TRANSPOSE_HELPER(3, 1);

-  PACKET32H_TRANSPOSE_HELPER(3, 2);

-

-  PACKET32H_TRANSPOSE_HELPER(0, 1);

-  PACKET32H_TRANSPOSE_HELPER(0, 2);

-  PACKET32H_TRANSPOSE_HELPER(0, 3);

-  PACKET32H_TRANSPOSE_HELPER(1, 2);

-  PACKET32H_TRANSPOSE_HELPER(1, 3);

-  PACKET32H_TRANSPOSE_HELPER(2, 3);

-

-#undef PACKET32H_TRANSPOSE_HELPER

-

-  EIGEN_UNROLL_LOOP

-  for (int i = 0; i < 32; i++) {

-    a.packet[i] = _mm512_castsi512_ph(f[i]);

-  }

-}

-

-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet32h, 4>& a) {

-  __m512i p0, p1, p2, p3, t0, t1, t2, t3, a0, a1, a2, a3;

-  t0 = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[0]), _mm512_castph_si512(a.packet[1]));

-  t1 = _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[0]), _mm512_castph_si512(a.packet[1]));

-  t2 = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[2]), _mm512_castph_si512(a.packet[3]));

-  t3 = _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[2]), _mm512_castph_si512(a.packet[3]));

-

-  p0 = _mm512_unpacklo_epi32(t0, t2);

-  p1 = _mm512_unpackhi_epi32(t0, t2);

-  p2 = _mm512_unpacklo_epi32(t1, t3);

-  p3 = _mm512_unpackhi_epi32(t1, t3);

-

-  a0 = p0;

-  a1 = p1;

-  a2 = p2;

-  a3 = p3;

-

-  a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p1, 0), 1);

-  a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p0, 1), 0);

-

-  a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p2, 0), 2);

-  a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p0, 2), 0);

-

-  a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p3, 0), 3);

-  a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p0, 3), 0);

-

-  a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p2, 1), 2);

-  a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p1, 2), 1);

-

-  a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p3, 2), 3);

-  a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p2, 3), 2);

-

-  a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p3, 1), 3);

-  a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p1, 3), 1);

-

-  a.packet[0] = _mm512_castsi512_ph(a0);

-  a.packet[1] = _mm512_castsi512_ph(a1);

-  a.packet[2] = _mm512_castsi512_ph(a2);

-  a.packet[3] = _mm512_castsi512_ph(a3);

-}

-

-// preverse

-

-template <>

-EIGEN_STRONG_INLINE Packet32h preverse(const Packet32h& a) {

-  return _mm512_permutexvar_ph(_mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,

-                                                20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31),

-                               a);

-}

-

-// pscatter

-

-template <>

-EIGEN_STRONG_INLINE void pscatter<half, Packet32h>(half* to, const Packet32h& from, Index stride) {

-  EIGEN_ALIGN64 half aux[32];

-  pstore(aux, from);

-

-  EIGEN_UNROLL_LOOP

-  for (int i = 0; i < 32; i++) {

-    to[stride * i] = aux[i];

-  }

-}

-

-// pgather

-

-template <>

-EIGEN_STRONG_INLINE Packet32h pgather<Eigen::half, Packet32h>(const Eigen::half* from, Index stride) {

-  return _mm512_castsi512_ph(_mm512_set_epi16(

-      from[31 * stride].x, from[30 * stride].x, from[29 * stride].x, from[28 * stride].x, from[27 * stride].x,

-      from[26 * stride].x, from[25 * stride].x, from[24 * stride].x, from[23 * stride].x, from[22 * stride].x,

-      from[21 * stride].x, from[20 * stride].x, from[19 * stride].x, from[18 * stride].x, from[17 * stride].x,

-      from[16 * stride].x, from[15 * stride].x, from[14 * stride].x, from[13 * stride].x, from[12 * stride].x,

-      from[11 * stride].x, from[10 * stride].x, from[9 * stride].x, from[8 * stride].x, from[7 * stride].x,

-      from[6 * stride].x, from[5 * stride].x, from[4 * stride].x, from[3 * stride].x, from[2 * stride].x,

-      from[1 * stride].x, from[0 * stride].x));

-}

-

-template <>

-EIGEN_STRONG_INLINE Packet16h pcos<Packet16h>(const Packet16h&);

-template <>

-EIGEN_STRONG_INLINE Packet16h psin<Packet16h>(const Packet16h&);

-template <>

-EIGEN_STRONG_INLINE Packet16h plog<Packet16h>(const Packet16h&);

-template <>

-EIGEN_STRONG_INLINE Packet16h plog2<Packet16h>(const Packet16h&);

-template <>

-EIGEN_STRONG_INLINE Packet16h plog1p<Packet16h>(const Packet16h&);

-template <>

-EIGEN_STRONG_INLINE Packet16h pexp<Packet16h>(const Packet16h&);

-template <>

-EIGEN_STRONG_INLINE Packet16h pexpm1<Packet16h>(const Packet16h&);

-template <>

-EIGEN_STRONG_INLINE Packet16h ptanh<Packet16h>(const Packet16h&);

-template <>

-EIGEN_STRONG_INLINE Packet16h pfrexp<Packet16h>(const Packet16h&, Packet16h&);

-template <>

-EIGEN_STRONG_INLINE Packet16h pldexp<Packet16h>(const Packet16h&, const Packet16h&);

-

-EIGEN_STRONG_INLINE Packet32h combine2Packet16h(const Packet16h& a, const Packet16h& b) {

-  __m512d result = _mm512_undefined_pd();

-  result = _mm512_insertf64x4(result, _mm256_castsi256_pd(a), 0);

-  result = _mm512_insertf64x4(result, _mm256_castsi256_pd(b), 1);

-  return _mm512_castpd_ph(result);

-}

-

-EIGEN_STRONG_INLINE void extract2Packet16h(const Packet32h& x, Packet16h& a, Packet16h& b) {

-  a = _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(x), 0));

-  b = _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(x), 1));

-}

-

-// psin

-template <>

-EIGEN_STRONG_INLINE Packet32h psin<Packet32h>(const Packet32h& a) {

-  Packet16h low;

-  Packet16h high;

-  extract2Packet16h(a, low, high);

-

-  Packet16h lowOut = psin(low);

-  Packet16h highOut = psin(high);

-

-  return combine2Packet16h(lowOut, highOut);

-}

-

-// pcos

-template <>

-EIGEN_STRONG_INLINE Packet32h pcos<Packet32h>(const Packet32h& a) {

-  Packet16h low;

-  Packet16h high;

-  extract2Packet16h(a, low, high);

-

-  Packet16h lowOut = pcos(low);

-  Packet16h highOut = pcos(high);

-

-  return combine2Packet16h(lowOut, highOut);

-}

-

-// plog

-template <>

-EIGEN_STRONG_INLINE Packet32h plog<Packet32h>(const Packet32h& a) {

-  Packet16h low;

-  Packet16h high;

-  extract2Packet16h(a, low, high);

-

-  Packet16h lowOut = plog(low);

-  Packet16h highOut = plog(high);

-

-  return combine2Packet16h(lowOut, highOut);

-}

-

-// plog2

-template <>

-EIGEN_STRONG_INLINE Packet32h plog2<Packet32h>(const Packet32h& a) {

-  Packet16h low;

-  Packet16h high;

-  extract2Packet16h(a, low, high);

-

-  Packet16h lowOut = plog2(low);

-  Packet16h highOut = plog2(high);

-

-  return combine2Packet16h(lowOut, highOut);

-}

-

-// plog1p

-template <>

-EIGEN_STRONG_INLINE Packet32h plog1p<Packet32h>(const Packet32h& a) {

-  Packet16h low;

-  Packet16h high;

-  extract2Packet16h(a, low, high);

-

-  Packet16h lowOut = plog1p(low);

-  Packet16h highOut = plog1p(high);

-

-  return combine2Packet16h(lowOut, highOut);

-}

-

-// pexp

-template <>

-EIGEN_STRONG_INLINE Packet32h pexp<Packet32h>(const Packet32h& a) {

-  Packet16h low;

-  Packet16h high;

-  extract2Packet16h(a, low, high);

-

-  Packet16h lowOut = pexp(low);

-  Packet16h highOut = pexp(high);

-

-  return combine2Packet16h(lowOut, highOut);

-}

-

-// pexpm1

-template <>

-EIGEN_STRONG_INLINE Packet32h pexpm1<Packet32h>(const Packet32h& a) {

-  Packet16h low;

-  Packet16h high;

-  extract2Packet16h(a, low, high);

-

-  Packet16h lowOut = pexpm1(low);

-  Packet16h highOut = pexpm1(high);

-

-  return combine2Packet16h(lowOut, highOut);

-}

-

-// ptanh

-template <>

-EIGEN_STRONG_INLINE Packet32h ptanh<Packet32h>(const Packet32h& a) {

-  Packet16h low;

-  Packet16h high;

-  extract2Packet16h(a, low, high);

-

-  Packet16h lowOut = ptanh(low);

-  Packet16h highOut = ptanh(high);

-

-  return combine2Packet16h(lowOut, highOut);

-}

-

-// pfrexp

-template <>

-EIGEN_STRONG_INLINE Packet32h pfrexp<Packet32h>(const Packet32h& a, Packet32h& exponent) {

-  Packet16h low;

-  Packet16h high;

-  extract2Packet16h(a, low, high);

-

-  Packet16h exp1 = _mm256_undefined_si256();

-  Packet16h exp2 = _mm256_undefined_si256();

-

-  Packet16h lowOut = pfrexp(low, exp1);

-  Packet16h highOut = pfrexp(high, exp2);

-

-  exponent = combine2Packet16h(exp1, exp2);

-

-  return combine2Packet16h(lowOut, highOut);

-}

-

-// pldexp

-template <>

-EIGEN_STRONG_INLINE Packet32h pldexp<Packet32h>(const Packet32h& a, const Packet32h& exponent) {

-  Packet16h low;

-  Packet16h high;

-  extract2Packet16h(a, low, high);

-

-  Packet16h exp1;

-  Packet16h exp2;

-  extract2Packet16h(exponent, exp1, exp2);

-

-  Packet16h lowOut = pldexp(low, exp1);

-  Packet16h highOut = pldexp(high, exp2);

-

-  return combine2Packet16h(lowOut, highOut);

-}

-

-}  // end namespace internal

-}  // end namespace Eigen

-

-#endif  // EIGEN_PACKET_MATH_FP16_AVX512_H

+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+typedef __m512h Packet32h;
+typedef eigen_packet_wrapper<__m256i, 1> Packet16h;
+typedef eigen_packet_wrapper<__m128i, 2> Packet8h;
+
+template <>
+struct is_arithmetic<Packet8h> {
+  enum { value = true };
+};
+
+template <>
+struct packet_traits<half> : default_packet_traits {
+  typedef Packet32h type;
+  typedef Packet16h half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 32,
+
+    HasCmp = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasAbs2 = 0,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 0,
+    HasLog = 1,
+    HasLog1p = 1,
+    HasExp = 1,
+    HasExpm1 = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    // These ones should be implemented in future
+    HasBessel = 0,
+    HasNdtri = 0,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = 0,  // EIGEN_FAST_MATH,
+    HasBlend = 0,
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasRint = 1
+  };
+};
+
+template <>
+struct unpacket_traits<Packet32h> {
+  typedef Eigen::half type;
+  typedef Packet16h half;
+  enum {
+    size = 32,
+    alignment = Aligned64,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template <>
+struct unpacket_traits<Packet16h> {
+  typedef Eigen::half type;
+  typedef Packet8h half;
+  enum {
+    size = 16,
+    alignment = Aligned32,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template <>
+struct unpacket_traits<Packet8h> {
+  typedef Eigen::half type;
+  typedef Packet8h half;
+  enum {
+    size = 8,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+// Memory functions
+
+// pset1
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pset1<Packet32h>(const Eigen::half& from) {
+  return _mm512_set1_ph(static_cast<_Float16>(from));
+}
+
+// pset1frombits
+template <>
+EIGEN_STRONG_INLINE Packet32h pset1frombits<Packet32h>(unsigned short from) {
+  return _mm512_castsi512_ph(_mm512_set1_epi16(from));
+}
+
+// pfirst
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half pfirst<Packet32h>(const Packet32h& from) {
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+  return half_impl::raw_uint16_to_half(
+      static_cast<unsigned short>(_mm256_extract_epi16(_mm512_extracti32x8_epi32(_mm512_castph_si512(from), 0), 0)));
+#else
+  Eigen::half dest[32];
+  _mm512_storeu_ph(dest, from);
+  return dest[0];
+#endif
+}
+
+// pload
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pload<Packet32h>(const Eigen::half* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_ph(from);
+}
+
+// ploadu
+
+template <>
+EIGEN_STRONG_INLINE Packet32h ploadu<Packet32h>(const Eigen::half* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_ph(from);
+}
+
+// pstore
+
+template <>
+EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet32h& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm512_store_ph(to, from);
+}
+
+// pstoreu
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet32h& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_ph(to, from);
+}
+
+// ploaddup
+template <>
+EIGEN_STRONG_INLINE Packet32h ploaddup<Packet32h>(const Eigen::half* from) {
+  __m512h a = _mm512_castph256_ph512(_mm256_loadu_ph(from));
+  return _mm512_permutexvar_ph(_mm512_set_epi16(15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6,
+                                                5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0),
+                               a);
+}
+
+// ploadquad
+template <>
+EIGEN_STRONG_INLINE Packet32h ploadquad<Packet32h>(const Eigen::half* from) {
+  __m512h a = _mm512_castph128_ph512(_mm_loadu_ph(from));
+  return _mm512_permutexvar_ph(
+      _mm512_set_epi16(7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0),
+      a);
+}
+
+// pabs
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pabs<Packet32h>(const Packet32h& a) {
+  return _mm512_abs_ph(a);
+}
+
+// psignbit
+
+template <>
+EIGEN_STRONG_INLINE Packet32h psignbit<Packet32h>(const Packet32h& a) {
+  return _mm512_castsi512_ph(_mm512_srai_epi16(_mm512_castph_si512(a), 15));
+}
+
+// pmin
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pmin<Packet32h>(const Packet32h& a, const Packet32h& b) {
+  return _mm512_min_ph(a, b);
+}
+
+// pmax
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pmax<Packet32h>(const Packet32h& a, const Packet32h& b) {
+  return _mm512_max_ph(a, b);
+}
+
+// plset
+template <>
+EIGEN_STRONG_INLINE Packet32h plset<Packet32h>(const half& a) {
+  return _mm512_add_ph(_mm512_set1_ph(a),
+                       _mm512_set_ph(31.0f, 30.0f, 29.0f, 28.0f, 27.0f, 26.0f, 25.0f, 24.0f, 23.0f, 22.0f, 21.0f, 20.0f,
+                                     19.0f, 18.0f, 17.0f, 16.0f, 15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f,
+                                     7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f));
+}
+
+// por
+
+template <>
+EIGEN_STRONG_INLINE Packet32h por(const Packet32h& a, const Packet32h& b) {
+  return _mm512_castsi512_ph(_mm512_or_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
+}
+
+// pxor
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pxor(const Packet32h& a, const Packet32h& b) {
+  return _mm512_castsi512_ph(_mm512_xor_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
+}
+
+// pand
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pand(const Packet32h& a, const Packet32h& b) {
+  return _mm512_castsi512_ph(_mm512_and_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
+}
+
+// pandnot
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pandnot(const Packet32h& a, const Packet32h& b) {
+  return _mm512_castsi512_ph(_mm512_andnot_si512(_mm512_castph_si512(b), _mm512_castph_si512(a)));
+}
+
+// pselect
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet32h pselect(const Packet32h& mask, const Packet32h& a, const Packet32h& b) {
+  __mmask32 mask32 = _mm512_cmp_epi16_mask(_mm512_castph_si512(mask), _mm512_setzero_epi32(), _MM_CMPINT_EQ);
+  return _mm512_mask_blend_ph(mask32, a, b);
+}
+
+// pcmp_eq
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pcmp_eq(const Packet32h& a, const Packet32h& b) {
+  __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_EQ_OQ);
+  return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu));
+}
+
+// pcmp_le
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pcmp_le(const Packet32h& a, const Packet32h& b) {
+  __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_LE_OQ);
+  return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu));
+}
+
+// pcmp_lt
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pcmp_lt(const Packet32h& a, const Packet32h& b) {
+  __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_LT_OQ);
+  return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu));
+}
+
+// pcmp_lt_or_nan
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pcmp_lt_or_nan(const Packet32h& a, const Packet32h& b) {
+  __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_NGE_UQ);
+  return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi16(0), mask, 0xffffu));
+}
+
+// padd
+
+template <>
+EIGEN_STRONG_INLINE Packet32h padd<Packet32h>(const Packet32h& a, const Packet32h& b) {
+  return _mm512_add_ph(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
+  return _mm256_castph_si256(_mm256_add_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {
+  return _mm_castph_si128(_mm_add_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
+}
+
+// psub
+
+template <>
+EIGEN_STRONG_INLINE Packet32h psub<Packet32h>(const Packet32h& a, const Packet32h& b) {
+  return _mm512_sub_ph(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h psub<Packet16h>(const Packet16h& a, const Packet16h& b) {
+  return _mm256_castph_si256(_mm256_sub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h psub<Packet8h>(const Packet8h& a, const Packet8h& b) {
+  return _mm_castph_si128(_mm_sub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
+}
+
+// pmul
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pmul<Packet32h>(const Packet32h& a, const Packet32h& b) {
+  return _mm512_mul_ph(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {
+  return _mm256_castph_si256(_mm256_mul_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {
+  return _mm_castph_si128(_mm_mul_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
+}
+
+// pdiv
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pdiv<Packet32h>(const Packet32h& a, const Packet32h& b) {
+  return _mm512_div_ph(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pdiv<Packet16h>(const Packet16h& a, const Packet16h& b) {
+  return _mm256_castph_si256(_mm256_div_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pdiv<Packet8h>(const Packet8h& a, const Packet8h& b) {
+  return _mm_castph_si128(_mm_div_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
+}
+
+// pround
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pround<Packet32h>(const Packet32h& a) {
+  // Work-around for default std::round rounding mode.
+
+  // Mask for the sign bit
+  const Packet32h signMask = pset1frombits<Packet32h>(static_cast<numext::uint16_t>(0x8000u));
+  // The largest half-preicision float less than 0.5
+  const Packet32h prev0dot5 = pset1frombits<Packet32h>(static_cast<numext::uint16_t>(0x37FFu));
+
+  return _mm512_roundscale_ph(padd(por(pand(a, signMask), prev0dot5), a), _MM_FROUND_TO_ZERO);
+}
+
+// print
+
+template <>
+EIGEN_STRONG_INLINE Packet32h print<Packet32h>(const Packet32h& a) {
+  return _mm512_roundscale_ph(a, _MM_FROUND_CUR_DIRECTION);
+}
+
+// pceil
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pceil<Packet32h>(const Packet32h& a) {
+  return _mm512_roundscale_ph(a, _MM_FROUND_TO_POS_INF);
+}
+
+// pfloor
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pfloor<Packet32h>(const Packet32h& a) {
+  return _mm512_roundscale_ph(a, _MM_FROUND_TO_NEG_INF);
+}
+
+// predux
+template <>
+EIGEN_STRONG_INLINE half predux<Packet32h>(const Packet32h& a) {
+  return (half)_mm512_reduce_add_ph(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& a) {
+  return (half)_mm256_reduce_add_ph(_mm256_castsi256_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux<Packet8h>(const Packet8h& a) {
+  return (half)_mm_reduce_add_ph(_mm_castsi128_ph(a));
+}
+
+// predux_half_dowto4
+template <>
+EIGEN_STRONG_INLINE Packet16h predux_half_dowto4<Packet32h>(const Packet32h& a) {
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+  __m256i lowHalf = _mm256_castps_si256(_mm512_extractf32x8_ps(_mm512_castph_ps(a), 0));
+  __m256i highHalf = _mm256_castps_si256(_mm512_extractf32x8_ps(_mm512_castph_ps(a), 1));
+
+  return Packet16h(padd<Packet16h>(lowHalf, highHalf));
+#else
+  Eigen::half data[32];
+  _mm512_storeu_ph(data, a);
+
+  __m256i lowHalf = _mm256_castph_si256(_mm256_loadu_ph(data));
+  __m256i highHalf = _mm256_castph_si256(_mm256_loadu_ph(data + 16));
+
+  return Packet16h(padd<Packet16h>(lowHalf, highHalf));
+#endif
+}
+
+// predux_max
+
+// predux_min
+
+// predux_mul
+
+#ifdef EIGEN_VECTORIZE_FMA
+
+// pmadd
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pmadd(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
+  return _mm512_fmadd_ph(a, b, c);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pmadd(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
+  return _mm256_castph_si256(_mm256_fmadd_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pmadd(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
+  return _mm_castph_si128(_mm_fmadd_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
+}
+
+// pmsub
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pmsub(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
+  return _mm512_fmsub_ph(a, b, c);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pmsub(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
+  return _mm256_castph_si256(_mm256_fmsub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pmsub(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
+  return _mm_castph_si128(_mm_fmsub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
+}
+
+// pnmadd
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pnmadd(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
+  return _mm512_fnmadd_ph(a, b, c);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pnmadd(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
+  return _mm256_castph_si256(_mm256_fnmadd_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pnmadd(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
+  return _mm_castph_si128(_mm_fnmadd_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
+}
+
+// pnmsub
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pnmsub(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
+  return _mm512_fnmsub_ph(a, b, c);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pnmsub(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
+  return _mm256_castph_si256(_mm256_fnmsub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pnmsub(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
+  return _mm_castph_si128(_mm_fnmsub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
+}
+
+#endif
+
+// pnegate
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pnegate<Packet32h>(const Packet32h& a) {
+  return _mm512_sub_ph(_mm512_set1_ph(0.0), a);
+}
+
+// pconj
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pconj<Packet32h>(const Packet32h& a) {
+  return a;
+}
+
+// psqrt
+
+template <>
+EIGEN_STRONG_INLINE Packet32h psqrt<Packet32h>(const Packet32h& a) {
+  return _mm512_sqrt_ph(a);
+}
+
+// prsqrt
+
+template <>
+EIGEN_STRONG_INLINE Packet32h prsqrt<Packet32h>(const Packet32h& a) {
+  return _mm512_rsqrt_ph(a);
+}
+
+// preciprocal
+
+template <>
+EIGEN_STRONG_INLINE Packet32h preciprocal<Packet32h>(const Packet32h& a) {
+  return _mm512_rcp_ph(a);
+}
+
+// ptranspose
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet32h, 32>& a) {
+  __m512i t[32];
+
+  EIGEN_UNROLL_LOOP
+  for (int i = 0; i < 16; i++) {
+    t[2 * i] = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[2 * i]), _mm512_castph_si512(a.packet[2 * i + 1]));
+    t[2 * i + 1] =
+        _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[2 * i]), _mm512_castph_si512(a.packet[2 * i + 1]));
+  }
+
+  __m512i p[32];
+
+  EIGEN_UNROLL_LOOP
+  for (int i = 0; i < 8; i++) {
+    p[4 * i] = _mm512_unpacklo_epi32(t[4 * i], t[4 * i + 2]);
+    p[4 * i + 1] = _mm512_unpackhi_epi32(t[4 * i], t[4 * i + 2]);
+    p[4 * i + 2] = _mm512_unpacklo_epi32(t[4 * i + 1], t[4 * i + 3]);
+    p[4 * i + 3] = _mm512_unpackhi_epi32(t[4 * i + 1], t[4 * i + 3]);
+  }
+
+  __m512i q[32];
+
+  EIGEN_UNROLL_LOOP
+  for (int i = 0; i < 4; i++) {
+    q[8 * i] = _mm512_unpacklo_epi64(p[8 * i], p[8 * i + 4]);
+    q[8 * i + 1] = _mm512_unpackhi_epi64(p[8 * i], p[8 * i + 4]);
+    q[8 * i + 2] = _mm512_unpacklo_epi64(p[8 * i + 1], p[8 * i + 5]);
+    q[8 * i + 3] = _mm512_unpackhi_epi64(p[8 * i + 1], p[8 * i + 5]);
+    q[8 * i + 4] = _mm512_unpacklo_epi64(p[8 * i + 2], p[8 * i + 6]);
+    q[8 * i + 5] = _mm512_unpackhi_epi64(p[8 * i + 2], p[8 * i + 6]);
+    q[8 * i + 6] = _mm512_unpacklo_epi64(p[8 * i + 3], p[8 * i + 7]);
+    q[8 * i + 7] = _mm512_unpackhi_epi64(p[8 * i + 3], p[8 * i + 7]);
+  }
+
+  __m512i f[32];
+
+#define PACKET32H_TRANSPOSE_HELPER(X, Y)                                                            \
+  do {                                                                                              \
+    f[Y * 8] = _mm512_inserti32x4(f[Y * 8], _mm512_extracti32x4_epi32(q[X * 8], Y), X);             \
+    f[Y * 8 + 1] = _mm512_inserti32x4(f[Y * 8 + 1], _mm512_extracti32x4_epi32(q[X * 8 + 1], Y), X); \
+    f[Y * 8 + 2] = _mm512_inserti32x4(f[Y * 8 + 2], _mm512_extracti32x4_epi32(q[X * 8 + 2], Y), X); \
+    f[Y * 8 + 3] = _mm512_inserti32x4(f[Y * 8 + 3], _mm512_extracti32x4_epi32(q[X * 8 + 3], Y), X); \
+    f[Y * 8 + 4] = _mm512_inserti32x4(f[Y * 8 + 4], _mm512_extracti32x4_epi32(q[X * 8 + 4], Y), X); \
+    f[Y * 8 + 5] = _mm512_inserti32x4(f[Y * 8 + 5], _mm512_extracti32x4_epi32(q[X * 8 + 5], Y), X); \
+    f[Y * 8 + 6] = _mm512_inserti32x4(f[Y * 8 + 6], _mm512_extracti32x4_epi32(q[X * 8 + 6], Y), X); \
+    f[Y * 8 + 7] = _mm512_inserti32x4(f[Y * 8 + 7], _mm512_extracti32x4_epi32(q[X * 8 + 7], Y), X); \
+  } while (false);
+
+  PACKET32H_TRANSPOSE_HELPER(0, 0);
+  PACKET32H_TRANSPOSE_HELPER(1, 1);
+  PACKET32H_TRANSPOSE_HELPER(2, 2);
+  PACKET32H_TRANSPOSE_HELPER(3, 3);
+
+  PACKET32H_TRANSPOSE_HELPER(1, 0);
+  PACKET32H_TRANSPOSE_HELPER(2, 0);
+  PACKET32H_TRANSPOSE_HELPER(3, 0);
+  PACKET32H_TRANSPOSE_HELPER(2, 1);
+  PACKET32H_TRANSPOSE_HELPER(3, 1);
+  PACKET32H_TRANSPOSE_HELPER(3, 2);
+
+  PACKET32H_TRANSPOSE_HELPER(0, 1);
+  PACKET32H_TRANSPOSE_HELPER(0, 2);
+  PACKET32H_TRANSPOSE_HELPER(0, 3);
+  PACKET32H_TRANSPOSE_HELPER(1, 2);
+  PACKET32H_TRANSPOSE_HELPER(1, 3);
+  PACKET32H_TRANSPOSE_HELPER(2, 3);
+
+#undef PACKET32H_TRANSPOSE_HELPER
+
+  EIGEN_UNROLL_LOOP
+  for (int i = 0; i < 32; i++) {
+    a.packet[i] = _mm512_castsi512_ph(f[i]);
+  }
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet32h, 4>& a) {
+  __m512i p0, p1, p2, p3, t0, t1, t2, t3, a0, a1, a2, a3;
+  t0 = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[0]), _mm512_castph_si512(a.packet[1]));
+  t1 = _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[0]), _mm512_castph_si512(a.packet[1]));
+  t2 = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[2]), _mm512_castph_si512(a.packet[3]));
+  t3 = _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[2]), _mm512_castph_si512(a.packet[3]));
+
+  p0 = _mm512_unpacklo_epi32(t0, t2);
+  p1 = _mm512_unpackhi_epi32(t0, t2);
+  p2 = _mm512_unpacklo_epi32(t1, t3);
+  p3 = _mm512_unpackhi_epi32(t1, t3);
+
+  a0 = p0;
+  a1 = p1;
+  a2 = p2;
+  a3 = p3;
+
+  a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p1, 0), 1);
+  a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p0, 1), 0);
+
+  a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p2, 0), 2);
+  a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p0, 2), 0);
+
+  a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p3, 0), 3);
+  a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p0, 3), 0);
+
+  a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p2, 1), 2);
+  a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p1, 2), 1);
+
+  a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p3, 2), 3);
+  a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p2, 3), 2);
+
+  a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p3, 1), 3);
+  a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p1, 3), 1);
+
+  a.packet[0] = _mm512_castsi512_ph(a0);
+  a.packet[1] = _mm512_castsi512_ph(a1);
+  a.packet[2] = _mm512_castsi512_ph(a2);
+  a.packet[3] = _mm512_castsi512_ph(a3);
+}
+
+// preverse
+
+template <>
+EIGEN_STRONG_INLINE Packet32h preverse(const Packet32h& a) {
+  return _mm512_permutexvar_ph(_mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+                                                20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31),
+                               a);
+}
+
+// pscatter
+
+template <>
+EIGEN_STRONG_INLINE void pscatter<half, Packet32h>(half* to, const Packet32h& from, Index stride) {
+  EIGEN_ALIGN64 half aux[32];
+  pstore(aux, from);
+
+  EIGEN_UNROLL_LOOP
+  for (int i = 0; i < 32; i++) {
+    to[stride * i] = aux[i];
+  }
+}
+
+// pgather
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pgather<Eigen::half, Packet32h>(const Eigen::half* from, Index stride) {
+  return _mm512_castsi512_ph(_mm512_set_epi16(
+      from[31 * stride].x, from[30 * stride].x, from[29 * stride].x, from[28 * stride].x, from[27 * stride].x,
+      from[26 * stride].x, from[25 * stride].x, from[24 * stride].x, from[23 * stride].x, from[22 * stride].x,
+      from[21 * stride].x, from[20 * stride].x, from[19 * stride].x, from[18 * stride].x, from[17 * stride].x,
+      from[16 * stride].x, from[15 * stride].x, from[14 * stride].x, from[13 * stride].x, from[12 * stride].x,
+      from[11 * stride].x, from[10 * stride].x, from[9 * stride].x, from[8 * stride].x, from[7 * stride].x,
+      from[6 * stride].x, from[5 * stride].x, from[4 * stride].x, from[3 * stride].x, from[2 * stride].x,
+      from[1 * stride].x, from[0 * stride].x));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pcos<Packet16h>(const Packet16h&);
+template <>
+EIGEN_STRONG_INLINE Packet16h psin<Packet16h>(const Packet16h&);
+template <>
+EIGEN_STRONG_INLINE Packet16h plog<Packet16h>(const Packet16h&);
+template <>
+EIGEN_STRONG_INLINE Packet16h plog2<Packet16h>(const Packet16h&);
+template <>
+EIGEN_STRONG_INLINE Packet16h plog1p<Packet16h>(const Packet16h&);
+template <>
+EIGEN_STRONG_INLINE Packet16h pexp<Packet16h>(const Packet16h&);
+template <>
+EIGEN_STRONG_INLINE Packet16h pexpm1<Packet16h>(const Packet16h&);
+template <>
+EIGEN_STRONG_INLINE Packet16h ptanh<Packet16h>(const Packet16h&);
+template <>
+EIGEN_STRONG_INLINE Packet16h pfrexp<Packet16h>(const Packet16h&, Packet16h&);
+template <>
+EIGEN_STRONG_INLINE Packet16h pldexp<Packet16h>(const Packet16h&, const Packet16h&);
+
+EIGEN_STRONG_INLINE Packet32h combine2Packet16h(const Packet16h& a, const Packet16h& b) {
+  __m512d result = _mm512_undefined_pd();
+  result = _mm512_insertf64x4(result, _mm256_castsi256_pd(a), 0);
+  result = _mm512_insertf64x4(result, _mm256_castsi256_pd(b), 1);
+  return _mm512_castpd_ph(result);
+}
+
+EIGEN_STRONG_INLINE void extract2Packet16h(const Packet32h& x, Packet16h& a, Packet16h& b) {
+  a = _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(x), 0));
+  b = _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(x), 1));
+}
+
+// psin
+template <>
+EIGEN_STRONG_INLINE Packet32h psin<Packet32h>(const Packet32h& a) {
+  Packet16h low;
+  Packet16h high;
+  extract2Packet16h(a, low, high);
+
+  Packet16h lowOut = psin(low);
+  Packet16h highOut = psin(high);
+
+  return combine2Packet16h(lowOut, highOut);
+}
+
+// pcos
+template <>
+EIGEN_STRONG_INLINE Packet32h pcos<Packet32h>(const Packet32h& a) {
+  Packet16h low;
+  Packet16h high;
+  extract2Packet16h(a, low, high);
+
+  Packet16h lowOut = pcos(low);
+  Packet16h highOut = pcos(high);
+
+  return combine2Packet16h(lowOut, highOut);
+}
+
+// plog
+template <>
+EIGEN_STRONG_INLINE Packet32h plog<Packet32h>(const Packet32h& a) {
+  Packet16h low;
+  Packet16h high;
+  extract2Packet16h(a, low, high);
+
+  Packet16h lowOut = plog(low);
+  Packet16h highOut = plog(high);
+
+  return combine2Packet16h(lowOut, highOut);
+}
+
+// plog2
+template <>
+EIGEN_STRONG_INLINE Packet32h plog2<Packet32h>(const Packet32h& a) {
+  Packet16h low;
+  Packet16h high;
+  extract2Packet16h(a, low, high);
+
+  Packet16h lowOut = plog2(low);
+  Packet16h highOut = plog2(high);
+
+  return combine2Packet16h(lowOut, highOut);
+}
+
+// plog1p
+template <>
+EIGEN_STRONG_INLINE Packet32h plog1p<Packet32h>(const Packet32h& a) {
+  Packet16h low;
+  Packet16h high;
+  extract2Packet16h(a, low, high);
+
+  Packet16h lowOut = plog1p(low);
+  Packet16h highOut = plog1p(high);
+
+  return combine2Packet16h(lowOut, highOut);
+}
+
+// pexp
+template <>
+EIGEN_STRONG_INLINE Packet32h pexp<Packet32h>(const Packet32h& a) {
+  Packet16h low;
+  Packet16h high;
+  extract2Packet16h(a, low, high);
+
+  Packet16h lowOut = pexp(low);
+  Packet16h highOut = pexp(high);
+
+  return combine2Packet16h(lowOut, highOut);
+}
+
+// pexpm1
+template <>
+EIGEN_STRONG_INLINE Packet32h pexpm1<Packet32h>(const Packet32h& a) {
+  Packet16h low;
+  Packet16h high;
+  extract2Packet16h(a, low, high);
+
+  Packet16h lowOut = pexpm1(low);
+  Packet16h highOut = pexpm1(high);
+
+  return combine2Packet16h(lowOut, highOut);
+}
+
+// ptanh
+template <>
+EIGEN_STRONG_INLINE Packet32h ptanh<Packet32h>(const Packet32h& a) {
+  Packet16h low;
+  Packet16h high;
+  extract2Packet16h(a, low, high);
+
+  Packet16h lowOut = ptanh(low);
+  Packet16h highOut = ptanh(high);
+
+  return combine2Packet16h(lowOut, highOut);
+}
+
+// pfrexp
+template <>
+EIGEN_STRONG_INLINE Packet32h pfrexp<Packet32h>(const Packet32h& a, Packet32h& exponent) {
+  Packet16h low;
+  Packet16h high;
+  extract2Packet16h(a, low, high);
+
+  Packet16h exp1 = _mm256_undefined_si256();
+  Packet16h exp2 = _mm256_undefined_si256();
+
+  Packet16h lowOut = pfrexp(low, exp1);
+  Packet16h highOut = pfrexp(high, exp2);
+
+  exponent = combine2Packet16h(exp1, exp2);
+
+  return combine2Packet16h(lowOut, highOut);
+}
+
+// pldexp
+template <>
+EIGEN_STRONG_INLINE Packet32h pldexp<Packet32h>(const Packet32h& a, const Packet32h& exponent) {
+  Packet16h low;
+  Packet16h high;
+  extract2Packet16h(a, low, high);
+
+  Packet16h exp1;
+  Packet16h exp2;
+  extract2Packet16h(exponent, exp1, exp2);
+
+  Packet16h lowOut = pldexp(low, exp1);
+  Packet16h highOut = pldexp(high, exp2);
+
+  return combine2Packet16h(lowOut, highOut);
+}
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_PACKET_MATH_FP16_AVX512_H
diff --git a/Eigen/src/Core/arch/AVX512/TrsmKernel.h b/Eigen/src/Core/arch/AVX512/TrsmKernel.h
index f0b1797..a3025ec 100644
--- a/Eigen/src/Core/arch/AVX512/TrsmKernel.h
+++ b/Eigen/src/Core/arch/AVX512/TrsmKernel.h
@@ -1082,7 +1082,7 @@
     Index size, Index otherSize, const float *_tri, Index triStride, float *_other, Index otherIncr,
     Index otherStride) {
   EIGEN_UNUSED_VARIABLE(otherIncr);
-#ifdef EIGEN_NO_RUNTIME_MALLOC
+#ifdef EIGEN_RUNTIME_NO_MALLOC
   if (!is_malloc_allowed()) {
     trsmKernelR<float, Index, Mode, false, TriStorageOrder, 1, /*Specialized=*/false>::kernel(
           size, otherSize, _tri, triStride, _other, otherIncr, otherStride);
@@ -1098,7 +1098,7 @@
     Index size, Index otherSize, const double *_tri, Index triStride, double *_other, Index otherIncr,
     Index otherStride) {
   EIGEN_UNUSED_VARIABLE(otherIncr);
-#ifdef EIGEN_NO_RUNTIME_MALLOC
+#ifdef EIGEN_RUNTIME_NO_MALLOC
   if (!is_malloc_allowed()) {
     trsmKernelR<double, Index, Mode, false, TriStorageOrder, 1, /*Specialized=*/false>::kernel(
           size, otherSize, _tri, triStride, _other, otherIncr, otherStride);
@@ -1132,7 +1132,7 @@
     Index size, Index otherSize, const float *_tri, Index triStride, float *_other, Index otherIncr,
     Index otherStride) {
   EIGEN_UNUSED_VARIABLE(otherIncr);
-#ifdef EIGEN_NO_RUNTIME_MALLOC
+#ifdef EIGEN_RUNTIME_NO_MALLOC
   if (!is_malloc_allowed()) {
     trsmKernelL<float, Index, Mode, false, TriStorageOrder, 1, /*Specialized=*/false>::kernel(
           size, otherSize, _tri, triStride, _other, otherIncr, otherStride);
@@ -1148,7 +1148,7 @@
     Index size, Index otherSize, const double *_tri, Index triStride, double *_other, Index otherIncr,
     Index otherStride) {
   EIGEN_UNUSED_VARIABLE(otherIncr);
-#ifdef EIGEN_NO_RUNTIME_MALLOC
+#ifdef EIGEN_RUNTIME_NO_MALLOC
   if (!is_malloc_allowed()) {
     trsmKernelL<double, Index, Mode, false, TriStorageOrder, 1, /*Specialized=*/false>::kernel(
           size, otherSize, _tri, triStride, _other, otherIncr, otherStride);
diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h
index 89cd677..d988eb1d 100644
--- a/Eigen/src/Core/functors/UnaryFunctors.h
+++ b/Eigen/src/Core/functors/UnaryFunctors.h
@@ -472,6 +472,20 @@
 };
 
 /** \internal
+  * \brief Template functor to compute the cube root of a scalar
+  * \sa class CwiseUnaryOp, Cwise::sqrt()
+  */
+template <typename Scalar>
+struct scalar_cbrt_op {
+  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::cbrt(a); }
+};
+
+template <typename Scalar>
+struct functor_traits<scalar_cbrt_op<Scalar> > {
+  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
+};
+
+/** \internal
   * \brief Template functor to compute the reciprocal square root of a scalar
   * \sa class CwiseUnaryOp, Cwise::rsqrt()
   */
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index e8bb821..3e7784d 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -84,12 +84,12 @@
   gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
   gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
 
-#ifdef EIGEN_HAS_OPENMP
+#if defined(EIGEN_HAS_OPENMP) || defined(EIGEN_GEMM_THREADPOOL)
   if(info)
   {
     // this is the parallel version!
-    int tid = omp_get_thread_num();
-    int threads = omp_get_num_threads();
+    int tid = info->logical_thread_id;
+    int threads = info->num_threads;
 
     LhsScalar* blockA = blocking.blockA();
     eigen_internal_assert(blockA!=0);
@@ -110,15 +110,15 @@
       // each thread packs the sub block A_k,i to A'_i where i is the thread id.
 
       // However, before copying to A'_i, we have to make sure that no other thread is still using it,
-      // i.e., we test that info[tid].users equals 0.
-      // Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it.
-      while(info[tid].users!=0) {}
-      info[tid].users = threads;
+      // i.e., we test that info->task_info[tid].users equals 0.
+      // Then, we set info->task_info[tid].users to the number of threads to mark that all other threads are going to use it.
+      while(info->task_info[tid].users!=0) {}
+      info->task_info[tid].users = threads;
 
-      pack_lhs(blockA+info[tid].lhs_start*actual_kc, lhs.getSubMapper(info[tid].lhs_start,k), actual_kc, info[tid].lhs_length);
+      pack_lhs(blockA+info->task_info[tid].lhs_start*actual_kc, lhs.getSubMapper(info->task_info[tid].lhs_start,k), actual_kc, info->task_info[tid].lhs_length);
 
       // Notify the other threads that the part A'_i is ready to go.
-      info[tid].sync = k;
+      info->task_info[tid].sync = k;
 
       // Computes C_i += A' * B' per A'_i
       for(int shift=0; shift<threads; ++shift)
@@ -129,11 +129,10 @@
         // we use testAndSetOrdered to mimic a volatile access.
         // However, no need to wait for the B' part which has been updated by the current thread!
         if (shift>0) {
-          while(info[i].sync!=k) {
-          }
+          while(info->task_info[i].sync!=k) {}
         }
 
-        gebp(res.getSubMapper(info[i].lhs_start, 0), blockA+info[i].lhs_start*actual_kc, blockB, info[i].lhs_length, actual_kc, nc, alpha);
+        gebp(res.getSubMapper(info->task_info[i].lhs_start, 0), blockA+info->task_info[i].lhs_start*actual_kc, blockB, info->task_info[i].lhs_length, actual_kc, nc, alpha);
       }
 
       // Then keep going as usual with the remaining B'
@@ -151,11 +150,11 @@
       // Release all the sub blocks A'_i of A' for the current thread,
       // i.e., we simply decrement the number of users by 1
       for(Index i=0; i<threads; ++i)
-        info[i].users -= 1;
+        info->task_info[i].users -= 1;
     }
   }
   else
-#endif // EIGEN_HAS_OPENMP
+#endif // defined(EIGEN_HAS_OPENMP) || defined(EIGEN_GEMM_THREADPOOL)
   {
     EIGEN_UNUSED_VARIABLE(info);
 
diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h
index 3d9cb5f..3e76827 100644
--- a/Eigen/src/Core/products/Parallelizer.h
+++ b/Eigen/src/Core/products/Parallelizer.h
@@ -13,49 +13,41 @@
 // IWYU pragma: private
 #include "../InternalHeaderCheck.h"
 
+// Note that in the following, there are 3 different uses of the concept
+// "number of threads":
+//  1. Max number of threads used by OpenMP or ThreadPool.
+//     * For OpenMP this is typically the value set by the OMP_NUM_THREADS
+//       environment variable, or by a call to omp_set_num_threads() prior to
+//       calling Eigen.
+//     * For ThreadPool, this is the number of threads in the ThreadPool.
+//  2. Max number of threads currently allowed to be used by parallel Eigen
+//     operations. This is set by setNbThreads(), and cannot exceed the value
+//     in 1.
+//  3. The actual number of threads used for a given parallel Eigen operation.
+//     This is typically computed on the fly using a cost model and cannot exceed
+//     the value in 2.
+//     * For OpenMP, this is typically the number of threads specified in individual
+//       "omp parallel" pragmas associated with an Eigen operation.
+//     * For ThreadPool, it is the number of concurrent tasks scheduled in the
+//       threadpool for a given Eigen operation. Notice that since the threadpool
+//       uses task stealing, there is no way to limit the number of concurrently
+//       executing tasks to below the number in 1. except by limiting the total
+//       number of tasks in flight.
+
+#if defined(EIGEN_HAS_OPENMP) && defined(EIGEN_GEMM_THREADPOOL)
+#error "EIGEN_HAS_OPENMP and EIGEN_GEMM_THREADPOOL may not both be defined."
+#endif
+
 namespace Eigen {
 
 namespace internal {
-
-/** \internal */
-inline void manage_multi_threading(Action action, int* v)
-{
-  static int m_maxThreads = -1;
-  EIGEN_UNUSED_VARIABLE(m_maxThreads)
-
-  if(action==SetAction)
-  {
-    eigen_internal_assert(v!=0);
-    m_maxThreads = *v;
-  }
-  else if(action==GetAction)
-  {
-    eigen_internal_assert(v!=0);
-    #ifdef EIGEN_HAS_OPENMP
-    if(m_maxThreads>0)
-      *v = m_maxThreads;
-    else
-      *v = omp_get_max_threads();
-    #else
-    *v = 1;
-    #endif
-  }
-  else
-  {
-    eigen_internal_assert(false);
-  }
+   inline void manage_multi_threading(Action action, int* v);
 }
 
-}
+// Public APIs.
 
 /** Must be call first when calling Eigen from multiple threads */
-inline void initParallel()
-{
-  int nbt;
-  internal::manage_multi_threading(GetAction, &nbt);
-  std::ptrdiff_t l1, l2, l3;
-  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
-}
+EIGEN_DEPRECATED inline void initParallel() {}
 
 /** \returns the max number of threads reserved for Eigen
   * \sa setNbThreads */
@@ -73,42 +65,111 @@
   internal::manage_multi_threading(SetAction, &v);
 }
 
-namespace internal {
+#ifdef EIGEN_GEMM_THREADPOOL
+// Sets the ThreadPool used by Eigen parallel Gemm.
+//
+// NOTICE: This function has a known race condition with
+// parallelize_gemm below, and should not be called while
+// an instance of that function is running.
+//
+// TODO(rmlarsen): Make the device API available instead of
+// storing a local static pointer variable to avoid this issue.
+inline ThreadPool* setGemmThreadPool(ThreadPool* new_pool) {
+  static ThreadPool* pool;
+  if (new_pool != nullptr) {
+    // This will wait for work in all threads in *pool to finish,
+    // then destroy the old ThreadPool, and then replace it with new_pool.
+    pool = new_pool;
+    // Reset the number of threads to the number of threads on the new pool.
+    setNbThreads(pool->NumThreads());
+  }
+  return pool;
+}
 
-template<typename Index> struct GemmParallelInfo
-{
-
-#ifdef EIGEN_HAS_OPENMP
-  GemmParallelInfo() : sync(-1), users(0), lhs_start(0), lhs_length(0) {}
-  std::atomic<Index> sync;
-  std::atomic<int> users;
-#else
-  GemmParallelInfo() : lhs_start(0), lhs_length(0) {}
+// Gets the ThreadPool used by Eigen parallel Gemm.
+inline ThreadPool* getGemmThreadPool() {
+  return setGemmThreadPool(nullptr);
+}
 #endif
 
+
+namespace internal {
+
+// Implementation.
+
+#if defined(EIGEN_USE_BLAS) || (!defined(EIGEN_HAS_OPENMP) && !defined(EIGEN_GEMM_THREADPOOL))
+
+inline void manage_multi_threading(Action action, int* v) {
+  if (action == SetAction) {
+    eigen_internal_assert(v != nullptr);
+  } else if (action == GetAction) {
+    eigen_internal_assert(v != nullptr);
+    *v = 1;
+  } else {
+    eigen_internal_assert(false);
+  }
+}
+template<typename Index> struct GemmParallelInfo {};
+template <bool Condition, typename Functor, typename Index>
+EIGEN_STRONG_INLINE void parallelize_gemm(const Functor& func, Index rows, Index cols,
+                                          Index /*unused*/, bool /*unused*/) {
+  func(0,rows, 0,cols);
+}
+
+#else
+
+template<typename Index> struct GemmParallelTaskInfo {
+  GemmParallelTaskInfo() : sync(-1), users(0), lhs_start(0), lhs_length(0) {}
+  std::atomic<Index> sync;
+  std::atomic<int> users;
   Index lhs_start;
   Index lhs_length;
 };
 
-template<bool Condition, typename Functor, typename Index>
-void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth, bool transpose)
-{
-  // TODO when EIGEN_USE_BLAS is defined,
-  // we should still enable OMP for other scalar types
-  // Without C++11, we have to disable GEMM's parallelization on
-  // non x86 architectures because there volatile is not enough for our purpose.
-  // See bug 1572.
-#if (! defined(EIGEN_HAS_OPENMP)) || defined(EIGEN_USE_BLAS)
-  // FIXME the transpose variable is only needed to properly split
-  // the matrix product when multithreading is enabled. This is a temporary
-  // fix to support row-major destination matrices. This whole
-  // parallelizer mechanism has to be redesigned anyway.
-  EIGEN_UNUSED_VARIABLE(depth);
-  EIGEN_UNUSED_VARIABLE(transpose);
-  func(0,rows, 0,cols);
-#else
+template<typename Index> struct GemmParallelInfo {
+  const int logical_thread_id;
+  const int num_threads;
+  GemmParallelTaskInfo<Index>* task_info;
 
-  // Dynamically check whether we should enable or disable OpenMP.
+  GemmParallelInfo(int logical_thread_id_, int num_threads_,
+                   GemmParallelTaskInfo<Index>* task_info_)
+      : logical_thread_id(logical_thread_id_),
+        num_threads(num_threads_),
+        task_info(task_info_) {}
+};
+
+inline void manage_multi_threading(Action action, int* v) {
+  static int m_maxThreads = -1;
+  if (action == SetAction) {
+    eigen_internal_assert(v != nullptr);
+#if defined(EIGEN_HAS_OPENMP)
+    // Calling action == SetAction and *v = 0 means
+    // restoring m_maxThreads to the maximum number of threads specified
+    // for OpenMP.
+    eigen_internal_assert(*v >= 0);
+    int omp_threads = omp_get_max_threads();
+    m_maxThreads = (*v == 0 ? omp_threads : std::min(*v, omp_threads));
+#elif defined(EIGEN_GEMM_THREADPOOL)
+    // Calling action == SetAction and *v = 0 means
+    // restoring m_maxThreads to the number of threads in the ThreadPool,
+    // which defaults to 1 if no pool was provided.
+    eigen_internal_assert(*v >= 0);
+    ThreadPool* pool = getGemmThreadPool();
+    int pool_threads = pool != nullptr ? pool->NumThreads() : 1;
+    m_maxThreads = (*v == 0 ? pool_threads : numext::mini(pool_threads, *v));
+#endif
+  } else if (action == GetAction) {
+    eigen_internal_assert(v != nullptr);
+    *v = m_maxThreads;
+  } else {
+    eigen_internal_assert(false);
+  }
+}
+
+template <bool Condition, typename Functor, typename Index>
+EIGEN_STRONG_INLINE void parallelize_gemm(const Functor& func, Index rows, Index cols,
+                                          Index depth, bool transpose) {
+  // Dynamically check whether we should even try to execute in parallel.
   // The conditions are:
   // - the max number of threads we can create is greater than 1
   // - we are not already in a parallel code
@@ -126,27 +187,41 @@
   pb_max_threads = std::max<Index>(1, std::min<Index>(pb_max_threads, static_cast<Index>( work / kMinTaskSize ) ));
 
   // compute the number of threads we are going to use
-  Index threads = std::min<Index>(nbThreads(), pb_max_threads);
+  int threads = std::min<int>(nbThreads(), static_cast<int>(pb_max_threads));
 
-  // if multi-threading is explicitly disabled, not useful, or if we already are in a parallel session,
-  // then abort multi-threading
-  // FIXME omp_get_num_threads()>1 only works for openmp, what if the user does not use openmp?
-  if((!Condition) || (threads==1) || (omp_get_num_threads()>1))
+  // if multi-threading is explicitly disabled, not useful, or if we already are
+  // inside a parallel session, then abort multi-threading
+  bool dont_parallelize = (!Condition) || (threads<=1);
+#if defined(EIGEN_HAS_OPENMP)
+  // don't parallelize if we are executing in a parallel context already.
+  dont_parallelize |= omp_get_num_threads() > 1;
+#elif defined(EIGEN_GEMM_THREADPOOL)
+  // don't parallelize if we have a trivial threadpool or the current thread id
+  // is != -1, indicating that we are already executing on a thread inside the pool.
+  // In other words, we do not allow nested parallelism, since this would lead to
+  // deadlocks due to the workstealing nature of the threadpool.
+  ThreadPool* pool = getGemmThreadPool();
+  dont_parallelize |= (pool == nullptr || pool->CurrentThreadId() != -1);
+#endif
+  if (dont_parallelize)
     return func(0,rows, 0,cols);
 
-  Eigen::initParallel();
   func.initParallelSession(threads);
 
   if(transpose)
     std::swap(rows,cols);
 
-  ei_declare_aligned_stack_constructed_variable(GemmParallelInfo<Index>,info,threads,0);
+  ei_declare_aligned_stack_constructed_variable(GemmParallelTaskInfo<Index>,task_info,threads,0);
 
+
+#if defined(EIGEN_HAS_OPENMP)
   #pragma omp parallel num_threads(threads)
   {
     Index i = omp_get_thread_num();
-    // Note that the actual number of threads might be lower than the number of request ones.
+    // Note that the actual number of threads might be lower than the number of
+    // requested ones
     Index actual_threads = omp_get_num_threads();
+    GemmParallelInfo<Index> info(i, static_cast<int>(actual_threads), task_info);
 
     Index blockCols = (cols / actual_threads) & ~Index(0x3);
     Index blockRows = (rows / actual_threads);
@@ -158,17 +233,52 @@
     Index c0 = i*blockCols;
     Index actualBlockCols = (i+1==actual_threads) ? cols-c0 : blockCols;
 
-    info[i].lhs_start = r0;
-    info[i].lhs_length = actualBlockRows;
+    info.task_info[i].lhs_start = r0;
+    info.task_info[i].lhs_length = actualBlockRows;
 
-    if(transpose) func(c0, actualBlockCols, 0, rows, info);
-    else          func(0, rows, c0, actualBlockCols, info);
+    if(transpose) func(c0, actualBlockCols, 0, rows, &info);
+    else          func(0, rows, c0, actualBlockCols, &info);
   }
+
+#elif defined(EIGEN_GEMM_THREADPOOL)
+  ei_declare_aligned_stack_constructed_variable(GemmParallelTaskInfo<Index>,meta_info,threads,0);
+  Barrier barrier(threads);
+  auto task = [=, &func, &barrier, &task_info](int i)
+  {
+    Index actual_threads = threads;
+    GemmParallelInfo<Index> info(i, static_cast<int>(actual_threads), task_info);
+    Index blockCols = (cols / actual_threads) & ~Index(0x3);
+    Index blockRows = (rows / actual_threads);
+    blockRows = (blockRows/Functor::Traits::mr)*Functor::Traits::mr;
+
+    Index r0 = i*blockRows;
+    Index actualBlockRows = (i+1==actual_threads) ? rows-r0 : blockRows;
+
+    Index c0 = i*blockCols;
+    Index actualBlockCols = (i+1==actual_threads) ? cols-c0 : blockCols;
+
+    info.task_info[i].lhs_start = r0;
+    info.task_info[i].lhs_length = actualBlockRows;
+
+    if(transpose) func(c0, actualBlockCols, 0, rows, &info);
+    else          func(0, rows, c0, actualBlockCols, &info);
+
+    barrier.Notify();
+  };
+  // Notice that we do not schedule more than "threads" tasks, which allows us to
+  // limit number of running threads, even if the threadpool itself was constructed
+  // with a larger number of threads.
+  for (int i=0; i < threads - 1; ++i) {
+    pool->Schedule([=, task = std::move(task)] { task(i); });
+  }
+  task(threads - 1);
+  barrier.Wait();
 #endif
 }
 
-} // end namespace internal
+#endif
 
+} // end namespace internal
 } // end namespace Eigen
 
 #endif // EIGEN_PARALLELIZER_H
diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h
index bed6cdd..7d1766c 100644
--- a/Eigen/src/Core/util/DisableStupidWarnings.h
+++ b/Eigen/src/Core/util/DisableStupidWarnings.h
@@ -21,6 +21,11 @@
     #pragma warning( push )
   #endif
   #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4714 4717 4800)
+  // We currently rely on has_denorm in tests, and need it defined correctly for half/bfloat16.
+  #ifndef _SILENCE_CXX23_DENORM_DEPRECATION_WARNING
+    #define EIGEN_REENABLE_CXX23_DENORM_DEPRECATION_WARNING 1
+    #define _SILENCE_CXX23_DENORM_DEPRECATION_WARNING
+  #endif
 
 #elif defined __INTEL_COMPILER
   // 2196 - routine is both "inline" and "noinline" ("noinline" assumed)
diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h
index 2a4126c..8bff87d 100644
--- a/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/Eigen/src/Core/util/ForwardDeclarations.h
@@ -185,6 +185,7 @@
 template<typename Scalar> struct scalar_abs2_op;
 template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_absolute_difference_op;
 template<typename Scalar> struct scalar_sqrt_op;
+template<typename Scalar> struct scalar_cbrt_op;
 template<typename Scalar> struct scalar_rsqrt_op;
 template<typename Scalar> struct scalar_exp_op;
 template<typename Scalar> struct scalar_log_op;
diff --git a/Eigen/src/Core/util/ReenableStupidWarnings.h b/Eigen/src/Core/util/ReenableStupidWarnings.h
index 7021e6d..c8238de 100644
--- a/Eigen/src/Core/util/ReenableStupidWarnings.h
+++ b/Eigen/src/Core/util/ReenableStupidWarnings.h
@@ -8,6 +8,11 @@
 #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
   #ifdef _MSC_VER
     #pragma warning( pop )
+    #ifdef EIGEN_REENABLE_CXX23_DENORM_DEPRECATION_WARNING
+      #undef EIGEN_REENABLE_CXX23_DENORM_DEPRECATION_WARNING
+      #undef _SILENCE_CXX23_DENORM_DEPRECATION_WARNING
+    #endif
+
   #elif defined __INTEL_COMPILER
     #pragma warning pop
   #elif defined __clang__
diff --git a/Eigen/src/plugins/ArrayCwiseUnaryOps.inc b/Eigen/src/plugins/ArrayCwiseUnaryOps.inc
index 301a900..b012326 100644
--- a/Eigen/src/plugins/ArrayCwiseUnaryOps.inc
+++ b/Eigen/src/plugins/ArrayCwiseUnaryOps.inc
@@ -5,6 +5,7 @@
 typedef CwiseUnaryOp<internal::scalar_carg_op<Scalar>, const Derived> CArgReturnType;
 typedef CwiseUnaryOp<internal::scalar_abs2_op<Scalar>, const Derived> Abs2ReturnType;
 typedef CwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived> SqrtReturnType;
+typedef CwiseUnaryOp<internal::scalar_cbrt_op<Scalar>, const Derived> CbrtReturnType;
 typedef CwiseUnaryOp<internal::scalar_rsqrt_op<Scalar>, const Derived> RsqrtReturnType;
 typedef CwiseUnaryOp<internal::scalar_sign_op<Scalar>, const Derived> SignReturnType;
 typedef CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const Derived> InverseReturnType;
@@ -184,7 +185,7 @@
   * Example: \include Cwise_sqrt.cpp
   * Output: \verbinclude Cwise_sqrt.out
   *
-  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_sqrt">Math functions</a>, pow(), square()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_sqrt">Math functions</a>, pow(), square(), cbrt()
   */
 EIGEN_DEVICE_FUNC
 inline const SqrtReturnType
@@ -193,6 +194,22 @@
   return SqrtReturnType(derived());
 }
 
+/** \returns an expression of the coefficient-wise cube root of *this.
+  *
+  * This function computes the coefficient-wise cube root. 
+  *
+  * Example: \include Cwise_cbrt.cpp
+  * Output: \verbinclude Cwise_cbrt.out
+  *
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_cbrt">Math functions</a>, sqrt(), pow(), square()
+  */
+EIGEN_DEVICE_FUNC
+inline const CbrtReturnType
+cbrt() const
+{
+  return CbrtReturnType(derived());
+}
+
 /** \returns an expression of the coefficient-wise inverse square root of *this.
   *
   * This function computes the coefficient-wise inverse square root.
diff --git a/Eigen/src/plugins/MatrixCwiseUnaryOps.inc b/Eigen/src/plugins/MatrixCwiseUnaryOps.inc
index 0222137..cb65e17 100644
--- a/Eigen/src/plugins/MatrixCwiseUnaryOps.inc
+++ b/Eigen/src/plugins/MatrixCwiseUnaryOps.inc
@@ -17,6 +17,7 @@
 typedef CwiseUnaryOp<internal::scalar_arg_op<Scalar>, const Derived> CwiseArgReturnType;
 typedef CwiseUnaryOp<internal::scalar_carg_op<Scalar>, const Derived> CwiseCArgReturnType;
 typedef CwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived> CwiseSqrtReturnType;
+typedef CwiseUnaryOp<internal::scalar_cbrt_op<Scalar>, const Derived> CwiseCbrtReturnType;
 typedef CwiseUnaryOp<internal::scalar_sign_op<Scalar>, const Derived> CwiseSignReturnType;
 typedef CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const Derived> CwiseInverseReturnType;
 
@@ -53,12 +54,25 @@
 ///
 EIGEN_DOC_UNARY_ADDONS(cwiseSqrt,square-root)
 ///
-/// \sa cwisePow(), cwiseSquare()
+/// \sa cwisePow(), cwiseSquare(), cwiseCbrt()
 ///
 EIGEN_DEVICE_FUNC
 inline const CwiseSqrtReturnType
 cwiseSqrt() const { return CwiseSqrtReturnType(derived()); }
 
+/// \returns an expression of the coefficient-wise cube root of *this.
+///
+/// Example: \include MatrixBase_cwiseCbrt.cpp
+/// Output: \verbinclude MatrixBase_cwiseCbrt.out
+///
+EIGEN_DOC_UNARY_ADDONS(cwiseCbrt,cube-root)
+///
+/// \sa cwiseSqrt(), cwiseSquare(), cwisePow()
+///
+EIGEN_DEVICE_FUNC
+inline const CwiseCbrtReturnType
+cwiseCbrt() const { return CwiseSCbrtReturnType(derived()); }
+
 /// \returns an expression of the coefficient-wise signum of *this.
 ///
 /// Example: \include MatrixBase_cwiseSign.cpp
diff --git a/debug/msvc/eigen.natvis b/debug/msvc/eigen.natvis
index da89857..22cf346 100644
--- a/debug/msvc/eigen.natvis
+++ b/debug/msvc/eigen.natvis
@@ -1,235 +1,235 @@
-<?xml version="1.0" encoding="utf-8"?>

-

-<AutoVisualizer xmlns="http://schemas.microsoft.com/vstudio/debugger/natvis/2010">

-

-  <!-- Fixed x Fixed Matrix -->

-  <Type Name="Eigen::Matrix&lt;*,*,*,*,*,*&gt;">      

-      <AlternativeType Name="Eigen::Array&lt;*,-1,-1,*,*,*&gt;"/>

-      <DisplayString>[{$T2}, {$T3}] (fixed matrix)</DisplayString>

-      <Expand>

-        <ArrayItems Condition="Flags%2"> <!-- row major layout -->

-          <Rank>2</Rank>

-          <Size>$i==0 ? $T2 : $T3</Size>

-          <ValuePointer>m_storage.m_data.array</ValuePointer>

-        </ArrayItems>

-        <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->

-          <Direction>Backward</Direction>

-          <Rank>2</Rank>

-          <Size>$i==0 ? $T2 : $T3</Size>

-          <ValuePointer>m_storage.m_data.array</ValuePointer>

-        </ArrayItems>

-      </Expand>

-  </Type>

-  

-  <!-- 2 x 2 Matrix -->

-  <Type Name="Eigen::Matrix&lt;*,2,2,*,*,*&gt;">      

-      <AlternativeType Name="Eigen::Array&lt;*,2,2,*,*,*&gt;"/>

-      <DisplayString>[2, 2] (fixed matrix)</DisplayString>

-      <Expand>

-        <Synthetic Name="[row 0]" Condition="Flags%2">

-          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]})</DisplayString>

-        </Synthetic>

-        <Synthetic Name="[row 0]" Condition="!(Flags%2)">

-          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[2]})</DisplayString>

-        </Synthetic>

-        <Synthetic Name="[row 1]" Condition="Flags%2">

-          <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>

-        </Synthetic>

-        <Synthetic Name="[row 1]" Condition="!(Flags%2)">

-          <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[3]})</DisplayString>

-        </Synthetic>        

-      </Expand>

-  </Type>

-  

-  <!-- 3 x 3 Matrix -->

-  <Type Name="Eigen::Matrix&lt;*,3,3,*,*,*&gt;">      

-      <AlternativeType Name="Eigen::Array&lt;*,3,3,*,*,*&gt;"/>

-      <DisplayString>[3, 3] (fixed matrix)</DisplayString>

-      <Expand>

-        <Synthetic Name="[row 0]" Condition="Flags%2">

-          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]})</DisplayString>

-        </Synthetic>

-        <Synthetic Name="[row 0]" Condition="!(Flags%2)">

-          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[3]}, {m_storage.m_data.array[6]})</DisplayString>

-        </Synthetic>

-        <Synthetic Name="[row 1]" Condition="Flags%2">

-          <DisplayString>({m_storage.m_data.array[3]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[5]})</DisplayString>

-        </Synthetic>

-        <Synthetic Name="[row 1]" Condition="!(Flags%2)">

-          <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[7]})</DisplayString>

-        </Synthetic>

-        <Synthetic Name="[row 2]" Condition="Flags%2">

-          <DisplayString>({m_storage.m_data.array[6]}, {m_storage.m_data.array[7]}, {m_storage.m_data.array[8]})</DisplayString>

-        </Synthetic>

-        <Synthetic Name="[row 2]" Condition="!(Flags%2)">

-          <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[8]})</DisplayString>

-        </Synthetic>        

-      </Expand>

-  </Type>

-  

-  <!-- 4 x 4 Matrix -->

-  <Type Name="Eigen::Matrix&lt;*,4,4,*,*,*&gt;">      

-      <AlternativeType Name="Eigen::Array&lt;*,4,4,*,*,*&gt;"/>

-      <DisplayString>[4, 4] (fixed matrix)</DisplayString>

-      <Expand>

-        <Synthetic Name="[row 0]" Condition="Flags%2">

-          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>

-        </Synthetic>

-        <Synthetic Name="[row 0]" Condition="!(Flags%2)">

-          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[8]}, {m_storage.m_data.array[12]})</DisplayString>

-        </Synthetic>

-        <Synthetic Name="[row 1]" Condition="Flags%2">

-          <DisplayString>({m_storage.m_data.array[4]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[6]}, {m_storage.m_data.array[7]})</DisplayString>

-        </Synthetic>

-        <Synthetic Name="[row 1]" Condition="!(Flags%2)">

-          <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[9]}, {m_storage.m_data.array[13]})</DisplayString>

-        </Synthetic>

-        <Synthetic Name="[row 2]" Condition="Flags%2">

-          <DisplayString>({m_storage.m_data.array[8]}, {m_storage.m_data.array[9]}, {m_storage.m_data.array[10]}, {m_storage.m_data.array[11]})</DisplayString>

-        </Synthetic>

-        <Synthetic Name="[row 2]" Condition="!(Flags%2)">

-          <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[6]}, {m_storage.m_data.array[10]}, {m_storage.m_data.array[14]})</DisplayString>

-        </Synthetic>

-        <Synthetic Name="[row 3]" Condition="Flags%2">

-          <DisplayString>({m_storage.m_data.array[12]}, {m_storage.m_data.array[13]}, {m_storage.m_data.array[14]}, {m_storage.m_data.array[15]})</DisplayString>

-        </Synthetic>

-        <Synthetic Name="[row 3]" Condition="!(Flags%2)">

-          <DisplayString>({m_storage.m_data.array[3]}, {m_storage.m_data.array[7]}, {m_storage.m_data.array[11]}, {m_storage.m_data.array[15]})</DisplayString>

-        </Synthetic>        

-      </Expand>

-  </Type>  

-  

-  <!-- Dynamic x Dynamic Matrix -->

-  <Type Name="Eigen::Matrix&lt;*,-1,-1,*,*,*&gt;">      

-      <AlternativeType Name="Eigen::Array&lt;*,-1,-1,*,*,*&gt;"/>

-      <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>

-      <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}, {m_storage.m_cols}] (dynamic matrix)</DisplayString>

-      <Expand>

-        <ArrayItems Condition="Flags%2"> <!-- row major layout -->

-          <Rank>2</Rank>

-          <Size>$i==0 ? m_storage.m_rows : m_storage.m_cols</Size>

-          <ValuePointer>m_storage.m_data</ValuePointer>

-        </ArrayItems>

-        <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->

-          <Direction>Backward</Direction>

-          <Rank>2</Rank>

-          <Size>$i==0 ? m_storage.m_rows : m_storage.m_cols</Size>

-          <ValuePointer>m_storage.m_data</ValuePointer>

-        </ArrayItems>

-      </Expand>

-  </Type>

-  

-  <!-- Fixed x Dynamic Matrix -->

-  <Type Name="Eigen::Matrix&lt;*,*,-1,*,*,*&gt;">

-      <AlternativeType Name="Eigen::Array&lt;*,*,-1,*,*,*&gt;"/>

-      <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>

-      <DisplayString Condition="m_storage.m_data != 0">[{$T2}, {m_storage.m_cols}] (dynamic column matrix)</DisplayString>

-      <Expand>

-        <ArrayItems Condition="Flags%2"> <!-- row major layout -->

-          <Rank>2</Rank>

-          <Size>$i==0 ? $T2 : m_storage.m_cols</Size>

-          <ValuePointer>m_storage.m_data</ValuePointer>

-        </ArrayItems>

-        <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->

-          <Direction>Backward</Direction>

-          <Rank>2</Rank>

-          <Size>$i==0 ? $T2 : m_storage.m_cols</Size>

-          <ValuePointer>m_storage.m_data</ValuePointer>

-        </ArrayItems>

-      </Expand>

-  </Type>

-  

-  <!-- Dynamic x Fixed Matrix -->

-  <Type Name="Eigen::Matrix&lt;*,-1,*,*,*,*&gt;">

-      <AlternativeType Name="Eigen::Array&lt;*,-1,*,*,*,*&gt;"/>

-      <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>

-      <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}, {$T2}] (dynamic row matrix)</DisplayString>

-      <Expand>

-        <ArrayItems Condition="Flags%2"> <!-- row major layout -->

-          <Rank>2</Rank>

-          <Size>$i==0 ? m_storage.m_rows : $T2</Size>

-          <ValuePointer>m_storage.m_data</ValuePointer>

-        </ArrayItems>

-        <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->

-          <Direction>Backward</Direction>

-          <Rank>2</Rank>

-          <Size>$i==0 ? m_storage.m_rows : $T2</Size>

-          <ValuePointer>m_storage.m_data</ValuePointer>

-        </ArrayItems>

-      </Expand>

-  </Type>

-  

-  <!-- Dynamic Column Vector -->

-  <Type Name="Eigen::Matrix&lt;*,1,-1,*,*,*&gt;">

-      <AlternativeType Name="Eigen::Array&lt;*,1,-1,*,*,*&gt;"/>

-      <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>

-      <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_cols}] (dynamic column vector)</DisplayString>

-      <Expand>

-        <Item Name="[size]">m_storage.m_cols</Item>

-        <ArrayItems>

-          <Size>m_storage.m_cols</Size>

-          <ValuePointer>m_storage.m_data</ValuePointer>

-        </ArrayItems>

-      </Expand>

-  </Type>

-  

-  <!-- Dynamic Row Vector -->

-  <Type Name="Eigen::Matrix&lt;*,-1,1,*,*,*&gt;">

-      <AlternativeType Name="Eigen::Array&lt;*,-1,1,*,*,*&gt;"/>

-      <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>

-      <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}] (dynamic row vector)</DisplayString>

-      <Expand>

-        <Item Name="[size]">m_storage.m_rows</Item>

-        <ArrayItems>

-          <Size>m_storage.m_rows</Size>

-          <ValuePointer>m_storage.m_data</ValuePointer>

-        </ArrayItems>

-      </Expand>

-  </Type>

-  

-  <!-- Fixed Vector -->

-  <Type Name="Eigen::Matrix&lt;*,1,1,*,*,*&gt;">

-      <AlternativeType Name="Eigen::Array&lt;*,1,1,*,*,*&gt;"/>

-      <DisplayString>[1] ({m_storage.m_data.array[0]})</DisplayString>

-      <Expand>

-        <Item Name="[x]">m_storage.m_data.array[0]</Item>

-      </Expand>

-  </Type>

-  

-  <Type Name="Eigen::Matrix&lt;*,2,1,*,*,*&gt;">

-      <AlternativeType Name="Eigen::Matrix&lt;*,1,2,*,*,*&gt;"/>

-      <AlternativeType Name="Eigen::Array&lt;*,2,1,*,*,*&gt;"/>

-      <AlternativeType Name="Eigen::Array&lt;*,1,2,*,*,*&gt;"/>

-      <DisplayString>[2] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]})</DisplayString>

-      <Expand>

-        <Item Name="[x]">m_storage.m_data.array[0]</Item>

-        <Item Name="[y]">m_storage.m_data.array[1]</Item>

-      </Expand>

-  </Type>

-  

-  <Type Name="Eigen::Matrix&lt;*,3,1,*,*,*&gt;">

-      <AlternativeType Name="Eigen::Matrix&lt;*,1,3,*,*,*&gt;"/>

-      <AlternativeType Name="Eigen::Array&lt;*,3,1,*,*,*&gt;"/>

-      <AlternativeType Name="Eigen::Array&lt;*,1,3,*,*,*&gt;"/>

-      <DisplayString>[3] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]})</DisplayString>

-      <Expand>

-        <Item Name="[x]">m_storage.m_data.array[0]</Item>

-        <Item Name="[y]">m_storage.m_data.array[1]</Item>

-        <Item Name="[z]">m_storage.m_data.array[2]</Item>

-      </Expand>

-  </Type>

-  

-    <Type Name="Eigen::Matrix&lt;*,4,1,*,*,*&gt;">

-      <AlternativeType Name="Eigen::Matrix&lt;*,1,4,*,*,*&gt;"/>

-      <AlternativeType Name="Eigen::Array&lt;*,4,1,*,*,*&gt;"/>

-      <AlternativeType Name="Eigen::Array&lt;*,1,4,*,*,*&gt;"/>

-      <DisplayString>[4] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>

-      <Expand>

-        <Item Name="[x]">m_storage.m_data.array[0]</Item>

-        <Item Name="[y]">m_storage.m_data.array[1]</Item>

-        <Item Name="[z]">m_storage.m_data.array[2]</Item>

-        <Item Name="[w]">m_storage.m_data.array[3]</Item>

-      </Expand>

-  </Type>

-

-</AutoVisualizer>

+<?xml version="1.0" encoding="utf-8"?>
+
+<AutoVisualizer xmlns="http://schemas.microsoft.com/vstudio/debugger/natvis/2010">
+
+  <!-- Fixed x Fixed Matrix -->
+  <Type Name="Eigen::Matrix&lt;*,*,*,*,*,*&gt;">      
+      <AlternativeType Name="Eigen::Array&lt;*,-1,-1,*,*,*&gt;"/>
+      <DisplayString>[{$T2}, {$T3}] (fixed matrix)</DisplayString>
+      <Expand>
+        <ArrayItems Condition="Flags%2"> <!-- row major layout -->
+          <Rank>2</Rank>
+          <Size>$i==0 ? $T2 : $T3</Size>
+          <ValuePointer>m_storage.m_data.array</ValuePointer>
+        </ArrayItems>
+        <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->
+          <Direction>Backward</Direction>
+          <Rank>2</Rank>
+          <Size>$i==0 ? $T2 : $T3</Size>
+          <ValuePointer>m_storage.m_data.array</ValuePointer>
+        </ArrayItems>
+      </Expand>
+  </Type>
+  
+  <!-- 2 x 2 Matrix -->
+  <Type Name="Eigen::Matrix&lt;*,2,2,*,*,*&gt;">      
+      <AlternativeType Name="Eigen::Array&lt;*,2,2,*,*,*&gt;"/>
+      <DisplayString>[2, 2] (fixed matrix)</DisplayString>
+      <Expand>
+        <Synthetic Name="[row 0]" Condition="Flags%2">
+          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]})</DisplayString>
+        </Synthetic>
+        <Synthetic Name="[row 0]" Condition="!(Flags%2)">
+          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[2]})</DisplayString>
+        </Synthetic>
+        <Synthetic Name="[row 1]" Condition="Flags%2">
+          <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>
+        </Synthetic>
+        <Synthetic Name="[row 1]" Condition="!(Flags%2)">
+          <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[3]})</DisplayString>
+        </Synthetic>        
+      </Expand>
+  </Type>
+  
+  <!-- 3 x 3 Matrix -->
+  <Type Name="Eigen::Matrix&lt;*,3,3,*,*,*&gt;">      
+      <AlternativeType Name="Eigen::Array&lt;*,3,3,*,*,*&gt;"/>
+      <DisplayString>[3, 3] (fixed matrix)</DisplayString>
+      <Expand>
+        <Synthetic Name="[row 0]" Condition="Flags%2">
+          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]})</DisplayString>
+        </Synthetic>
+        <Synthetic Name="[row 0]" Condition="!(Flags%2)">
+          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[3]}, {m_storage.m_data.array[6]})</DisplayString>
+        </Synthetic>
+        <Synthetic Name="[row 1]" Condition="Flags%2">
+          <DisplayString>({m_storage.m_data.array[3]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[5]})</DisplayString>
+        </Synthetic>
+        <Synthetic Name="[row 1]" Condition="!(Flags%2)">
+          <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[7]})</DisplayString>
+        </Synthetic>
+        <Synthetic Name="[row 2]" Condition="Flags%2">
+          <DisplayString>({m_storage.m_data.array[6]}, {m_storage.m_data.array[7]}, {m_storage.m_data.array[8]})</DisplayString>
+        </Synthetic>
+        <Synthetic Name="[row 2]" Condition="!(Flags%2)">
+          <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[8]})</DisplayString>
+        </Synthetic>        
+      </Expand>
+  </Type>
+  
+  <!-- 4 x 4 Matrix -->
+  <Type Name="Eigen::Matrix&lt;*,4,4,*,*,*&gt;">      
+      <AlternativeType Name="Eigen::Array&lt;*,4,4,*,*,*&gt;"/>
+      <DisplayString>[4, 4] (fixed matrix)</DisplayString>
+      <Expand>
+        <Synthetic Name="[row 0]" Condition="Flags%2">
+          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>
+        </Synthetic>
+        <Synthetic Name="[row 0]" Condition="!(Flags%2)">
+          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[8]}, {m_storage.m_data.array[12]})</DisplayString>
+        </Synthetic>
+        <Synthetic Name="[row 1]" Condition="Flags%2">
+          <DisplayString>({m_storage.m_data.array[4]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[6]}, {m_storage.m_data.array[7]})</DisplayString>
+        </Synthetic>
+        <Synthetic Name="[row 1]" Condition="!(Flags%2)">
+          <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[9]}, {m_storage.m_data.array[13]})</DisplayString>
+        </Synthetic>
+        <Synthetic Name="[row 2]" Condition="Flags%2">
+          <DisplayString>({m_storage.m_data.array[8]}, {m_storage.m_data.array[9]}, {m_storage.m_data.array[10]}, {m_storage.m_data.array[11]})</DisplayString>
+        </Synthetic>
+        <Synthetic Name="[row 2]" Condition="!(Flags%2)">
+          <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[6]}, {m_storage.m_data.array[10]}, {m_storage.m_data.array[14]})</DisplayString>
+        </Synthetic>
+        <Synthetic Name="[row 3]" Condition="Flags%2">
+          <DisplayString>({m_storage.m_data.array[12]}, {m_storage.m_data.array[13]}, {m_storage.m_data.array[14]}, {m_storage.m_data.array[15]})</DisplayString>
+        </Synthetic>
+        <Synthetic Name="[row 3]" Condition="!(Flags%2)">
+          <DisplayString>({m_storage.m_data.array[3]}, {m_storage.m_data.array[7]}, {m_storage.m_data.array[11]}, {m_storage.m_data.array[15]})</DisplayString>
+        </Synthetic>        
+      </Expand>
+  </Type>  
+  
+  <!-- Dynamic x Dynamic Matrix -->
+  <Type Name="Eigen::Matrix&lt;*,-1,-1,*,*,*&gt;">      
+      <AlternativeType Name="Eigen::Array&lt;*,-1,-1,*,*,*&gt;"/>
+      <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
+      <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}, {m_storage.m_cols}] (dynamic matrix)</DisplayString>
+      <Expand>
+        <ArrayItems Condition="Flags%2"> <!-- row major layout -->
+          <Rank>2</Rank>
+          <Size>$i==0 ? m_storage.m_rows : m_storage.m_cols</Size>
+          <ValuePointer>m_storage.m_data</ValuePointer>
+        </ArrayItems>
+        <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->
+          <Direction>Backward</Direction>
+          <Rank>2</Rank>
+          <Size>$i==0 ? m_storage.m_rows : m_storage.m_cols</Size>
+          <ValuePointer>m_storage.m_data</ValuePointer>
+        </ArrayItems>
+      </Expand>
+  </Type>
+  
+  <!-- Fixed x Dynamic Matrix -->
+  <Type Name="Eigen::Matrix&lt;*,*,-1,*,*,*&gt;">
+      <AlternativeType Name="Eigen::Array&lt;*,*,-1,*,*,*&gt;"/>
+      <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
+      <DisplayString Condition="m_storage.m_data != 0">[{$T2}, {m_storage.m_cols}] (dynamic column matrix)</DisplayString>
+      <Expand>
+        <ArrayItems Condition="Flags%2"> <!-- row major layout -->
+          <Rank>2</Rank>
+          <Size>$i==0 ? $T2 : m_storage.m_cols</Size>
+          <ValuePointer>m_storage.m_data</ValuePointer>
+        </ArrayItems>
+        <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->
+          <Direction>Backward</Direction>
+          <Rank>2</Rank>
+          <Size>$i==0 ? $T2 : m_storage.m_cols</Size>
+          <ValuePointer>m_storage.m_data</ValuePointer>
+        </ArrayItems>
+      </Expand>
+  </Type>
+  
+  <!-- Dynamic x Fixed Matrix -->
+  <Type Name="Eigen::Matrix&lt;*,-1,*,*,*,*&gt;">
+      <AlternativeType Name="Eigen::Array&lt;*,-1,*,*,*,*&gt;"/>
+      <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
+      <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}, {$T2}] (dynamic row matrix)</DisplayString>
+      <Expand>
+        <ArrayItems Condition="Flags%2"> <!-- row major layout -->
+          <Rank>2</Rank>
+          <Size>$i==0 ? m_storage.m_rows : $T2</Size>
+          <ValuePointer>m_storage.m_data</ValuePointer>
+        </ArrayItems>
+        <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->
+          <Direction>Backward</Direction>
+          <Rank>2</Rank>
+          <Size>$i==0 ? m_storage.m_rows : $T2</Size>
+          <ValuePointer>m_storage.m_data</ValuePointer>
+        </ArrayItems>
+      </Expand>
+  </Type>
+  
+  <!-- Dynamic Column Vector -->
+  <Type Name="Eigen::Matrix&lt;*,1,-1,*,*,*&gt;">
+      <AlternativeType Name="Eigen::Array&lt;*,1,-1,*,*,*&gt;"/>
+      <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
+      <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_cols}] (dynamic column vector)</DisplayString>
+      <Expand>
+        <Item Name="[size]">m_storage.m_cols</Item>
+        <ArrayItems>
+          <Size>m_storage.m_cols</Size>
+          <ValuePointer>m_storage.m_data</ValuePointer>
+        </ArrayItems>
+      </Expand>
+  </Type>
+  
+  <!-- Dynamic Row Vector -->
+  <Type Name="Eigen::Matrix&lt;*,-1,1,*,*,*&gt;">
+      <AlternativeType Name="Eigen::Array&lt;*,-1,1,*,*,*&gt;"/>
+      <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
+      <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}] (dynamic row vector)</DisplayString>
+      <Expand>
+        <Item Name="[size]">m_storage.m_rows</Item>
+        <ArrayItems>
+          <Size>m_storage.m_rows</Size>
+          <ValuePointer>m_storage.m_data</ValuePointer>
+        </ArrayItems>
+      </Expand>
+  </Type>
+  
+  <!-- Fixed Vector -->
+  <Type Name="Eigen::Matrix&lt;*,1,1,*,*,*&gt;">
+      <AlternativeType Name="Eigen::Array&lt;*,1,1,*,*,*&gt;"/>
+      <DisplayString>[1] ({m_storage.m_data.array[0]})</DisplayString>
+      <Expand>
+        <Item Name="[x]">m_storage.m_data.array[0]</Item>
+      </Expand>
+  </Type>
+  
+  <Type Name="Eigen::Matrix&lt;*,2,1,*,*,*&gt;">
+      <AlternativeType Name="Eigen::Matrix&lt;*,1,2,*,*,*&gt;"/>
+      <AlternativeType Name="Eigen::Array&lt;*,2,1,*,*,*&gt;"/>
+      <AlternativeType Name="Eigen::Array&lt;*,1,2,*,*,*&gt;"/>
+      <DisplayString>[2] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]})</DisplayString>
+      <Expand>
+        <Item Name="[x]">m_storage.m_data.array[0]</Item>
+        <Item Name="[y]">m_storage.m_data.array[1]</Item>
+      </Expand>
+  </Type>
+  
+  <Type Name="Eigen::Matrix&lt;*,3,1,*,*,*&gt;">
+      <AlternativeType Name="Eigen::Matrix&lt;*,1,3,*,*,*&gt;"/>
+      <AlternativeType Name="Eigen::Array&lt;*,3,1,*,*,*&gt;"/>
+      <AlternativeType Name="Eigen::Array&lt;*,1,3,*,*,*&gt;"/>
+      <DisplayString>[3] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]})</DisplayString>
+      <Expand>
+        <Item Name="[x]">m_storage.m_data.array[0]</Item>
+        <Item Name="[y]">m_storage.m_data.array[1]</Item>
+        <Item Name="[z]">m_storage.m_data.array[2]</Item>
+      </Expand>
+  </Type>
+  
+    <Type Name="Eigen::Matrix&lt;*,4,1,*,*,*&gt;">
+      <AlternativeType Name="Eigen::Matrix&lt;*,1,4,*,*,*&gt;"/>
+      <AlternativeType Name="Eigen::Array&lt;*,4,1,*,*,*&gt;"/>
+      <AlternativeType Name="Eigen::Array&lt;*,1,4,*,*,*&gt;"/>
+      <DisplayString>[4] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>
+      <Expand>
+        <Item Name="[x]">m_storage.m_data.array[0]</Item>
+        <Item Name="[y]">m_storage.m_data.array[1]</Item>
+        <Item Name="[z]">m_storage.m_data.array[2]</Item>
+        <Item Name="[w]">m_storage.m_data.array[3]</Item>
+      </Expand>
+  </Type>
+
+</AutoVisualizer>
diff --git a/debug/msvc/eigen_autoexp_part.dat b/debug/msvc/eigen_autoexp_part.dat
index 35ef580..273c10d 100644
--- a/debug/msvc/eigen_autoexp_part.dat
+++ b/debug/msvc/eigen_autoexp_part.dat
@@ -1,295 +1,295 @@
-; ***************************************************************

-; * Eigen Visualizer

-; *

-; * Author: Hauke Heibel <hauke.heibel@gmail.com>

-; *

-; * Support the enhanced debugging of the following Eigen

-; * types (*: any, +:fixed dimension) :

-; *

-; * - Eigen::Matrix<*,4,1,*,*,*> and Eigen::Matrix<*,1,4,*,*,*>

-; * - Eigen::Matrix<*,3,1,*,*,*> and Eigen::Matrix<*,1,3,*,*,*>

-; * - Eigen::Matrix<*,2,1,*,*,*> and Eigen::Matrix<*,1,2,*,*,*>

-; * - Eigen::Matrix<*,-1,-1,*,*,*>

-; * - Eigen::Matrix<*,+,-1,*,*,*>

-; * - Eigen::Matrix<*,-1,+,*,*,*>

-; * - Eigen::Matrix<*,+,+,*,*,*>

-; *

-; * Matrices are displayed properly independently of the memory

-; * alignment (RowMajor vs. ColMajor).

-; *

-; * This file is distributed WITHOUT ANY WARRANTY. Please ensure

-; * that your original autoexp.dat file is copied to a safe 

-; * place before proceeding with its modification.

-; ***************************************************************

-

-[Visualizer]

-

-; Fixed size 4-vectors

-Eigen::Matrix<*,4,1,*,*,*>|Eigen::Matrix<*,1,4,*,*,*>{

-   children

-   (

-      #(

-        [internals]: [$c,!],

-         x : ($c.m_storage.m_data.array)[0],

-         y : ($c.m_storage.m_data.array)[1],

-         z : ($c.m_storage.m_data.array)[2],

-         w : ($c.m_storage.m_data.array)[3]

-      )

-   )

-

-   preview

-   (

-      #(

-        "[",

-        4,

-        "](",

-        #array(expr: $e.m_storage.m_data.array[$i], size: 4),

-        ")"

-      )

-   )

-}

-

-; Fixed size 3-vectors

-Eigen::Matrix<*,3,1,*,*,*>|Eigen::Matrix<*,1,3,*,*,*>{

-   children

-   (

-      #(

-        [internals]: [$c,!],

-         x : ($c.m_storage.m_data.array)[0],

-         y : ($c.m_storage.m_data.array)[1],

-         z : ($c.m_storage.m_data.array)[2]

-      )

-   )

-

-   preview

-   (

-      #(

-        "[",

-        3,

-        "](",

-        #array(expr: $e.m_storage.m_data.array[$i], size: 3),

-        ")"

-      )

-   )

-}

-

-; Fixed size 2-vectors

-Eigen::Matrix<*,2,1,*,*,*>|Eigen::Matrix<*,1,2,*,*,*>{

-   children

-   (

-      #(

-        [internals]: [$c,!],

-         x : ($c.m_storage.m_data.array)[0],

-         y : ($c.m_storage.m_data.array)[1]

-      )

-   )

-

-   preview

-   (

-      #(

-        "[",

-        2,

-        "](",

-        #array(expr: $e.m_storage.m_data.array[$i], size: 2),

-        ")"

-      )

-   )

-}

-

-; Fixed size 1-vectors

-Eigen::Matrix<*,1,1,*,*,*>|Eigen::Matrix<*,1,1,*,*,*>{

-   children

-   (

-      #(

-        [internals]: [$c,!],

-         x : ($c.m_storage.m_data.array)[0]

-      )

-   )

-

-   preview

-   (

-      #(

-        "[",

-        1,

-        "](",

-        #array(expr: $e.m_storage.m_data.array[$i], size: 1),

-        ")"

-      )

-   )

-}

-

-; Dynamic matrices (ColMajor and RowMajor support)

-Eigen::Matrix<*,-1,-1,*,*,*>{

-  children

-   (

-      #(

-         [internals]: [$c,!],

-         rows: $c.m_storage.m_rows,

-         cols: $c.m_storage.m_cols,

-         ; Check for RowMajorBit

-         #if ($c.Flags & 0x1) (

-             #array(

-                rank: 2,

-                base: 0,

-                expr: ($c.m_storage.m_data)[($i % $c.m_storage.m_rows)*$c.m_storage.m_cols + (($i- $i % $c.m_storage.m_rows)/$c.m_storage.m_rows)], 

-                size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.m_storage.m_cols

-             )

-         ) #else (

-             #array(

-                rank: 2,

-                base: 0,

-                expr: ($c.m_storage.m_data)[$i],

-                size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.m_storage.m_cols

-             )

-         )

-      )

-   )

-

-   preview

-   (

-     #(

-         "[",

-           $c.m_storage.m_rows,

-         ",",

-           $c.m_storage.m_cols,

-         "](",

-           #array(

-            expr :    [($c.m_storage.m_data)[$i],g],

-            size :    $c.m_storage.m_rows*$c.m_storage.m_cols

-           ),

-         ")"

-      )

-   )

-}

-

-; Fixed rows, dynamic columns matrix (ColMajor and RowMajor support)

-Eigen::Matrix<*,*,-1,*,*,*>{

-  children

-   (

-      #(

-         [internals]: [$c,!],

-         rows: $c.RowsAtCompileTime,

-         cols: $c.m_storage.m_cols,

-         ; Check for RowMajorBit

-         #if ($c.Flags & 0x1) (

-             #array(

-                rank: 2,

-                base: 0,

-                expr: ($c.m_storage.m_data)[($i % $c.RowsAtCompileTime)*$c.m_storage.m_cols + (($i- $i % $c.RowsAtCompileTime)/$c.RowsAtCompileTime)],

-                size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.m_storage.m_cols

-             )

-         ) #else (

-             #array(

-                rank: 2,

-                base: 0,

-                expr: ($c.m_storage.m_data)[$i],

-                size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.m_storage.m_cols

-             )

-         )

-      )

-   )

-

-   preview

-   (

-     #(

-         "[",

-           $c.RowsAtCompileTime,

-         ",",

-           $c.m_storage.m_cols,

-         "](",

-           #array(

-            expr :    [($c.m_storage.m_data)[$i],g],

-            size :    $c.RowsAtCompileTime*$c.m_storage.m_cols

-           ),

-         ")"

-      )

-   )

-}

-

-; Dynamic rows, fixed columns matrix (ColMajor and RowMajor support)

-Eigen::Matrix<*,-1,*,*,*,*>{

-  children

-   (

-      #(

-         [internals]: [$c,!],

-         rows: $c.m_storage.m_rows,

-         cols: $c.ColsAtCompileTime,

-         ; Check for RowMajorBit

-         #if ($c.Flags & 0x1) (

-             #array(

-                rank: 2,

-                base: 0,

-                expr: ($c.m_storage.m_data)[($i % $c.m_storage.m_rows)*$c.ColsAtCompileTime + (($i- $i % $c.m_storage.m_rows)/$c.m_storage.m_rows)], 

-                size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.ColsAtCompileTime

-             )

-         ) #else (

-             #array(

-                rank: 2,

-                base: 0,

-                expr: ($c.m_storage.m_data)[$i],

-                size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.ColsAtCompileTime

-             )

-         )

-      )

-   )

-

-   preview

-   (

-     #(

-         "[",

-           $c.m_storage.m_rows,

-         ",",

-           $c.ColsAtCompileTime,

-         "](",

-           #array(

-            expr :    [($c.m_storage.m_data)[$i],g],

-            size :    $c.m_storage.m_rows*$c.ColsAtCompileTime

-           ),

-         ")"

-      )

-   )

-}

-

-; Fixed size matrix (ColMajor and RowMajor support)

-Eigen::Matrix<*,*,*,*,*,*>{

-  children

-   (

-      #(

-         [internals]: [$c,!],

-         rows: $c.RowsAtCompileTime,

-         cols: $c.ColsAtCompileTime,

-         ; Check for RowMajorBit

-         #if ($c.Flags & 0x1) (

-             #array(

-                rank: 2,

-                base: 0,

-                expr: ($c.m_storage.m_data.array)[($i % $c.RowsAtCompileTime)*$c.ColsAtCompileTime + (($i- $i % $c.RowsAtCompileTime)/$c.RowsAtCompileTime)], 

-                size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.ColsAtCompileTime

-             )

-         ) #else (

-             #array(

-                rank: 2,

-                base: 0,

-                expr: ($c.m_storage.m_data.array)[$i],

-                size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.ColsAtCompileTime

-             )

-         )

-      )

-   )

-

-   preview

-   (

-     #(

-         "[",

-           $c.RowsAtCompileTime,

-         ",",

-           $c.ColsAtCompileTime,

-         "](",

-           #array(

-            expr :    [($c.m_storage.m_data.array)[$i],g],

-            size :    $c.RowsAtCompileTime*$c.ColsAtCompileTime

-           ),

-         ")"

-      )

-   )

-}

+; ***************************************************************
+; * Eigen Visualizer
+; *
+; * Author: Hauke Heibel <hauke.heibel@gmail.com>
+; *
+; * Support the enhanced debugging of the following Eigen
+; * types (*: any, +:fixed dimension) :
+; *
+; * - Eigen::Matrix<*,4,1,*,*,*> and Eigen::Matrix<*,1,4,*,*,*>
+; * - Eigen::Matrix<*,3,1,*,*,*> and Eigen::Matrix<*,1,3,*,*,*>
+; * - Eigen::Matrix<*,2,1,*,*,*> and Eigen::Matrix<*,1,2,*,*,*>
+; * - Eigen::Matrix<*,-1,-1,*,*,*>
+; * - Eigen::Matrix<*,+,-1,*,*,*>
+; * - Eigen::Matrix<*,-1,+,*,*,*>
+; * - Eigen::Matrix<*,+,+,*,*,*>
+; *
+; * Matrices are displayed properly independently of the memory
+; * alignment (RowMajor vs. ColMajor).
+; *
+; * This file is distributed WITHOUT ANY WARRANTY. Please ensure
+; * that your original autoexp.dat file is copied to a safe 
+; * place before proceeding with its modification.
+; ***************************************************************
+
+[Visualizer]
+
+; Fixed size 4-vectors
+Eigen::Matrix<*,4,1,*,*,*>|Eigen::Matrix<*,1,4,*,*,*>{
+   children
+   (
+      #(
+        [internals]: [$c,!],
+         x : ($c.m_storage.m_data.array)[0],
+         y : ($c.m_storage.m_data.array)[1],
+         z : ($c.m_storage.m_data.array)[2],
+         w : ($c.m_storage.m_data.array)[3]
+      )
+   )
+
+   preview
+   (
+      #(
+        "[",
+        4,
+        "](",
+        #array(expr: $e.m_storage.m_data.array[$i], size: 4),
+        ")"
+      )
+   )
+}
+
+; Fixed size 3-vectors
+Eigen::Matrix<*,3,1,*,*,*>|Eigen::Matrix<*,1,3,*,*,*>{
+   children
+   (
+      #(
+        [internals]: [$c,!],
+         x : ($c.m_storage.m_data.array)[0],
+         y : ($c.m_storage.m_data.array)[1],
+         z : ($c.m_storage.m_data.array)[2]
+      )
+   )
+
+   preview
+   (
+      #(
+        "[",
+        3,
+        "](",
+        #array(expr: $e.m_storage.m_data.array[$i], size: 3),
+        ")"
+      )
+   )
+}
+
+; Fixed size 2-vectors
+Eigen::Matrix<*,2,1,*,*,*>|Eigen::Matrix<*,1,2,*,*,*>{
+   children
+   (
+      #(
+        [internals]: [$c,!],
+         x : ($c.m_storage.m_data.array)[0],
+         y : ($c.m_storage.m_data.array)[1]
+      )
+   )
+
+   preview
+   (
+      #(
+        "[",
+        2,
+        "](",
+        #array(expr: $e.m_storage.m_data.array[$i], size: 2),
+        ")"
+      )
+   )
+}
+
+; Fixed size 1-vectors
+Eigen::Matrix<*,1,1,*,*,*>|Eigen::Matrix<*,1,1,*,*,*>{
+   children
+   (
+      #(
+        [internals]: [$c,!],
+         x : ($c.m_storage.m_data.array)[0]
+      )
+   )
+
+   preview
+   (
+      #(
+        "[",
+        1,
+        "](",
+        #array(expr: $e.m_storage.m_data.array[$i], size: 1),
+        ")"
+      )
+   )
+}
+
+; Dynamic matrices (ColMajor and RowMajor support)
+Eigen::Matrix<*,-1,-1,*,*,*>{
+  children
+   (
+      #(
+         [internals]: [$c,!],
+         rows: $c.m_storage.m_rows,
+         cols: $c.m_storage.m_cols,
+         ; Check for RowMajorBit
+         #if ($c.Flags & 0x1) (
+             #array(
+                rank: 2,
+                base: 0,
+                expr: ($c.m_storage.m_data)[($i % $c.m_storage.m_rows)*$c.m_storage.m_cols + (($i- $i % $c.m_storage.m_rows)/$c.m_storage.m_rows)], 
+                size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.m_storage.m_cols
+             )
+         ) #else (
+             #array(
+                rank: 2,
+                base: 0,
+                expr: ($c.m_storage.m_data)[$i],
+                size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.m_storage.m_cols
+             )
+         )
+      )
+   )
+
+   preview
+   (
+     #(
+         "[",
+           $c.m_storage.m_rows,
+         ",",
+           $c.m_storage.m_cols,
+         "](",
+           #array(
+            expr :    [($c.m_storage.m_data)[$i],g],
+            size :    $c.m_storage.m_rows*$c.m_storage.m_cols
+           ),
+         ")"
+      )
+   )
+}
+
+; Fixed rows, dynamic columns matrix (ColMajor and RowMajor support)
+Eigen::Matrix<*,*,-1,*,*,*>{
+  children
+   (
+      #(
+         [internals]: [$c,!],
+         rows: $c.RowsAtCompileTime,
+         cols: $c.m_storage.m_cols,
+         ; Check for RowMajorBit
+         #if ($c.Flags & 0x1) (
+             #array(
+                rank: 2,
+                base: 0,
+                expr: ($c.m_storage.m_data)[($i % $c.RowsAtCompileTime)*$c.m_storage.m_cols + (($i- $i % $c.RowsAtCompileTime)/$c.RowsAtCompileTime)],
+                size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.m_storage.m_cols
+             )
+         ) #else (
+             #array(
+                rank: 2,
+                base: 0,
+                expr: ($c.m_storage.m_data)[$i],
+                size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.m_storage.m_cols
+             )
+         )
+      )
+   )
+
+   preview
+   (
+     #(
+         "[",
+           $c.RowsAtCompileTime,
+         ",",
+           $c.m_storage.m_cols,
+         "](",
+           #array(
+            expr :    [($c.m_storage.m_data)[$i],g],
+            size :    $c.RowsAtCompileTime*$c.m_storage.m_cols
+           ),
+         ")"
+      )
+   )
+}
+
+; Dynamic rows, fixed columns matrix (ColMajor and RowMajor support)
+Eigen::Matrix<*,-1,*,*,*,*>{
+  children
+   (
+      #(
+         [internals]: [$c,!],
+         rows: $c.m_storage.m_rows,
+         cols: $c.ColsAtCompileTime,
+         ; Check for RowMajorBit
+         #if ($c.Flags & 0x1) (
+             #array(
+                rank: 2,
+                base: 0,
+                expr: ($c.m_storage.m_data)[($i % $c.m_storage.m_rows)*$c.ColsAtCompileTime + (($i- $i % $c.m_storage.m_rows)/$c.m_storage.m_rows)], 
+                size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.ColsAtCompileTime
+             )
+         ) #else (
+             #array(
+                rank: 2,
+                base: 0,
+                expr: ($c.m_storage.m_data)[$i],
+                size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.ColsAtCompileTime
+             )
+         )
+      )
+   )
+
+   preview
+   (
+     #(
+         "[",
+           $c.m_storage.m_rows,
+         ",",
+           $c.ColsAtCompileTime,
+         "](",
+           #array(
+            expr :    [($c.m_storage.m_data)[$i],g],
+            size :    $c.m_storage.m_rows*$c.ColsAtCompileTime
+           ),
+         ")"
+      )
+   )
+}
+
+; Fixed size matrix (ColMajor and RowMajor support)
+Eigen::Matrix<*,*,*,*,*,*>{
+  children
+   (
+      #(
+         [internals]: [$c,!],
+         rows: $c.RowsAtCompileTime,
+         cols: $c.ColsAtCompileTime,
+         ; Check for RowMajorBit
+         #if ($c.Flags & 0x1) (
+             #array(
+                rank: 2,
+                base: 0,
+                expr: ($c.m_storage.m_data.array)[($i % $c.RowsAtCompileTime)*$c.ColsAtCompileTime + (($i- $i % $c.RowsAtCompileTime)/$c.RowsAtCompileTime)], 
+                size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.ColsAtCompileTime
+             )
+         ) #else (
+             #array(
+                rank: 2,
+                base: 0,
+                expr: ($c.m_storage.m_data.array)[$i],
+                size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.ColsAtCompileTime
+             )
+         )
+      )
+   )
+
+   preview
+   (
+     #(
+         "[",
+           $c.RowsAtCompileTime,
+         ",",
+           $c.ColsAtCompileTime,
+         "](",
+           #array(
+            expr :    [($c.m_storage.m_data.array)[$i],g],
+            size :    $c.RowsAtCompileTime*$c.ColsAtCompileTime
+           ),
+         ")"
+      )
+   )
+}
diff --git a/doc/AsciiQuickReference.txt b/doc/AsciiQuickReference.txt
index 18b4446..0256c81 100644
--- a/doc/AsciiQuickReference.txt
+++ b/doc/AsciiQuickReference.txt
@@ -140,6 +140,8 @@
 R.array().cube()               // P .^ 3
 R.cwiseSqrt()                  // sqrt(P)
 R.array().sqrt()               // sqrt(P)
+R.cwiseCbrt()                  // cbrt(P)
+R.array().cbrt()               // cbrt(P)
 R.array().exp()                // exp(P)
 R.array().log()                // log(P)
 R.cwiseMax(P)                  // max(R, P)
diff --git a/doc/CoeffwiseMathFunctionsTable.dox b/doc/CoeffwiseMathFunctionsTable.dox
index 3f5c564..934a95a 100644
--- a/doc/CoeffwiseMathFunctionsTable.dox
+++ b/doc/CoeffwiseMathFunctionsTable.dox
@@ -172,6 +172,19 @@
 </tr>
 <tr>
   <td class="code">
+  \anchor cwisetable_cbrt
+  a.\link ArrayBase::cbrt cbrt\endlink(); \n
+  \link Eigen::cbrt cbrt\endlink(a);\n
+  m.\link MatrixBase::cwiseCbrt cwiseCbrt\endlink();
+  </td>
+  <td>computes cube root (\f$ \cbrt a_i \f$)</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/cbrt">std::cbrt</a>; \n
+  cbrt(a[i]);</td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
   \anchor cwisetable_rsqrt
   a.\link ArrayBase::rsqrt rsqrt\endlink(); \n
   \link Eigen::rsqrt rsqrt\endlink(a);
diff --git a/doc/QuickReference.dox b/doc/QuickReference.dox
index e96b617..c61d47a 100644
--- a/doc/QuickReference.dox
+++ b/doc/QuickReference.dox
@@ -458,6 +458,7 @@
 mat1.cwiseAbs2()
 mat1.cwiseAbs()
 mat1.cwiseSqrt()
+mat1.cwiseCbrt()
 mat1.cwiseInverse()
 mat1.cwiseProduct(mat2)
 mat1.cwiseQuotient(mat2)
@@ -470,6 +471,7 @@
 mat1.array().abs2()
 mat1.array().abs()
 mat1.array().sqrt()
+mat1.array().cbrt()
 mat1.array().inverse()
 mat1.array() * mat2.array()
 mat1.array() / mat2.array()
diff --git a/doc/SparseQuickReference.dox b/doc/SparseQuickReference.dox
index 68ac2dc..66288cd 100644
--- a/doc/SparseQuickReference.dox
+++ b/doc/SparseQuickReference.dox
@@ -172,6 +172,7 @@
   sm1.cwiseMax(sm2);
   sm1.cwiseAbs();
   sm1.cwiseSqrt();
+  sm1.cwiseCbrt();
   \endcode</td>
   <td>
   sm1 and sm2 should have the same storage order
diff --git a/doc/snippets/Cwise_cbrt.cpp b/doc/snippets/Cwise_cbrt.cpp
new file mode 100644
index 0000000..a58c76c
--- /dev/null
+++ b/doc/snippets/Cwise_cbrt.cpp
@@ -0,0 +1,2 @@
+Array3d v(1,2,4);
+cout << v.cbrt() << endl;
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index e0b2c83..0614a20 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -234,6 +234,7 @@
 ei_add_test(product_trsolve)
 ei_add_test(product_mmtr)
 ei_add_test(product_notemporary)
+ei_add_test(product_threaded "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
 ei_add_test(stable_norm)
 ei_add_test(permutationmatrices)
 ei_add_test(bandmatrix)
diff --git a/test/array_cwise.cpp b/test/array_cwise.cpp
index 058b721..bfea96a 100644
--- a/test/array_cwise.cpp
+++ b/test/array_cwise.cpp
@@ -169,7 +169,9 @@
 
 template <typename Scalar>
 void unary_ops_test() {
+  
   unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(sqrt));
+  unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(cbrt));
   unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(exp));
   unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(log));
   unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(sin));
@@ -821,6 +823,7 @@
   m3 = m4.abs();
 
   VERIFY_IS_APPROX(m3.sqrt(), sqrt(abs(m3)));
+  VERIFY_IS_APPROX(m3.cbrt(), cbrt(m3));
   VERIFY_IS_APPROX(m3.rsqrt(), Scalar(1)/sqrt(abs(m3)));
   VERIFY_IS_APPROX(rsqrt(m3), Scalar(1)/sqrt(abs(m3)));
   VERIFY_IS_APPROX(m3.log(), log(m3));
@@ -882,6 +885,8 @@
 
   VERIFY_IS_APPROX(m3.pow(RealScalar(0.5)), m3.sqrt());
   VERIFY_IS_APPROX(pow(m3,RealScalar(0.5)), m3.sqrt());
+  VERIFY_IS_APPROX(m3.pow(RealScalar(1.0/3.0)), m3.cbrt());
+  VERIFY_IS_APPROX(pow(m3,RealScalar(1.0/3.0)), m3.cbrt());
 
   VERIFY_IS_APPROX(m3.pow(RealScalar(-0.5)), m3.rsqrt());
   VERIFY_IS_APPROX(pow(m3,RealScalar(-0.5)), m3.rsqrt());
diff --git a/test/product_threaded.cpp b/test/product_threaded.cpp
new file mode 100644
index 0000000..95dad79
--- /dev/null
+++ b/test/product_threaded.cpp
@@ -0,0 +1,33 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2023 Rasmus Munk Larsen <rmlarsen@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_GEMM_THREADPOOL
+#include "main.h"
+
+void test_parallelize_gemm() {
+  constexpr int n = 1024;
+  constexpr int num_threads = 4;
+  MatrixXf a(n,n);
+  MatrixXf b(n,n);
+  MatrixXf c(n,n);
+  c.noalias() = a*b;
+
+  ThreadPool pool(num_threads);
+  MatrixXf c_threaded(n,n);
+  c_threaded.noalias() = a*b;
+
+  VERIFY_IS_APPROX(c, c_threaded);
+}
+
+
+
+EIGEN_DECLARE_TEST(product_threaded)
+{
+  CALL_SUBTEST(test_parallelize_gemm());
+}
diff --git a/test/ref.cpp b/test/ref.cpp
index f283537..f0faa94 100644
--- a/test/ref.cpp
+++ b/test/ref.cpp
@@ -341,6 +341,17 @@
   VERIFY(test_is_equal(data1, obj_data2, MatrixType::MaxSizeAtCompileTime == Dynamic && owns_data));
 }
 
+template <typename MatrixType>
+void test_contiguous_ref_no_copy(const PlainObjectBase<MatrixType> &obj) {
+  typedef Ref<MatrixType, Unaligned, Stride<0, 0>> Ref_;
+  typedef Ref<const MatrixType, Unaligned, Stride<0, 0>> CRef_;
+  MatrixType m(obj);
+  Ref_ ref(m);
+  VERIFY(test_is_equal(ref.data(), m.data(), true));
+  CRef_ cref(m);
+  VERIFY(test_is_equal(cref.data(), m.data(), true));
+}
+
 EIGEN_DECLARE_TEST(ref)
 {
   for(int i = 0; i < g_repeat; i++) {
@@ -375,4 +386,8 @@
   CALL_SUBTEST_9( test_cref_move_ctor<MatrixXd>(MatrixXd(9, 5)) );
   CALL_SUBTEST_9( test_cref_move_ctor<Matrix3d>(Matrix3d::Ones()) );
   CALL_SUBTEST_9( test_cref_move_ctor<Matrix3d>(Matrix3d()) );
+  CALL_SUBTEST_10(test_contiguous_ref_no_copy(VectorXd(9)));
+  CALL_SUBTEST_10(test_contiguous_ref_no_copy(Vector3d()));
+  CALL_SUBTEST_10(test_contiguous_ref_no_copy(MatrixXd(9, 5)));
+  CALL_SUBTEST_10(test_contiguous_ref_no_copy(Matrix3d()));
 }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index 7576808..9567d3b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -1548,10 +1548,10 @@
     Index nn0 = numext::div_ceil(n, bn);
     Index new_tasks = numext::div_ceil(nm0, gm) * numext::div_ceil(nn0, gn);
     double new_parallelism = static_cast<double>(new_tasks) /
-                             (numext::div_ceil<int>(new_tasks, num_threads) * num_threads);
+                             (numext::div_ceil<Index>(new_tasks, num_threads) * num_threads);
     Index old_tasks = numext::div_ceil(nm0, oldgm) * numext::div_ceil(nn0, oldgn);
     double old_parallelism = static_cast<double>(old_tasks) /
-                             (numext::div_ceil<int>(old_tasks, num_threads) * num_threads);
+                             (numext::div_ceil<Index>(old_tasks, num_threads) * num_threads);
     if (new_parallelism > old_parallelism || new_parallelism == 1) return 1;
     return 0;
   }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
index 53b66c0..abf6834 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -373,7 +373,7 @@
     // computations:
     double max_efficiency =
         static_cast<double>(block_count) /
-        (numext::div_ceil<int>(block_count, numThreads()) * numThreads());
+        (numext::div_ceil<Index>(block_count, numThreads()) * numThreads());
 
     // Now try to increase block size up to max_block_size as long as it
     // doesn't decrease parallel efficiency.
@@ -396,7 +396,7 @@
       prev_block_count = coarser_block_count;
       const double coarser_efficiency =
           static_cast<double>(coarser_block_count) /
-          (numext::div_ceil<int>(coarser_block_count, numThreads()) * numThreads());
+          (numext::div_ceil<Index>(coarser_block_count, numThreads()) * numThreads());
       if (coarser_efficiency + 0.01 >= max_efficiency) {
         // Taking it.
         block_size = coarser_block_size;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 4eebbe7..e8fb85d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -655,13 +655,11 @@
 
     const int block_size = device.maxGpuThreadsPerBlock();
     const int max_blocks =
-        numext::mini<int64_t>(device.getNumGpuMultiProcessors() *
-                              device.maxGpuThreadsPerMultiProcessor(),
-                          NumTraits<StorageIndex>::highest()) /
-        block_size;
+        static_cast<int>(numext::mini<int64_t>(device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor(),
+                                               NumTraits<StorageIndex>::highest()) / block_size);
     const StorageIndex size = array_prod(evaluator.dimensions());
     // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
-    const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, numext::div_ceil<int>(size, block_size)), 1);
+    const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, static_cast<int>(numext::div_ceil<StorageIndex>(size, block_size))), 1);
 
     LAUNCH_GPU_KERNEL(
         (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, StorageIndex>),
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index c1ad675..062a56b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -271,7 +271,7 @@
       // Make sure the split point is aligned on a packet boundary.
       const typename Self::Index split =
           packetSize *
-          divup(firstIndex + divup(numValuesToReduce, typename Self::Index(2)),
+          numext::div_ceil(firstIndex + numext::div_ceil(numValuesToReduce, typename Self::Index(2)),
                 packetSize);
       const typename Self::Index num_left =
           numext::mini(split - firstIndex, numValuesToReduce);