Update Eigen to commit:3cf6bb6f1ce9cd3cd6bdc43cdcfae7f4822ff2c3
CHANGELOG
=========
3cf6bb6f1 - Fix a bug in commit 76e8c0455396446f8166c798da5efe879e010bdc:
32165c6f0 - Fix Wshorten-64-to-32 warning in gemm parallelizer
b33dbb576 - Fix implicit narrowing warning in Parallelizer.h.
3a9635b20 - Link pthread for product_threaded test
f78c37f0a - traits<Ref>::match: use correct strides
516d08a49 - Fix typo in Parallelizer.h
76e8c0455 - Generalize parallel GEMM implementation in Core to work with ThreadPool in addition to OpenMP.
4d54c43d6 - Fix typo to allow nomalloc test to pass on AVX512.
a25f02d73 - Fix int overflow causing cxx11_tensor_gpu_1 to fail.
6f9ad7da6 - fix Wshorten-64-to-32 warnings in div_ceil
1aac9332c - TensorReduction: replace divup with div_ceil
5de0f2f89 - Fixes #2735: Component-wise cbrt
48b254a4b - Disable denorm deprecation warnings in MSVC C++23.
176821f2f - Avoid building docs if cross-compiling or not top level.
PiperOrigin-RevId: 582839371
Change-Id: I1a6247e188e09c8f9c56c5bbda59d4714573f5fd
diff --git a/Eigen/Core b/Eigen/Core
index 93d4789..700d815 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -349,6 +349,9 @@
#include "src/Core/TriangularMatrix.h"
#include "src/Core/SelfAdjointView.h"
#include "src/Core/products/GeneralBlockPanelKernel.h"
+#ifdef EIGEN_GEMM_THREADPOOL
+#include "ThreadPool"
+#endif
#include "src/Core/products/Parallelizer.h"
#include "src/Core/ProductEvaluators.h"
#include "src/Core/products/GeneralMatrixVector.h"
diff --git a/Eigen/src/Core/Assign_MKL.h b/Eigen/src/Core/Assign_MKL.h
index 0fda71c..448dae2 100644
--- a/Eigen/src/Core/Assign_MKL.h
+++ b/Eigen/src/Core/Assign_MKL.h
@@ -140,6 +140,7 @@
EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(round, Round, _)
EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(floor, Floor, _)
EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(ceil, Ceil, _)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(cbrt, Cbrt, _)
#define EIGEN_MKL_VML_DECLARE_POW_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE, VMLMODE) \
template< typename DstXprType, typename SrcXprNested, typename Plain> \
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index 8e9902e..d04f713 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -1017,6 +1017,10 @@
template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
Packet psqrt(const Packet& a) { return numext::sqrt(a); }
+/** \internal \returns the cube-root of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pcbrt(const Packet& a) { return numext::cbrt(a); }
+
/** \internal \returns the rounded value of \a a (coeff-wise) */
template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
Packet pround(const Packet& a) { using numext::round; return round(a); }
diff --git a/Eigen/src/Core/GlobalFunctions.h b/Eigen/src/Core/GlobalFunctions.h
index 9818b86..2d8bb80 100644
--- a/Eigen/src/Core/GlobalFunctions.h
+++ b/Eigen/src/Core/GlobalFunctions.h
@@ -89,6 +89,7 @@
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(arg,scalar_arg_op,complex argument,\sa ArrayBase::arg DOXCOMMA MatrixBase::cwiseArg)
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(carg, scalar_carg_op, complex argument, \sa ArrayBase::carg DOXCOMMA MatrixBase::cwiseCArg)
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sqrt,scalar_sqrt_op,square root,\sa ArrayBase::sqrt DOXCOMMA MatrixBase::cwiseSqrt)
+ EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cbrt,scalar_cbrt_op,cube root,\sa ArrayBase::cbrt DOXCOMMA MatrixBase::cwiseCbrt)
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(rsqrt,scalar_rsqrt_op,reciprocal square root,\sa ArrayBase::rsqrt)
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(square,scalar_square_op,square (power 2),\sa Eigen::abs2 DOXCOMMA Eigen::pow DOXCOMMA ArrayBase::square)
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cube,scalar_cube_op,cube (power 3),\sa Eigen::pow DOXCOMMA ArrayBase::cube)
diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index 0f5a0fd..6f2fd6d 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -1394,6 +1394,14 @@
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sqrt, sqrt)
#endif
+/** \returns the cube root of \a x. **/
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T cbrt(const T &x) {
+ EIGEN_USING_STD(cbrt);
+ return static_cast<T>(cbrt(x));
+}
+
/** \returns the reciprocal square root of \a x. **/
template<typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
diff --git a/Eigen/src/Core/Ref.h b/Eigen/src/Core/Ref.h
index df43c05..5be39ce 100644
--- a/Eigen/src/Core/Ref.h
+++ b/Eigen/src/Core/Ref.h
@@ -26,7 +26,9 @@
enum {
Options = Options_,
Flags = traits<Map<PlainObjectType_, Options_, StrideType_> >::Flags | NestByRefBit,
- Alignment = traits<Map<PlainObjectType_, Options_, StrideType_> >::Alignment
+ Alignment = traits<Map<PlainObjectType_, Options_, StrideType_> >::Alignment,
+ InnerStrideAtCompileTime = traits<Map<PlainObjectType_, Options_, StrideType_> >::InnerStrideAtCompileTime,
+ OuterStrideAtCompileTime = traits<Map<PlainObjectType_, Options_, StrideType_> >::OuterStrideAtCompileTime
};
template<typename Derived> struct match {
@@ -34,11 +36,11 @@
IsVectorAtCompileTime = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime,
HasDirectAccess = internal::has_direct_access<Derived>::ret,
StorageOrderMatch = IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)),
- InnerStrideMatch = int(StrideType::InnerStrideAtCompileTime)==int(Dynamic)
- || int(StrideType::InnerStrideAtCompileTime)==int(Derived::InnerStrideAtCompileTime)
- || (int(StrideType::InnerStrideAtCompileTime)==0 && int(Derived::InnerStrideAtCompileTime)==1),
+ InnerStrideMatch = int(InnerStrideAtCompileTime)==int(Dynamic)
+ || int(InnerStrideAtCompileTime)==int(Derived::InnerStrideAtCompileTime)
+ || (int(InnerStrideAtCompileTime)==0 && int(Derived::InnerStrideAtCompileTime)==1),
OuterStrideMatch = IsVectorAtCompileTime
- || int(StrideType::OuterStrideAtCompileTime)==int(Dynamic) || int(StrideType::OuterStrideAtCompileTime)==int(Derived::OuterStrideAtCompileTime),
+ || int(OuterStrideAtCompileTime)==int(Dynamic) || int(OuterStrideAtCompileTime)==int(Derived::OuterStrideAtCompileTime),
// NOTE, this indirection of evaluator<Derived>::Alignment is needed
// to workaround a very strange bug in MSVC related to the instantiation
// of has_*ary_operator in evaluator<CwiseNullaryOp>.
diff --git a/Eigen/src/Core/arch/AVX512/PacketMathFP16.h b/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
index faa3853..fc11174 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
@@ -1,870 +1,870 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-//
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_PACKET_MATH_FP16_AVX512_H
-#define EIGEN_PACKET_MATH_FP16_AVX512_H
-
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+//
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_FP16_AVX512_H
+#define EIGEN_PACKET_MATH_FP16_AVX512_H
+
// IWYU pragma: private
-#include "../../InternalHeaderCheck.h"
-
-namespace Eigen {
-
-namespace internal {
-
-typedef __m512h Packet32h;
-typedef eigen_packet_wrapper<__m256i, 1> Packet16h;
-typedef eigen_packet_wrapper<__m128i, 2> Packet8h;
-
-template <>
-struct is_arithmetic<Packet8h> {
- enum { value = true };
-};
-
-template <>
-struct packet_traits<half> : default_packet_traits {
- typedef Packet32h type;
- typedef Packet16h half;
- enum {
- Vectorizable = 1,
- AlignedOnScalar = 1,
- size = 32,
-
- HasCmp = 1,
- HasAdd = 1,
- HasSub = 1,
- HasMul = 1,
- HasDiv = 1,
- HasNegate = 1,
- HasAbs = 1,
- HasAbs2 = 0,
- HasMin = 1,
- HasMax = 1,
- HasConj = 1,
- HasSetLinear = 0,
- HasLog = 1,
- HasLog1p = 1,
- HasExp = 1,
- HasExpm1 = 1,
- HasSqrt = 1,
- HasRsqrt = 1,
- // These ones should be implemented in future
- HasBessel = 0,
- HasNdtri = 0,
- HasSin = EIGEN_FAST_MATH,
- HasCos = EIGEN_FAST_MATH,
- HasTanh = EIGEN_FAST_MATH,
- HasErf = 0, // EIGEN_FAST_MATH,
- HasBlend = 0,
- HasRound = 1,
- HasFloor = 1,
- HasCeil = 1,
- HasRint = 1
- };
-};
-
-template <>
-struct unpacket_traits<Packet32h> {
- typedef Eigen::half type;
- typedef Packet16h half;
- enum {
- size = 32,
- alignment = Aligned64,
- vectorizable = true,
- masked_load_available = false,
- masked_store_available = false
- };
-};
-
-template <>
-struct unpacket_traits<Packet16h> {
- typedef Eigen::half type;
- typedef Packet8h half;
- enum {
- size = 16,
- alignment = Aligned32,
- vectorizable = true,
- masked_load_available = false,
- masked_store_available = false
- };
-};
-
-template <>
-struct unpacket_traits<Packet8h> {
- typedef Eigen::half type;
- typedef Packet8h half;
- enum {
- size = 8,
- alignment = Aligned16,
- vectorizable = true,
- masked_load_available = false,
- masked_store_available = false
- };
-};
-
-// Memory functions
-
-// pset1
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pset1<Packet32h>(const Eigen::half& from) {
- return _mm512_set1_ph(static_cast<_Float16>(from));
-}
-
-// pset1frombits
-template <>
-EIGEN_STRONG_INLINE Packet32h pset1frombits<Packet32h>(unsigned short from) {
- return _mm512_castsi512_ph(_mm512_set1_epi16(from));
-}
-
-// pfirst
-
-template <>
-EIGEN_STRONG_INLINE Eigen::half pfirst<Packet32h>(const Packet32h& from) {
-#ifdef EIGEN_VECTORIZE_AVX512DQ
- return half_impl::raw_uint16_to_half(
- static_cast<unsigned short>(_mm256_extract_epi16(_mm512_extracti32x8_epi32(_mm512_castph_si512(from), 0), 0)));
-#else
- Eigen::half dest[32];
- _mm512_storeu_ph(dest, from);
- return dest[0];
-#endif
-}
-
-// pload
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pload<Packet32h>(const Eigen::half* from) {
- EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_ph(from);
-}
-
-// ploadu
-
-template <>
-EIGEN_STRONG_INLINE Packet32h ploadu<Packet32h>(const Eigen::half* from) {
- EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_ph(from);
-}
-
-// pstore
-
-template <>
-EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet32h& from) {
- EIGEN_DEBUG_ALIGNED_STORE _mm512_store_ph(to, from);
-}
-
-// pstoreu
-
-template <>
-EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet32h& from) {
- EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_ph(to, from);
-}
-
-// ploaddup
-template <>
-EIGEN_STRONG_INLINE Packet32h ploaddup<Packet32h>(const Eigen::half* from) {
- __m512h a = _mm512_castph256_ph512(_mm256_loadu_ph(from));
- return _mm512_permutexvar_ph(_mm512_set_epi16(15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6,
- 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0),
- a);
-}
-
-// ploadquad
-template <>
-EIGEN_STRONG_INLINE Packet32h ploadquad<Packet32h>(const Eigen::half* from) {
- __m512h a = _mm512_castph128_ph512(_mm_loadu_ph(from));
- return _mm512_permutexvar_ph(
- _mm512_set_epi16(7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0),
- a);
-}
-
-// pabs
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pabs<Packet32h>(const Packet32h& a) {
- return _mm512_abs_ph(a);
-}
-
-// psignbit
-
-template <>
-EIGEN_STRONG_INLINE Packet32h psignbit<Packet32h>(const Packet32h& a) {
- return _mm512_castsi512_ph(_mm512_srai_epi16(_mm512_castph_si512(a), 15));
-}
-
-// pmin
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pmin<Packet32h>(const Packet32h& a, const Packet32h& b) {
- return _mm512_min_ph(a, b);
-}
-
-// pmax
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pmax<Packet32h>(const Packet32h& a, const Packet32h& b) {
- return _mm512_max_ph(a, b);
-}
-
-// plset
-template <>
-EIGEN_STRONG_INLINE Packet32h plset<Packet32h>(const half& a) {
- return _mm512_add_ph(_mm512_set1_ph(a),
- _mm512_set_ph(31.0f, 30.0f, 29.0f, 28.0f, 27.0f, 26.0f, 25.0f, 24.0f, 23.0f, 22.0f, 21.0f, 20.0f,
- 19.0f, 18.0f, 17.0f, 16.0f, 15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f,
- 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f));
-}
-
-// por
-
-template <>
-EIGEN_STRONG_INLINE Packet32h por(const Packet32h& a, const Packet32h& b) {
- return _mm512_castsi512_ph(_mm512_or_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
-}
-
-// pxor
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pxor(const Packet32h& a, const Packet32h& b) {
- return _mm512_castsi512_ph(_mm512_xor_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
-}
-
-// pand
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pand(const Packet32h& a, const Packet32h& b) {
- return _mm512_castsi512_ph(_mm512_and_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
-}
-
-// pandnot
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pandnot(const Packet32h& a, const Packet32h& b) {
- return _mm512_castsi512_ph(_mm512_andnot_si512(_mm512_castph_si512(b), _mm512_castph_si512(a)));
-}
-
-// pselect
-
-template <>
-EIGEN_DEVICE_FUNC inline Packet32h pselect(const Packet32h& mask, const Packet32h& a, const Packet32h& b) {
- __mmask32 mask32 = _mm512_cmp_epi16_mask(_mm512_castph_si512(mask), _mm512_setzero_epi32(), _MM_CMPINT_EQ);
- return _mm512_mask_blend_ph(mask32, a, b);
-}
-
-// pcmp_eq
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pcmp_eq(const Packet32h& a, const Packet32h& b) {
- __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_EQ_OQ);
- return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu));
-}
-
-// pcmp_le
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pcmp_le(const Packet32h& a, const Packet32h& b) {
- __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_LE_OQ);
- return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu));
-}
-
-// pcmp_lt
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pcmp_lt(const Packet32h& a, const Packet32h& b) {
- __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_LT_OQ);
- return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu));
-}
-
-// pcmp_lt_or_nan
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pcmp_lt_or_nan(const Packet32h& a, const Packet32h& b) {
- __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_NGE_UQ);
- return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi16(0), mask, 0xffffu));
-}
-
-// padd
-
-template <>
-EIGEN_STRONG_INLINE Packet32h padd<Packet32h>(const Packet32h& a, const Packet32h& b) {
- return _mm512_add_ph(a, b);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
- return _mm256_castph_si256(_mm256_add_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {
- return _mm_castph_si128(_mm_add_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
-}
-
-// psub
-
-template <>
-EIGEN_STRONG_INLINE Packet32h psub<Packet32h>(const Packet32h& a, const Packet32h& b) {
- return _mm512_sub_ph(a, b);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16h psub<Packet16h>(const Packet16h& a, const Packet16h& b) {
- return _mm256_castph_si256(_mm256_sub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8h psub<Packet8h>(const Packet8h& a, const Packet8h& b) {
- return _mm_castph_si128(_mm_sub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
-}
-
-// pmul
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pmul<Packet32h>(const Packet32h& a, const Packet32h& b) {
- return _mm512_mul_ph(a, b);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {
- return _mm256_castph_si256(_mm256_mul_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {
- return _mm_castph_si128(_mm_mul_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
-}
-
-// pdiv
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pdiv<Packet32h>(const Packet32h& a, const Packet32h& b) {
- return _mm512_div_ph(a, b);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16h pdiv<Packet16h>(const Packet16h& a, const Packet16h& b) {
- return _mm256_castph_si256(_mm256_div_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8h pdiv<Packet8h>(const Packet8h& a, const Packet8h& b) {
- return _mm_castph_si128(_mm_div_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
-}
-
-// pround
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pround<Packet32h>(const Packet32h& a) {
- // Work-around for default std::round rounding mode.
-
- // Mask for the sign bit
- const Packet32h signMask = pset1frombits<Packet32h>(static_cast<numext::uint16_t>(0x8000u));
- // The largest half-preicision float less than 0.5
- const Packet32h prev0dot5 = pset1frombits<Packet32h>(static_cast<numext::uint16_t>(0x37FFu));
-
- return _mm512_roundscale_ph(padd(por(pand(a, signMask), prev0dot5), a), _MM_FROUND_TO_ZERO);
-}
-
-// print
-
-template <>
-EIGEN_STRONG_INLINE Packet32h print<Packet32h>(const Packet32h& a) {
- return _mm512_roundscale_ph(a, _MM_FROUND_CUR_DIRECTION);
-}
-
-// pceil
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pceil<Packet32h>(const Packet32h& a) {
- return _mm512_roundscale_ph(a, _MM_FROUND_TO_POS_INF);
-}
-
-// pfloor
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pfloor<Packet32h>(const Packet32h& a) {
- return _mm512_roundscale_ph(a, _MM_FROUND_TO_NEG_INF);
-}
-
-// predux
-template <>
-EIGEN_STRONG_INLINE half predux<Packet32h>(const Packet32h& a) {
- return (half)_mm512_reduce_add_ph(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& a) {
- return (half)_mm256_reduce_add_ph(_mm256_castsi256_ph(a));
-}
-
-template <>
-EIGEN_STRONG_INLINE half predux<Packet8h>(const Packet8h& a) {
- return (half)_mm_reduce_add_ph(_mm_castsi128_ph(a));
-}
-
-// predux_half_dowto4
-template <>
-EIGEN_STRONG_INLINE Packet16h predux_half_dowto4<Packet32h>(const Packet32h& a) {
-#ifdef EIGEN_VECTORIZE_AVX512DQ
- __m256i lowHalf = _mm256_castps_si256(_mm512_extractf32x8_ps(_mm512_castph_ps(a), 0));
- __m256i highHalf = _mm256_castps_si256(_mm512_extractf32x8_ps(_mm512_castph_ps(a), 1));
-
- return Packet16h(padd<Packet16h>(lowHalf, highHalf));
-#else
- Eigen::half data[32];
- _mm512_storeu_ph(data, a);
-
- __m256i lowHalf = _mm256_castph_si256(_mm256_loadu_ph(data));
- __m256i highHalf = _mm256_castph_si256(_mm256_loadu_ph(data + 16));
-
- return Packet16h(padd<Packet16h>(lowHalf, highHalf));
-#endif
-}
-
-// predux_max
-
-// predux_min
-
-// predux_mul
-
-#ifdef EIGEN_VECTORIZE_FMA
-
-// pmadd
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pmadd(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
- return _mm512_fmadd_ph(a, b, c);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16h pmadd(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
- return _mm256_castph_si256(_mm256_fmadd_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8h pmadd(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
- return _mm_castph_si128(_mm_fmadd_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
-}
-
-// pmsub
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pmsub(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
- return _mm512_fmsub_ph(a, b, c);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16h pmsub(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
- return _mm256_castph_si256(_mm256_fmsub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8h pmsub(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
- return _mm_castph_si128(_mm_fmsub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
-}
-
-// pnmadd
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pnmadd(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
- return _mm512_fnmadd_ph(a, b, c);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16h pnmadd(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
- return _mm256_castph_si256(_mm256_fnmadd_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8h pnmadd(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
- return _mm_castph_si128(_mm_fnmadd_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
-}
-
-// pnmsub
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pnmsub(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
- return _mm512_fnmsub_ph(a, b, c);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16h pnmsub(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
- return _mm256_castph_si256(_mm256_fnmsub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8h pnmsub(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
- return _mm_castph_si128(_mm_fnmsub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
-}
-
-#endif
-
-// pnegate
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pnegate<Packet32h>(const Packet32h& a) {
- return _mm512_sub_ph(_mm512_set1_ph(0.0), a);
-}
-
-// pconj
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pconj<Packet32h>(const Packet32h& a) {
- return a;
-}
-
-// psqrt
-
-template <>
-EIGEN_STRONG_INLINE Packet32h psqrt<Packet32h>(const Packet32h& a) {
- return _mm512_sqrt_ph(a);
-}
-
-// prsqrt
-
-template <>
-EIGEN_STRONG_INLINE Packet32h prsqrt<Packet32h>(const Packet32h& a) {
- return _mm512_rsqrt_ph(a);
-}
-
-// preciprocal
-
-template <>
-EIGEN_STRONG_INLINE Packet32h preciprocal<Packet32h>(const Packet32h& a) {
- return _mm512_rcp_ph(a);
-}
-
-// ptranspose
-
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet32h, 32>& a) {
- __m512i t[32];
-
- EIGEN_UNROLL_LOOP
- for (int i = 0; i < 16; i++) {
- t[2 * i] = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[2 * i]), _mm512_castph_si512(a.packet[2 * i + 1]));
- t[2 * i + 1] =
- _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[2 * i]), _mm512_castph_si512(a.packet[2 * i + 1]));
- }
-
- __m512i p[32];
-
- EIGEN_UNROLL_LOOP
- for (int i = 0; i < 8; i++) {
- p[4 * i] = _mm512_unpacklo_epi32(t[4 * i], t[4 * i + 2]);
- p[4 * i + 1] = _mm512_unpackhi_epi32(t[4 * i], t[4 * i + 2]);
- p[4 * i + 2] = _mm512_unpacklo_epi32(t[4 * i + 1], t[4 * i + 3]);
- p[4 * i + 3] = _mm512_unpackhi_epi32(t[4 * i + 1], t[4 * i + 3]);
- }
-
- __m512i q[32];
-
- EIGEN_UNROLL_LOOP
- for (int i = 0; i < 4; i++) {
- q[8 * i] = _mm512_unpacklo_epi64(p[8 * i], p[8 * i + 4]);
- q[8 * i + 1] = _mm512_unpackhi_epi64(p[8 * i], p[8 * i + 4]);
- q[8 * i + 2] = _mm512_unpacklo_epi64(p[8 * i + 1], p[8 * i + 5]);
- q[8 * i + 3] = _mm512_unpackhi_epi64(p[8 * i + 1], p[8 * i + 5]);
- q[8 * i + 4] = _mm512_unpacklo_epi64(p[8 * i + 2], p[8 * i + 6]);
- q[8 * i + 5] = _mm512_unpackhi_epi64(p[8 * i + 2], p[8 * i + 6]);
- q[8 * i + 6] = _mm512_unpacklo_epi64(p[8 * i + 3], p[8 * i + 7]);
- q[8 * i + 7] = _mm512_unpackhi_epi64(p[8 * i + 3], p[8 * i + 7]);
- }
-
- __m512i f[32];
-
-#define PACKET32H_TRANSPOSE_HELPER(X, Y) \
- do { \
- f[Y * 8] = _mm512_inserti32x4(f[Y * 8], _mm512_extracti32x4_epi32(q[X * 8], Y), X); \
- f[Y * 8 + 1] = _mm512_inserti32x4(f[Y * 8 + 1], _mm512_extracti32x4_epi32(q[X * 8 + 1], Y), X); \
- f[Y * 8 + 2] = _mm512_inserti32x4(f[Y * 8 + 2], _mm512_extracti32x4_epi32(q[X * 8 + 2], Y), X); \
- f[Y * 8 + 3] = _mm512_inserti32x4(f[Y * 8 + 3], _mm512_extracti32x4_epi32(q[X * 8 + 3], Y), X); \
- f[Y * 8 + 4] = _mm512_inserti32x4(f[Y * 8 + 4], _mm512_extracti32x4_epi32(q[X * 8 + 4], Y), X); \
- f[Y * 8 + 5] = _mm512_inserti32x4(f[Y * 8 + 5], _mm512_extracti32x4_epi32(q[X * 8 + 5], Y), X); \
- f[Y * 8 + 6] = _mm512_inserti32x4(f[Y * 8 + 6], _mm512_extracti32x4_epi32(q[X * 8 + 6], Y), X); \
- f[Y * 8 + 7] = _mm512_inserti32x4(f[Y * 8 + 7], _mm512_extracti32x4_epi32(q[X * 8 + 7], Y), X); \
- } while (false);
-
- PACKET32H_TRANSPOSE_HELPER(0, 0);
- PACKET32H_TRANSPOSE_HELPER(1, 1);
- PACKET32H_TRANSPOSE_HELPER(2, 2);
- PACKET32H_TRANSPOSE_HELPER(3, 3);
-
- PACKET32H_TRANSPOSE_HELPER(1, 0);
- PACKET32H_TRANSPOSE_HELPER(2, 0);
- PACKET32H_TRANSPOSE_HELPER(3, 0);
- PACKET32H_TRANSPOSE_HELPER(2, 1);
- PACKET32H_TRANSPOSE_HELPER(3, 1);
- PACKET32H_TRANSPOSE_HELPER(3, 2);
-
- PACKET32H_TRANSPOSE_HELPER(0, 1);
- PACKET32H_TRANSPOSE_HELPER(0, 2);
- PACKET32H_TRANSPOSE_HELPER(0, 3);
- PACKET32H_TRANSPOSE_HELPER(1, 2);
- PACKET32H_TRANSPOSE_HELPER(1, 3);
- PACKET32H_TRANSPOSE_HELPER(2, 3);
-
-#undef PACKET32H_TRANSPOSE_HELPER
-
- EIGEN_UNROLL_LOOP
- for (int i = 0; i < 32; i++) {
- a.packet[i] = _mm512_castsi512_ph(f[i]);
- }
-}
-
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet32h, 4>& a) {
- __m512i p0, p1, p2, p3, t0, t1, t2, t3, a0, a1, a2, a3;
- t0 = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[0]), _mm512_castph_si512(a.packet[1]));
- t1 = _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[0]), _mm512_castph_si512(a.packet[1]));
- t2 = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[2]), _mm512_castph_si512(a.packet[3]));
- t3 = _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[2]), _mm512_castph_si512(a.packet[3]));
-
- p0 = _mm512_unpacklo_epi32(t0, t2);
- p1 = _mm512_unpackhi_epi32(t0, t2);
- p2 = _mm512_unpacklo_epi32(t1, t3);
- p3 = _mm512_unpackhi_epi32(t1, t3);
-
- a0 = p0;
- a1 = p1;
- a2 = p2;
- a3 = p3;
-
- a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p1, 0), 1);
- a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p0, 1), 0);
-
- a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p2, 0), 2);
- a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p0, 2), 0);
-
- a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p3, 0), 3);
- a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p0, 3), 0);
-
- a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p2, 1), 2);
- a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p1, 2), 1);
-
- a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p3, 2), 3);
- a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p2, 3), 2);
-
- a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p3, 1), 3);
- a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p1, 3), 1);
-
- a.packet[0] = _mm512_castsi512_ph(a0);
- a.packet[1] = _mm512_castsi512_ph(a1);
- a.packet[2] = _mm512_castsi512_ph(a2);
- a.packet[3] = _mm512_castsi512_ph(a3);
-}
-
-// preverse
-
-template <>
-EIGEN_STRONG_INLINE Packet32h preverse(const Packet32h& a) {
- return _mm512_permutexvar_ph(_mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
- 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31),
- a);
-}
-
-// pscatter
-
-template <>
-EIGEN_STRONG_INLINE void pscatter<half, Packet32h>(half* to, const Packet32h& from, Index stride) {
- EIGEN_ALIGN64 half aux[32];
- pstore(aux, from);
-
- EIGEN_UNROLL_LOOP
- for (int i = 0; i < 32; i++) {
- to[stride * i] = aux[i];
- }
-}
-
-// pgather
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pgather<Eigen::half, Packet32h>(const Eigen::half* from, Index stride) {
- return _mm512_castsi512_ph(_mm512_set_epi16(
- from[31 * stride].x, from[30 * stride].x, from[29 * stride].x, from[28 * stride].x, from[27 * stride].x,
- from[26 * stride].x, from[25 * stride].x, from[24 * stride].x, from[23 * stride].x, from[22 * stride].x,
- from[21 * stride].x, from[20 * stride].x, from[19 * stride].x, from[18 * stride].x, from[17 * stride].x,
- from[16 * stride].x, from[15 * stride].x, from[14 * stride].x, from[13 * stride].x, from[12 * stride].x,
- from[11 * stride].x, from[10 * stride].x, from[9 * stride].x, from[8 * stride].x, from[7 * stride].x,
- from[6 * stride].x, from[5 * stride].x, from[4 * stride].x, from[3 * stride].x, from[2 * stride].x,
- from[1 * stride].x, from[0 * stride].x));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16h pcos<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h psin<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h plog<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h plog2<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h plog1p<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h pexp<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h pexpm1<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h ptanh<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h pfrexp<Packet16h>(const Packet16h&, Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h pldexp<Packet16h>(const Packet16h&, const Packet16h&);
-
-EIGEN_STRONG_INLINE Packet32h combine2Packet16h(const Packet16h& a, const Packet16h& b) {
- __m512d result = _mm512_undefined_pd();
- result = _mm512_insertf64x4(result, _mm256_castsi256_pd(a), 0);
- result = _mm512_insertf64x4(result, _mm256_castsi256_pd(b), 1);
- return _mm512_castpd_ph(result);
-}
-
-EIGEN_STRONG_INLINE void extract2Packet16h(const Packet32h& x, Packet16h& a, Packet16h& b) {
- a = _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(x), 0));
- b = _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(x), 1));
-}
-
-// psin
-template <>
-EIGEN_STRONG_INLINE Packet32h psin<Packet32h>(const Packet32h& a) {
- Packet16h low;
- Packet16h high;
- extract2Packet16h(a, low, high);
-
- Packet16h lowOut = psin(low);
- Packet16h highOut = psin(high);
-
- return combine2Packet16h(lowOut, highOut);
-}
-
-// pcos
-template <>
-EIGEN_STRONG_INLINE Packet32h pcos<Packet32h>(const Packet32h& a) {
- Packet16h low;
- Packet16h high;
- extract2Packet16h(a, low, high);
-
- Packet16h lowOut = pcos(low);
- Packet16h highOut = pcos(high);
-
- return combine2Packet16h(lowOut, highOut);
-}
-
-// plog
-template <>
-EIGEN_STRONG_INLINE Packet32h plog<Packet32h>(const Packet32h& a) {
- Packet16h low;
- Packet16h high;
- extract2Packet16h(a, low, high);
-
- Packet16h lowOut = plog(low);
- Packet16h highOut = plog(high);
-
- return combine2Packet16h(lowOut, highOut);
-}
-
-// plog2
-template <>
-EIGEN_STRONG_INLINE Packet32h plog2<Packet32h>(const Packet32h& a) {
- Packet16h low;
- Packet16h high;
- extract2Packet16h(a, low, high);
-
- Packet16h lowOut = plog2(low);
- Packet16h highOut = plog2(high);
-
- return combine2Packet16h(lowOut, highOut);
-}
-
-// plog1p
-template <>
-EIGEN_STRONG_INLINE Packet32h plog1p<Packet32h>(const Packet32h& a) {
- Packet16h low;
- Packet16h high;
- extract2Packet16h(a, low, high);
-
- Packet16h lowOut = plog1p(low);
- Packet16h highOut = plog1p(high);
-
- return combine2Packet16h(lowOut, highOut);
-}
-
-// pexp
-template <>
-EIGEN_STRONG_INLINE Packet32h pexp<Packet32h>(const Packet32h& a) {
- Packet16h low;
- Packet16h high;
- extract2Packet16h(a, low, high);
-
- Packet16h lowOut = pexp(low);
- Packet16h highOut = pexp(high);
-
- return combine2Packet16h(lowOut, highOut);
-}
-
-// pexpm1
-template <>
-EIGEN_STRONG_INLINE Packet32h pexpm1<Packet32h>(const Packet32h& a) {
- Packet16h low;
- Packet16h high;
- extract2Packet16h(a, low, high);
-
- Packet16h lowOut = pexpm1(low);
- Packet16h highOut = pexpm1(high);
-
- return combine2Packet16h(lowOut, highOut);
-}
-
-// ptanh
-template <>
-EIGEN_STRONG_INLINE Packet32h ptanh<Packet32h>(const Packet32h& a) {
- Packet16h low;
- Packet16h high;
- extract2Packet16h(a, low, high);
-
- Packet16h lowOut = ptanh(low);
- Packet16h highOut = ptanh(high);
-
- return combine2Packet16h(lowOut, highOut);
-}
-
-// pfrexp
-template <>
-EIGEN_STRONG_INLINE Packet32h pfrexp<Packet32h>(const Packet32h& a, Packet32h& exponent) {
- Packet16h low;
- Packet16h high;
- extract2Packet16h(a, low, high);
-
- Packet16h exp1 = _mm256_undefined_si256();
- Packet16h exp2 = _mm256_undefined_si256();
-
- Packet16h lowOut = pfrexp(low, exp1);
- Packet16h highOut = pfrexp(high, exp2);
-
- exponent = combine2Packet16h(exp1, exp2);
-
- return combine2Packet16h(lowOut, highOut);
-}
-
-// pldexp
-template <>
-EIGEN_STRONG_INLINE Packet32h pldexp<Packet32h>(const Packet32h& a, const Packet32h& exponent) {
- Packet16h low;
- Packet16h high;
- extract2Packet16h(a, low, high);
-
- Packet16h exp1;
- Packet16h exp2;
- extract2Packet16h(exponent, exp1, exp2);
-
- Packet16h lowOut = pldexp(low, exp1);
- Packet16h highOut = pldexp(high, exp2);
-
- return combine2Packet16h(lowOut, highOut);
-}
-
-} // end namespace internal
-} // end namespace Eigen
-
-#endif // EIGEN_PACKET_MATH_FP16_AVX512_H
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+typedef __m512h Packet32h;
+typedef eigen_packet_wrapper<__m256i, 1> Packet16h;
+typedef eigen_packet_wrapper<__m128i, 2> Packet8h;
+
+template <>
+struct is_arithmetic<Packet8h> {
+ enum { value = true };
+};
+
+template <>
+struct packet_traits<half> : default_packet_traits {
+ typedef Packet32h type;
+ typedef Packet16h half;
+ enum {
+ Vectorizable = 1,
+ AlignedOnScalar = 1,
+ size = 32,
+
+ HasCmp = 1,
+ HasAdd = 1,
+ HasSub = 1,
+ HasMul = 1,
+ HasDiv = 1,
+ HasNegate = 1,
+ HasAbs = 1,
+ HasAbs2 = 0,
+ HasMin = 1,
+ HasMax = 1,
+ HasConj = 1,
+ HasSetLinear = 0,
+ HasLog = 1,
+ HasLog1p = 1,
+ HasExp = 1,
+ HasExpm1 = 1,
+ HasSqrt = 1,
+ HasRsqrt = 1,
+ // These ones should be implemented in future
+ HasBessel = 0,
+ HasNdtri = 0,
+ HasSin = EIGEN_FAST_MATH,
+ HasCos = EIGEN_FAST_MATH,
+ HasTanh = EIGEN_FAST_MATH,
+ HasErf = 0, // EIGEN_FAST_MATH,
+ HasBlend = 0,
+ HasRound = 1,
+ HasFloor = 1,
+ HasCeil = 1,
+ HasRint = 1
+ };
+};
+
+template <>
+struct unpacket_traits<Packet32h> {
+ typedef Eigen::half type;
+ typedef Packet16h half;
+ enum {
+ size = 32,
+ alignment = Aligned64,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
+ };
+};
+
+template <>
+struct unpacket_traits<Packet16h> {
+ typedef Eigen::half type;
+ typedef Packet8h half;
+ enum {
+ size = 16,
+ alignment = Aligned32,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
+ };
+};
+
+template <>
+struct unpacket_traits<Packet8h> {
+ typedef Eigen::half type;
+ typedef Packet8h half;
+ enum {
+ size = 8,
+ alignment = Aligned16,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
+ };
+};
+
+// Memory functions
+
+// pset1
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pset1<Packet32h>(const Eigen::half& from) {
+ return _mm512_set1_ph(static_cast<_Float16>(from));
+}
+
+// pset1frombits
+template <>
+EIGEN_STRONG_INLINE Packet32h pset1frombits<Packet32h>(unsigned short from) {
+ return _mm512_castsi512_ph(_mm512_set1_epi16(from));
+}
+
+// pfirst
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half pfirst<Packet32h>(const Packet32h& from) {
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+ return half_impl::raw_uint16_to_half(
+ static_cast<unsigned short>(_mm256_extract_epi16(_mm512_extracti32x8_epi32(_mm512_castph_si512(from), 0), 0)));
+#else
+ Eigen::half dest[32];
+ _mm512_storeu_ph(dest, from);
+ return dest[0];
+#endif
+}
+
+// pload
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pload<Packet32h>(const Eigen::half* from) {
+ EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_ph(from);
+}
+
+// ploadu
+
+template <>
+EIGEN_STRONG_INLINE Packet32h ploadu<Packet32h>(const Eigen::half* from) {
+ EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_ph(from);
+}
+
+// pstore
+
+template <>
+EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet32h& from) {
+ EIGEN_DEBUG_ALIGNED_STORE _mm512_store_ph(to, from);
+}
+
+// pstoreu
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet32h& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_ph(to, from);
+}
+
+// ploaddup
+template <>
+EIGEN_STRONG_INLINE Packet32h ploaddup<Packet32h>(const Eigen::half* from) {
+ __m512h a = _mm512_castph256_ph512(_mm256_loadu_ph(from));
+ return _mm512_permutexvar_ph(_mm512_set_epi16(15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6,
+ 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0),
+ a);
+}
+
+// ploadquad
+template <>
+EIGEN_STRONG_INLINE Packet32h ploadquad<Packet32h>(const Eigen::half* from) {
+ __m512h a = _mm512_castph128_ph512(_mm_loadu_ph(from));
+ return _mm512_permutexvar_ph(
+ _mm512_set_epi16(7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0),
+ a);
+}
+
+// pabs
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pabs<Packet32h>(const Packet32h& a) {
+ return _mm512_abs_ph(a);
+}
+
+// psignbit
+
+template <>
+EIGEN_STRONG_INLINE Packet32h psignbit<Packet32h>(const Packet32h& a) {
+ return _mm512_castsi512_ph(_mm512_srai_epi16(_mm512_castph_si512(a), 15));
+}
+
+// pmin
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pmin<Packet32h>(const Packet32h& a, const Packet32h& b) {
+ return _mm512_min_ph(a, b);
+}
+
+// pmax
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pmax<Packet32h>(const Packet32h& a, const Packet32h& b) {
+ return _mm512_max_ph(a, b);
+}
+
+// plset
+template <>
+EIGEN_STRONG_INLINE Packet32h plset<Packet32h>(const half& a) {
+ return _mm512_add_ph(_mm512_set1_ph(a),
+ _mm512_set_ph(31.0f, 30.0f, 29.0f, 28.0f, 27.0f, 26.0f, 25.0f, 24.0f, 23.0f, 22.0f, 21.0f, 20.0f,
+ 19.0f, 18.0f, 17.0f, 16.0f, 15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f,
+ 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f));
+}
+
+// por
+
+template <>
+EIGEN_STRONG_INLINE Packet32h por(const Packet32h& a, const Packet32h& b) {
+ return _mm512_castsi512_ph(_mm512_or_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
+}
+
+// pxor
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pxor(const Packet32h& a, const Packet32h& b) {
+ return _mm512_castsi512_ph(_mm512_xor_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
+}
+
+// pand
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pand(const Packet32h& a, const Packet32h& b) {
+ return _mm512_castsi512_ph(_mm512_and_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
+}
+
+// pandnot
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pandnot(const Packet32h& a, const Packet32h& b) {
+ return _mm512_castsi512_ph(_mm512_andnot_si512(_mm512_castph_si512(b), _mm512_castph_si512(a)));
+}
+
+// pselect
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet32h pselect(const Packet32h& mask, const Packet32h& a, const Packet32h& b) {
+ __mmask32 mask32 = _mm512_cmp_epi16_mask(_mm512_castph_si512(mask), _mm512_setzero_epi32(), _MM_CMPINT_EQ);
+ return _mm512_mask_blend_ph(mask32, a, b);
+}
+
+// pcmp_eq
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pcmp_eq(const Packet32h& a, const Packet32h& b) {
+ __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_EQ_OQ);
+ return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu));
+}
+
+// pcmp_le
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pcmp_le(const Packet32h& a, const Packet32h& b) {
+ __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_LE_OQ);
+ return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu));
+}
+
+// pcmp_lt
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pcmp_lt(const Packet32h& a, const Packet32h& b) {
+ __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_LT_OQ);
+ return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu));
+}
+
+// pcmp_lt_or_nan
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pcmp_lt_or_nan(const Packet32h& a, const Packet32h& b) {
+ __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_NGE_UQ);
+ return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi16(0), mask, 0xffffu));
+}
+
+// padd
+
+template <>
+EIGEN_STRONG_INLINE Packet32h padd<Packet32h>(const Packet32h& a, const Packet32h& b) {
+ return _mm512_add_ph(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
+ return _mm256_castph_si256(_mm256_add_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {
+ return _mm_castph_si128(_mm_add_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
+}
+
+// psub
+
+template <>
+EIGEN_STRONG_INLINE Packet32h psub<Packet32h>(const Packet32h& a, const Packet32h& b) {
+ return _mm512_sub_ph(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h psub<Packet16h>(const Packet16h& a, const Packet16h& b) {
+ return _mm256_castph_si256(_mm256_sub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h psub<Packet8h>(const Packet8h& a, const Packet8h& b) {
+ return _mm_castph_si128(_mm_sub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
+}
+
+// pmul
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pmul<Packet32h>(const Packet32h& a, const Packet32h& b) {
+ return _mm512_mul_ph(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {
+ return _mm256_castph_si256(_mm256_mul_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {
+ return _mm_castph_si128(_mm_mul_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
+}
+
+// pdiv
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pdiv<Packet32h>(const Packet32h& a, const Packet32h& b) {
+ return _mm512_div_ph(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pdiv<Packet16h>(const Packet16h& a, const Packet16h& b) {
+ return _mm256_castph_si256(_mm256_div_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pdiv<Packet8h>(const Packet8h& a, const Packet8h& b) {
+ return _mm_castph_si128(_mm_div_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
+}
+
+// pround
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pround<Packet32h>(const Packet32h& a) {
+ // Work-around for default std::round rounding mode.
+
+ // Mask for the sign bit
+ const Packet32h signMask = pset1frombits<Packet32h>(static_cast<numext::uint16_t>(0x8000u));
+ // The largest half-preicision float less than 0.5
+ const Packet32h prev0dot5 = pset1frombits<Packet32h>(static_cast<numext::uint16_t>(0x37FFu));
+
+ return _mm512_roundscale_ph(padd(por(pand(a, signMask), prev0dot5), a), _MM_FROUND_TO_ZERO);
+}
+
+// print
+
+template <>
+EIGEN_STRONG_INLINE Packet32h print<Packet32h>(const Packet32h& a) {
+ return _mm512_roundscale_ph(a, _MM_FROUND_CUR_DIRECTION);
+}
+
+// pceil
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pceil<Packet32h>(const Packet32h& a) {
+ return _mm512_roundscale_ph(a, _MM_FROUND_TO_POS_INF);
+}
+
+// pfloor
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pfloor<Packet32h>(const Packet32h& a) {
+ return _mm512_roundscale_ph(a, _MM_FROUND_TO_NEG_INF);
+}
+
+// predux
+template <>
+EIGEN_STRONG_INLINE half predux<Packet32h>(const Packet32h& a) {
+ return (half)_mm512_reduce_add_ph(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& a) {
+ return (half)_mm256_reduce_add_ph(_mm256_castsi256_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux<Packet8h>(const Packet8h& a) {
+ return (half)_mm_reduce_add_ph(_mm_castsi128_ph(a));
+}
+
+// predux_half_dowto4
+template <>
+EIGEN_STRONG_INLINE Packet16h predux_half_dowto4<Packet32h>(const Packet32h& a) {
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+ __m256i lowHalf = _mm256_castps_si256(_mm512_extractf32x8_ps(_mm512_castph_ps(a), 0));
+ __m256i highHalf = _mm256_castps_si256(_mm512_extractf32x8_ps(_mm512_castph_ps(a), 1));
+
+ return Packet16h(padd<Packet16h>(lowHalf, highHalf));
+#else
+ Eigen::half data[32];
+ _mm512_storeu_ph(data, a);
+
+ __m256i lowHalf = _mm256_castph_si256(_mm256_loadu_ph(data));
+ __m256i highHalf = _mm256_castph_si256(_mm256_loadu_ph(data + 16));
+
+ return Packet16h(padd<Packet16h>(lowHalf, highHalf));
+#endif
+}
+
+// predux_max
+
+// predux_min
+
+// predux_mul
+
+#ifdef EIGEN_VECTORIZE_FMA
+
+// pmadd
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pmadd(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
+ return _mm512_fmadd_ph(a, b, c);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pmadd(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
+ return _mm256_castph_si256(_mm256_fmadd_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pmadd(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
+ return _mm_castph_si128(_mm_fmadd_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
+}
+
+// pmsub
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pmsub(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
+ return _mm512_fmsub_ph(a, b, c);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pmsub(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
+ return _mm256_castph_si256(_mm256_fmsub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pmsub(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
+ return _mm_castph_si128(_mm_fmsub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
+}
+
+// pnmadd
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pnmadd(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
+ return _mm512_fnmadd_ph(a, b, c);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pnmadd(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
+ return _mm256_castph_si256(_mm256_fnmadd_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pnmadd(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
+ return _mm_castph_si128(_mm_fnmadd_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
+}
+
+// pnmsub
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pnmsub(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
+ return _mm512_fnmsub_ph(a, b, c);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pnmsub(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
+ return _mm256_castph_si256(_mm256_fnmsub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pnmsub(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
+ return _mm_castph_si128(_mm_fnmsub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
+}
+
+#endif
+
+// pnegate
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pnegate<Packet32h>(const Packet32h& a) {
+ return _mm512_sub_ph(_mm512_set1_ph(0.0), a);
+}
+
+// pconj
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pconj<Packet32h>(const Packet32h& a) {
+ return a;
+}
+
+// psqrt
+
+template <>
+EIGEN_STRONG_INLINE Packet32h psqrt<Packet32h>(const Packet32h& a) {
+ return _mm512_sqrt_ph(a);
+}
+
+// prsqrt
+
+template <>
+EIGEN_STRONG_INLINE Packet32h prsqrt<Packet32h>(const Packet32h& a) {
+ return _mm512_rsqrt_ph(a);
+}
+
+// preciprocal
+
+template <>
+EIGEN_STRONG_INLINE Packet32h preciprocal<Packet32h>(const Packet32h& a) {
+ return _mm512_rcp_ph(a);
+}
+
+// ptranspose
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet32h, 32>& a) {
+ __m512i t[32];
+
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < 16; i++) {
+ t[2 * i] = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[2 * i]), _mm512_castph_si512(a.packet[2 * i + 1]));
+ t[2 * i + 1] =
+ _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[2 * i]), _mm512_castph_si512(a.packet[2 * i + 1]));
+ }
+
+ __m512i p[32];
+
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < 8; i++) {
+ p[4 * i] = _mm512_unpacklo_epi32(t[4 * i], t[4 * i + 2]);
+ p[4 * i + 1] = _mm512_unpackhi_epi32(t[4 * i], t[4 * i + 2]);
+ p[4 * i + 2] = _mm512_unpacklo_epi32(t[4 * i + 1], t[4 * i + 3]);
+ p[4 * i + 3] = _mm512_unpackhi_epi32(t[4 * i + 1], t[4 * i + 3]);
+ }
+
+ __m512i q[32];
+
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < 4; i++) {
+ q[8 * i] = _mm512_unpacklo_epi64(p[8 * i], p[8 * i + 4]);
+ q[8 * i + 1] = _mm512_unpackhi_epi64(p[8 * i], p[8 * i + 4]);
+ q[8 * i + 2] = _mm512_unpacklo_epi64(p[8 * i + 1], p[8 * i + 5]);
+ q[8 * i + 3] = _mm512_unpackhi_epi64(p[8 * i + 1], p[8 * i + 5]);
+ q[8 * i + 4] = _mm512_unpacklo_epi64(p[8 * i + 2], p[8 * i + 6]);
+ q[8 * i + 5] = _mm512_unpackhi_epi64(p[8 * i + 2], p[8 * i + 6]);
+ q[8 * i + 6] = _mm512_unpacklo_epi64(p[8 * i + 3], p[8 * i + 7]);
+ q[8 * i + 7] = _mm512_unpackhi_epi64(p[8 * i + 3], p[8 * i + 7]);
+ }
+
+ __m512i f[32];
+
+#define PACKET32H_TRANSPOSE_HELPER(X, Y) \
+ do { \
+ f[Y * 8] = _mm512_inserti32x4(f[Y * 8], _mm512_extracti32x4_epi32(q[X * 8], Y), X); \
+ f[Y * 8 + 1] = _mm512_inserti32x4(f[Y * 8 + 1], _mm512_extracti32x4_epi32(q[X * 8 + 1], Y), X); \
+ f[Y * 8 + 2] = _mm512_inserti32x4(f[Y * 8 + 2], _mm512_extracti32x4_epi32(q[X * 8 + 2], Y), X); \
+ f[Y * 8 + 3] = _mm512_inserti32x4(f[Y * 8 + 3], _mm512_extracti32x4_epi32(q[X * 8 + 3], Y), X); \
+ f[Y * 8 + 4] = _mm512_inserti32x4(f[Y * 8 + 4], _mm512_extracti32x4_epi32(q[X * 8 + 4], Y), X); \
+ f[Y * 8 + 5] = _mm512_inserti32x4(f[Y * 8 + 5], _mm512_extracti32x4_epi32(q[X * 8 + 5], Y), X); \
+ f[Y * 8 + 6] = _mm512_inserti32x4(f[Y * 8 + 6], _mm512_extracti32x4_epi32(q[X * 8 + 6], Y), X); \
+ f[Y * 8 + 7] = _mm512_inserti32x4(f[Y * 8 + 7], _mm512_extracti32x4_epi32(q[X * 8 + 7], Y), X); \
+ } while (false);
+
+ PACKET32H_TRANSPOSE_HELPER(0, 0);
+ PACKET32H_TRANSPOSE_HELPER(1, 1);
+ PACKET32H_TRANSPOSE_HELPER(2, 2);
+ PACKET32H_TRANSPOSE_HELPER(3, 3);
+
+ PACKET32H_TRANSPOSE_HELPER(1, 0);
+ PACKET32H_TRANSPOSE_HELPER(2, 0);
+ PACKET32H_TRANSPOSE_HELPER(3, 0);
+ PACKET32H_TRANSPOSE_HELPER(2, 1);
+ PACKET32H_TRANSPOSE_HELPER(3, 1);
+ PACKET32H_TRANSPOSE_HELPER(3, 2);
+
+ PACKET32H_TRANSPOSE_HELPER(0, 1);
+ PACKET32H_TRANSPOSE_HELPER(0, 2);
+ PACKET32H_TRANSPOSE_HELPER(0, 3);
+ PACKET32H_TRANSPOSE_HELPER(1, 2);
+ PACKET32H_TRANSPOSE_HELPER(1, 3);
+ PACKET32H_TRANSPOSE_HELPER(2, 3);
+
+#undef PACKET32H_TRANSPOSE_HELPER
+
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < 32; i++) {
+ a.packet[i] = _mm512_castsi512_ph(f[i]);
+ }
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet32h, 4>& a) {
+ __m512i p0, p1, p2, p3, t0, t1, t2, t3, a0, a1, a2, a3;
+ t0 = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[0]), _mm512_castph_si512(a.packet[1]));
+ t1 = _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[0]), _mm512_castph_si512(a.packet[1]));
+ t2 = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[2]), _mm512_castph_si512(a.packet[3]));
+ t3 = _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[2]), _mm512_castph_si512(a.packet[3]));
+
+ p0 = _mm512_unpacklo_epi32(t0, t2);
+ p1 = _mm512_unpackhi_epi32(t0, t2);
+ p2 = _mm512_unpacklo_epi32(t1, t3);
+ p3 = _mm512_unpackhi_epi32(t1, t3);
+
+ a0 = p0;
+ a1 = p1;
+ a2 = p2;
+ a3 = p3;
+
+ a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p1, 0), 1);
+ a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p0, 1), 0);
+
+ a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p2, 0), 2);
+ a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p0, 2), 0);
+
+ a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p3, 0), 3);
+ a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p0, 3), 0);
+
+ a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p2, 1), 2);
+ a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p1, 2), 1);
+
+ a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p3, 2), 3);
+ a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p2, 3), 2);
+
+ a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p3, 1), 3);
+ a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p1, 3), 1);
+
+ a.packet[0] = _mm512_castsi512_ph(a0);
+ a.packet[1] = _mm512_castsi512_ph(a1);
+ a.packet[2] = _mm512_castsi512_ph(a2);
+ a.packet[3] = _mm512_castsi512_ph(a3);
+}
+
+// preverse
+
+template <>
+EIGEN_STRONG_INLINE Packet32h preverse(const Packet32h& a) {
+ return _mm512_permutexvar_ph(_mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31),
+ a);
+}
+
+// pscatter
+
+template <>
+EIGEN_STRONG_INLINE void pscatter<half, Packet32h>(half* to, const Packet32h& from, Index stride) {
+ EIGEN_ALIGN64 half aux[32];
+ pstore(aux, from);
+
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < 32; i++) {
+ to[stride * i] = aux[i];
+ }
+}
+
+// pgather
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pgather<Eigen::half, Packet32h>(const Eigen::half* from, Index stride) {
+ return _mm512_castsi512_ph(_mm512_set_epi16(
+ from[31 * stride].x, from[30 * stride].x, from[29 * stride].x, from[28 * stride].x, from[27 * stride].x,
+ from[26 * stride].x, from[25 * stride].x, from[24 * stride].x, from[23 * stride].x, from[22 * stride].x,
+ from[21 * stride].x, from[20 * stride].x, from[19 * stride].x, from[18 * stride].x, from[17 * stride].x,
+ from[16 * stride].x, from[15 * stride].x, from[14 * stride].x, from[13 * stride].x, from[12 * stride].x,
+ from[11 * stride].x, from[10 * stride].x, from[9 * stride].x, from[8 * stride].x, from[7 * stride].x,
+ from[6 * stride].x, from[5 * stride].x, from[4 * stride].x, from[3 * stride].x, from[2 * stride].x,
+ from[1 * stride].x, from[0 * stride].x));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pcos<Packet16h>(const Packet16h&);
+template <>
+EIGEN_STRONG_INLINE Packet16h psin<Packet16h>(const Packet16h&);
+template <>
+EIGEN_STRONG_INLINE Packet16h plog<Packet16h>(const Packet16h&);
+template <>
+EIGEN_STRONG_INLINE Packet16h plog2<Packet16h>(const Packet16h&);
+template <>
+EIGEN_STRONG_INLINE Packet16h plog1p<Packet16h>(const Packet16h&);
+template <>
+EIGEN_STRONG_INLINE Packet16h pexp<Packet16h>(const Packet16h&);
+template <>
+EIGEN_STRONG_INLINE Packet16h pexpm1<Packet16h>(const Packet16h&);
+template <>
+EIGEN_STRONG_INLINE Packet16h ptanh<Packet16h>(const Packet16h&);
+template <>
+EIGEN_STRONG_INLINE Packet16h pfrexp<Packet16h>(const Packet16h&, Packet16h&);
+template <>
+EIGEN_STRONG_INLINE Packet16h pldexp<Packet16h>(const Packet16h&, const Packet16h&);
+
+EIGEN_STRONG_INLINE Packet32h combine2Packet16h(const Packet16h& a, const Packet16h& b) {
+ __m512d result = _mm512_undefined_pd();
+ result = _mm512_insertf64x4(result, _mm256_castsi256_pd(a), 0);
+ result = _mm512_insertf64x4(result, _mm256_castsi256_pd(b), 1);
+ return _mm512_castpd_ph(result);
+}
+
+EIGEN_STRONG_INLINE void extract2Packet16h(const Packet32h& x, Packet16h& a, Packet16h& b) {
+ a = _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(x), 0));
+ b = _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(x), 1));
+}
+
+// psin
+template <>
+EIGEN_STRONG_INLINE Packet32h psin<Packet32h>(const Packet32h& a) {
+ Packet16h low;
+ Packet16h high;
+ extract2Packet16h(a, low, high);
+
+ Packet16h lowOut = psin(low);
+ Packet16h highOut = psin(high);
+
+ return combine2Packet16h(lowOut, highOut);
+}
+
+// pcos
+template <>
+EIGEN_STRONG_INLINE Packet32h pcos<Packet32h>(const Packet32h& a) {
+ Packet16h low;
+ Packet16h high;
+ extract2Packet16h(a, low, high);
+
+ Packet16h lowOut = pcos(low);
+ Packet16h highOut = pcos(high);
+
+ return combine2Packet16h(lowOut, highOut);
+}
+
+// plog
+template <>
+EIGEN_STRONG_INLINE Packet32h plog<Packet32h>(const Packet32h& a) {
+ Packet16h low;
+ Packet16h high;
+ extract2Packet16h(a, low, high);
+
+ Packet16h lowOut = plog(low);
+ Packet16h highOut = plog(high);
+
+ return combine2Packet16h(lowOut, highOut);
+}
+
+// plog2
+template <>
+EIGEN_STRONG_INLINE Packet32h plog2<Packet32h>(const Packet32h& a) {
+ Packet16h low;
+ Packet16h high;
+ extract2Packet16h(a, low, high);
+
+ Packet16h lowOut = plog2(low);
+ Packet16h highOut = plog2(high);
+
+ return combine2Packet16h(lowOut, highOut);
+}
+
+// plog1p
+template <>
+EIGEN_STRONG_INLINE Packet32h plog1p<Packet32h>(const Packet32h& a) {
+ Packet16h low;
+ Packet16h high;
+ extract2Packet16h(a, low, high);
+
+ Packet16h lowOut = plog1p(low);
+ Packet16h highOut = plog1p(high);
+
+ return combine2Packet16h(lowOut, highOut);
+}
+
+// pexp
+template <>
+EIGEN_STRONG_INLINE Packet32h pexp<Packet32h>(const Packet32h& a) {
+ Packet16h low;
+ Packet16h high;
+ extract2Packet16h(a, low, high);
+
+ Packet16h lowOut = pexp(low);
+ Packet16h highOut = pexp(high);
+
+ return combine2Packet16h(lowOut, highOut);
+}
+
+// pexpm1
+template <>
+EIGEN_STRONG_INLINE Packet32h pexpm1<Packet32h>(const Packet32h& a) {
+ Packet16h low;
+ Packet16h high;
+ extract2Packet16h(a, low, high);
+
+ Packet16h lowOut = pexpm1(low);
+ Packet16h highOut = pexpm1(high);
+
+ return combine2Packet16h(lowOut, highOut);
+}
+
+// ptanh
+template <>
+EIGEN_STRONG_INLINE Packet32h ptanh<Packet32h>(const Packet32h& a) {
+ Packet16h low;
+ Packet16h high;
+ extract2Packet16h(a, low, high);
+
+ Packet16h lowOut = ptanh(low);
+ Packet16h highOut = ptanh(high);
+
+ return combine2Packet16h(lowOut, highOut);
+}
+
+// pfrexp
+template <>
+EIGEN_STRONG_INLINE Packet32h pfrexp<Packet32h>(const Packet32h& a, Packet32h& exponent) {
+ Packet16h low;
+ Packet16h high;
+ extract2Packet16h(a, low, high);
+
+ Packet16h exp1 = _mm256_undefined_si256();
+ Packet16h exp2 = _mm256_undefined_si256();
+
+ Packet16h lowOut = pfrexp(low, exp1);
+ Packet16h highOut = pfrexp(high, exp2);
+
+ exponent = combine2Packet16h(exp1, exp2);
+
+ return combine2Packet16h(lowOut, highOut);
+}
+
+// pldexp
+template <>
+EIGEN_STRONG_INLINE Packet32h pldexp<Packet32h>(const Packet32h& a, const Packet32h& exponent) {
+ Packet16h low;
+ Packet16h high;
+ extract2Packet16h(a, low, high);
+
+ Packet16h exp1;
+ Packet16h exp2;
+ extract2Packet16h(exponent, exp1, exp2);
+
+ Packet16h lowOut = pldexp(low, exp1);
+ Packet16h highOut = pldexp(high, exp2);
+
+ return combine2Packet16h(lowOut, highOut);
+}
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // EIGEN_PACKET_MATH_FP16_AVX512_H
diff --git a/Eigen/src/Core/arch/AVX512/TrsmKernel.h b/Eigen/src/Core/arch/AVX512/TrsmKernel.h
index f0b1797..a3025ec 100644
--- a/Eigen/src/Core/arch/AVX512/TrsmKernel.h
+++ b/Eigen/src/Core/arch/AVX512/TrsmKernel.h
@@ -1082,7 +1082,7 @@
Index size, Index otherSize, const float *_tri, Index triStride, float *_other, Index otherIncr,
Index otherStride) {
EIGEN_UNUSED_VARIABLE(otherIncr);
-#ifdef EIGEN_NO_RUNTIME_MALLOC
+#ifdef EIGEN_RUNTIME_NO_MALLOC
if (!is_malloc_allowed()) {
trsmKernelR<float, Index, Mode, false, TriStorageOrder, 1, /*Specialized=*/false>::kernel(
size, otherSize, _tri, triStride, _other, otherIncr, otherStride);
@@ -1098,7 +1098,7 @@
Index size, Index otherSize, const double *_tri, Index triStride, double *_other, Index otherIncr,
Index otherStride) {
EIGEN_UNUSED_VARIABLE(otherIncr);
-#ifdef EIGEN_NO_RUNTIME_MALLOC
+#ifdef EIGEN_RUNTIME_NO_MALLOC
if (!is_malloc_allowed()) {
trsmKernelR<double, Index, Mode, false, TriStorageOrder, 1, /*Specialized=*/false>::kernel(
size, otherSize, _tri, triStride, _other, otherIncr, otherStride);
@@ -1132,7 +1132,7 @@
Index size, Index otherSize, const float *_tri, Index triStride, float *_other, Index otherIncr,
Index otherStride) {
EIGEN_UNUSED_VARIABLE(otherIncr);
-#ifdef EIGEN_NO_RUNTIME_MALLOC
+#ifdef EIGEN_RUNTIME_NO_MALLOC
if (!is_malloc_allowed()) {
trsmKernelL<float, Index, Mode, false, TriStorageOrder, 1, /*Specialized=*/false>::kernel(
size, otherSize, _tri, triStride, _other, otherIncr, otherStride);
@@ -1148,7 +1148,7 @@
Index size, Index otherSize, const double *_tri, Index triStride, double *_other, Index otherIncr,
Index otherStride) {
EIGEN_UNUSED_VARIABLE(otherIncr);
-#ifdef EIGEN_NO_RUNTIME_MALLOC
+#ifdef EIGEN_RUNTIME_NO_MALLOC
if (!is_malloc_allowed()) {
trsmKernelL<double, Index, Mode, false, TriStorageOrder, 1, /*Specialized=*/false>::kernel(
size, otherSize, _tri, triStride, _other, otherIncr, otherStride);
diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h
index 89cd677..d988eb1d 100644
--- a/Eigen/src/Core/functors/UnaryFunctors.h
+++ b/Eigen/src/Core/functors/UnaryFunctors.h
@@ -472,6 +472,20 @@
};
/** \internal
+ * \brief Template functor to compute the cube root of a scalar
+ * \sa class CwiseUnaryOp, Cwise::sqrt()
+ */
+template <typename Scalar>
+struct scalar_cbrt_op {
+ EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::cbrt(a); }
+};
+
+template <typename Scalar>
+struct functor_traits<scalar_cbrt_op<Scalar> > {
+ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
+};
+
+/** \internal
* \brief Template functor to compute the reciprocal square root of a scalar
* \sa class CwiseUnaryOp, Cwise::rsqrt()
*/
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index e8bb821..3e7784d 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -84,12 +84,12 @@
gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
-#ifdef EIGEN_HAS_OPENMP
+#if defined(EIGEN_HAS_OPENMP) || defined(EIGEN_GEMM_THREADPOOL)
if(info)
{
// this is the parallel version!
- int tid = omp_get_thread_num();
- int threads = omp_get_num_threads();
+ int tid = info->logical_thread_id;
+ int threads = info->num_threads;
LhsScalar* blockA = blocking.blockA();
eigen_internal_assert(blockA!=0);
@@ -110,15 +110,15 @@
// each thread packs the sub block A_k,i to A'_i where i is the thread id.
// However, before copying to A'_i, we have to make sure that no other thread is still using it,
- // i.e., we test that info[tid].users equals 0.
- // Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it.
- while(info[tid].users!=0) {}
- info[tid].users = threads;
+ // i.e., we test that info->task_info[tid].users equals 0.
+ // Then, we set info->task_info[tid].users to the number of threads to mark that all other threads are going to use it.
+ while(info->task_info[tid].users!=0) {}
+ info->task_info[tid].users = threads;
- pack_lhs(blockA+info[tid].lhs_start*actual_kc, lhs.getSubMapper(info[tid].lhs_start,k), actual_kc, info[tid].lhs_length);
+ pack_lhs(blockA+info->task_info[tid].lhs_start*actual_kc, lhs.getSubMapper(info->task_info[tid].lhs_start,k), actual_kc, info->task_info[tid].lhs_length);
// Notify the other threads that the part A'_i is ready to go.
- info[tid].sync = k;
+ info->task_info[tid].sync = k;
// Computes C_i += A' * B' per A'_i
for(int shift=0; shift<threads; ++shift)
@@ -129,11 +129,10 @@
// we use testAndSetOrdered to mimic a volatile access.
// However, no need to wait for the B' part which has been updated by the current thread!
if (shift>0) {
- while(info[i].sync!=k) {
- }
+ while(info->task_info[i].sync!=k) {}
}
- gebp(res.getSubMapper(info[i].lhs_start, 0), blockA+info[i].lhs_start*actual_kc, blockB, info[i].lhs_length, actual_kc, nc, alpha);
+ gebp(res.getSubMapper(info->task_info[i].lhs_start, 0), blockA+info->task_info[i].lhs_start*actual_kc, blockB, info->task_info[i].lhs_length, actual_kc, nc, alpha);
}
// Then keep going as usual with the remaining B'
@@ -151,11 +150,11 @@
// Release all the sub blocks A'_i of A' for the current thread,
// i.e., we simply decrement the number of users by 1
for(Index i=0; i<threads; ++i)
- info[i].users -= 1;
+ info->task_info[i].users -= 1;
}
}
else
-#endif // EIGEN_HAS_OPENMP
+#endif // defined(EIGEN_HAS_OPENMP) || defined(EIGEN_GEMM_THREADPOOL)
{
EIGEN_UNUSED_VARIABLE(info);
diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h
index 3d9cb5f..3e76827 100644
--- a/Eigen/src/Core/products/Parallelizer.h
+++ b/Eigen/src/Core/products/Parallelizer.h
@@ -13,49 +13,41 @@
// IWYU pragma: private
#include "../InternalHeaderCheck.h"
+// Note that in the following, there are 3 different uses of the concept
+// "number of threads":
+// 1. Max number of threads used by OpenMP or ThreadPool.
+// * For OpenMP this is typically the value set by the OMP_NUM_THREADS
+// environment variable, or by a call to omp_set_num_threads() prior to
+// calling Eigen.
+// * For ThreadPool, this is the number of threads in the ThreadPool.
+// 2. Max number of threads currently allowed to be used by parallel Eigen
+// operations. This is set by setNbThreads(), and cannot exceed the value
+// in 1.
+// 3. The actual number of threads used for a given parallel Eigen operation.
+// This is typically computed on the fly using a cost model and cannot exceed
+// the value in 2.
+// * For OpenMP, this is typically the number of threads specified in individual
+// "omp parallel" pragmas associated with an Eigen operation.
+// * For ThreadPool, it is the number of concurrent tasks scheduled in the
+// threadpool for a given Eigen operation. Notice that since the threadpool
+// uses task stealing, there is no way to limit the number of concurrently
+// executing tasks to below the number in 1. except by limiting the total
+// number of tasks in flight.
+
+#if defined(EIGEN_HAS_OPENMP) && defined(EIGEN_GEMM_THREADPOOL)
+#error "EIGEN_HAS_OPENMP and EIGEN_GEMM_THREADPOOL may not both be defined."
+#endif
+
namespace Eigen {
namespace internal {
-
-/** \internal */
-inline void manage_multi_threading(Action action, int* v)
-{
- static int m_maxThreads = -1;
- EIGEN_UNUSED_VARIABLE(m_maxThreads)
-
- if(action==SetAction)
- {
- eigen_internal_assert(v!=0);
- m_maxThreads = *v;
- }
- else if(action==GetAction)
- {
- eigen_internal_assert(v!=0);
- #ifdef EIGEN_HAS_OPENMP
- if(m_maxThreads>0)
- *v = m_maxThreads;
- else
- *v = omp_get_max_threads();
- #else
- *v = 1;
- #endif
- }
- else
- {
- eigen_internal_assert(false);
- }
+ inline void manage_multi_threading(Action action, int* v);
}
-}
+// Public APIs.
/** Must be call first when calling Eigen from multiple threads */
-inline void initParallel()
-{
- int nbt;
- internal::manage_multi_threading(GetAction, &nbt);
- std::ptrdiff_t l1, l2, l3;
- internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
-}
+EIGEN_DEPRECATED inline void initParallel() {}
/** \returns the max number of threads reserved for Eigen
* \sa setNbThreads */
@@ -73,42 +65,111 @@
internal::manage_multi_threading(SetAction, &v);
}
-namespace internal {
+#ifdef EIGEN_GEMM_THREADPOOL
+// Sets the ThreadPool used by Eigen parallel Gemm.
+//
+// NOTICE: This function has a known race condition with
+// parallelize_gemm below, and should not be called while
+// an instance of that function is running.
+//
+// TODO(rmlarsen): Make the device API available instead of
+// storing a local static pointer variable to avoid this issue.
+inline ThreadPool* setGemmThreadPool(ThreadPool* new_pool) {
+ static ThreadPool* pool;
+ if (new_pool != nullptr) {
+ // This will wait for work in all threads in *pool to finish,
+ // then destroy the old ThreadPool, and then replace it with new_pool.
+ pool = new_pool;
+ // Reset the number of threads to the number of threads on the new pool.
+ setNbThreads(pool->NumThreads());
+ }
+ return pool;
+}
-template<typename Index> struct GemmParallelInfo
-{
-
-#ifdef EIGEN_HAS_OPENMP
- GemmParallelInfo() : sync(-1), users(0), lhs_start(0), lhs_length(0) {}
- std::atomic<Index> sync;
- std::atomic<int> users;
-#else
- GemmParallelInfo() : lhs_start(0), lhs_length(0) {}
+// Gets the ThreadPool used by Eigen parallel Gemm.
+inline ThreadPool* getGemmThreadPool() {
+ return setGemmThreadPool(nullptr);
+}
#endif
+
+namespace internal {
+
+// Implementation.
+
+#if defined(EIGEN_USE_BLAS) || (!defined(EIGEN_HAS_OPENMP) && !defined(EIGEN_GEMM_THREADPOOL))
+
+inline void manage_multi_threading(Action action, int* v) {
+ if (action == SetAction) {
+ eigen_internal_assert(v != nullptr);
+ } else if (action == GetAction) {
+ eigen_internal_assert(v != nullptr);
+ *v = 1;
+ } else {
+ eigen_internal_assert(false);
+ }
+}
+template<typename Index> struct GemmParallelInfo {};
+template <bool Condition, typename Functor, typename Index>
+EIGEN_STRONG_INLINE void parallelize_gemm(const Functor& func, Index rows, Index cols,
+ Index /*unused*/, bool /*unused*/) {
+ func(0,rows, 0,cols);
+}
+
+#else
+
+template<typename Index> struct GemmParallelTaskInfo {
+ GemmParallelTaskInfo() : sync(-1), users(0), lhs_start(0), lhs_length(0) {}
+ std::atomic<Index> sync;
+ std::atomic<int> users;
Index lhs_start;
Index lhs_length;
};
-template<bool Condition, typename Functor, typename Index>
-void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth, bool transpose)
-{
- // TODO when EIGEN_USE_BLAS is defined,
- // we should still enable OMP for other scalar types
- // Without C++11, we have to disable GEMM's parallelization on
- // non x86 architectures because there volatile is not enough for our purpose.
- // See bug 1572.
-#if (! defined(EIGEN_HAS_OPENMP)) || defined(EIGEN_USE_BLAS)
- // FIXME the transpose variable is only needed to properly split
- // the matrix product when multithreading is enabled. This is a temporary
- // fix to support row-major destination matrices. This whole
- // parallelizer mechanism has to be redesigned anyway.
- EIGEN_UNUSED_VARIABLE(depth);
- EIGEN_UNUSED_VARIABLE(transpose);
- func(0,rows, 0,cols);
-#else
+template<typename Index> struct GemmParallelInfo {
+ const int logical_thread_id;
+ const int num_threads;
+ GemmParallelTaskInfo<Index>* task_info;
- // Dynamically check whether we should enable or disable OpenMP.
+ GemmParallelInfo(int logical_thread_id_, int num_threads_,
+ GemmParallelTaskInfo<Index>* task_info_)
+ : logical_thread_id(logical_thread_id_),
+ num_threads(num_threads_),
+ task_info(task_info_) {}
+};
+
+inline void manage_multi_threading(Action action, int* v) {
+ static int m_maxThreads = -1;
+ if (action == SetAction) {
+ eigen_internal_assert(v != nullptr);
+#if defined(EIGEN_HAS_OPENMP)
+ // Calling action == SetAction and *v = 0 means
+ // restoring m_maxThreads to the maximum number of threads specified
+ // for OpenMP.
+ eigen_internal_assert(*v >= 0);
+ int omp_threads = omp_get_max_threads();
+ m_maxThreads = (*v == 0 ? omp_threads : std::min(*v, omp_threads));
+#elif defined(EIGEN_GEMM_THREADPOOL)
+ // Calling action == SetAction and *v = 0 means
+ // restoring m_maxThreads to the number of threads in the ThreadPool,
+ // which defaults to 1 if no pool was provided.
+ eigen_internal_assert(*v >= 0);
+ ThreadPool* pool = getGemmThreadPool();
+ int pool_threads = pool != nullptr ? pool->NumThreads() : 1;
+ m_maxThreads = (*v == 0 ? pool_threads : numext::mini(pool_threads, *v));
+#endif
+ } else if (action == GetAction) {
+ eigen_internal_assert(v != nullptr);
+ *v = m_maxThreads;
+ } else {
+ eigen_internal_assert(false);
+ }
+}
+
+template <bool Condition, typename Functor, typename Index>
+EIGEN_STRONG_INLINE void parallelize_gemm(const Functor& func, Index rows, Index cols,
+ Index depth, bool transpose) {
+ // Dynamically check whether we should even try to execute in parallel.
// The conditions are:
// - the max number of threads we can create is greater than 1
// - we are not already in a parallel code
@@ -126,27 +187,41 @@
pb_max_threads = std::max<Index>(1, std::min<Index>(pb_max_threads, static_cast<Index>( work / kMinTaskSize ) ));
// compute the number of threads we are going to use
- Index threads = std::min<Index>(nbThreads(), pb_max_threads);
+ int threads = std::min<int>(nbThreads(), static_cast<int>(pb_max_threads));
- // if multi-threading is explicitly disabled, not useful, or if we already are in a parallel session,
- // then abort multi-threading
- // FIXME omp_get_num_threads()>1 only works for openmp, what if the user does not use openmp?
- if((!Condition) || (threads==1) || (omp_get_num_threads()>1))
+ // if multi-threading is explicitly disabled, not useful, or if we already are
+ // inside a parallel session, then abort multi-threading
+ bool dont_parallelize = (!Condition) || (threads<=1);
+#if defined(EIGEN_HAS_OPENMP)
+ // don't parallelize if we are executing in a parallel context already.
+ dont_parallelize |= omp_get_num_threads() > 1;
+#elif defined(EIGEN_GEMM_THREADPOOL)
+ // don't parallelize if we have a trivial threadpool or the current thread id
+ // is != -1, indicating that we are already executing on a thread inside the pool.
+ // In other words, we do not allow nested parallelism, since this would lead to
+ // deadlocks due to the workstealing nature of the threadpool.
+ ThreadPool* pool = getGemmThreadPool();
+ dont_parallelize |= (pool == nullptr || pool->CurrentThreadId() != -1);
+#endif
+ if (dont_parallelize)
return func(0,rows, 0,cols);
- Eigen::initParallel();
func.initParallelSession(threads);
if(transpose)
std::swap(rows,cols);
- ei_declare_aligned_stack_constructed_variable(GemmParallelInfo<Index>,info,threads,0);
+ ei_declare_aligned_stack_constructed_variable(GemmParallelTaskInfo<Index>,task_info,threads,0);
+
+#if defined(EIGEN_HAS_OPENMP)
#pragma omp parallel num_threads(threads)
{
Index i = omp_get_thread_num();
- // Note that the actual number of threads might be lower than the number of request ones.
+ // Note that the actual number of threads might be lower than the number of
+ // requested ones
Index actual_threads = omp_get_num_threads();
+ GemmParallelInfo<Index> info(i, static_cast<int>(actual_threads), task_info);
Index blockCols = (cols / actual_threads) & ~Index(0x3);
Index blockRows = (rows / actual_threads);
@@ -158,17 +233,52 @@
Index c0 = i*blockCols;
Index actualBlockCols = (i+1==actual_threads) ? cols-c0 : blockCols;
- info[i].lhs_start = r0;
- info[i].lhs_length = actualBlockRows;
+ info.task_info[i].lhs_start = r0;
+ info.task_info[i].lhs_length = actualBlockRows;
- if(transpose) func(c0, actualBlockCols, 0, rows, info);
- else func(0, rows, c0, actualBlockCols, info);
+ if(transpose) func(c0, actualBlockCols, 0, rows, &info);
+ else func(0, rows, c0, actualBlockCols, &info);
}
+
+#elif defined(EIGEN_GEMM_THREADPOOL)
+ ei_declare_aligned_stack_constructed_variable(GemmParallelTaskInfo<Index>,meta_info,threads,0);
+ Barrier barrier(threads);
+ auto task = [=, &func, &barrier, &task_info](int i)
+ {
+ Index actual_threads = threads;
+ GemmParallelInfo<Index> info(i, static_cast<int>(actual_threads), task_info);
+ Index blockCols = (cols / actual_threads) & ~Index(0x3);
+ Index blockRows = (rows / actual_threads);
+ blockRows = (blockRows/Functor::Traits::mr)*Functor::Traits::mr;
+
+ Index r0 = i*blockRows;
+ Index actualBlockRows = (i+1==actual_threads) ? rows-r0 : blockRows;
+
+ Index c0 = i*blockCols;
+ Index actualBlockCols = (i+1==actual_threads) ? cols-c0 : blockCols;
+
+ info.task_info[i].lhs_start = r0;
+ info.task_info[i].lhs_length = actualBlockRows;
+
+ if(transpose) func(c0, actualBlockCols, 0, rows, &info);
+ else func(0, rows, c0, actualBlockCols, &info);
+
+ barrier.Notify();
+ };
+ // Notice that we do not schedule more than "threads" tasks, which allows us to
+ // limit number of running threads, even if the threadpool itself was constructed
+ // with a larger number of threads.
+ for (int i=0; i < threads - 1; ++i) {
+ pool->Schedule([=, task = std::move(task)] { task(i); });
+ }
+ task(threads - 1);
+ barrier.Wait();
#endif
}
-} // end namespace internal
+#endif
+} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_PARALLELIZER_H
diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h
index bed6cdd..7d1766c 100644
--- a/Eigen/src/Core/util/DisableStupidWarnings.h
+++ b/Eigen/src/Core/util/DisableStupidWarnings.h
@@ -21,6 +21,11 @@
#pragma warning( push )
#endif
#pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4714 4717 4800)
+ // We currently rely on has_denorm in tests, and need it defined correctly for half/bfloat16.
+ #ifndef _SILENCE_CXX23_DENORM_DEPRECATION_WARNING
+ #define EIGEN_REENABLE_CXX23_DENORM_DEPRECATION_WARNING 1
+ #define _SILENCE_CXX23_DENORM_DEPRECATION_WARNING
+ #endif
#elif defined __INTEL_COMPILER
// 2196 - routine is both "inline" and "noinline" ("noinline" assumed)
diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h
index 2a4126c..8bff87d 100644
--- a/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/Eigen/src/Core/util/ForwardDeclarations.h
@@ -185,6 +185,7 @@
template<typename Scalar> struct scalar_abs2_op;
template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_absolute_difference_op;
template<typename Scalar> struct scalar_sqrt_op;
+template<typename Scalar> struct scalar_cbrt_op;
template<typename Scalar> struct scalar_rsqrt_op;
template<typename Scalar> struct scalar_exp_op;
template<typename Scalar> struct scalar_log_op;
diff --git a/Eigen/src/Core/util/ReenableStupidWarnings.h b/Eigen/src/Core/util/ReenableStupidWarnings.h
index 7021e6d..c8238de 100644
--- a/Eigen/src/Core/util/ReenableStupidWarnings.h
+++ b/Eigen/src/Core/util/ReenableStupidWarnings.h
@@ -8,6 +8,11 @@
#ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
#ifdef _MSC_VER
#pragma warning( pop )
+ #ifdef EIGEN_REENABLE_CXX23_DENORM_DEPRECATION_WARNING
+ #undef EIGEN_REENABLE_CXX23_DENORM_DEPRECATION_WARNING
+ #undef _SILENCE_CXX23_DENORM_DEPRECATION_WARNING
+ #endif
+
#elif defined __INTEL_COMPILER
#pragma warning pop
#elif defined __clang__
diff --git a/Eigen/src/plugins/ArrayCwiseUnaryOps.inc b/Eigen/src/plugins/ArrayCwiseUnaryOps.inc
index 301a900..b012326 100644
--- a/Eigen/src/plugins/ArrayCwiseUnaryOps.inc
+++ b/Eigen/src/plugins/ArrayCwiseUnaryOps.inc
@@ -5,6 +5,7 @@
typedef CwiseUnaryOp<internal::scalar_carg_op<Scalar>, const Derived> CArgReturnType;
typedef CwiseUnaryOp<internal::scalar_abs2_op<Scalar>, const Derived> Abs2ReturnType;
typedef CwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived> SqrtReturnType;
+typedef CwiseUnaryOp<internal::scalar_cbrt_op<Scalar>, const Derived> CbrtReturnType;
typedef CwiseUnaryOp<internal::scalar_rsqrt_op<Scalar>, const Derived> RsqrtReturnType;
typedef CwiseUnaryOp<internal::scalar_sign_op<Scalar>, const Derived> SignReturnType;
typedef CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const Derived> InverseReturnType;
@@ -184,7 +185,7 @@
* Example: \include Cwise_sqrt.cpp
* Output: \verbinclude Cwise_sqrt.out
*
- * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_sqrt">Math functions</a>, pow(), square()
+ * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_sqrt">Math functions</a>, pow(), square(), cbrt()
*/
EIGEN_DEVICE_FUNC
inline const SqrtReturnType
@@ -193,6 +194,22 @@
return SqrtReturnType(derived());
}
+/** \returns an expression of the coefficient-wise cube root of *this.
+ *
+ * This function computes the coefficient-wise cube root.
+ *
+ * Example: \include Cwise_cbrt.cpp
+ * Output: \verbinclude Cwise_cbrt.out
+ *
+ * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_cbrt">Math functions</a>, sqrt(), pow(), square()
+ */
+EIGEN_DEVICE_FUNC
+inline const CbrtReturnType
+cbrt() const
+{
+ return CbrtReturnType(derived());
+}
+
/** \returns an expression of the coefficient-wise inverse square root of *this.
*
* This function computes the coefficient-wise inverse square root.
diff --git a/Eigen/src/plugins/MatrixCwiseUnaryOps.inc b/Eigen/src/plugins/MatrixCwiseUnaryOps.inc
index 0222137..cb65e17 100644
--- a/Eigen/src/plugins/MatrixCwiseUnaryOps.inc
+++ b/Eigen/src/plugins/MatrixCwiseUnaryOps.inc
@@ -17,6 +17,7 @@
typedef CwiseUnaryOp<internal::scalar_arg_op<Scalar>, const Derived> CwiseArgReturnType;
typedef CwiseUnaryOp<internal::scalar_carg_op<Scalar>, const Derived> CwiseCArgReturnType;
typedef CwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived> CwiseSqrtReturnType;
+typedef CwiseUnaryOp<internal::scalar_cbrt_op<Scalar>, const Derived> CwiseCbrtReturnType;
typedef CwiseUnaryOp<internal::scalar_sign_op<Scalar>, const Derived> CwiseSignReturnType;
typedef CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const Derived> CwiseInverseReturnType;
@@ -53,12 +54,25 @@
///
EIGEN_DOC_UNARY_ADDONS(cwiseSqrt,square-root)
///
-/// \sa cwisePow(), cwiseSquare()
+/// \sa cwisePow(), cwiseSquare(), cwiseCbrt()
///
EIGEN_DEVICE_FUNC
inline const CwiseSqrtReturnType
cwiseSqrt() const { return CwiseSqrtReturnType(derived()); }
+/// \returns an expression of the coefficient-wise cube root of *this.
+///
+/// Example: \include MatrixBase_cwiseCbrt.cpp
+/// Output: \verbinclude MatrixBase_cwiseCbrt.out
+///
+EIGEN_DOC_UNARY_ADDONS(cwiseCbrt,cube-root)
+///
+/// \sa cwiseSqrt(), cwiseSquare(), cwisePow()
+///
+EIGEN_DEVICE_FUNC
+inline const CwiseCbrtReturnType
+cwiseCbrt() const { return CwiseSCbrtReturnType(derived()); }
+
/// \returns an expression of the coefficient-wise signum of *this.
///
/// Example: \include MatrixBase_cwiseSign.cpp
diff --git a/debug/msvc/eigen.natvis b/debug/msvc/eigen.natvis
index da89857..22cf346 100644
--- a/debug/msvc/eigen.natvis
+++ b/debug/msvc/eigen.natvis
@@ -1,235 +1,235 @@
-<?xml version="1.0" encoding="utf-8"?>
-
-<AutoVisualizer xmlns="http://schemas.microsoft.com/vstudio/debugger/natvis/2010">
-
- <!-- Fixed x Fixed Matrix -->
- <Type Name="Eigen::Matrix<*,*,*,*,*,*>">
- <AlternativeType Name="Eigen::Array<*,-1,-1,*,*,*>"/>
- <DisplayString>[{$T2}, {$T3}] (fixed matrix)</DisplayString>
- <Expand>
- <ArrayItems Condition="Flags%2"> <!-- row major layout -->
- <Rank>2</Rank>
- <Size>$i==0 ? $T2 : $T3</Size>
- <ValuePointer>m_storage.m_data.array</ValuePointer>
- </ArrayItems>
- <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->
- <Direction>Backward</Direction>
- <Rank>2</Rank>
- <Size>$i==0 ? $T2 : $T3</Size>
- <ValuePointer>m_storage.m_data.array</ValuePointer>
- </ArrayItems>
- </Expand>
- </Type>
-
- <!-- 2 x 2 Matrix -->
- <Type Name="Eigen::Matrix<*,2,2,*,*,*>">
- <AlternativeType Name="Eigen::Array<*,2,2,*,*,*>"/>
- <DisplayString>[2, 2] (fixed matrix)</DisplayString>
- <Expand>
- <Synthetic Name="[row 0]" Condition="Flags%2">
- <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 0]" Condition="!(Flags%2)">
- <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[2]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 1]" Condition="Flags%2">
- <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 1]" Condition="!(Flags%2)">
- <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[3]})</DisplayString>
- </Synthetic>
- </Expand>
- </Type>
-
- <!-- 3 x 3 Matrix -->
- <Type Name="Eigen::Matrix<*,3,3,*,*,*>">
- <AlternativeType Name="Eigen::Array<*,3,3,*,*,*>"/>
- <DisplayString>[3, 3] (fixed matrix)</DisplayString>
- <Expand>
- <Synthetic Name="[row 0]" Condition="Flags%2">
- <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 0]" Condition="!(Flags%2)">
- <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[3]}, {m_storage.m_data.array[6]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 1]" Condition="Flags%2">
- <DisplayString>({m_storage.m_data.array[3]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[5]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 1]" Condition="!(Flags%2)">
- <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[7]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 2]" Condition="Flags%2">
- <DisplayString>({m_storage.m_data.array[6]}, {m_storage.m_data.array[7]}, {m_storage.m_data.array[8]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 2]" Condition="!(Flags%2)">
- <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[8]})</DisplayString>
- </Synthetic>
- </Expand>
- </Type>
-
- <!-- 4 x 4 Matrix -->
- <Type Name="Eigen::Matrix<*,4,4,*,*,*>">
- <AlternativeType Name="Eigen::Array<*,4,4,*,*,*>"/>
- <DisplayString>[4, 4] (fixed matrix)</DisplayString>
- <Expand>
- <Synthetic Name="[row 0]" Condition="Flags%2">
- <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 0]" Condition="!(Flags%2)">
- <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[8]}, {m_storage.m_data.array[12]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 1]" Condition="Flags%2">
- <DisplayString>({m_storage.m_data.array[4]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[6]}, {m_storage.m_data.array[7]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 1]" Condition="!(Flags%2)">
- <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[9]}, {m_storage.m_data.array[13]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 2]" Condition="Flags%2">
- <DisplayString>({m_storage.m_data.array[8]}, {m_storage.m_data.array[9]}, {m_storage.m_data.array[10]}, {m_storage.m_data.array[11]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 2]" Condition="!(Flags%2)">
- <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[6]}, {m_storage.m_data.array[10]}, {m_storage.m_data.array[14]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 3]" Condition="Flags%2">
- <DisplayString>({m_storage.m_data.array[12]}, {m_storage.m_data.array[13]}, {m_storage.m_data.array[14]}, {m_storage.m_data.array[15]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 3]" Condition="!(Flags%2)">
- <DisplayString>({m_storage.m_data.array[3]}, {m_storage.m_data.array[7]}, {m_storage.m_data.array[11]}, {m_storage.m_data.array[15]})</DisplayString>
- </Synthetic>
- </Expand>
- </Type>
-
- <!-- Dynamic x Dynamic Matrix -->
- <Type Name="Eigen::Matrix<*,-1,-1,*,*,*>">
- <AlternativeType Name="Eigen::Array<*,-1,-1,*,*,*>"/>
- <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
- <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}, {m_storage.m_cols}] (dynamic matrix)</DisplayString>
- <Expand>
- <ArrayItems Condition="Flags%2"> <!-- row major layout -->
- <Rank>2</Rank>
- <Size>$i==0 ? m_storage.m_rows : m_storage.m_cols</Size>
- <ValuePointer>m_storage.m_data</ValuePointer>
- </ArrayItems>
- <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->
- <Direction>Backward</Direction>
- <Rank>2</Rank>
- <Size>$i==0 ? m_storage.m_rows : m_storage.m_cols</Size>
- <ValuePointer>m_storage.m_data</ValuePointer>
- </ArrayItems>
- </Expand>
- </Type>
-
- <!-- Fixed x Dynamic Matrix -->
- <Type Name="Eigen::Matrix<*,*,-1,*,*,*>">
- <AlternativeType Name="Eigen::Array<*,*,-1,*,*,*>"/>
- <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
- <DisplayString Condition="m_storage.m_data != 0">[{$T2}, {m_storage.m_cols}] (dynamic column matrix)</DisplayString>
- <Expand>
- <ArrayItems Condition="Flags%2"> <!-- row major layout -->
- <Rank>2</Rank>
- <Size>$i==0 ? $T2 : m_storage.m_cols</Size>
- <ValuePointer>m_storage.m_data</ValuePointer>
- </ArrayItems>
- <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->
- <Direction>Backward</Direction>
- <Rank>2</Rank>
- <Size>$i==0 ? $T2 : m_storage.m_cols</Size>
- <ValuePointer>m_storage.m_data</ValuePointer>
- </ArrayItems>
- </Expand>
- </Type>
-
- <!-- Dynamic x Fixed Matrix -->
- <Type Name="Eigen::Matrix<*,-1,*,*,*,*>">
- <AlternativeType Name="Eigen::Array<*,-1,*,*,*,*>"/>
- <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
- <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}, {$T2}] (dynamic row matrix)</DisplayString>
- <Expand>
- <ArrayItems Condition="Flags%2"> <!-- row major layout -->
- <Rank>2</Rank>
- <Size>$i==0 ? m_storage.m_rows : $T2</Size>
- <ValuePointer>m_storage.m_data</ValuePointer>
- </ArrayItems>
- <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->
- <Direction>Backward</Direction>
- <Rank>2</Rank>
- <Size>$i==0 ? m_storage.m_rows : $T2</Size>
- <ValuePointer>m_storage.m_data</ValuePointer>
- </ArrayItems>
- </Expand>
- </Type>
-
- <!-- Dynamic Column Vector -->
- <Type Name="Eigen::Matrix<*,1,-1,*,*,*>">
- <AlternativeType Name="Eigen::Array<*,1,-1,*,*,*>"/>
- <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
- <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_cols}] (dynamic column vector)</DisplayString>
- <Expand>
- <Item Name="[size]">m_storage.m_cols</Item>
- <ArrayItems>
- <Size>m_storage.m_cols</Size>
- <ValuePointer>m_storage.m_data</ValuePointer>
- </ArrayItems>
- </Expand>
- </Type>
-
- <!-- Dynamic Row Vector -->
- <Type Name="Eigen::Matrix<*,-1,1,*,*,*>">
- <AlternativeType Name="Eigen::Array<*,-1,1,*,*,*>"/>
- <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
- <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}] (dynamic row vector)</DisplayString>
- <Expand>
- <Item Name="[size]">m_storage.m_rows</Item>
- <ArrayItems>
- <Size>m_storage.m_rows</Size>
- <ValuePointer>m_storage.m_data</ValuePointer>
- </ArrayItems>
- </Expand>
- </Type>
-
- <!-- Fixed Vector -->
- <Type Name="Eigen::Matrix<*,1,1,*,*,*>">
- <AlternativeType Name="Eigen::Array<*,1,1,*,*,*>"/>
- <DisplayString>[1] ({m_storage.m_data.array[0]})</DisplayString>
- <Expand>
- <Item Name="[x]">m_storage.m_data.array[0]</Item>
- </Expand>
- </Type>
-
- <Type Name="Eigen::Matrix<*,2,1,*,*,*>">
- <AlternativeType Name="Eigen::Matrix<*,1,2,*,*,*>"/>
- <AlternativeType Name="Eigen::Array<*,2,1,*,*,*>"/>
- <AlternativeType Name="Eigen::Array<*,1,2,*,*,*>"/>
- <DisplayString>[2] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]})</DisplayString>
- <Expand>
- <Item Name="[x]">m_storage.m_data.array[0]</Item>
- <Item Name="[y]">m_storage.m_data.array[1]</Item>
- </Expand>
- </Type>
-
- <Type Name="Eigen::Matrix<*,3,1,*,*,*>">
- <AlternativeType Name="Eigen::Matrix<*,1,3,*,*,*>"/>
- <AlternativeType Name="Eigen::Array<*,3,1,*,*,*>"/>
- <AlternativeType Name="Eigen::Array<*,1,3,*,*,*>"/>
- <DisplayString>[3] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]})</DisplayString>
- <Expand>
- <Item Name="[x]">m_storage.m_data.array[0]</Item>
- <Item Name="[y]">m_storage.m_data.array[1]</Item>
- <Item Name="[z]">m_storage.m_data.array[2]</Item>
- </Expand>
- </Type>
-
- <Type Name="Eigen::Matrix<*,4,1,*,*,*>">
- <AlternativeType Name="Eigen::Matrix<*,1,4,*,*,*>"/>
- <AlternativeType Name="Eigen::Array<*,4,1,*,*,*>"/>
- <AlternativeType Name="Eigen::Array<*,1,4,*,*,*>"/>
- <DisplayString>[4] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>
- <Expand>
- <Item Name="[x]">m_storage.m_data.array[0]</Item>
- <Item Name="[y]">m_storage.m_data.array[1]</Item>
- <Item Name="[z]">m_storage.m_data.array[2]</Item>
- <Item Name="[w]">m_storage.m_data.array[3]</Item>
- </Expand>
- </Type>
-
-</AutoVisualizer>
+<?xml version="1.0" encoding="utf-8"?>
+
+<AutoVisualizer xmlns="http://schemas.microsoft.com/vstudio/debugger/natvis/2010">
+
+ <!-- Fixed x Fixed Matrix -->
+ <Type Name="Eigen::Matrix<*,*,*,*,*,*>">
+ <AlternativeType Name="Eigen::Array<*,-1,-1,*,*,*>"/>
+ <DisplayString>[{$T2}, {$T3}] (fixed matrix)</DisplayString>
+ <Expand>
+ <ArrayItems Condition="Flags%2"> <!-- row major layout -->
+ <Rank>2</Rank>
+ <Size>$i==0 ? $T2 : $T3</Size>
+ <ValuePointer>m_storage.m_data.array</ValuePointer>
+ </ArrayItems>
+ <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->
+ <Direction>Backward</Direction>
+ <Rank>2</Rank>
+ <Size>$i==0 ? $T2 : $T3</Size>
+ <ValuePointer>m_storage.m_data.array</ValuePointer>
+ </ArrayItems>
+ </Expand>
+ </Type>
+
+ <!-- 2 x 2 Matrix -->
+ <Type Name="Eigen::Matrix<*,2,2,*,*,*>">
+ <AlternativeType Name="Eigen::Array<*,2,2,*,*,*>"/>
+ <DisplayString>[2, 2] (fixed matrix)</DisplayString>
+ <Expand>
+ <Synthetic Name="[row 0]" Condition="Flags%2">
+ <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 0]" Condition="!(Flags%2)">
+ <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[2]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 1]" Condition="Flags%2">
+ <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 1]" Condition="!(Flags%2)">
+ <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[3]})</DisplayString>
+ </Synthetic>
+ </Expand>
+ </Type>
+
+ <!-- 3 x 3 Matrix -->
+ <Type Name="Eigen::Matrix<*,3,3,*,*,*>">
+ <AlternativeType Name="Eigen::Array<*,3,3,*,*,*>"/>
+ <DisplayString>[3, 3] (fixed matrix)</DisplayString>
+ <Expand>
+ <Synthetic Name="[row 0]" Condition="Flags%2">
+ <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 0]" Condition="!(Flags%2)">
+ <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[3]}, {m_storage.m_data.array[6]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 1]" Condition="Flags%2">
+ <DisplayString>({m_storage.m_data.array[3]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[5]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 1]" Condition="!(Flags%2)">
+ <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[7]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 2]" Condition="Flags%2">
+ <DisplayString>({m_storage.m_data.array[6]}, {m_storage.m_data.array[7]}, {m_storage.m_data.array[8]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 2]" Condition="!(Flags%2)">
+ <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[8]})</DisplayString>
+ </Synthetic>
+ </Expand>
+ </Type>
+
+ <!-- 4 x 4 Matrix -->
+ <Type Name="Eigen::Matrix<*,4,4,*,*,*>">
+ <AlternativeType Name="Eigen::Array<*,4,4,*,*,*>"/>
+ <DisplayString>[4, 4] (fixed matrix)</DisplayString>
+ <Expand>
+ <Synthetic Name="[row 0]" Condition="Flags%2">
+ <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 0]" Condition="!(Flags%2)">
+ <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[8]}, {m_storage.m_data.array[12]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 1]" Condition="Flags%2">
+ <DisplayString>({m_storage.m_data.array[4]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[6]}, {m_storage.m_data.array[7]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 1]" Condition="!(Flags%2)">
+ <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[9]}, {m_storage.m_data.array[13]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 2]" Condition="Flags%2">
+ <DisplayString>({m_storage.m_data.array[8]}, {m_storage.m_data.array[9]}, {m_storage.m_data.array[10]}, {m_storage.m_data.array[11]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 2]" Condition="!(Flags%2)">
+ <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[6]}, {m_storage.m_data.array[10]}, {m_storage.m_data.array[14]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 3]" Condition="Flags%2">
+ <DisplayString>({m_storage.m_data.array[12]}, {m_storage.m_data.array[13]}, {m_storage.m_data.array[14]}, {m_storage.m_data.array[15]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 3]" Condition="!(Flags%2)">
+ <DisplayString>({m_storage.m_data.array[3]}, {m_storage.m_data.array[7]}, {m_storage.m_data.array[11]}, {m_storage.m_data.array[15]})</DisplayString>
+ </Synthetic>
+ </Expand>
+ </Type>
+
+ <!-- Dynamic x Dynamic Matrix -->
+ <Type Name="Eigen::Matrix<*,-1,-1,*,*,*>">
+ <AlternativeType Name="Eigen::Array<*,-1,-1,*,*,*>"/>
+ <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
+ <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}, {m_storage.m_cols}] (dynamic matrix)</DisplayString>
+ <Expand>
+ <ArrayItems Condition="Flags%2"> <!-- row major layout -->
+ <Rank>2</Rank>
+ <Size>$i==0 ? m_storage.m_rows : m_storage.m_cols</Size>
+ <ValuePointer>m_storage.m_data</ValuePointer>
+ </ArrayItems>
+ <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->
+ <Direction>Backward</Direction>
+ <Rank>2</Rank>
+ <Size>$i==0 ? m_storage.m_rows : m_storage.m_cols</Size>
+ <ValuePointer>m_storage.m_data</ValuePointer>
+ </ArrayItems>
+ </Expand>
+ </Type>
+
+ <!-- Fixed x Dynamic Matrix -->
+ <Type Name="Eigen::Matrix<*,*,-1,*,*,*>">
+ <AlternativeType Name="Eigen::Array<*,*,-1,*,*,*>"/>
+ <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
+ <DisplayString Condition="m_storage.m_data != 0">[{$T2}, {m_storage.m_cols}] (dynamic column matrix)</DisplayString>
+ <Expand>
+ <ArrayItems Condition="Flags%2"> <!-- row major layout -->
+ <Rank>2</Rank>
+ <Size>$i==0 ? $T2 : m_storage.m_cols</Size>
+ <ValuePointer>m_storage.m_data</ValuePointer>
+ </ArrayItems>
+ <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->
+ <Direction>Backward</Direction>
+ <Rank>2</Rank>
+ <Size>$i==0 ? $T2 : m_storage.m_cols</Size>
+ <ValuePointer>m_storage.m_data</ValuePointer>
+ </ArrayItems>
+ </Expand>
+ </Type>
+
+ <!-- Dynamic x Fixed Matrix -->
+ <Type Name="Eigen::Matrix<*,-1,*,*,*,*>">
+ <AlternativeType Name="Eigen::Array<*,-1,*,*,*,*>"/>
+ <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
+ <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}, {$T2}] (dynamic row matrix)</DisplayString>
+ <Expand>
+ <ArrayItems Condition="Flags%2"> <!-- row major layout -->
+ <Rank>2</Rank>
+ <Size>$i==0 ? m_storage.m_rows : $T2</Size>
+ <ValuePointer>m_storage.m_data</ValuePointer>
+ </ArrayItems>
+ <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->
+ <Direction>Backward</Direction>
+ <Rank>2</Rank>
+ <Size>$i==0 ? m_storage.m_rows : $T2</Size>
+ <ValuePointer>m_storage.m_data</ValuePointer>
+ </ArrayItems>
+ </Expand>
+ </Type>
+
+ <!-- Dynamic Column Vector -->
+ <Type Name="Eigen::Matrix<*,1,-1,*,*,*>">
+ <AlternativeType Name="Eigen::Array<*,1,-1,*,*,*>"/>
+ <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
+ <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_cols}] (dynamic column vector)</DisplayString>
+ <Expand>
+ <Item Name="[size]">m_storage.m_cols</Item>
+ <ArrayItems>
+ <Size>m_storage.m_cols</Size>
+ <ValuePointer>m_storage.m_data</ValuePointer>
+ </ArrayItems>
+ </Expand>
+ </Type>
+
+ <!-- Dynamic Row Vector -->
+ <Type Name="Eigen::Matrix<*,-1,1,*,*,*>">
+ <AlternativeType Name="Eigen::Array<*,-1,1,*,*,*>"/>
+ <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
+ <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}] (dynamic row vector)</DisplayString>
+ <Expand>
+ <Item Name="[size]">m_storage.m_rows</Item>
+ <ArrayItems>
+ <Size>m_storage.m_rows</Size>
+ <ValuePointer>m_storage.m_data</ValuePointer>
+ </ArrayItems>
+ </Expand>
+ </Type>
+
+ <!-- Fixed Vector -->
+ <Type Name="Eigen::Matrix<*,1,1,*,*,*>">
+ <AlternativeType Name="Eigen::Array<*,1,1,*,*,*>"/>
+ <DisplayString>[1] ({m_storage.m_data.array[0]})</DisplayString>
+ <Expand>
+ <Item Name="[x]">m_storage.m_data.array[0]</Item>
+ </Expand>
+ </Type>
+
+ <Type Name="Eigen::Matrix<*,2,1,*,*,*>">
+ <AlternativeType Name="Eigen::Matrix<*,1,2,*,*,*>"/>
+ <AlternativeType Name="Eigen::Array<*,2,1,*,*,*>"/>
+ <AlternativeType Name="Eigen::Array<*,1,2,*,*,*>"/>
+ <DisplayString>[2] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]})</DisplayString>
+ <Expand>
+ <Item Name="[x]">m_storage.m_data.array[0]</Item>
+ <Item Name="[y]">m_storage.m_data.array[1]</Item>
+ </Expand>
+ </Type>
+
+ <Type Name="Eigen::Matrix<*,3,1,*,*,*>">
+ <AlternativeType Name="Eigen::Matrix<*,1,3,*,*,*>"/>
+ <AlternativeType Name="Eigen::Array<*,3,1,*,*,*>"/>
+ <AlternativeType Name="Eigen::Array<*,1,3,*,*,*>"/>
+ <DisplayString>[3] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]})</DisplayString>
+ <Expand>
+ <Item Name="[x]">m_storage.m_data.array[0]</Item>
+ <Item Name="[y]">m_storage.m_data.array[1]</Item>
+ <Item Name="[z]">m_storage.m_data.array[2]</Item>
+ </Expand>
+ </Type>
+
+ <Type Name="Eigen::Matrix<*,4,1,*,*,*>">
+ <AlternativeType Name="Eigen::Matrix<*,1,4,*,*,*>"/>
+ <AlternativeType Name="Eigen::Array<*,4,1,*,*,*>"/>
+ <AlternativeType Name="Eigen::Array<*,1,4,*,*,*>"/>
+ <DisplayString>[4] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>
+ <Expand>
+ <Item Name="[x]">m_storage.m_data.array[0]</Item>
+ <Item Name="[y]">m_storage.m_data.array[1]</Item>
+ <Item Name="[z]">m_storage.m_data.array[2]</Item>
+ <Item Name="[w]">m_storage.m_data.array[3]</Item>
+ </Expand>
+ </Type>
+
+</AutoVisualizer>
diff --git a/debug/msvc/eigen_autoexp_part.dat b/debug/msvc/eigen_autoexp_part.dat
index 35ef580..273c10d 100644
--- a/debug/msvc/eigen_autoexp_part.dat
+++ b/debug/msvc/eigen_autoexp_part.dat
@@ -1,295 +1,295 @@
-; ***************************************************************
-; * Eigen Visualizer
-; *
-; * Author: Hauke Heibel <hauke.heibel@gmail.com>
-; *
-; * Support the enhanced debugging of the following Eigen
-; * types (*: any, +:fixed dimension) :
-; *
-; * - Eigen::Matrix<*,4,1,*,*,*> and Eigen::Matrix<*,1,4,*,*,*>
-; * - Eigen::Matrix<*,3,1,*,*,*> and Eigen::Matrix<*,1,3,*,*,*>
-; * - Eigen::Matrix<*,2,1,*,*,*> and Eigen::Matrix<*,1,2,*,*,*>
-; * - Eigen::Matrix<*,-1,-1,*,*,*>
-; * - Eigen::Matrix<*,+,-1,*,*,*>
-; * - Eigen::Matrix<*,-1,+,*,*,*>
-; * - Eigen::Matrix<*,+,+,*,*,*>
-; *
-; * Matrices are displayed properly independently of the memory
-; * alignment (RowMajor vs. ColMajor).
-; *
-; * This file is distributed WITHOUT ANY WARRANTY. Please ensure
-; * that your original autoexp.dat file is copied to a safe
-; * place before proceeding with its modification.
-; ***************************************************************
-
-[Visualizer]
-
-; Fixed size 4-vectors
-Eigen::Matrix<*,4,1,*,*,*>|Eigen::Matrix<*,1,4,*,*,*>{
- children
- (
- #(
- [internals]: [$c,!],
- x : ($c.m_storage.m_data.array)[0],
- y : ($c.m_storage.m_data.array)[1],
- z : ($c.m_storage.m_data.array)[2],
- w : ($c.m_storage.m_data.array)[3]
- )
- )
-
- preview
- (
- #(
- "[",
- 4,
- "](",
- #array(expr: $e.m_storage.m_data.array[$i], size: 4),
- ")"
- )
- )
-}
-
-; Fixed size 3-vectors
-Eigen::Matrix<*,3,1,*,*,*>|Eigen::Matrix<*,1,3,*,*,*>{
- children
- (
- #(
- [internals]: [$c,!],
- x : ($c.m_storage.m_data.array)[0],
- y : ($c.m_storage.m_data.array)[1],
- z : ($c.m_storage.m_data.array)[2]
- )
- )
-
- preview
- (
- #(
- "[",
- 3,
- "](",
- #array(expr: $e.m_storage.m_data.array[$i], size: 3),
- ")"
- )
- )
-}
-
-; Fixed size 2-vectors
-Eigen::Matrix<*,2,1,*,*,*>|Eigen::Matrix<*,1,2,*,*,*>{
- children
- (
- #(
- [internals]: [$c,!],
- x : ($c.m_storage.m_data.array)[0],
- y : ($c.m_storage.m_data.array)[1]
- )
- )
-
- preview
- (
- #(
- "[",
- 2,
- "](",
- #array(expr: $e.m_storage.m_data.array[$i], size: 2),
- ")"
- )
- )
-}
-
-; Fixed size 1-vectors
-Eigen::Matrix<*,1,1,*,*,*>|Eigen::Matrix<*,1,1,*,*,*>{
- children
- (
- #(
- [internals]: [$c,!],
- x : ($c.m_storage.m_data.array)[0]
- )
- )
-
- preview
- (
- #(
- "[",
- 1,
- "](",
- #array(expr: $e.m_storage.m_data.array[$i], size: 1),
- ")"
- )
- )
-}
-
-; Dynamic matrices (ColMajor and RowMajor support)
-Eigen::Matrix<*,-1,-1,*,*,*>{
- children
- (
- #(
- [internals]: [$c,!],
- rows: $c.m_storage.m_rows,
- cols: $c.m_storage.m_cols,
- ; Check for RowMajorBit
- #if ($c.Flags & 0x1) (
- #array(
- rank: 2,
- base: 0,
- expr: ($c.m_storage.m_data)[($i % $c.m_storage.m_rows)*$c.m_storage.m_cols + (($i- $i % $c.m_storage.m_rows)/$c.m_storage.m_rows)],
- size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.m_storage.m_cols
- )
- ) #else (
- #array(
- rank: 2,
- base: 0,
- expr: ($c.m_storage.m_data)[$i],
- size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.m_storage.m_cols
- )
- )
- )
- )
-
- preview
- (
- #(
- "[",
- $c.m_storage.m_rows,
- ",",
- $c.m_storage.m_cols,
- "](",
- #array(
- expr : [($c.m_storage.m_data)[$i],g],
- size : $c.m_storage.m_rows*$c.m_storage.m_cols
- ),
- ")"
- )
- )
-}
-
-; Fixed rows, dynamic columns matrix (ColMajor and RowMajor support)
-Eigen::Matrix<*,*,-1,*,*,*>{
- children
- (
- #(
- [internals]: [$c,!],
- rows: $c.RowsAtCompileTime,
- cols: $c.m_storage.m_cols,
- ; Check for RowMajorBit
- #if ($c.Flags & 0x1) (
- #array(
- rank: 2,
- base: 0,
- expr: ($c.m_storage.m_data)[($i % $c.RowsAtCompileTime)*$c.m_storage.m_cols + (($i- $i % $c.RowsAtCompileTime)/$c.RowsAtCompileTime)],
- size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.m_storage.m_cols
- )
- ) #else (
- #array(
- rank: 2,
- base: 0,
- expr: ($c.m_storage.m_data)[$i],
- size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.m_storage.m_cols
- )
- )
- )
- )
-
- preview
- (
- #(
- "[",
- $c.RowsAtCompileTime,
- ",",
- $c.m_storage.m_cols,
- "](",
- #array(
- expr : [($c.m_storage.m_data)[$i],g],
- size : $c.RowsAtCompileTime*$c.m_storage.m_cols
- ),
- ")"
- )
- )
-}
-
-; Dynamic rows, fixed columns matrix (ColMajor and RowMajor support)
-Eigen::Matrix<*,-1,*,*,*,*>{
- children
- (
- #(
- [internals]: [$c,!],
- rows: $c.m_storage.m_rows,
- cols: $c.ColsAtCompileTime,
- ; Check for RowMajorBit
- #if ($c.Flags & 0x1) (
- #array(
- rank: 2,
- base: 0,
- expr: ($c.m_storage.m_data)[($i % $c.m_storage.m_rows)*$c.ColsAtCompileTime + (($i- $i % $c.m_storage.m_rows)/$c.m_storage.m_rows)],
- size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.ColsAtCompileTime
- )
- ) #else (
- #array(
- rank: 2,
- base: 0,
- expr: ($c.m_storage.m_data)[$i],
- size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.ColsAtCompileTime
- )
- )
- )
- )
-
- preview
- (
- #(
- "[",
- $c.m_storage.m_rows,
- ",",
- $c.ColsAtCompileTime,
- "](",
- #array(
- expr : [($c.m_storage.m_data)[$i],g],
- size : $c.m_storage.m_rows*$c.ColsAtCompileTime
- ),
- ")"
- )
- )
-}
-
-; Fixed size matrix (ColMajor and RowMajor support)
-Eigen::Matrix<*,*,*,*,*,*>{
- children
- (
- #(
- [internals]: [$c,!],
- rows: $c.RowsAtCompileTime,
- cols: $c.ColsAtCompileTime,
- ; Check for RowMajorBit
- #if ($c.Flags & 0x1) (
- #array(
- rank: 2,
- base: 0,
- expr: ($c.m_storage.m_data.array)[($i % $c.RowsAtCompileTime)*$c.ColsAtCompileTime + (($i- $i % $c.RowsAtCompileTime)/$c.RowsAtCompileTime)],
- size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.ColsAtCompileTime
- )
- ) #else (
- #array(
- rank: 2,
- base: 0,
- expr: ($c.m_storage.m_data.array)[$i],
- size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.ColsAtCompileTime
- )
- )
- )
- )
-
- preview
- (
- #(
- "[",
- $c.RowsAtCompileTime,
- ",",
- $c.ColsAtCompileTime,
- "](",
- #array(
- expr : [($c.m_storage.m_data.array)[$i],g],
- size : $c.RowsAtCompileTime*$c.ColsAtCompileTime
- ),
- ")"
- )
- )
-}
+; ***************************************************************
+; * Eigen Visualizer
+; *
+; * Author: Hauke Heibel <hauke.heibel@gmail.com>
+; *
+; * Support the enhanced debugging of the following Eigen
+; * types (*: any, +:fixed dimension) :
+; *
+; * - Eigen::Matrix<*,4,1,*,*,*> and Eigen::Matrix<*,1,4,*,*,*>
+; * - Eigen::Matrix<*,3,1,*,*,*> and Eigen::Matrix<*,1,3,*,*,*>
+; * - Eigen::Matrix<*,2,1,*,*,*> and Eigen::Matrix<*,1,2,*,*,*>
+; * - Eigen::Matrix<*,-1,-1,*,*,*>
+; * - Eigen::Matrix<*,+,-1,*,*,*>
+; * - Eigen::Matrix<*,-1,+,*,*,*>
+; * - Eigen::Matrix<*,+,+,*,*,*>
+; *
+; * Matrices are displayed properly independently of the memory
+; * alignment (RowMajor vs. ColMajor).
+; *
+; * This file is distributed WITHOUT ANY WARRANTY. Please ensure
+; * that your original autoexp.dat file is copied to a safe
+; * place before proceeding with its modification.
+; ***************************************************************
+
+[Visualizer]
+
+; Fixed size 4-vectors
+Eigen::Matrix<*,4,1,*,*,*>|Eigen::Matrix<*,1,4,*,*,*>{
+ children
+ (
+ #(
+ [internals]: [$c,!],
+ x : ($c.m_storage.m_data.array)[0],
+ y : ($c.m_storage.m_data.array)[1],
+ z : ($c.m_storage.m_data.array)[2],
+ w : ($c.m_storage.m_data.array)[3]
+ )
+ )
+
+ preview
+ (
+ #(
+ "[",
+ 4,
+ "](",
+ #array(expr: $e.m_storage.m_data.array[$i], size: 4),
+ ")"
+ )
+ )
+}
+
+; Fixed size 3-vectors
+Eigen::Matrix<*,3,1,*,*,*>|Eigen::Matrix<*,1,3,*,*,*>{
+ children
+ (
+ #(
+ [internals]: [$c,!],
+ x : ($c.m_storage.m_data.array)[0],
+ y : ($c.m_storage.m_data.array)[1],
+ z : ($c.m_storage.m_data.array)[2]
+ )
+ )
+
+ preview
+ (
+ #(
+ "[",
+ 3,
+ "](",
+ #array(expr: $e.m_storage.m_data.array[$i], size: 3),
+ ")"
+ )
+ )
+}
+
+; Fixed size 2-vectors
+Eigen::Matrix<*,2,1,*,*,*>|Eigen::Matrix<*,1,2,*,*,*>{
+ children
+ (
+ #(
+ [internals]: [$c,!],
+ x : ($c.m_storage.m_data.array)[0],
+ y : ($c.m_storage.m_data.array)[1]
+ )
+ )
+
+ preview
+ (
+ #(
+ "[",
+ 2,
+ "](",
+ #array(expr: $e.m_storage.m_data.array[$i], size: 2),
+ ")"
+ )
+ )
+}
+
+; Fixed size 1-vectors
+Eigen::Matrix<*,1,1,*,*,*>|Eigen::Matrix<*,1,1,*,*,*>{
+ children
+ (
+ #(
+ [internals]: [$c,!],
+ x : ($c.m_storage.m_data.array)[0]
+ )
+ )
+
+ preview
+ (
+ #(
+ "[",
+ 1,
+ "](",
+ #array(expr: $e.m_storage.m_data.array[$i], size: 1),
+ ")"
+ )
+ )
+}
+
+; Dynamic matrices (ColMajor and RowMajor support)
+Eigen::Matrix<*,-1,-1,*,*,*>{
+ children
+ (
+ #(
+ [internals]: [$c,!],
+ rows: $c.m_storage.m_rows,
+ cols: $c.m_storage.m_cols,
+ ; Check for RowMajorBit
+ #if ($c.Flags & 0x1) (
+ #array(
+ rank: 2,
+ base: 0,
+ expr: ($c.m_storage.m_data)[($i % $c.m_storage.m_rows)*$c.m_storage.m_cols + (($i- $i % $c.m_storage.m_rows)/$c.m_storage.m_rows)],
+ size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.m_storage.m_cols
+ )
+ ) #else (
+ #array(
+ rank: 2,
+ base: 0,
+ expr: ($c.m_storage.m_data)[$i],
+ size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.m_storage.m_cols
+ )
+ )
+ )
+ )
+
+ preview
+ (
+ #(
+ "[",
+ $c.m_storage.m_rows,
+ ",",
+ $c.m_storage.m_cols,
+ "](",
+ #array(
+ expr : [($c.m_storage.m_data)[$i],g],
+ size : $c.m_storage.m_rows*$c.m_storage.m_cols
+ ),
+ ")"
+ )
+ )
+}
+
+; Fixed rows, dynamic columns matrix (ColMajor and RowMajor support)
+Eigen::Matrix<*,*,-1,*,*,*>{
+ children
+ (
+ #(
+ [internals]: [$c,!],
+ rows: $c.RowsAtCompileTime,
+ cols: $c.m_storage.m_cols,
+ ; Check for RowMajorBit
+ #if ($c.Flags & 0x1) (
+ #array(
+ rank: 2,
+ base: 0,
+ expr: ($c.m_storage.m_data)[($i % $c.RowsAtCompileTime)*$c.m_storage.m_cols + (($i- $i % $c.RowsAtCompileTime)/$c.RowsAtCompileTime)],
+ size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.m_storage.m_cols
+ )
+ ) #else (
+ #array(
+ rank: 2,
+ base: 0,
+ expr: ($c.m_storage.m_data)[$i],
+ size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.m_storage.m_cols
+ )
+ )
+ )
+ )
+
+ preview
+ (
+ #(
+ "[",
+ $c.RowsAtCompileTime,
+ ",",
+ $c.m_storage.m_cols,
+ "](",
+ #array(
+ expr : [($c.m_storage.m_data)[$i],g],
+ size : $c.RowsAtCompileTime*$c.m_storage.m_cols
+ ),
+ ")"
+ )
+ )
+}
+
+; Dynamic rows, fixed columns matrix (ColMajor and RowMajor support)
+Eigen::Matrix<*,-1,*,*,*,*>{
+ children
+ (
+ #(
+ [internals]: [$c,!],
+ rows: $c.m_storage.m_rows,
+ cols: $c.ColsAtCompileTime,
+ ; Check for RowMajorBit
+ #if ($c.Flags & 0x1) (
+ #array(
+ rank: 2,
+ base: 0,
+ expr: ($c.m_storage.m_data)[($i % $c.m_storage.m_rows)*$c.ColsAtCompileTime + (($i- $i % $c.m_storage.m_rows)/$c.m_storage.m_rows)],
+ size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.ColsAtCompileTime
+ )
+ ) #else (
+ #array(
+ rank: 2,
+ base: 0,
+ expr: ($c.m_storage.m_data)[$i],
+ size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.ColsAtCompileTime
+ )
+ )
+ )
+ )
+
+ preview
+ (
+ #(
+ "[",
+ $c.m_storage.m_rows,
+ ",",
+ $c.ColsAtCompileTime,
+ "](",
+ #array(
+ expr : [($c.m_storage.m_data)[$i],g],
+ size : $c.m_storage.m_rows*$c.ColsAtCompileTime
+ ),
+ ")"
+ )
+ )
+}
+
+; Fixed size matrix (ColMajor and RowMajor support)
+Eigen::Matrix<*,*,*,*,*,*>{
+ children
+ (
+ #(
+ [internals]: [$c,!],
+ rows: $c.RowsAtCompileTime,
+ cols: $c.ColsAtCompileTime,
+ ; Check for RowMajorBit
+ #if ($c.Flags & 0x1) (
+ #array(
+ rank: 2,
+ base: 0,
+ expr: ($c.m_storage.m_data.array)[($i % $c.RowsAtCompileTime)*$c.ColsAtCompileTime + (($i- $i % $c.RowsAtCompileTime)/$c.RowsAtCompileTime)],
+ size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.ColsAtCompileTime
+ )
+ ) #else (
+ #array(
+ rank: 2,
+ base: 0,
+ expr: ($c.m_storage.m_data.array)[$i],
+ size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.ColsAtCompileTime
+ )
+ )
+ )
+ )
+
+ preview
+ (
+ #(
+ "[",
+ $c.RowsAtCompileTime,
+ ",",
+ $c.ColsAtCompileTime,
+ "](",
+ #array(
+ expr : [($c.m_storage.m_data.array)[$i],g],
+ size : $c.RowsAtCompileTime*$c.ColsAtCompileTime
+ ),
+ ")"
+ )
+ )
+}
diff --git a/doc/AsciiQuickReference.txt b/doc/AsciiQuickReference.txt
index 18b4446..0256c81 100644
--- a/doc/AsciiQuickReference.txt
+++ b/doc/AsciiQuickReference.txt
@@ -140,6 +140,8 @@
R.array().cube() // P .^ 3
R.cwiseSqrt() // sqrt(P)
R.array().sqrt() // sqrt(P)
+R.cwiseCbrt() // cbrt(P)
+R.array().cbrt() // cbrt(P)
R.array().exp() // exp(P)
R.array().log() // log(P)
R.cwiseMax(P) // max(R, P)
diff --git a/doc/CoeffwiseMathFunctionsTable.dox b/doc/CoeffwiseMathFunctionsTable.dox
index 3f5c564..934a95a 100644
--- a/doc/CoeffwiseMathFunctionsTable.dox
+++ b/doc/CoeffwiseMathFunctionsTable.dox
@@ -172,6 +172,19 @@
</tr>
<tr>
<td class="code">
+ \anchor cwisetable_cbrt
+ a.\link ArrayBase::cbrt cbrt\endlink(); \n
+ \link Eigen::cbrt cbrt\endlink(a);\n
+ m.\link MatrixBase::cwiseCbrt cwiseCbrt\endlink();
+ </td>
+ <td>computes cube root (\f$ \cbrt a_i \f$)</td>
+ <td class="code">
+ using <a href="http://en.cppreference.com/w/cpp/numeric/math/cbrt">std::cbrt</a>; \n
+ cbrt(a[i]);</td>
+ <td></td>
+</tr>
+<tr>
+ <td class="code">
\anchor cwisetable_rsqrt
a.\link ArrayBase::rsqrt rsqrt\endlink(); \n
\link Eigen::rsqrt rsqrt\endlink(a);
diff --git a/doc/QuickReference.dox b/doc/QuickReference.dox
index e96b617..c61d47a 100644
--- a/doc/QuickReference.dox
+++ b/doc/QuickReference.dox
@@ -458,6 +458,7 @@
mat1.cwiseAbs2()
mat1.cwiseAbs()
mat1.cwiseSqrt()
+mat1.cwiseCbrt()
mat1.cwiseInverse()
mat1.cwiseProduct(mat2)
mat1.cwiseQuotient(mat2)
@@ -470,6 +471,7 @@
mat1.array().abs2()
mat1.array().abs()
mat1.array().sqrt()
+mat1.array().cbrt()
mat1.array().inverse()
mat1.array() * mat2.array()
mat1.array() / mat2.array()
diff --git a/doc/SparseQuickReference.dox b/doc/SparseQuickReference.dox
index 68ac2dc..66288cd 100644
--- a/doc/SparseQuickReference.dox
+++ b/doc/SparseQuickReference.dox
@@ -172,6 +172,7 @@
sm1.cwiseMax(sm2);
sm1.cwiseAbs();
sm1.cwiseSqrt();
+ sm1.cwiseCbrt();
\endcode</td>
<td>
sm1 and sm2 should have the same storage order
diff --git a/doc/snippets/Cwise_cbrt.cpp b/doc/snippets/Cwise_cbrt.cpp
new file mode 100644
index 0000000..a58c76c
--- /dev/null
+++ b/doc/snippets/Cwise_cbrt.cpp
@@ -0,0 +1,2 @@
+Array3d v(1,2,4);
+cout << v.cbrt() << endl;
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index e0b2c83..0614a20 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -234,6 +234,7 @@
ei_add_test(product_trsolve)
ei_add_test(product_mmtr)
ei_add_test(product_notemporary)
+ei_add_test(product_threaded "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
ei_add_test(stable_norm)
ei_add_test(permutationmatrices)
ei_add_test(bandmatrix)
diff --git a/test/array_cwise.cpp b/test/array_cwise.cpp
index 058b721..bfea96a 100644
--- a/test/array_cwise.cpp
+++ b/test/array_cwise.cpp
@@ -169,7 +169,9 @@
template <typename Scalar>
void unary_ops_test() {
+
unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(sqrt));
+ unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(cbrt));
unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(exp));
unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(log));
unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(sin));
@@ -821,6 +823,7 @@
m3 = m4.abs();
VERIFY_IS_APPROX(m3.sqrt(), sqrt(abs(m3)));
+ VERIFY_IS_APPROX(m3.cbrt(), cbrt(m3));
VERIFY_IS_APPROX(m3.rsqrt(), Scalar(1)/sqrt(abs(m3)));
VERIFY_IS_APPROX(rsqrt(m3), Scalar(1)/sqrt(abs(m3)));
VERIFY_IS_APPROX(m3.log(), log(m3));
@@ -882,6 +885,8 @@
VERIFY_IS_APPROX(m3.pow(RealScalar(0.5)), m3.sqrt());
VERIFY_IS_APPROX(pow(m3,RealScalar(0.5)), m3.sqrt());
+ VERIFY_IS_APPROX(m3.pow(RealScalar(1.0/3.0)), m3.cbrt());
+ VERIFY_IS_APPROX(pow(m3,RealScalar(1.0/3.0)), m3.cbrt());
VERIFY_IS_APPROX(m3.pow(RealScalar(-0.5)), m3.rsqrt());
VERIFY_IS_APPROX(pow(m3,RealScalar(-0.5)), m3.rsqrt());
diff --git a/test/product_threaded.cpp b/test/product_threaded.cpp
new file mode 100644
index 0000000..95dad79
--- /dev/null
+++ b/test/product_threaded.cpp
@@ -0,0 +1,33 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2023 Rasmus Munk Larsen <rmlarsen@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_GEMM_THREADPOOL
+#include "main.h"
+
+void test_parallelize_gemm() {
+ constexpr int n = 1024;
+ constexpr int num_threads = 4;
+ MatrixXf a(n,n);
+ MatrixXf b(n,n);
+ MatrixXf c(n,n);
+ c.noalias() = a*b;
+
+ ThreadPool pool(num_threads);
+ MatrixXf c_threaded(n,n);
+ c_threaded.noalias() = a*b;
+
+ VERIFY_IS_APPROX(c, c_threaded);
+}
+
+
+
+EIGEN_DECLARE_TEST(product_threaded)
+{
+ CALL_SUBTEST(test_parallelize_gemm());
+}
diff --git a/test/ref.cpp b/test/ref.cpp
index f283537..f0faa94 100644
--- a/test/ref.cpp
+++ b/test/ref.cpp
@@ -341,6 +341,17 @@
VERIFY(test_is_equal(data1, obj_data2, MatrixType::MaxSizeAtCompileTime == Dynamic && owns_data));
}
+template <typename MatrixType>
+void test_contiguous_ref_no_copy(const PlainObjectBase<MatrixType> &obj) {
+ typedef Ref<MatrixType, Unaligned, Stride<0, 0>> Ref_;
+ typedef Ref<const MatrixType, Unaligned, Stride<0, 0>> CRef_;
+ MatrixType m(obj);
+ Ref_ ref(m);
+ VERIFY(test_is_equal(ref.data(), m.data(), true));
+ CRef_ cref(m);
+ VERIFY(test_is_equal(cref.data(), m.data(), true));
+}
+
EIGEN_DECLARE_TEST(ref)
{
for(int i = 0; i < g_repeat; i++) {
@@ -375,4 +386,8 @@
CALL_SUBTEST_9( test_cref_move_ctor<MatrixXd>(MatrixXd(9, 5)) );
CALL_SUBTEST_9( test_cref_move_ctor<Matrix3d>(Matrix3d::Ones()) );
CALL_SUBTEST_9( test_cref_move_ctor<Matrix3d>(Matrix3d()) );
+ CALL_SUBTEST_10(test_contiguous_ref_no_copy(VectorXd(9)));
+ CALL_SUBTEST_10(test_contiguous_ref_no_copy(Vector3d()));
+ CALL_SUBTEST_10(test_contiguous_ref_no_copy(MatrixXd(9, 5)));
+ CALL_SUBTEST_10(test_contiguous_ref_no_copy(Matrix3d()));
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index 7576808..9567d3b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -1548,10 +1548,10 @@
Index nn0 = numext::div_ceil(n, bn);
Index new_tasks = numext::div_ceil(nm0, gm) * numext::div_ceil(nn0, gn);
double new_parallelism = static_cast<double>(new_tasks) /
- (numext::div_ceil<int>(new_tasks, num_threads) * num_threads);
+ (numext::div_ceil<Index>(new_tasks, num_threads) * num_threads);
Index old_tasks = numext::div_ceil(nm0, oldgm) * numext::div_ceil(nn0, oldgn);
double old_parallelism = static_cast<double>(old_tasks) /
- (numext::div_ceil<int>(old_tasks, num_threads) * num_threads);
+ (numext::div_ceil<Index>(old_tasks, num_threads) * num_threads);
if (new_parallelism > old_parallelism || new_parallelism == 1) return 1;
return 0;
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
index 53b66c0..abf6834 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -373,7 +373,7 @@
// computations:
double max_efficiency =
static_cast<double>(block_count) /
- (numext::div_ceil<int>(block_count, numThreads()) * numThreads());
+ (numext::div_ceil<Index>(block_count, numThreads()) * numThreads());
// Now try to increase block size up to max_block_size as long as it
// doesn't decrease parallel efficiency.
@@ -396,7 +396,7 @@
prev_block_count = coarser_block_count;
const double coarser_efficiency =
static_cast<double>(coarser_block_count) /
- (numext::div_ceil<int>(coarser_block_count, numThreads()) * numThreads());
+ (numext::div_ceil<Index>(coarser_block_count, numThreads()) * numThreads());
if (coarser_efficiency + 0.01 >= max_efficiency) {
// Taking it.
block_size = coarser_block_size;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 4eebbe7..e8fb85d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -655,13 +655,11 @@
const int block_size = device.maxGpuThreadsPerBlock();
const int max_blocks =
- numext::mini<int64_t>(device.getNumGpuMultiProcessors() *
- device.maxGpuThreadsPerMultiProcessor(),
- NumTraits<StorageIndex>::highest()) /
- block_size;
+ static_cast<int>(numext::mini<int64_t>(device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor(),
+ NumTraits<StorageIndex>::highest()) / block_size);
const StorageIndex size = array_prod(evaluator.dimensions());
// Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
- const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, numext::div_ceil<int>(size, block_size)), 1);
+ const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, static_cast<int>(numext::div_ceil<StorageIndex>(size, block_size))), 1);
LAUNCH_GPU_KERNEL(
(EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, StorageIndex>),
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index c1ad675..062a56b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -271,7 +271,7 @@
// Make sure the split point is aligned on a packet boundary.
const typename Self::Index split =
packetSize *
- divup(firstIndex + divup(numValuesToReduce, typename Self::Index(2)),
+ numext::div_ceil(firstIndex + numext::div_ceil(numValuesToReduce, typename Self::Index(2)),
packetSize);
const typename Self::Index num_left =
numext::mini(split - firstIndex, numValuesToReduce);