Update Eigen to commit:f3912891504ad965d5fe8bd11d7346faa69b4026 CHANGELOG ========= f39128915 - Fix bug in checking subnormals. 5a90fbcea - Update documentation of lapack second/dsecnd. 907982024 - Remove simple relicense script. 851b40afd - LAPACK CPU time functions. a73970a86 - Fix arm32 issues. 580812201 - Formatting. 92f9544f6 - Remove explicit specialization of member function. 2692fb2b7 - Fix compile-time error caused by chip static asserts 2c6b61c00 - Add half and quarter vector support to HVX architecture 05a457534 - Chipping Asserts v2 fc92fe312 - SPQR: Fix build error, Index/StorageIndex mismatch. eede526b7 - [Compressed Storage] Use smaller type of Index & StorageIndex for determining maximum size during resize. 772057c55 - Revert "Add asserts for .chip" 6163dbe2b - Allow specifying a temporary directory for fileio outputs. 6b6bb9d34 - Fix unused warnings in failtest. f6e41e643 - Revert "Clean up stableNorm" b1ae206ea - Add asserts for .chip b0f906419 - add missing constexpr qualifier 34fd46a9b - Update CI with testing framework from eigen_ci_cross_testing. b2814d53a - Fix stableNorm when input is zero-sized. c29a41011 - check pointers before freeing 48e0c827d - [ROCm] MI300 related test support 538577301 - Fix TensorForcedEval in the case of the evaluator being copied. 3f3bc6d86 - Improve documentation of SparseLU d33174d5a - Doc: Fix Basic slicing examples minor issues 19b119288 - Add factor getters to Cholmod LLT/LDLT a1a96fafd - Clean up stableNorm 3026f1f29 - Fix various asan errors. a2cf99ec6 - Fix GPU+clang+asan. fee5d60b5 - Fix MSAN failures. 9697d481c - Fix up clang-format CI. 85efa8329 - Set up clang-format in CI 2c4541f73 - fix msvc clz 75e273afc - Add internal ctz/clz implementation. PiperOrigin-RevId: 601659527 Change-Id: I52089e61fd81692bb2147f7d15ad98eb56da27c1
diff --git a/Eigen/Core b/Eigen/Core index 39f2b3f..f9d9974 100644 --- a/Eigen/Core +++ b/Eigen/Core
@@ -378,10 +378,6 @@ #include "src/Core/arch/AVX512/GemmKernel.h" #endif -#if defined(EIGEN_VECTORIZE_HVX) -#include "src/Core/arch/HVX/GeneralBlockPanelKernel.h" -#endif - #include "src/Core/Select.h" #include "src/Core/VectorwiseOp.h" #include "src/Core/PartialReduxEvaluator.h"
diff --git a/Eigen/src/CholmodSupport/CholmodSupport.h b/Eigen/src/CholmodSupport/CholmodSupport.h index be2e737..447a393 100644 --- a/Eigen/src/CholmodSupport/CholmodSupport.h +++ b/Eigen/src/CholmodSupport/CholmodSupport.h
@@ -149,11 +149,20 @@ /** Returns a view of the Cholmod sparse matrix \a cm as an Eigen sparse matrix. * The data are not copied but shared. */ -template <typename Scalar, int Flags, typename StorageIndex> -Map<SparseMatrix<Scalar, Flags, StorageIndex> > viewAsEigen(cholmod_sparse& cm) { - return Map<SparseMatrix<Scalar, Flags, StorageIndex> >(cm.nrow, cm.ncol, static_cast<StorageIndex*>(cm.p)[cm.ncol], - static_cast<StorageIndex*>(cm.p), - static_cast<StorageIndex*>(cm.i), static_cast<Scalar*>(cm.x)); +template <typename Scalar, typename StorageIndex> +Map<const SparseMatrix<Scalar, ColMajor, StorageIndex> > viewAsEigen(cholmod_sparse& cm) { + return Map<const SparseMatrix<Scalar, ColMajor, StorageIndex> >( + cm.nrow, cm.ncol, static_cast<StorageIndex*>(cm.p)[cm.ncol], static_cast<StorageIndex*>(cm.p), + static_cast<StorageIndex*>(cm.i), static_cast<Scalar*>(cm.x)); +} + +/** Returns a view of the Cholmod sparse matrix factor \a cm as an Eigen sparse matrix. + * The data are not copied but shared. */ +template <typename Scalar, typename StorageIndex> +Map<const SparseMatrix<Scalar, ColMajor, StorageIndex> > viewAsEigen(cholmod_factor& cm) { + return Map<const SparseMatrix<Scalar, ColMajor, StorageIndex> >( + cm.n, cm.n, static_cast<StorageIndex*>(cm.p)[cm.n], static_cast<StorageIndex*>(cm.p), + static_cast<StorageIndex*>(cm.i), static_cast<Scalar*>(cm.x)); } namespace internal { @@ -188,6 +197,7 @@ EIGEN_CHOLMOD_SPECIALIZE1(int, free_sparse, cholmod_sparse*, A) EIGEN_CHOLMOD_SPECIALIZE1(cholmod_factor*, analyze, cholmod_sparse, A) +EIGEN_CHOLMOD_SPECIALIZE1(cholmod_sparse*, factor_to_sparse, cholmod_factor, L) template <typename StorageIndex_> inline cholmod_dense* cm_solve(int sys, cholmod_factor& L, cholmod_dense& B, cholmod_common& Common) { @@ -377,7 +387,7 @@ // TODO optimize this copy by swapping when possible (be careful with alignment, etc.) // NOTE cholmod_spsolve in fact just calls the dense solver for blocks of 4 columns at a time (similar to Eigen's // sparse solver) - dest.derived() = viewAsEigen<typename DestDerived::Scalar, ColMajor, typename DestDerived::StorageIndex>(*x_cs); + dest.derived() = viewAsEigen<typename DestDerived::Scalar, typename DestDerived::StorageIndex>(*x_cs); internal::cm_free_sparse<StorageIndex>(x_cs, m_cholmod); } #endif // EIGEN_PARSED_BY_DOXYGEN @@ -483,6 +493,11 @@ public: typedef MatrixType_ MatrixType; + typedef typename MatrixType::Scalar Scalar; + typedef typename MatrixType::RealScalar RealScalar; + typedef typename MatrixType::StorageIndex StorageIndex; + typedef TriangularView<const MatrixType, Eigen::Lower> MatrixL; + typedef TriangularView<const typename MatrixType::AdjointReturnType, Eigen::Upper> MatrixU; CholmodSimplicialLLT() : Base() { init(); } @@ -493,6 +508,12 @@ ~CholmodSimplicialLLT() {} + /** \returns an expression of the factor L */ + inline MatrixL matrixL() const { return viewAsEigen<Scalar, StorageIndex>(*Base::m_cholmodFactor); } + + /** \returns an expression of the factor U (= L^*) */ + inline MatrixU matrixU() const { return matrixL().adjoint(); } + protected: void init() { m_cholmod.final_asis = 0; @@ -531,6 +552,12 @@ public: typedef MatrixType_ MatrixType; + typedef typename MatrixType::Scalar Scalar; + typedef typename MatrixType::RealScalar RealScalar; + typedef typename MatrixType::StorageIndex StorageIndex; + typedef Matrix<Scalar, Dynamic, 1> VectorType; + typedef TriangularView<const MatrixType, Eigen::UnitLower> MatrixL; + typedef TriangularView<const typename MatrixType::AdjointReturnType, Eigen::UnitUpper> MatrixU; CholmodSimplicialLDLT() : Base() { init(); } @@ -541,6 +568,26 @@ ~CholmodSimplicialLDLT() {} + /** \returns a vector expression of the diagonal D */ + inline VectorType vectorD() const { + auto cholmodL = viewAsEigen<Scalar, StorageIndex>(*Base::m_cholmodFactor); + + VectorType D{cholmodL.rows()}; + + for (Index k = 0; k < cholmodL.outerSize(); ++k) { + typename decltype(cholmodL)::InnerIterator it{cholmodL, k}; + D(k) = it.value(); + } + + return D; + } + + /** \returns an expression of the factor L */ + inline MatrixL matrixL() const { return viewAsEigen<Scalar, StorageIndex>(*Base::m_cholmodFactor); } + + /** \returns an expression of the factor U (= L^*) */ + inline MatrixU matrixU() const { return matrixL().adjoint(); } + protected: void init() { m_cholmod.final_asis = 1; @@ -578,6 +625,9 @@ public: typedef MatrixType_ MatrixType; + typedef typename MatrixType::Scalar Scalar; + typedef typename MatrixType::RealScalar RealScalar; + typedef typename MatrixType::StorageIndex StorageIndex; CholmodSupernodalLLT() : Base() { init(); } @@ -588,6 +638,19 @@ ~CholmodSupernodalLLT() {} + /** \returns an expression of the factor L */ + inline MatrixType matrixL() const { + // Convert Cholmod factor's supernodal storage format to Eigen's CSC storage format + cholmod_sparse* cholmodL = internal::cm_factor_to_sparse(*Base::m_cholmodFactor, m_cholmod); + MatrixType L = viewAsEigen<Scalar, StorageIndex>(*cholmodL); + internal::cm_free_sparse<StorageIndex>(cholmodL, m_cholmod); + + return L; + } + + /** \returns an expression of the factor U (= L^*) */ + inline MatrixType matrixU() const { return matrixL().adjoint(); } + protected: void init() { m_cholmod.final_asis = 1;
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 5936336..3a302c0 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h
@@ -612,11 +612,7 @@ } }; -#ifndef SYCL_DEVICE_ONLY -#define EIGEN_BINARY_OP_NAN_PROPAGATION(Type, Func) Func -#else #define EIGEN_BINARY_OP_NAN_PROPAGATION(Type, Func) [](const Type& a, const Type& b) { return Func(a, b); } -#endif /** \internal \returns the min of \a a and \a b (coeff-wise). If \a a or \b b is NaN, the return value is implementation defined. */
diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 95f9b97..0be29bc 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h
@@ -628,6 +628,147 @@ // no value, error at compile time }; +template <typename BitsType, typename EnableIf = void> +struct count_bits_impl { + static_assert(std::is_integral<BitsType>::value && std::is_unsigned<BitsType>::value, + "BitsType must be an unsigned integer"); + + static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) { + int n = CHAR_BIT * sizeof(BitsType); + int shift = n / 2; + while (bits > 0 && shift > 0) { + BitsType y = bits >> shift; + if (y > 0) { + n -= shift; + bits = y; + } + shift /= 2; + } + if (shift == 0) { + --n; + } + return n; + } + + static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) { + int n = CHAR_BIT * sizeof(BitsType); + int shift = n / 2; + while (bits > 0 && shift > 0) { + BitsType y = bits << shift; + if (y > 0) { + n -= shift; + bits = y; + } + shift /= 2; + } + if (shift == 0) { + --n; + } + return n; + } +}; + +// Count leading zeros. +template <typename BitsType> +EIGEN_DEVICE_FUNC inline int clz(BitsType bits) { + return count_bits_impl<BitsType>::clz(bits); +} + +// Count trailing zeros. +template <typename BitsType> +EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) { + return count_bits_impl<BitsType>::ctz(bits); +} + +#if EIGEN_COMP_GNUC || EIGEN_COMP_CLANG + +template <typename BitsType> +struct count_bits_impl<BitsType, std::enable_if_t<sizeof(BitsType) <= sizeof(unsigned int)>> { + static constexpr int kNumBits = static_cast<int>(sizeof(BitsType) * CHAR_BIT); + static_assert(std::is_integral<BitsType>::value, "BitsType must be a built-in integer"); + static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) { + static constexpr int kLeadingBitsOffset = (sizeof(unsigned int) - sizeof(BitsType)) * CHAR_BIT; + return bits == 0 ? kNumBits : __builtin_clz(static_cast<unsigned int>(bits)) - kLeadingBitsOffset; + } + + static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) { + return bits == 0 ? kNumBits : __builtin_ctz(static_cast<unsigned int>(bits)); + } +}; + +template <typename BitsType> +struct count_bits_impl< + BitsType, std::enable_if_t<sizeof(unsigned int) < sizeof(BitsType) && sizeof(BitsType) <= sizeof(unsigned long)>> { + static constexpr int kNumBits = static_cast<int>(sizeof(BitsType) * CHAR_BIT); + static_assert(std::is_integral<BitsType>::value, "BitsType must be a built-in integer"); + static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) { + static constexpr int kLeadingBitsOffset = (sizeof(unsigned long) - sizeof(BitsType)) * CHAR_BIT; + return bits == 0 ? kNumBits : __builtin_clzl(static_cast<unsigned long>(bits)) - kLeadingBitsOffset; + } + + static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) { + return bits == 0 ? kNumBits : __builtin_ctzl(static_cast<unsigned long>(bits)); + } +}; + +template <typename BitsType> +struct count_bits_impl<BitsType, std::enable_if_t<sizeof(unsigned long) < sizeof(BitsType) && + sizeof(BitsType) <= sizeof(unsigned long long)>> { + static constexpr int kNumBits = static_cast<int>(sizeof(BitsType) * CHAR_BIT); + static_assert(std::is_integral<BitsType>::value, "BitsType must be a built-in integer"); + static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) { + static constexpr int kLeadingBitsOffset = (sizeof(unsigned long long) - sizeof(BitsType)) * CHAR_BIT; + return bits == 0 ? kNumBits : __builtin_clzll(static_cast<unsigned long long>(bits)) - kLeadingBitsOffset; + } + + static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) { + return bits == 0 ? kNumBits : __builtin_ctzll(static_cast<unsigned long long>(bits)); + } +}; + +#elif EIGEN_COMP_MSVC + +template <typename BitsType> +struct count_bits_impl<BitsType, std::enable_if_t<sizeof(BitsType) <= sizeof(unsigned long)>> { + static constexpr int kNumBits = static_cast<int>(sizeof(BitsType) * CHAR_BIT); + static_assert(std::is_integral<BitsType>::value, "BitsType must be a built-in integer"); + static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) { + unsigned long out; + _BitScanReverse(&out, static_cast<unsigned long>(bits)); + return bits == 0 ? kNumBits : (kNumBits - 1) - static_cast<int>(out); + } + + static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) { + unsigned long out; + _BitScanForward(&out, static_cast<unsigned long>(bits)); + return bits == 0 ? kNumBits : static_cast<int>(out); + } +}; + +#ifdef _WIN64 + +template <typename BitsType> +struct count_bits_impl< + BitsType, std::enable_if_t<sizeof(unsigned long) < sizeof(BitsType) && sizeof(BitsType) <= sizeof(__int64)>> { + static constexpr int kNumBits = static_cast<int>(sizeof(BitsType) * CHAR_BIT); + static_assert(std::is_integral<BitsType>::value, "BitsType must be a built-in integer"); + static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) { + unsigned long out; + _BitScanReverse64(&out, static_cast<unsigned __int64>(bits)); + return bits == 0 ? kNumBits : (kNumBits - 1) - static_cast<int>(out); + } + + static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) { + unsigned long out; + _BitScanForward64(&out, static_cast<unsigned __int64>(bits)); + return bits == 0 ? kNumBits : static_cast<int>(out); + } +}; + +#endif // _WIN64 + +#endif // EIGEN_COMP_GNUC || EIGEN_COMP_CLANG + template <typename Scalar> struct random_default_impl<Scalar, false, true> { static inline Scalar run(const Scalar& x, const Scalar& y) {
diff --git a/Eigen/src/Core/PlainObjectBase.h b/Eigen/src/Core/PlainObjectBase.h index a8307c7..f9bf737 100644 --- a/Eigen/src/Core/PlainObjectBase.h +++ b/Eigen/src/Core/PlainObjectBase.h
@@ -204,7 +204,9 @@ * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts. * * See DenseCoeffsBase<Derived,ReadOnlyAccessors>::coeff(Index) const for details. */ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const { return m_storage.data()[index]; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const Scalar& coeff(Index index) const { + return m_storage.data()[index]; + } /** This is an overloaded version of DenseCoeffsBase<Derived,WriteAccessors>::coeffRef(Index,Index) const * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts.
diff --git a/Eigen/src/Core/arch/AVX512/PacketMathFP16.h b/Eigen/src/Core/arch/AVX512/PacketMathFP16.h index fc11174..131e6f1 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMathFP16.h +++ b/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
@@ -1,870 +1,870 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_PACKET_MATH_FP16_AVX512_H -#define EIGEN_PACKET_MATH_FP16_AVX512_H - -// IWYU pragma: private -#include "../../InternalHeaderCheck.h" - -namespace Eigen { - -namespace internal { - -typedef __m512h Packet32h; -typedef eigen_packet_wrapper<__m256i, 1> Packet16h; -typedef eigen_packet_wrapper<__m128i, 2> Packet8h; - -template <> -struct is_arithmetic<Packet8h> { - enum { value = true }; -}; - -template <> -struct packet_traits<half> : default_packet_traits { - typedef Packet32h type; - typedef Packet16h half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size = 32, - - HasCmp = 1, - HasAdd = 1, - HasSub = 1, - HasMul = 1, - HasDiv = 1, - HasNegate = 1, - HasAbs = 1, - HasAbs2 = 0, - HasMin = 1, - HasMax = 1, - HasConj = 1, - HasSetLinear = 0, - HasLog = 1, - HasLog1p = 1, - HasExp = 1, - HasExpm1 = 1, - HasSqrt = 1, - HasRsqrt = 1, - // These ones should be implemented in future - HasBessel = 0, - HasNdtri = 0, - HasSin = EIGEN_FAST_MATH, - HasCos = EIGEN_FAST_MATH, - HasTanh = EIGEN_FAST_MATH, - HasErf = 0, // EIGEN_FAST_MATH, - HasBlend = 0, - HasRound = 1, - HasFloor = 1, - HasCeil = 1, - HasRint = 1 - }; -}; - -template <> -struct unpacket_traits<Packet32h> { - typedef Eigen::half type; - typedef Packet16h half; - enum { - size = 32, - alignment = Aligned64, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; - -template <> -struct unpacket_traits<Packet16h> { - typedef Eigen::half type; - typedef Packet8h half; - enum { - size = 16, - alignment = Aligned32, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; - -template <> -struct unpacket_traits<Packet8h> { - typedef Eigen::half type; - typedef Packet8h half; - enum { - size = 8, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; - -// Memory functions - -// pset1 - -template <> -EIGEN_STRONG_INLINE Packet32h pset1<Packet32h>(const Eigen::half& from) { - return _mm512_set1_ph(static_cast<_Float16>(from)); -} - -// pset1frombits -template <> -EIGEN_STRONG_INLINE Packet32h pset1frombits<Packet32h>(unsigned short from) { - return _mm512_castsi512_ph(_mm512_set1_epi16(from)); -} - -// pfirst - -template <> -EIGEN_STRONG_INLINE Eigen::half pfirst<Packet32h>(const Packet32h& from) { -#ifdef EIGEN_VECTORIZE_AVX512DQ - return half_impl::raw_uint16_to_half( - static_cast<unsigned short>(_mm256_extract_epi16(_mm512_extracti32x8_epi32(_mm512_castph_si512(from), 0), 0))); -#else - Eigen::half dest[32]; - _mm512_storeu_ph(dest, from); - return dest[0]; -#endif -} - -// pload - -template <> -EIGEN_STRONG_INLINE Packet32h pload<Packet32h>(const Eigen::half* from) { - EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_ph(from); -} - -// ploadu - -template <> -EIGEN_STRONG_INLINE Packet32h ploadu<Packet32h>(const Eigen::half* from) { - EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_ph(from); -} - -// pstore - -template <> -EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet32h& from) { - EIGEN_DEBUG_ALIGNED_STORE _mm512_store_ph(to, from); -} - -// pstoreu - -template <> -EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet32h& from) { - EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_ph(to, from); -} - -// ploaddup -template <> -EIGEN_STRONG_INLINE Packet32h ploaddup<Packet32h>(const Eigen::half* from) { - __m512h a = _mm512_castph256_ph512(_mm256_loadu_ph(from)); - return _mm512_permutexvar_ph(_mm512_set_epi16(15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6, - 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0), - a); -} - -// ploadquad -template <> -EIGEN_STRONG_INLINE Packet32h ploadquad<Packet32h>(const Eigen::half* from) { - __m512h a = _mm512_castph128_ph512(_mm_loadu_ph(from)); - return _mm512_permutexvar_ph( - _mm512_set_epi16(7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0), - a); -} - -// pabs - -template <> -EIGEN_STRONG_INLINE Packet32h pabs<Packet32h>(const Packet32h& a) { - return _mm512_abs_ph(a); -} - -// psignbit - -template <> -EIGEN_STRONG_INLINE Packet32h psignbit<Packet32h>(const Packet32h& a) { - return _mm512_castsi512_ph(_mm512_srai_epi16(_mm512_castph_si512(a), 15)); -} - -// pmin - -template <> -EIGEN_STRONG_INLINE Packet32h pmin<Packet32h>(const Packet32h& a, const Packet32h& b) { - return _mm512_min_ph(a, b); -} - -// pmax - -template <> -EIGEN_STRONG_INLINE Packet32h pmax<Packet32h>(const Packet32h& a, const Packet32h& b) { - return _mm512_max_ph(a, b); -} - -// plset -template <> -EIGEN_STRONG_INLINE Packet32h plset<Packet32h>(const half& a) { - return _mm512_add_ph(_mm512_set1_ph(a), - _mm512_set_ph(31.0f, 30.0f, 29.0f, 28.0f, 27.0f, 26.0f, 25.0f, 24.0f, 23.0f, 22.0f, 21.0f, 20.0f, - 19.0f, 18.0f, 17.0f, 16.0f, 15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, - 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); -} - -// por - -template <> -EIGEN_STRONG_INLINE Packet32h por(const Packet32h& a, const Packet32h& b) { - return _mm512_castsi512_ph(_mm512_or_si512(_mm512_castph_si512(a), _mm512_castph_si512(b))); -} - -// pxor - -template <> -EIGEN_STRONG_INLINE Packet32h pxor(const Packet32h& a, const Packet32h& b) { - return _mm512_castsi512_ph(_mm512_xor_si512(_mm512_castph_si512(a), _mm512_castph_si512(b))); -} - -// pand - -template <> -EIGEN_STRONG_INLINE Packet32h pand(const Packet32h& a, const Packet32h& b) { - return _mm512_castsi512_ph(_mm512_and_si512(_mm512_castph_si512(a), _mm512_castph_si512(b))); -} - -// pandnot - -template <> -EIGEN_STRONG_INLINE Packet32h pandnot(const Packet32h& a, const Packet32h& b) { - return _mm512_castsi512_ph(_mm512_andnot_si512(_mm512_castph_si512(b), _mm512_castph_si512(a))); -} - -// pselect - -template <> -EIGEN_DEVICE_FUNC inline Packet32h pselect(const Packet32h& mask, const Packet32h& a, const Packet32h& b) { - __mmask32 mask32 = _mm512_cmp_epi16_mask(_mm512_castph_si512(mask), _mm512_setzero_epi32(), _MM_CMPINT_EQ); - return _mm512_mask_blend_ph(mask32, a, b); -} - -// pcmp_eq - -template <> -EIGEN_STRONG_INLINE Packet32h pcmp_eq(const Packet32h& a, const Packet32h& b) { - __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_EQ_OQ); - return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu)); -} - -// pcmp_le - -template <> -EIGEN_STRONG_INLINE Packet32h pcmp_le(const Packet32h& a, const Packet32h& b) { - __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_LE_OQ); - return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu)); -} - -// pcmp_lt - -template <> -EIGEN_STRONG_INLINE Packet32h pcmp_lt(const Packet32h& a, const Packet32h& b) { - __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_LT_OQ); - return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu)); -} - -// pcmp_lt_or_nan - -template <> -EIGEN_STRONG_INLINE Packet32h pcmp_lt_or_nan(const Packet32h& a, const Packet32h& b) { - __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_NGE_UQ); - return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi16(0), mask, 0xffffu)); -} - -// padd - -template <> -EIGEN_STRONG_INLINE Packet32h padd<Packet32h>(const Packet32h& a, const Packet32h& b) { - return _mm512_add_ph(a, b); -} - -template <> -EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) { - return _mm256_castph_si256(_mm256_add_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b))); -} - -template <> -EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) { - return _mm_castph_si128(_mm_add_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b))); -} - -// psub - -template <> -EIGEN_STRONG_INLINE Packet32h psub<Packet32h>(const Packet32h& a, const Packet32h& b) { - return _mm512_sub_ph(a, b); -} - -template <> -EIGEN_STRONG_INLINE Packet16h psub<Packet16h>(const Packet16h& a, const Packet16h& b) { - return _mm256_castph_si256(_mm256_sub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b))); -} - -template <> -EIGEN_STRONG_INLINE Packet8h psub<Packet8h>(const Packet8h& a, const Packet8h& b) { - return _mm_castph_si128(_mm_sub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b))); -} - -// pmul - -template <> -EIGEN_STRONG_INLINE Packet32h pmul<Packet32h>(const Packet32h& a, const Packet32h& b) { - return _mm512_mul_ph(a, b); -} - -template <> -EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) { - return _mm256_castph_si256(_mm256_mul_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b))); -} - -template <> -EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) { - return _mm_castph_si128(_mm_mul_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b))); -} - -// pdiv - -template <> -EIGEN_STRONG_INLINE Packet32h pdiv<Packet32h>(const Packet32h& a, const Packet32h& b) { - return _mm512_div_ph(a, b); -} - -template <> -EIGEN_STRONG_INLINE Packet16h pdiv<Packet16h>(const Packet16h& a, const Packet16h& b) { - return _mm256_castph_si256(_mm256_div_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b))); -} - -template <> -EIGEN_STRONG_INLINE Packet8h pdiv<Packet8h>(const Packet8h& a, const Packet8h& b) { - return _mm_castph_si128(_mm_div_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b))); -} - -// pround - -template <> -EIGEN_STRONG_INLINE Packet32h pround<Packet32h>(const Packet32h& a) { - // Work-around for default std::round rounding mode. - - // Mask for the sign bit - const Packet32h signMask = pset1frombits<Packet32h>(static_cast<numext::uint16_t>(0x8000u)); - // The largest half-preicision float less than 0.5 - const Packet32h prev0dot5 = pset1frombits<Packet32h>(static_cast<numext::uint16_t>(0x37FFu)); - - return _mm512_roundscale_ph(padd(por(pand(a, signMask), prev0dot5), a), _MM_FROUND_TO_ZERO); -} - -// print - -template <> -EIGEN_STRONG_INLINE Packet32h print<Packet32h>(const Packet32h& a) { - return _mm512_roundscale_ph(a, _MM_FROUND_CUR_DIRECTION); -} - -// pceil - -template <> -EIGEN_STRONG_INLINE Packet32h pceil<Packet32h>(const Packet32h& a) { - return _mm512_roundscale_ph(a, _MM_FROUND_TO_POS_INF); -} - -// pfloor - -template <> -EIGEN_STRONG_INLINE Packet32h pfloor<Packet32h>(const Packet32h& a) { - return _mm512_roundscale_ph(a, _MM_FROUND_TO_NEG_INF); -} - -// predux -template <> -EIGEN_STRONG_INLINE half predux<Packet32h>(const Packet32h& a) { - return (half)_mm512_reduce_add_ph(a); -} - -template <> -EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& a) { - return (half)_mm256_reduce_add_ph(_mm256_castsi256_ph(a)); -} - -template <> -EIGEN_STRONG_INLINE half predux<Packet8h>(const Packet8h& a) { - return (half)_mm_reduce_add_ph(_mm_castsi128_ph(a)); -} - -// predux_half_dowto4 -template <> -EIGEN_STRONG_INLINE Packet16h predux_half_dowto4<Packet32h>(const Packet32h& a) { -#ifdef EIGEN_VECTORIZE_AVX512DQ - __m256i lowHalf = _mm256_castps_si256(_mm512_extractf32x8_ps(_mm512_castph_ps(a), 0)); - __m256i highHalf = _mm256_castps_si256(_mm512_extractf32x8_ps(_mm512_castph_ps(a), 1)); - - return Packet16h(padd<Packet16h>(lowHalf, highHalf)); -#else - Eigen::half data[32]; - _mm512_storeu_ph(data, a); - - __m256i lowHalf = _mm256_castph_si256(_mm256_loadu_ph(data)); - __m256i highHalf = _mm256_castph_si256(_mm256_loadu_ph(data + 16)); - - return Packet16h(padd<Packet16h>(lowHalf, highHalf)); -#endif -} - -// predux_max - -// predux_min - -// predux_mul - -#ifdef EIGEN_VECTORIZE_FMA - -// pmadd - -template <> -EIGEN_STRONG_INLINE Packet32h pmadd(const Packet32h& a, const Packet32h& b, const Packet32h& c) { - return _mm512_fmadd_ph(a, b, c); -} - -template <> -EIGEN_STRONG_INLINE Packet16h pmadd(const Packet16h& a, const Packet16h& b, const Packet16h& c) { - return _mm256_castph_si256(_mm256_fmadd_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c))); -} - -template <> -EIGEN_STRONG_INLINE Packet8h pmadd(const Packet8h& a, const Packet8h& b, const Packet8h& c) { - return _mm_castph_si128(_mm_fmadd_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c))); -} - -// pmsub - -template <> -EIGEN_STRONG_INLINE Packet32h pmsub(const Packet32h& a, const Packet32h& b, const Packet32h& c) { - return _mm512_fmsub_ph(a, b, c); -} - -template <> -EIGEN_STRONG_INLINE Packet16h pmsub(const Packet16h& a, const Packet16h& b, const Packet16h& c) { - return _mm256_castph_si256(_mm256_fmsub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c))); -} - -template <> -EIGEN_STRONG_INLINE Packet8h pmsub(const Packet8h& a, const Packet8h& b, const Packet8h& c) { - return _mm_castph_si128(_mm_fmsub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c))); -} - -// pnmadd - -template <> -EIGEN_STRONG_INLINE Packet32h pnmadd(const Packet32h& a, const Packet32h& b, const Packet32h& c) { - return _mm512_fnmadd_ph(a, b, c); -} - -template <> -EIGEN_STRONG_INLINE Packet16h pnmadd(const Packet16h& a, const Packet16h& b, const Packet16h& c) { - return _mm256_castph_si256(_mm256_fnmadd_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c))); -} - -template <> -EIGEN_STRONG_INLINE Packet8h pnmadd(const Packet8h& a, const Packet8h& b, const Packet8h& c) { - return _mm_castph_si128(_mm_fnmadd_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c))); -} - -// pnmsub - -template <> -EIGEN_STRONG_INLINE Packet32h pnmsub(const Packet32h& a, const Packet32h& b, const Packet32h& c) { - return _mm512_fnmsub_ph(a, b, c); -} - -template <> -EIGEN_STRONG_INLINE Packet16h pnmsub(const Packet16h& a, const Packet16h& b, const Packet16h& c) { - return _mm256_castph_si256(_mm256_fnmsub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c))); -} - -template <> -EIGEN_STRONG_INLINE Packet8h pnmsub(const Packet8h& a, const Packet8h& b, const Packet8h& c) { - return _mm_castph_si128(_mm_fnmsub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c))); -} - -#endif - -// pnegate - -template <> -EIGEN_STRONG_INLINE Packet32h pnegate<Packet32h>(const Packet32h& a) { - return _mm512_sub_ph(_mm512_set1_ph(0.0), a); -} - -// pconj - -template <> -EIGEN_STRONG_INLINE Packet32h pconj<Packet32h>(const Packet32h& a) { - return a; -} - -// psqrt - -template <> -EIGEN_STRONG_INLINE Packet32h psqrt<Packet32h>(const Packet32h& a) { - return _mm512_sqrt_ph(a); -} - -// prsqrt - -template <> -EIGEN_STRONG_INLINE Packet32h prsqrt<Packet32h>(const Packet32h& a) { - return _mm512_rsqrt_ph(a); -} - -// preciprocal - -template <> -EIGEN_STRONG_INLINE Packet32h preciprocal<Packet32h>(const Packet32h& a) { - return _mm512_rcp_ph(a); -} - -// ptranspose - -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet32h, 32>& a) { - __m512i t[32]; - - EIGEN_UNROLL_LOOP - for (int i = 0; i < 16; i++) { - t[2 * i] = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[2 * i]), _mm512_castph_si512(a.packet[2 * i + 1])); - t[2 * i + 1] = - _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[2 * i]), _mm512_castph_si512(a.packet[2 * i + 1])); - } - - __m512i p[32]; - - EIGEN_UNROLL_LOOP - for (int i = 0; i < 8; i++) { - p[4 * i] = _mm512_unpacklo_epi32(t[4 * i], t[4 * i + 2]); - p[4 * i + 1] = _mm512_unpackhi_epi32(t[4 * i], t[4 * i + 2]); - p[4 * i + 2] = _mm512_unpacklo_epi32(t[4 * i + 1], t[4 * i + 3]); - p[4 * i + 3] = _mm512_unpackhi_epi32(t[4 * i + 1], t[4 * i + 3]); - } - - __m512i q[32]; - - EIGEN_UNROLL_LOOP - for (int i = 0; i < 4; i++) { - q[8 * i] = _mm512_unpacklo_epi64(p[8 * i], p[8 * i + 4]); - q[8 * i + 1] = _mm512_unpackhi_epi64(p[8 * i], p[8 * i + 4]); - q[8 * i + 2] = _mm512_unpacklo_epi64(p[8 * i + 1], p[8 * i + 5]); - q[8 * i + 3] = _mm512_unpackhi_epi64(p[8 * i + 1], p[8 * i + 5]); - q[8 * i + 4] = _mm512_unpacklo_epi64(p[8 * i + 2], p[8 * i + 6]); - q[8 * i + 5] = _mm512_unpackhi_epi64(p[8 * i + 2], p[8 * i + 6]); - q[8 * i + 6] = _mm512_unpacklo_epi64(p[8 * i + 3], p[8 * i + 7]); - q[8 * i + 7] = _mm512_unpackhi_epi64(p[8 * i + 3], p[8 * i + 7]); - } - - __m512i f[32]; - -#define PACKET32H_TRANSPOSE_HELPER(X, Y) \ - do { \ - f[Y * 8] = _mm512_inserti32x4(f[Y * 8], _mm512_extracti32x4_epi32(q[X * 8], Y), X); \ - f[Y * 8 + 1] = _mm512_inserti32x4(f[Y * 8 + 1], _mm512_extracti32x4_epi32(q[X * 8 + 1], Y), X); \ - f[Y * 8 + 2] = _mm512_inserti32x4(f[Y * 8 + 2], _mm512_extracti32x4_epi32(q[X * 8 + 2], Y), X); \ - f[Y * 8 + 3] = _mm512_inserti32x4(f[Y * 8 + 3], _mm512_extracti32x4_epi32(q[X * 8 + 3], Y), X); \ - f[Y * 8 + 4] = _mm512_inserti32x4(f[Y * 8 + 4], _mm512_extracti32x4_epi32(q[X * 8 + 4], Y), X); \ - f[Y * 8 + 5] = _mm512_inserti32x4(f[Y * 8 + 5], _mm512_extracti32x4_epi32(q[X * 8 + 5], Y), X); \ - f[Y * 8 + 6] = _mm512_inserti32x4(f[Y * 8 + 6], _mm512_extracti32x4_epi32(q[X * 8 + 6], Y), X); \ - f[Y * 8 + 7] = _mm512_inserti32x4(f[Y * 8 + 7], _mm512_extracti32x4_epi32(q[X * 8 + 7], Y), X); \ - } while (false); - - PACKET32H_TRANSPOSE_HELPER(0, 0); - PACKET32H_TRANSPOSE_HELPER(1, 1); - PACKET32H_TRANSPOSE_HELPER(2, 2); - PACKET32H_TRANSPOSE_HELPER(3, 3); - - PACKET32H_TRANSPOSE_HELPER(1, 0); - PACKET32H_TRANSPOSE_HELPER(2, 0); - PACKET32H_TRANSPOSE_HELPER(3, 0); - PACKET32H_TRANSPOSE_HELPER(2, 1); - PACKET32H_TRANSPOSE_HELPER(3, 1); - PACKET32H_TRANSPOSE_HELPER(3, 2); - - PACKET32H_TRANSPOSE_HELPER(0, 1); - PACKET32H_TRANSPOSE_HELPER(0, 2); - PACKET32H_TRANSPOSE_HELPER(0, 3); - PACKET32H_TRANSPOSE_HELPER(1, 2); - PACKET32H_TRANSPOSE_HELPER(1, 3); - PACKET32H_TRANSPOSE_HELPER(2, 3); - -#undef PACKET32H_TRANSPOSE_HELPER - - EIGEN_UNROLL_LOOP - for (int i = 0; i < 32; i++) { - a.packet[i] = _mm512_castsi512_ph(f[i]); - } -} - -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet32h, 4>& a) { - __m512i p0, p1, p2, p3, t0, t1, t2, t3, a0, a1, a2, a3; - t0 = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[0]), _mm512_castph_si512(a.packet[1])); - t1 = _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[0]), _mm512_castph_si512(a.packet[1])); - t2 = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[2]), _mm512_castph_si512(a.packet[3])); - t3 = _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[2]), _mm512_castph_si512(a.packet[3])); - - p0 = _mm512_unpacklo_epi32(t0, t2); - p1 = _mm512_unpackhi_epi32(t0, t2); - p2 = _mm512_unpacklo_epi32(t1, t3); - p3 = _mm512_unpackhi_epi32(t1, t3); - - a0 = p0; - a1 = p1; - a2 = p2; - a3 = p3; - - a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p1, 0), 1); - a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p0, 1), 0); - - a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p2, 0), 2); - a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p0, 2), 0); - - a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p3, 0), 3); - a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p0, 3), 0); - - a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p2, 1), 2); - a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p1, 2), 1); - - a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p3, 2), 3); - a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p2, 3), 2); - - a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p3, 1), 3); - a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p1, 3), 1); - - a.packet[0] = _mm512_castsi512_ph(a0); - a.packet[1] = _mm512_castsi512_ph(a1); - a.packet[2] = _mm512_castsi512_ph(a2); - a.packet[3] = _mm512_castsi512_ph(a3); -} - -// preverse - -template <> -EIGEN_STRONG_INLINE Packet32h preverse(const Packet32h& a) { - return _mm512_permutexvar_ph(_mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, - 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), - a); -} - -// pscatter - -template <> -EIGEN_STRONG_INLINE void pscatter<half, Packet32h>(half* to, const Packet32h& from, Index stride) { - EIGEN_ALIGN64 half aux[32]; - pstore(aux, from); - - EIGEN_UNROLL_LOOP - for (int i = 0; i < 32; i++) { - to[stride * i] = aux[i]; - } -} - -// pgather - -template <> -EIGEN_STRONG_INLINE Packet32h pgather<Eigen::half, Packet32h>(const Eigen::half* from, Index stride) { - return _mm512_castsi512_ph(_mm512_set_epi16( - from[31 * stride].x, from[30 * stride].x, from[29 * stride].x, from[28 * stride].x, from[27 * stride].x, - from[26 * stride].x, from[25 * stride].x, from[24 * stride].x, from[23 * stride].x, from[22 * stride].x, - from[21 * stride].x, from[20 * stride].x, from[19 * stride].x, from[18 * stride].x, from[17 * stride].x, - from[16 * stride].x, from[15 * stride].x, from[14 * stride].x, from[13 * stride].x, from[12 * stride].x, - from[11 * stride].x, from[10 * stride].x, from[9 * stride].x, from[8 * stride].x, from[7 * stride].x, - from[6 * stride].x, from[5 * stride].x, from[4 * stride].x, from[3 * stride].x, from[2 * stride].x, - from[1 * stride].x, from[0 * stride].x)); -} - -template <> -EIGEN_STRONG_INLINE Packet16h pcos<Packet16h>(const Packet16h&); -template <> -EIGEN_STRONG_INLINE Packet16h psin<Packet16h>(const Packet16h&); -template <> -EIGEN_STRONG_INLINE Packet16h plog<Packet16h>(const Packet16h&); -template <> -EIGEN_STRONG_INLINE Packet16h plog2<Packet16h>(const Packet16h&); -template <> -EIGEN_STRONG_INLINE Packet16h plog1p<Packet16h>(const Packet16h&); -template <> -EIGEN_STRONG_INLINE Packet16h pexp<Packet16h>(const Packet16h&); -template <> -EIGEN_STRONG_INLINE Packet16h pexpm1<Packet16h>(const Packet16h&); -template <> -EIGEN_STRONG_INLINE Packet16h ptanh<Packet16h>(const Packet16h&); -template <> -EIGEN_STRONG_INLINE Packet16h pfrexp<Packet16h>(const Packet16h&, Packet16h&); -template <> -EIGEN_STRONG_INLINE Packet16h pldexp<Packet16h>(const Packet16h&, const Packet16h&); - -EIGEN_STRONG_INLINE Packet32h combine2Packet16h(const Packet16h& a, const Packet16h& b) { - __m512d result = _mm512_undefined_pd(); - result = _mm512_insertf64x4(result, _mm256_castsi256_pd(a), 0); - result = _mm512_insertf64x4(result, _mm256_castsi256_pd(b), 1); - return _mm512_castpd_ph(result); -} - -EIGEN_STRONG_INLINE void extract2Packet16h(const Packet32h& x, Packet16h& a, Packet16h& b) { - a = _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(x), 0)); - b = _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(x), 1)); -} - -// psin -template <> -EIGEN_STRONG_INLINE Packet32h psin<Packet32h>(const Packet32h& a) { - Packet16h low; - Packet16h high; - extract2Packet16h(a, low, high); - - Packet16h lowOut = psin(low); - Packet16h highOut = psin(high); - - return combine2Packet16h(lowOut, highOut); -} - -// pcos -template <> -EIGEN_STRONG_INLINE Packet32h pcos<Packet32h>(const Packet32h& a) { - Packet16h low; - Packet16h high; - extract2Packet16h(a, low, high); - - Packet16h lowOut = pcos(low); - Packet16h highOut = pcos(high); - - return combine2Packet16h(lowOut, highOut); -} - -// plog -template <> -EIGEN_STRONG_INLINE Packet32h plog<Packet32h>(const Packet32h& a) { - Packet16h low; - Packet16h high; - extract2Packet16h(a, low, high); - - Packet16h lowOut = plog(low); - Packet16h highOut = plog(high); - - return combine2Packet16h(lowOut, highOut); -} - -// plog2 -template <> -EIGEN_STRONG_INLINE Packet32h plog2<Packet32h>(const Packet32h& a) { - Packet16h low; - Packet16h high; - extract2Packet16h(a, low, high); - - Packet16h lowOut = plog2(low); - Packet16h highOut = plog2(high); - - return combine2Packet16h(lowOut, highOut); -} - -// plog1p -template <> -EIGEN_STRONG_INLINE Packet32h plog1p<Packet32h>(const Packet32h& a) { - Packet16h low; - Packet16h high; - extract2Packet16h(a, low, high); - - Packet16h lowOut = plog1p(low); - Packet16h highOut = plog1p(high); - - return combine2Packet16h(lowOut, highOut); -} - -// pexp -template <> -EIGEN_STRONG_INLINE Packet32h pexp<Packet32h>(const Packet32h& a) { - Packet16h low; - Packet16h high; - extract2Packet16h(a, low, high); - - Packet16h lowOut = pexp(low); - Packet16h highOut = pexp(high); - - return combine2Packet16h(lowOut, highOut); -} - -// pexpm1 -template <> -EIGEN_STRONG_INLINE Packet32h pexpm1<Packet32h>(const Packet32h& a) { - Packet16h low; - Packet16h high; - extract2Packet16h(a, low, high); - - Packet16h lowOut = pexpm1(low); - Packet16h highOut = pexpm1(high); - - return combine2Packet16h(lowOut, highOut); -} - -// ptanh -template <> -EIGEN_STRONG_INLINE Packet32h ptanh<Packet32h>(const Packet32h& a) { - Packet16h low; - Packet16h high; - extract2Packet16h(a, low, high); - - Packet16h lowOut = ptanh(low); - Packet16h highOut = ptanh(high); - - return combine2Packet16h(lowOut, highOut); -} - -// pfrexp -template <> -EIGEN_STRONG_INLINE Packet32h pfrexp<Packet32h>(const Packet32h& a, Packet32h& exponent) { - Packet16h low; - Packet16h high; - extract2Packet16h(a, low, high); - - Packet16h exp1 = _mm256_undefined_si256(); - Packet16h exp2 = _mm256_undefined_si256(); - - Packet16h lowOut = pfrexp(low, exp1); - Packet16h highOut = pfrexp(high, exp2); - - exponent = combine2Packet16h(exp1, exp2); - - return combine2Packet16h(lowOut, highOut); -} - -// pldexp -template <> -EIGEN_STRONG_INLINE Packet32h pldexp<Packet32h>(const Packet32h& a, const Packet32h& exponent) { - Packet16h low; - Packet16h high; - extract2Packet16h(a, low, high); - - Packet16h exp1; - Packet16h exp2; - extract2Packet16h(exponent, exp1, exp2); - - Packet16h lowOut = pldexp(low, exp1); - Packet16h highOut = pldexp(high, exp2); - - return combine2Packet16h(lowOut, highOut); -} - -} // end namespace internal -} // end namespace Eigen - -#endif // EIGEN_PACKET_MATH_FP16_AVX512_H +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET_MATH_FP16_AVX512_H +#define EIGEN_PACKET_MATH_FP16_AVX512_H + +// IWYU pragma: private +#include "../../InternalHeaderCheck.h" + +namespace Eigen { + +namespace internal { + +typedef __m512h Packet32h; +typedef eigen_packet_wrapper<__m256i, 1> Packet16h; +typedef eigen_packet_wrapper<__m128i, 2> Packet8h; + +template <> +struct is_arithmetic<Packet8h> { + enum { value = true }; +}; + +template <> +struct packet_traits<half> : default_packet_traits { + typedef Packet32h type; + typedef Packet16h half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 32, + + HasCmp = 1, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 1, + HasAbs2 = 0, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasLog = 1, + HasLog1p = 1, + HasExp = 1, + HasExpm1 = 1, + HasSqrt = 1, + HasRsqrt = 1, + // These ones should be implemented in future + HasBessel = 0, + HasNdtri = 0, + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasTanh = EIGEN_FAST_MATH, + HasErf = 0, // EIGEN_FAST_MATH, + HasBlend = 0, + HasRound = 1, + HasFloor = 1, + HasCeil = 1, + HasRint = 1 + }; +}; + +template <> +struct unpacket_traits<Packet32h> { + typedef Eigen::half type; + typedef Packet16h half; + enum { + size = 32, + alignment = Aligned64, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits<Packet16h> { + typedef Eigen::half type; + typedef Packet8h half; + enum { + size = 16, + alignment = Aligned32, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits<Packet8h> { + typedef Eigen::half type; + typedef Packet8h half; + enum { + size = 8, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +// Memory functions + +// pset1 + +template <> +EIGEN_STRONG_INLINE Packet32h pset1<Packet32h>(const Eigen::half& from) { + return _mm512_set1_ph(static_cast<_Float16>(from)); +} + +// pset1frombits +template <> +EIGEN_STRONG_INLINE Packet32h pset1frombits<Packet32h>(unsigned short from) { + return _mm512_castsi512_ph(_mm512_set1_epi16(from)); +} + +// pfirst + +template <> +EIGEN_STRONG_INLINE Eigen::half pfirst<Packet32h>(const Packet32h& from) { +#ifdef EIGEN_VECTORIZE_AVX512DQ + return half_impl::raw_uint16_to_half( + static_cast<unsigned short>(_mm256_extract_epi16(_mm512_extracti32x8_epi32(_mm512_castph_si512(from), 0), 0))); +#else + Eigen::half dest[32]; + _mm512_storeu_ph(dest, from); + return dest[0]; +#endif +} + +// pload + +template <> +EIGEN_STRONG_INLINE Packet32h pload<Packet32h>(const Eigen::half* from) { + EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_ph(from); +} + +// ploadu + +template <> +EIGEN_STRONG_INLINE Packet32h ploadu<Packet32h>(const Eigen::half* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_ph(from); +} + +// pstore + +template <> +EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet32h& from) { + EIGEN_DEBUG_ALIGNED_STORE _mm512_store_ph(to, from); +} + +// pstoreu + +template <> +EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet32h& from) { + EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_ph(to, from); +} + +// ploaddup +template <> +EIGEN_STRONG_INLINE Packet32h ploaddup<Packet32h>(const Eigen::half* from) { + __m512h a = _mm512_castph256_ph512(_mm256_loadu_ph(from)); + return _mm512_permutexvar_ph(_mm512_set_epi16(15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6, + 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0), + a); +} + +// ploadquad +template <> +EIGEN_STRONG_INLINE Packet32h ploadquad<Packet32h>(const Eigen::half* from) { + __m512h a = _mm512_castph128_ph512(_mm_loadu_ph(from)); + return _mm512_permutexvar_ph( + _mm512_set_epi16(7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0), + a); +} + +// pabs + +template <> +EIGEN_STRONG_INLINE Packet32h pabs<Packet32h>(const Packet32h& a) { + return _mm512_abs_ph(a); +} + +// psignbit + +template <> +EIGEN_STRONG_INLINE Packet32h psignbit<Packet32h>(const Packet32h& a) { + return _mm512_castsi512_ph(_mm512_srai_epi16(_mm512_castph_si512(a), 15)); +} + +// pmin + +template <> +EIGEN_STRONG_INLINE Packet32h pmin<Packet32h>(const Packet32h& a, const Packet32h& b) { + return _mm512_min_ph(a, b); +} + +// pmax + +template <> +EIGEN_STRONG_INLINE Packet32h pmax<Packet32h>(const Packet32h& a, const Packet32h& b) { + return _mm512_max_ph(a, b); +} + +// plset +template <> +EIGEN_STRONG_INLINE Packet32h plset<Packet32h>(const half& a) { + return _mm512_add_ph(_mm512_set1_ph(a), + _mm512_set_ph(31.0f, 30.0f, 29.0f, 28.0f, 27.0f, 26.0f, 25.0f, 24.0f, 23.0f, 22.0f, 21.0f, 20.0f, + 19.0f, 18.0f, 17.0f, 16.0f, 15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, + 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); +} + +// por + +template <> +EIGEN_STRONG_INLINE Packet32h por(const Packet32h& a, const Packet32h& b) { + return _mm512_castsi512_ph(_mm512_or_si512(_mm512_castph_si512(a), _mm512_castph_si512(b))); +} + +// pxor + +template <> +EIGEN_STRONG_INLINE Packet32h pxor(const Packet32h& a, const Packet32h& b) { + return _mm512_castsi512_ph(_mm512_xor_si512(_mm512_castph_si512(a), _mm512_castph_si512(b))); +} + +// pand + +template <> +EIGEN_STRONG_INLINE Packet32h pand(const Packet32h& a, const Packet32h& b) { + return _mm512_castsi512_ph(_mm512_and_si512(_mm512_castph_si512(a), _mm512_castph_si512(b))); +} + +// pandnot + +template <> +EIGEN_STRONG_INLINE Packet32h pandnot(const Packet32h& a, const Packet32h& b) { + return _mm512_castsi512_ph(_mm512_andnot_si512(_mm512_castph_si512(b), _mm512_castph_si512(a))); +} + +// pselect + +template <> +EIGEN_DEVICE_FUNC inline Packet32h pselect(const Packet32h& mask, const Packet32h& a, const Packet32h& b) { + __mmask32 mask32 = _mm512_cmp_epi16_mask(_mm512_castph_si512(mask), _mm512_setzero_epi32(), _MM_CMPINT_EQ); + return _mm512_mask_blend_ph(mask32, a, b); +} + +// pcmp_eq + +template <> +EIGEN_STRONG_INLINE Packet32h pcmp_eq(const Packet32h& a, const Packet32h& b) { + __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_EQ_OQ); + return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu)); +} + +// pcmp_le + +template <> +EIGEN_STRONG_INLINE Packet32h pcmp_le(const Packet32h& a, const Packet32h& b) { + __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_LE_OQ); + return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu)); +} + +// pcmp_lt + +template <> +EIGEN_STRONG_INLINE Packet32h pcmp_lt(const Packet32h& a, const Packet32h& b) { + __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_LT_OQ); + return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu)); +} + +// pcmp_lt_or_nan + +template <> +EIGEN_STRONG_INLINE Packet32h pcmp_lt_or_nan(const Packet32h& a, const Packet32h& b) { + __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_NGE_UQ); + return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi16(0), mask, 0xffffu)); +} + +// padd + +template <> +EIGEN_STRONG_INLINE Packet32h padd<Packet32h>(const Packet32h& a, const Packet32h& b) { + return _mm512_add_ph(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) { + return _mm256_castph_si256(_mm256_add_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) { + return _mm_castph_si128(_mm_add_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b))); +} + +// psub + +template <> +EIGEN_STRONG_INLINE Packet32h psub<Packet32h>(const Packet32h& a, const Packet32h& b) { + return _mm512_sub_ph(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet16h psub<Packet16h>(const Packet16h& a, const Packet16h& b) { + return _mm256_castph_si256(_mm256_sub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet8h psub<Packet8h>(const Packet8h& a, const Packet8h& b) { + return _mm_castph_si128(_mm_sub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b))); +} + +// pmul + +template <> +EIGEN_STRONG_INLINE Packet32h pmul<Packet32h>(const Packet32h& a, const Packet32h& b) { + return _mm512_mul_ph(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) { + return _mm256_castph_si256(_mm256_mul_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) { + return _mm_castph_si128(_mm_mul_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b))); +} + +// pdiv + +template <> +EIGEN_STRONG_INLINE Packet32h pdiv<Packet32h>(const Packet32h& a, const Packet32h& b) { + return _mm512_div_ph(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet16h pdiv<Packet16h>(const Packet16h& a, const Packet16h& b) { + return _mm256_castph_si256(_mm256_div_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet8h pdiv<Packet8h>(const Packet8h& a, const Packet8h& b) { + return _mm_castph_si128(_mm_div_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b))); +} + +// pround + +template <> +EIGEN_STRONG_INLINE Packet32h pround<Packet32h>(const Packet32h& a) { + // Work-around for default std::round rounding mode. + + // Mask for the sign bit + const Packet32h signMask = pset1frombits<Packet32h>(static_cast<numext::uint16_t>(0x8000u)); + // The largest half-preicision float less than 0.5 + const Packet32h prev0dot5 = pset1frombits<Packet32h>(static_cast<numext::uint16_t>(0x37FFu)); + + return _mm512_roundscale_ph(padd(por(pand(a, signMask), prev0dot5), a), _MM_FROUND_TO_ZERO); +} + +// print + +template <> +EIGEN_STRONG_INLINE Packet32h print<Packet32h>(const Packet32h& a) { + return _mm512_roundscale_ph(a, _MM_FROUND_CUR_DIRECTION); +} + +// pceil + +template <> +EIGEN_STRONG_INLINE Packet32h pceil<Packet32h>(const Packet32h& a) { + return _mm512_roundscale_ph(a, _MM_FROUND_TO_POS_INF); +} + +// pfloor + +template <> +EIGEN_STRONG_INLINE Packet32h pfloor<Packet32h>(const Packet32h& a) { + return _mm512_roundscale_ph(a, _MM_FROUND_TO_NEG_INF); +} + +// predux +template <> +EIGEN_STRONG_INLINE half predux<Packet32h>(const Packet32h& a) { + return (half)_mm512_reduce_add_ph(a); +} + +template <> +EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& a) { + return (half)_mm256_reduce_add_ph(_mm256_castsi256_ph(a)); +} + +template <> +EIGEN_STRONG_INLINE half predux<Packet8h>(const Packet8h& a) { + return (half)_mm_reduce_add_ph(_mm_castsi128_ph(a)); +} + +// predux_half_dowto4 +template <> +EIGEN_STRONG_INLINE Packet16h predux_half_dowto4<Packet32h>(const Packet32h& a) { +#ifdef EIGEN_VECTORIZE_AVX512DQ + __m256i lowHalf = _mm256_castps_si256(_mm512_extractf32x8_ps(_mm512_castph_ps(a), 0)); + __m256i highHalf = _mm256_castps_si256(_mm512_extractf32x8_ps(_mm512_castph_ps(a), 1)); + + return Packet16h(padd<Packet16h>(lowHalf, highHalf)); +#else + Eigen::half data[32]; + _mm512_storeu_ph(data, a); + + __m256i lowHalf = _mm256_castph_si256(_mm256_loadu_ph(data)); + __m256i highHalf = _mm256_castph_si256(_mm256_loadu_ph(data + 16)); + + return Packet16h(padd<Packet16h>(lowHalf, highHalf)); +#endif +} + +// predux_max + +// predux_min + +// predux_mul + +#ifdef EIGEN_VECTORIZE_FMA + +// pmadd + +template <> +EIGEN_STRONG_INLINE Packet32h pmadd(const Packet32h& a, const Packet32h& b, const Packet32h& c) { + return _mm512_fmadd_ph(a, b, c); +} + +template <> +EIGEN_STRONG_INLINE Packet16h pmadd(const Packet16h& a, const Packet16h& b, const Packet16h& c) { + return _mm256_castph_si256(_mm256_fmadd_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c))); +} + +template <> +EIGEN_STRONG_INLINE Packet8h pmadd(const Packet8h& a, const Packet8h& b, const Packet8h& c) { + return _mm_castph_si128(_mm_fmadd_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c))); +} + +// pmsub + +template <> +EIGEN_STRONG_INLINE Packet32h pmsub(const Packet32h& a, const Packet32h& b, const Packet32h& c) { + return _mm512_fmsub_ph(a, b, c); +} + +template <> +EIGEN_STRONG_INLINE Packet16h pmsub(const Packet16h& a, const Packet16h& b, const Packet16h& c) { + return _mm256_castph_si256(_mm256_fmsub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c))); +} + +template <> +EIGEN_STRONG_INLINE Packet8h pmsub(const Packet8h& a, const Packet8h& b, const Packet8h& c) { + return _mm_castph_si128(_mm_fmsub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c))); +} + +// pnmadd + +template <> +EIGEN_STRONG_INLINE Packet32h pnmadd(const Packet32h& a, const Packet32h& b, const Packet32h& c) { + return _mm512_fnmadd_ph(a, b, c); +} + +template <> +EIGEN_STRONG_INLINE Packet16h pnmadd(const Packet16h& a, const Packet16h& b, const Packet16h& c) { + return _mm256_castph_si256(_mm256_fnmadd_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c))); +} + +template <> +EIGEN_STRONG_INLINE Packet8h pnmadd(const Packet8h& a, const Packet8h& b, const Packet8h& c) { + return _mm_castph_si128(_mm_fnmadd_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c))); +} + +// pnmsub + +template <> +EIGEN_STRONG_INLINE Packet32h pnmsub(const Packet32h& a, const Packet32h& b, const Packet32h& c) { + return _mm512_fnmsub_ph(a, b, c); +} + +template <> +EIGEN_STRONG_INLINE Packet16h pnmsub(const Packet16h& a, const Packet16h& b, const Packet16h& c) { + return _mm256_castph_si256(_mm256_fnmsub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c))); +} + +template <> +EIGEN_STRONG_INLINE Packet8h pnmsub(const Packet8h& a, const Packet8h& b, const Packet8h& c) { + return _mm_castph_si128(_mm_fnmsub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c))); +} + +#endif + +// pnegate + +template <> +EIGEN_STRONG_INLINE Packet32h pnegate<Packet32h>(const Packet32h& a) { + return _mm512_sub_ph(_mm512_set1_ph(0.0), a); +} + +// pconj + +template <> +EIGEN_STRONG_INLINE Packet32h pconj<Packet32h>(const Packet32h& a) { + return a; +} + +// psqrt + +template <> +EIGEN_STRONG_INLINE Packet32h psqrt<Packet32h>(const Packet32h& a) { + return _mm512_sqrt_ph(a); +} + +// prsqrt + +template <> +EIGEN_STRONG_INLINE Packet32h prsqrt<Packet32h>(const Packet32h& a) { + return _mm512_rsqrt_ph(a); +} + +// preciprocal + +template <> +EIGEN_STRONG_INLINE Packet32h preciprocal<Packet32h>(const Packet32h& a) { + return _mm512_rcp_ph(a); +} + +// ptranspose + +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet32h, 32>& a) { + __m512i t[32]; + + EIGEN_UNROLL_LOOP + for (int i = 0; i < 16; i++) { + t[2 * i] = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[2 * i]), _mm512_castph_si512(a.packet[2 * i + 1])); + t[2 * i + 1] = + _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[2 * i]), _mm512_castph_si512(a.packet[2 * i + 1])); + } + + __m512i p[32]; + + EIGEN_UNROLL_LOOP + for (int i = 0; i < 8; i++) { + p[4 * i] = _mm512_unpacklo_epi32(t[4 * i], t[4 * i + 2]); + p[4 * i + 1] = _mm512_unpackhi_epi32(t[4 * i], t[4 * i + 2]); + p[4 * i + 2] = _mm512_unpacklo_epi32(t[4 * i + 1], t[4 * i + 3]); + p[4 * i + 3] = _mm512_unpackhi_epi32(t[4 * i + 1], t[4 * i + 3]); + } + + __m512i q[32]; + + EIGEN_UNROLL_LOOP + for (int i = 0; i < 4; i++) { + q[8 * i] = _mm512_unpacklo_epi64(p[8 * i], p[8 * i + 4]); + q[8 * i + 1] = _mm512_unpackhi_epi64(p[8 * i], p[8 * i + 4]); + q[8 * i + 2] = _mm512_unpacklo_epi64(p[8 * i + 1], p[8 * i + 5]); + q[8 * i + 3] = _mm512_unpackhi_epi64(p[8 * i + 1], p[8 * i + 5]); + q[8 * i + 4] = _mm512_unpacklo_epi64(p[8 * i + 2], p[8 * i + 6]); + q[8 * i + 5] = _mm512_unpackhi_epi64(p[8 * i + 2], p[8 * i + 6]); + q[8 * i + 6] = _mm512_unpacklo_epi64(p[8 * i + 3], p[8 * i + 7]); + q[8 * i + 7] = _mm512_unpackhi_epi64(p[8 * i + 3], p[8 * i + 7]); + } + + __m512i f[32]; + +#define PACKET32H_TRANSPOSE_HELPER(X, Y) \ + do { \ + f[Y * 8] = _mm512_inserti32x4(f[Y * 8], _mm512_extracti32x4_epi32(q[X * 8], Y), X); \ + f[Y * 8 + 1] = _mm512_inserti32x4(f[Y * 8 + 1], _mm512_extracti32x4_epi32(q[X * 8 + 1], Y), X); \ + f[Y * 8 + 2] = _mm512_inserti32x4(f[Y * 8 + 2], _mm512_extracti32x4_epi32(q[X * 8 + 2], Y), X); \ + f[Y * 8 + 3] = _mm512_inserti32x4(f[Y * 8 + 3], _mm512_extracti32x4_epi32(q[X * 8 + 3], Y), X); \ + f[Y * 8 + 4] = _mm512_inserti32x4(f[Y * 8 + 4], _mm512_extracti32x4_epi32(q[X * 8 + 4], Y), X); \ + f[Y * 8 + 5] = _mm512_inserti32x4(f[Y * 8 + 5], _mm512_extracti32x4_epi32(q[X * 8 + 5], Y), X); \ + f[Y * 8 + 6] = _mm512_inserti32x4(f[Y * 8 + 6], _mm512_extracti32x4_epi32(q[X * 8 + 6], Y), X); \ + f[Y * 8 + 7] = _mm512_inserti32x4(f[Y * 8 + 7], _mm512_extracti32x4_epi32(q[X * 8 + 7], Y), X); \ + } while (false); + + PACKET32H_TRANSPOSE_HELPER(0, 0); + PACKET32H_TRANSPOSE_HELPER(1, 1); + PACKET32H_TRANSPOSE_HELPER(2, 2); + PACKET32H_TRANSPOSE_HELPER(3, 3); + + PACKET32H_TRANSPOSE_HELPER(1, 0); + PACKET32H_TRANSPOSE_HELPER(2, 0); + PACKET32H_TRANSPOSE_HELPER(3, 0); + PACKET32H_TRANSPOSE_HELPER(2, 1); + PACKET32H_TRANSPOSE_HELPER(3, 1); + PACKET32H_TRANSPOSE_HELPER(3, 2); + + PACKET32H_TRANSPOSE_HELPER(0, 1); + PACKET32H_TRANSPOSE_HELPER(0, 2); + PACKET32H_TRANSPOSE_HELPER(0, 3); + PACKET32H_TRANSPOSE_HELPER(1, 2); + PACKET32H_TRANSPOSE_HELPER(1, 3); + PACKET32H_TRANSPOSE_HELPER(2, 3); + +#undef PACKET32H_TRANSPOSE_HELPER + + EIGEN_UNROLL_LOOP + for (int i = 0; i < 32; i++) { + a.packet[i] = _mm512_castsi512_ph(f[i]); + } +} + +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet32h, 4>& a) { + __m512i p0, p1, p2, p3, t0, t1, t2, t3, a0, a1, a2, a3; + t0 = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[0]), _mm512_castph_si512(a.packet[1])); + t1 = _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[0]), _mm512_castph_si512(a.packet[1])); + t2 = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[2]), _mm512_castph_si512(a.packet[3])); + t3 = _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[2]), _mm512_castph_si512(a.packet[3])); + + p0 = _mm512_unpacklo_epi32(t0, t2); + p1 = _mm512_unpackhi_epi32(t0, t2); + p2 = _mm512_unpacklo_epi32(t1, t3); + p3 = _mm512_unpackhi_epi32(t1, t3); + + a0 = p0; + a1 = p1; + a2 = p2; + a3 = p3; + + a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p1, 0), 1); + a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p0, 1), 0); + + a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p2, 0), 2); + a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p0, 2), 0); + + a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p3, 0), 3); + a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p0, 3), 0); + + a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p2, 1), 2); + a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p1, 2), 1); + + a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p3, 2), 3); + a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p2, 3), 2); + + a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p3, 1), 3); + a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p1, 3), 1); + + a.packet[0] = _mm512_castsi512_ph(a0); + a.packet[1] = _mm512_castsi512_ph(a1); + a.packet[2] = _mm512_castsi512_ph(a2); + a.packet[3] = _mm512_castsi512_ph(a3); +} + +// preverse + +template <> +EIGEN_STRONG_INLINE Packet32h preverse(const Packet32h& a) { + return _mm512_permutexvar_ph(_mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), + a); +} + +// pscatter + +template <> +EIGEN_STRONG_INLINE void pscatter<half, Packet32h>(half* to, const Packet32h& from, Index stride) { + EIGEN_ALIGN64 half aux[32]; + pstore(aux, from); + + EIGEN_UNROLL_LOOP + for (int i = 0; i < 32; i++) { + to[stride * i] = aux[i]; + } +} + +// pgather + +template <> +EIGEN_STRONG_INLINE Packet32h pgather<Eigen::half, Packet32h>(const Eigen::half* from, Index stride) { + return _mm512_castsi512_ph(_mm512_set_epi16( + from[31 * stride].x, from[30 * stride].x, from[29 * stride].x, from[28 * stride].x, from[27 * stride].x, + from[26 * stride].x, from[25 * stride].x, from[24 * stride].x, from[23 * stride].x, from[22 * stride].x, + from[21 * stride].x, from[20 * stride].x, from[19 * stride].x, from[18 * stride].x, from[17 * stride].x, + from[16 * stride].x, from[15 * stride].x, from[14 * stride].x, from[13 * stride].x, from[12 * stride].x, + from[11 * stride].x, from[10 * stride].x, from[9 * stride].x, from[8 * stride].x, from[7 * stride].x, + from[6 * stride].x, from[5 * stride].x, from[4 * stride].x, from[3 * stride].x, from[2 * stride].x, + from[1 * stride].x, from[0 * stride].x)); +} + +template <> +EIGEN_STRONG_INLINE Packet16h pcos<Packet16h>(const Packet16h&); +template <> +EIGEN_STRONG_INLINE Packet16h psin<Packet16h>(const Packet16h&); +template <> +EIGEN_STRONG_INLINE Packet16h plog<Packet16h>(const Packet16h&); +template <> +EIGEN_STRONG_INLINE Packet16h plog2<Packet16h>(const Packet16h&); +template <> +EIGEN_STRONG_INLINE Packet16h plog1p<Packet16h>(const Packet16h&); +template <> +EIGEN_STRONG_INLINE Packet16h pexp<Packet16h>(const Packet16h&); +template <> +EIGEN_STRONG_INLINE Packet16h pexpm1<Packet16h>(const Packet16h&); +template <> +EIGEN_STRONG_INLINE Packet16h ptanh<Packet16h>(const Packet16h&); +template <> +EIGEN_STRONG_INLINE Packet16h pfrexp<Packet16h>(const Packet16h&, Packet16h&); +template <> +EIGEN_STRONG_INLINE Packet16h pldexp<Packet16h>(const Packet16h&, const Packet16h&); + +EIGEN_STRONG_INLINE Packet32h combine2Packet16h(const Packet16h& a, const Packet16h& b) { + __m512d result = _mm512_undefined_pd(); + result = _mm512_insertf64x4(result, _mm256_castsi256_pd(a), 0); + result = _mm512_insertf64x4(result, _mm256_castsi256_pd(b), 1); + return _mm512_castpd_ph(result); +} + +EIGEN_STRONG_INLINE void extract2Packet16h(const Packet32h& x, Packet16h& a, Packet16h& b) { + a = _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(x), 0)); + b = _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(x), 1)); +} + +// psin +template <> +EIGEN_STRONG_INLINE Packet32h psin<Packet32h>(const Packet32h& a) { + Packet16h low; + Packet16h high; + extract2Packet16h(a, low, high); + + Packet16h lowOut = psin(low); + Packet16h highOut = psin(high); + + return combine2Packet16h(lowOut, highOut); +} + +// pcos +template <> +EIGEN_STRONG_INLINE Packet32h pcos<Packet32h>(const Packet32h& a) { + Packet16h low; + Packet16h high; + extract2Packet16h(a, low, high); + + Packet16h lowOut = pcos(low); + Packet16h highOut = pcos(high); + + return combine2Packet16h(lowOut, highOut); +} + +// plog +template <> +EIGEN_STRONG_INLINE Packet32h plog<Packet32h>(const Packet32h& a) { + Packet16h low; + Packet16h high; + extract2Packet16h(a, low, high); + + Packet16h lowOut = plog(low); + Packet16h highOut = plog(high); + + return combine2Packet16h(lowOut, highOut); +} + +// plog2 +template <> +EIGEN_STRONG_INLINE Packet32h plog2<Packet32h>(const Packet32h& a) { + Packet16h low; + Packet16h high; + extract2Packet16h(a, low, high); + + Packet16h lowOut = plog2(low); + Packet16h highOut = plog2(high); + + return combine2Packet16h(lowOut, highOut); +} + +// plog1p +template <> +EIGEN_STRONG_INLINE Packet32h plog1p<Packet32h>(const Packet32h& a) { + Packet16h low; + Packet16h high; + extract2Packet16h(a, low, high); + + Packet16h lowOut = plog1p(low); + Packet16h highOut = plog1p(high); + + return combine2Packet16h(lowOut, highOut); +} + +// pexp +template <> +EIGEN_STRONG_INLINE Packet32h pexp<Packet32h>(const Packet32h& a) { + Packet16h low; + Packet16h high; + extract2Packet16h(a, low, high); + + Packet16h lowOut = pexp(low); + Packet16h highOut = pexp(high); + + return combine2Packet16h(lowOut, highOut); +} + +// pexpm1 +template <> +EIGEN_STRONG_INLINE Packet32h pexpm1<Packet32h>(const Packet32h& a) { + Packet16h low; + Packet16h high; + extract2Packet16h(a, low, high); + + Packet16h lowOut = pexpm1(low); + Packet16h highOut = pexpm1(high); + + return combine2Packet16h(lowOut, highOut); +} + +// ptanh +template <> +EIGEN_STRONG_INLINE Packet32h ptanh<Packet32h>(const Packet32h& a) { + Packet16h low; + Packet16h high; + extract2Packet16h(a, low, high); + + Packet16h lowOut = ptanh(low); + Packet16h highOut = ptanh(high); + + return combine2Packet16h(lowOut, highOut); +} + +// pfrexp +template <> +EIGEN_STRONG_INLINE Packet32h pfrexp<Packet32h>(const Packet32h& a, Packet32h& exponent) { + Packet16h low; + Packet16h high; + extract2Packet16h(a, low, high); + + Packet16h exp1 = _mm256_undefined_si256(); + Packet16h exp2 = _mm256_undefined_si256(); + + Packet16h lowOut = pfrexp(low, exp1); + Packet16h highOut = pfrexp(high, exp2); + + exponent = combine2Packet16h(exp1, exp2); + + return combine2Packet16h(lowOut, highOut); +} + +// pldexp +template <> +EIGEN_STRONG_INLINE Packet32h pldexp<Packet32h>(const Packet32h& a, const Packet32h& exponent) { + Packet16h low; + Packet16h high; + extract2Packet16h(a, low, high); + + Packet16h exp1; + Packet16h exp2; + extract2Packet16h(exponent, exp1, exp2); + + Packet16h lowOut = pldexp(low, exp1); + Packet16h highOut = pldexp(high, exp2); + + return combine2Packet16h(lowOut, highOut); +} + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_PACKET_MATH_FP16_AVX512_H
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index 8fb5b68..d84b1cc 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@@ -582,8 +582,8 @@ // Subtract y * Pi/2 to reduce x to the interval -Pi/4 <= x <= +Pi/4 // using "Extended precision modular arithmetic" -#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) - // This version requires true FMA for high accuracy +#if defined(EIGEN_VECTORIZE_FMA) + // This version requires true FMA for high accuracy. // It provides a max error of 1ULP up to (with absolute_error < 5.9605e-08): const float huge_th = ComputeSine ? 117435.992f : 71476.0625f; x = pmadd(y, pset1<Packet>(-1.57079601287841796875f), x); @@ -1181,7 +1181,7 @@ s_lo = psub(y, t); } -#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#ifdef EIGEN_VECTORIZE_FMA // This function implements the extended precision product of // a pair of floating point numbers. Given {x, y}, it computes the pair // {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and @@ -1227,7 +1227,7 @@ p_lo = pmadd(x_lo, y_lo, p_lo); } -#endif // EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#endif // EIGEN_VECTORIZE_FMA // This function implements Dekker's algorithm for the addition // of two double word numbers represented by {x_hi, x_lo} and {y_hi, y_lo}.
diff --git a/Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h b/Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h deleted file mode 100644 index a159739..0000000 --- a/Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h +++ /dev/null
@@ -1,41 +0,0 @@ -#ifndef EIGEN_HVX_GENERAL_BLOCK_KERNEL_H -#define EIGEN_HVX_GENERAL_BLOCK_KERNEL_H - -// Only support 128B HVX now. -// Floating-point operations are only supported since V68. -#if defined __HVX__ && (__HVX_LENGTH__ == 128) && __HVX_ARCH__ >= 68 - -namespace Eigen { -namespace internal { - -template <bool ConjLhs_, bool ConjRhs_, int PacketSize_> -class gebp_traits<float, float, ConjLhs_, ConjRhs_, Architecture::Target, PacketSize_> - : public gebp_traits<float, float, ConjLhs_, ConjRhs_, Architecture::Generic, PacketSize_> { - public: - typedef Packet32qf AccPacket; - - EIGEN_STRONG_INLINE void initAcc(Packet32qf& p) { p = pzero<Packet32qf>(p); } - - template <typename LaneIdType> - EIGEN_STRONG_INLINE void madd(const Packet32f& a, const Packet32f& b, Packet32qf& c, Packet32f& /*tmp*/, - const LaneIdType&) const { - c = pmadd_f32_to_qf32(a, b, c); - } - - template <typename LaneIdType> - EIGEN_STRONG_INLINE void madd(const Packet32f& a, const QuadPacket<Packet32f>& b, Packet32qf& c, Packet32f& tmp, - const LaneIdType& lane) const { - madd(a, b.get(lane), c, tmp, lane); - } - - EIGEN_STRONG_INLINE void acc(const Packet32qf& c, const Packet32f& alpha, Packet32f& r) const { - r = pmadd_qf32_to_f32(c, alpha, r); - } -}; - -} // end namespace internal -} // end namespace Eigen - -#endif // __HVX__ && (__HVX_LENGTH__ == 128) && __HVX_ARCH__ >= 68 - -#endif // EIGEN_HVX_GENERAL_BLOCK_KERNEL_H
diff --git a/Eigen/src/Core/arch/HVX/PacketMath.h b/Eigen/src/Core/arch/HVX/PacketMath.h index 7c69f3b..7e139de 100644 --- a/Eigen/src/Core/arch/HVX/PacketMath.h +++ b/Eigen/src/Core/arch/HVX/PacketMath.h
@@ -18,18 +18,107 @@ namespace Eigen { namespace internal { -EIGEN_STRONG_INLINE HVX_Vector HVX_load(const void* mem) { return *((HVX_Vector*)mem); } +// HVX utilities. -EIGEN_STRONG_INLINE HVX_Vector HVX_loadu(const void* mem) { return *((HVX_UVector*)mem); } +template <int D> +EIGEN_STRONG_INLINE HVX_Vector HVX_vmem(const void* m) { + HVX_Vector v; +#if EIGEN_COMP_CLANG + // Use inlined assembly for aligned vmem load on unaligned memory. + // Use type cast to HVX_Vector* may mess up with compiler data alignment. + __asm__("%0 = vmem(%1+#%2)" : "=v"(v) : "r"(m), "i"(D) : "memory"); +#else + void* aligned_mem = + reinterpret_cast<void*>((reinterpret_cast<uintptr_t>(m) & ~(__HVX_LENGTH__ - 1)) + D * __HVX_LENGTH__); + memcpy(&v, aligned_mem, __HVX_LENGTH__); +#endif + return v; +} -EIGEN_STRONG_INLINE void HVX_store(void* mem, HVX_Vector v) { *((HVX_Vector*)mem) = v; } +template <typename T> +EIGEN_STRONG_INLINE HVX_Vector HVX_load(const T* mem) { + HVX_Vector v; + memcpy(&v, reinterpret_cast<const HVX_Vector*>(mem), __HVX_LENGTH__); + return v; +} -EIGEN_STRONG_INLINE void HVX_storeu(void* mem, HVX_Vector v) { *((HVX_UVector*)mem) = v; } +template <typename T> +EIGEN_STRONG_INLINE HVX_Vector HVX_loadu(const T* mem) { + HVX_Vector v; + memcpy(&v, mem, __HVX_LENGTH__); + return v; +} + +template <size_t Size, size_t Alignment, typename T> +EIGEN_STRONG_INLINE HVX_Vector HVX_load_partial(const T* mem) { +#if defined(EIGEN_HVX_FAST_PARTIAL_VECTOR_LOAD) + // Fast partial vector load through aligned vmem load. + // The load may past end of array but is aligned to prevent memory fault. + HVX_Vector v0 = HVX_vmem<0>(mem); + HVX_Vector v1 = v0; + uintptr_t mem_addr = reinterpret_cast<uintptr_t>(mem); + EIGEN_IF_CONSTEXPR(Size * sizeof(T) <= Alignment) { + // Data size less than alignment will never cross multiple aligned vectors. + v1 = v0; + } + else { + uintptr_t left_off = mem_addr & (__HVX_LENGTH__ - 1); + if (left_off + Size * sizeof(T) > __HVX_LENGTH__) { + v1 = HVX_vmem<1>(mem); + } else { + v1 = v0; + } + } + return Q6_V_valign_VVR(v1, v0, mem_addr); +#else + HVX_Vector v; + memcpy(&v, mem, Size * sizeof(T)); + return v; +#endif +} + +template <typename T> +EIGEN_STRONG_INLINE void HVX_store(T* mem, HVX_Vector v) { + memcpy(reinterpret_cast<HVX_Vector*>(mem), &v, __HVX_LENGTH__); +} + +template <typename T> +EIGEN_STRONG_INLINE void HVX_storeu(T* mem, HVX_Vector v) { + memcpy(mem, &v, __HVX_LENGTH__); +} + +template <size_t Size, size_t Alignment, typename T> +EIGEN_STRONG_INLINE void HVX_store_partial(T* mem, HVX_Vector v) { + uintptr_t mem_addr = reinterpret_cast<uintptr_t>(mem); + HVX_Vector value = Q6_V_vlalign_VVR(v, v, mem_addr); + uintptr_t left_off = mem_addr & (__HVX_LENGTH__ - 1); + uintptr_t right_off = left_off + Size * sizeof(T); + + HVX_VectorPred ql_not = Q6_Q_vsetq_R(mem_addr); + HVX_VectorPred qr = Q6_Q_vsetq2_R(right_off); + + EIGEN_IF_CONSTEXPR(Size * sizeof(T) > Alignment) { + if (right_off > __HVX_LENGTH__) { + Q6_vmem_QRIV(qr, mem + __HVX_LENGTH__ / sizeof(T), value); + qr = Q6_Q_vcmp_eq_VbVb(value, value); + } + } + + ql_not = Q6_Q_or_QQn(ql_not, qr); + Q6_vmem_QnRIV(ql_not, mem, value); +} + +// Packet definitions. +enum class HVXPacketSize { + Full, + Half, + Quarter, +}; // Hexagon compiler uses same HVX_Vector to represent all HVX vector types. // Wrap different vector type (float32, int32, etc) to different class with // explicit constructor and casting back-and-force to HVX_Vector. -template <int unique_id> +template <HVXPacketSize T> class HVXPacket { public: HVXPacket() = default; @@ -41,24 +130,62 @@ HVX_Vector m_val = Q6_V_vzero(); }; -typedef HVXPacket<0> Packet32f; // float32 -typedef HVXPacket<1> Packet32qf; // qfloat32 +typedef HVXPacket<HVXPacketSize::Full> Packet32f; +typedef HVXPacket<HVXPacketSize::Half> Packet16f; +typedef HVXPacket<HVXPacketSize::Quarter> Packet8f; +// Packet traits. template <> struct packet_traits<float> : default_packet_traits { typedef Packet32f type; - typedef Packet32f half; + typedef Packet16f half; enum { Vectorizable = 1, AlignedOnScalar = 1, size = 32, + + HasCmp = 1, + HasAdd = 1, + HasSub = 1, + HasShift = 0, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 0, + HasAbsDiff = 0, + HasMin = 1, + HasMax = 1, + HasConj = 0, + HasSetLinear = 0, + HasBlend = 0, + + HasDiv = 0, + HasFloor = 0, + HasCeil = 0, + HasRint = 0, + + HasSin = 0, + HasCos = 0, + HasACos = 0, + HasASin = 0, + HasATan = 0, + HasATanh = 0, + HasLog = 0, + HasExp = 0, + HasSqrt = 0, + HasRsqrt = 0, + HasTanh = 0, + HasErf = 0, + HasBessel = 0, + HasNdtri = 0 }; }; template <> struct unpacket_traits<Packet32f> { typedef float type; - typedef Packet32f half; + typedef Packet16f half; enum { size = 32, alignment = Aligned128, @@ -68,15 +195,89 @@ }; }; -// float32 operations. template <> -EIGEN_STRONG_INLINE Packet32f pset1<Packet32f>(const float& from) { +struct unpacket_traits<Packet16f> { + typedef float type; + typedef Packet8f half; + enum { + size = 16, + // Many code assume alignment on packet size instead of following trait + // So we do not use Aligned128 to optimize aligned load/store, + alignment = Aligned64, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits<Packet8f> { + typedef float type; + typedef Packet8f half; + enum { + size = 8, + // Many code assume alignment on packet size instead of following trait + // So we do not use Aligned128 to optimize aligned load/store, + alignment = Aligned32, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +// float32 operations. +template <HVXPacketSize T> +EIGEN_STRONG_INLINE HVXPacket<T> pzero_hvx(const HVXPacket<T>&) { + return HVXPacket<T>::Create(Q6_V_vzero()); +} +template <> +EIGEN_STRONG_INLINE Packet32f pzero<Packet32f>(const Packet32f&) { + return pzero_hvx(Packet32f()); +} +template <> +EIGEN_STRONG_INLINE Packet16f pzero<Packet16f>(const Packet16f&) { + return pzero_hvx(Packet16f()); +} +template <> +EIGEN_STRONG_INLINE Packet8f pzero<Packet8f>(const Packet8f&) { + return pzero_hvx(Packet8f()); +} + +template <HVXPacketSize T> +EIGEN_STRONG_INLINE typename unpacket_traits<HVXPacket<T>>::half predux_half_dowto4_hvx(const HVXPacket<T>& a) { + const Index packet_size = unpacket_traits<HVXPacket<T>>::size; + return unpacket_traits<HVXPacket<T>>::half::Create( + Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_vror_VR(a.Get(), sizeof(float) * packet_size / 2), a.Get()))); +} +template <> +EIGEN_STRONG_INLINE Packet16f predux_half_dowto4(const Packet32f& a) { + return predux_half_dowto4_hvx(a); +} +template <> +EIGEN_STRONG_INLINE Packet8f predux_half_dowto4(const Packet16f& a) { + return predux_half_dowto4_hvx(a); +} + +template <HVXPacketSize T> +EIGEN_STRONG_INLINE HVXPacket<T> pset1_hvx(const float& from) { union { float f; int32_t i; } u; u.f = from; - return Packet32f::Create(Q6_V_vsplat_R(u.i)); + return HVXPacket<T>::Create(Q6_V_vsplat_R(u.i)); +} +template <> +EIGEN_STRONG_INLINE Packet32f pset1<Packet32f>(const float& from) { + return pset1_hvx<HVXPacketSize::Full>(from); +} +template <> +EIGEN_STRONG_INLINE Packet16f pset1<Packet16f>(const float& from) { + return pset1_hvx<HVXPacketSize::Half>(from); +} +template <> +EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float& from) { + return pset1_hvx<HVXPacketSize::Quarter>(from); } template <> @@ -84,78 +285,236 @@ return Packet32f::Create(HVX_load(from)); } template <> +EIGEN_STRONG_INLINE Packet16f pload<Packet16f>(const float* from) { + return Packet16f::Create( + HVX_load_partial<unpacket_traits<Packet16f>::size, unpacket_traits<Packet16f>::alignment>(from)); +} +template <> +EIGEN_STRONG_INLINE Packet8f pload<Packet8f>(const float* from) { + return Packet8f::Create( + HVX_load_partial<unpacket_traits<Packet8f>::size, unpacket_traits<Packet8f>::alignment>(from)); +} + +template <> EIGEN_STRONG_INLINE Packet32f ploadu<Packet32f>(const float* from) { return Packet32f::Create(HVX_loadu(from)); } +template <> +EIGEN_STRONG_INLINE Packet16f ploadu<Packet16f>(const float* from) { + return Packet16f::Create(HVX_load_partial<unpacket_traits<Packet16f>::size, 0>(from)); +} +template <> +EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from) { + return Packet8f::Create(HVX_load_partial<unpacket_traits<Packet8f>::size, 0>(from)); +} template <> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet32f& from) { HVX_store(to, from.Get()); } template <> +EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet16f& from) { + HVX_store_partial<unpacket_traits<Packet16f>::size, unpacket_traits<Packet16f>::alignment>(to, from.Get()); +} +template <> +EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet8f& from) { + HVX_store_partial<unpacket_traits<Packet8f>::size, unpacket_traits<Packet8f>::alignment>(to, from.Get()); +} + +template <> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet32f& from) { HVX_storeu(to, from.Get()); } +template <> +EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet16f& from) { + HVX_store_partial<unpacket_traits<Packet16f>::size, 0>(to, from.Get()); +} +template <> +EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet8f& from) { + HVX_store_partial<unpacket_traits<Packet8f>::size, 0>(to, from.Get()); +} +template <HVXPacketSize T> +EIGEN_STRONG_INLINE HVXPacket<T> pmul_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) { + return HVXPacket<T>::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(a.Get(), b.Get()))); +} template <> EIGEN_STRONG_INLINE Packet32f pmul<Packet32f>(const Packet32f& a, const Packet32f& b) { - return Packet32f::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(a.Get(), b.Get()))); + return pmul_hvx(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet16f pmul<Packet16f>(const Packet16f& a, const Packet16f& b) { + return pmul_hvx(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8f pmul<Packet8f>(const Packet8f& a, const Packet8f& b) { + return pmul_hvx(a, b); } +template <HVXPacketSize T> +EIGEN_STRONG_INLINE HVXPacket<T> padd_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) { + return HVXPacket<T>::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(a.Get(), b.Get()))); +} template <> EIGEN_STRONG_INLINE Packet32f padd<Packet32f>(const Packet32f& a, const Packet32f& b) { - return Packet32f::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(a.Get(), b.Get()))); + return padd_hvx(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet16f padd<Packet16f>(const Packet16f& a, const Packet16f& b) { + return padd_hvx(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b) { + return padd_hvx(a, b); } +template <HVXPacketSize T> +EIGEN_STRONG_INLINE HVXPacket<T> psub_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) { + return HVXPacket<T>::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(a.Get(), b.Get()))); +} template <> EIGEN_STRONG_INLINE Packet32f psub<Packet32f>(const Packet32f& a, const Packet32f& b) { - return Packet32f::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(a.Get(), b.Get()))); + return psub_hvx(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet16f psub<Packet16f>(const Packet16f& a, const Packet16f& b) { + return psub_hvx(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8f psub<Packet8f>(const Packet8f& a, const Packet8f& b) { + return psub_hvx(a, b); } +template <HVXPacketSize T> +EIGEN_STRONG_INLINE HVXPacket<T> pnegate_hvx(const HVXPacket<T>& a) { + return HVXPacket<T>::Create(a.Get() ^ Q6_V_vsplat_R(0x80000000)); +} template <> EIGEN_STRONG_INLINE Packet32f pnegate(const Packet32f& a) { - return psub(Packet32f::Create(Q6_V_vzero()), a); + return pnegate_hvx(a); +} +template <> +EIGEN_STRONG_INLINE Packet16f pnegate(const Packet16f& a) { + return pnegate_hvx(a); +} +template <> +EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a) { + return pnegate_hvx(a); } -template <> -EIGEN_STRONG_INLINE Packet32f pcmp_le(const Packet32f& a, const Packet32f& b) { +template <HVXPacketSize T> +EIGEN_STRONG_INLINE HVXPacket<T> pcmp_le_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) { HVX_Vector v_true = Q6_Vb_vsplat_R(0xff); HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(a.Get(), b.Get()); - return Packet32f::Create(Q6_V_vmux_QVV(pred, Q6_V_vzero(), v_true)); + return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, Q6_V_vzero(), v_true)); +} +template <> +EIGEN_STRONG_INLINE Packet32f pcmp_le(const Packet32f& a, const Packet32f& b) { + return pcmp_le_hvx(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet16f pcmp_le(const Packet16f& a, const Packet16f& b) { + return pcmp_le_hvx(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8f pcmp_le(const Packet8f& a, const Packet8f& b) { + return pcmp_le_hvx(a, b); } -template <> -EIGEN_STRONG_INLINE Packet32f pcmp_eq(const Packet32f& a, const Packet32f& b) { +template <HVXPacketSize T> +EIGEN_STRONG_INLINE HVXPacket<T> pcmp_eq_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) { HVX_Vector v_true = Q6_Vb_vsplat_R(0xff); HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(a.Get(), b.Get()); - return Packet32f::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero())); + return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero())); +} +template <> +EIGEN_STRONG_INLINE Packet32f pcmp_eq(const Packet32f& a, const Packet32f& b) { + return pcmp_eq_hvx(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) { + return pcmp_eq_hvx(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8f pcmp_eq(const Packet8f& a, const Packet8f& b) { + return pcmp_eq_hvx(a, b); } +template <HVXPacketSize T> +EIGEN_STRONG_INLINE HVXPacket<T> pcmp_lt_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) { + HVX_Vector v_true = Q6_Vb_vsplat_R(0xff); + HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get()); + return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero())); +} template <> EIGEN_STRONG_INLINE Packet32f pcmp_lt(const Packet32f& a, const Packet32f& b) { - HVX_Vector v_true = Q6_Vb_vsplat_R(0xff); - HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get()); - return Packet32f::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero())); + return pcmp_lt_hvx(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet16f pcmp_lt(const Packet16f& a, const Packet16f& b) { + return pcmp_lt_hvx(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8f pcmp_lt(const Packet8f& a, const Packet8f& b) { + return pcmp_lt_hvx(a, b); } +template <HVXPacketSize T> +EIGEN_STRONG_INLINE HVXPacket<T> pcmp_lt_or_nan_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) { + HVX_Vector v_true = Q6_Vb_vsplat_R(0xff); + HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get()); + return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero())); +} template <> EIGEN_STRONG_INLINE Packet32f pcmp_lt_or_nan(const Packet32f& a, const Packet32f& b) { - HVX_Vector v_true = Q6_Vb_vsplat_R(0xff); - HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get()); - return Packet32f::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero())); + return pcmp_lt_or_nan_hvx(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) { + return pcmp_lt_or_nan_hvx(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8f pcmp_lt_or_nan(const Packet8f& a, const Packet8f& b) { + return pcmp_lt_or_nan_hvx(a, b); } +template <HVXPacketSize T> +EIGEN_STRONG_INLINE HVXPacket<T> pabs_hvx(const HVXPacket<T>& a) { + return HVXPacket<T>::Create(a.Get() & Q6_V_vsplat_R(0x7FFFFFFF)); +} template <> EIGEN_STRONG_INLINE Packet32f pabs(const Packet32f& a) { - HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(a.Get(), Q6_V_vzero()); - return Packet32f::Create(Q6_V_vmux_QVV(pred, a.Get(), pnegate(a).Get())); + return pabs_hvx(a); +} +template <> +EIGEN_STRONG_INLINE Packet16f pabs(const Packet16f& a) { + return pabs_hvx(a); +} +template <> +EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a) { + return pabs_hvx(a); } +template <HVXPacketSize T> +EIGEN_STRONG_INLINE float pfirst_hvx(const HVXPacket<T>& a) { + union { + float array[1]; + HVX_Vector vector; + } HVX_and_array; + HVX_and_array.vector = a.Get(); + return HVX_and_array.array[0]; +} template <> EIGEN_STRONG_INLINE float pfirst(const Packet32f& a) { - float vsf[32] __attribute__((aligned(128))); - pstore(vsf, a); - return vsf[0]; + return pfirst_hvx(a); +} +template <> +EIGEN_STRONG_INLINE float pfirst(const Packet16f& a) { + return pfirst_hvx(a); +} +template <> +EIGEN_STRONG_INLINE float pfirst(const Packet8f& a) { + return pfirst_hvx(a); } EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet32f, 4>& kernel) { @@ -166,13 +525,107 @@ // Shuffle the 64-bit lanes. HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8); HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_3_2), HEXAGON_HVX_GET_V1(v_0_1_0), -8); - kernel.packet[0] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_1_1_0)); kernel.packet[1] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_1_1_0)); kernel.packet[2] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_1_3_2)); kernel.packet[3] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_1_3_2)); } +EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16f, 4>& kernel) { + // Shuffle the 32-bit lanes. + HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4); + HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4); + // Shuffle the 64-bit lanes. + HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8); + + kernel.packet[0] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_1_0)); + kernel.packet[1] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 64)); + kernel.packet[2] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_1_0)); + kernel.packet[3] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_1_0), HEXAGON_HVX_GET_V1(v_1_1_0), 64)); +} +EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8f, 4>& kernel) { + // Shuffle the 32-bit lanes. + HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4); + HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4); + + // Shuffle the 64-bit lanes. + HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8); + + kernel.packet[0] = Packet8f::Create(HEXAGON_HVX_GET_V0(v_1_1_0)); + kernel.packet[1] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 32)); + kernel.packet[2] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 64)); + kernel.packet[3] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 96)); +} + +EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8f, 8>& kernel) { + // Shuffle the 32-bit lanes. + HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4); + HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4); + HVX_VectorPair v_0_5_4 = Q6_W_vshuff_VVR(kernel.packet[5].Get(), kernel.packet[4].Get(), -4); + HVX_VectorPair v_0_7_6 = Q6_W_vshuff_VVR(kernel.packet[7].Get(), kernel.packet[6].Get(), -4); + + // Shuffle the 64-bit lanes. + HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8); + HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_7_6), HEXAGON_HVX_GET_V0(v_0_5_4), -8); + + // Shuffle the 128-bit lanes. + v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_3_2), HEXAGON_HVX_GET_V0(v_1_1_0), -16); + + kernel.packet[0] = Packet8f::Create(HEXAGON_HVX_GET_V0(v_0_1_0)); + kernel.packet[1] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_0_1_0), HEXAGON_HVX_GET_V0(v_0_1_0), 32)); + kernel.packet[2] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_0_1_0), HEXAGON_HVX_GET_V0(v_0_1_0), 64)); + kernel.packet[3] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_0_1_0), HEXAGON_HVX_GET_V0(v_0_1_0), 96)); + kernel.packet[4] = Packet8f::Create(HEXAGON_HVX_GET_V1(v_0_1_0)); + kernel.packet[5] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_0_1_0), HEXAGON_HVX_GET_V1(v_0_1_0), 32)); + kernel.packet[6] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_0_1_0), HEXAGON_HVX_GET_V1(v_0_1_0), 64)); + kernel.packet[7] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_0_1_0), HEXAGON_HVX_GET_V1(v_0_1_0), 96)); +} +EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16f, 16>& kernel) { + // Shuffle the 32-bit lanes. + HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4); + HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4); + HVX_VectorPair v_0_5_4 = Q6_W_vshuff_VVR(kernel.packet[5].Get(), kernel.packet[4].Get(), -4); + HVX_VectorPair v_0_7_6 = Q6_W_vshuff_VVR(kernel.packet[7].Get(), kernel.packet[6].Get(), -4); + HVX_VectorPair v_0_9_8 = Q6_W_vshuff_VVR(kernel.packet[9].Get(), kernel.packet[8].Get(), -4); + HVX_VectorPair v_0_11_10 = Q6_W_vshuff_VVR(kernel.packet[11].Get(), kernel.packet[10].Get(), -4); + HVX_VectorPair v_0_13_12 = Q6_W_vshuff_VVR(kernel.packet[13].Get(), kernel.packet[12].Get(), -4); + HVX_VectorPair v_0_15_14 = Q6_W_vshuff_VVR(kernel.packet[15].Get(), kernel.packet[14].Get(), -4); + + // Shuffle the 64-bit lanes. + HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8); + HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_7_6), HEXAGON_HVX_GET_V0(v_0_5_4), -8); + HVX_VectorPair v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10), HEXAGON_HVX_GET_V0(v_0_9_8), -8); + HVX_VectorPair v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_15_14), HEXAGON_HVX_GET_V0(v_0_13_12), -8); + + // Shuffle the 128-bit lanes. + v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_3_2), HEXAGON_HVX_GET_V0(v_1_1_0), -16); + v_0_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_3_2), HEXAGON_HVX_GET_V1(v_1_1_0), -16); + v_0_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_7_6), HEXAGON_HVX_GET_V0(v_1_5_4), -16); + v_0_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_7_6), HEXAGON_HVX_GET_V1(v_1_5_4), -16); + + // Shuffle the 256-bit lanes. + v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_9_8), HEXAGON_HVX_GET_V0(v_0_1_0), -32); + v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_9_8), HEXAGON_HVX_GET_V1(v_0_1_0), -32); + v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10), HEXAGON_HVX_GET_V0(v_0_3_2), -32); + v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_11_10), HEXAGON_HVX_GET_V1(v_0_3_2), -32); + + kernel.packet[0] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_1_0)); + kernel.packet[1] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 64)); + kernel.packet[2] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_1_0)); + kernel.packet[3] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_1_0), HEXAGON_HVX_GET_V1(v_1_1_0), 64)); + kernel.packet[4] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_3_2)); + kernel.packet[5] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_3_2), HEXAGON_HVX_GET_V0(v_1_3_2), 64)); + kernel.packet[6] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_3_2)); + kernel.packet[7] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_3_2), HEXAGON_HVX_GET_V1(v_1_3_2), 64)); + kernel.packet[8] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_5_4)); + kernel.packet[9] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_5_4), HEXAGON_HVX_GET_V0(v_1_5_4), 64)); + kernel.packet[10] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_5_4)); + kernel.packet[11] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_5_4), HEXAGON_HVX_GET_V1(v_1_5_4), 64)); + kernel.packet[12] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_7_6)); + kernel.packet[13] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_7_6), HEXAGON_HVX_GET_V0(v_1_7_6), 64)); + kernel.packet[14] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_7_6)); + kernel.packet[15] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_7_6), HEXAGON_HVX_GET_V1(v_1_7_6), 64)); +} EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet32f, 32>& kernel) { // Shuffle the 32-bit lanes. HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4); @@ -298,29 +751,67 @@ kernel.packet[31] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_31_30)); } +template <HVXPacketSize T> +EIGEN_STRONG_INLINE float predux_hvx(const HVXPacket<T>& a) { + const Index packet_size = unpacket_traits<HVXPacket<T>>::size; + HVX_Vector vsum = Q6_Vqf32_vadd_VsfVsf(a.Get(), Q6_V_vror_VR(a.Get(), sizeof(float))); + for (int i = 2; i < packet_size; i <<= 1) { + vsum = Q6_Vqf32_vadd_Vqf32Vqf32(vsum, Q6_V_vror_VR(vsum, i * sizeof(float))); + } + return pfirst(HVXPacket<T>::Create(Q6_Vsf_equals_Vqf32(vsum))); +} template <> EIGEN_STRONG_INLINE float predux<Packet32f>(const Packet32f& a) { - HVX_Vector vsum_4 = Q6_Vqf32_vadd_VsfVsf(Q6_V_vror_VR(a.Get(), 4), a.Get()); - HVX_Vector vsum_8 = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_vror_VR(vsum_4, 8), vsum_4); - HVX_Vector vsum_16 = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_vror_VR(vsum_8, 16), vsum_8); - HVX_Vector vsum_32 = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_vror_VR(vsum_16, 32), vsum_16); - HVX_Vector vsum_64 = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_vror_VR(vsum_32, 64), vsum_32); - return pfirst(Packet32f::Create(Q6_Vsf_equals_Vqf32(vsum_64))); + return predux_hvx(a); +} +template <> +EIGEN_STRONG_INLINE float predux<Packet16f>(const Packet16f& a) { + return predux_hvx(a); +} +template <> +EIGEN_STRONG_INLINE float predux<Packet8f>(const Packet8f& a) { + return predux_hvx(a); } +template <HVXPacketSize T> +EIGEN_STRONG_INLINE HVXPacket<T> ploaddup_hvx(const float* from) { + constexpr Index size = unpacket_traits<HVXPacket<T>>::size / 2; + HVX_Vector load = HVX_load_partial<size, 0>(from); + HVX_VectorPair dup = Q6_W_vshuff_VVR(load, load, -4); + return HVXPacket<T>::Create(HEXAGON_HVX_GET_V0(dup)); +} template <> EIGEN_STRONG_INLINE Packet32f ploaddup(const float* from) { - HVX_Vector load = HVX_loadu(from); - HVX_VectorPair dup = Q6_W_vshuff_VVR(load, load, -4); - return Packet32f::Create(HEXAGON_HVX_GET_V0(dup)); + return ploaddup_hvx<HVXPacketSize::Full>(from); +} +template <> +EIGEN_STRONG_INLINE Packet16f ploaddup(const float* from) { + return ploaddup_hvx<HVXPacketSize::Half>(from); +} +template <> +EIGEN_STRONG_INLINE Packet8f ploaddup(const float* from) { + return ploaddup_hvx<HVXPacketSize::Quarter>(from); } -template <> -EIGEN_STRONG_INLINE Packet32f ploadquad(const float* from) { - HVX_Vector load = HVX_loadu(from); +template <HVXPacketSize T> +EIGEN_STRONG_INLINE HVXPacket<T> ploadquad_hvx(const float* from) { + constexpr Index size = unpacket_traits<HVXPacket<T>>::size / 4; + HVX_Vector load = HVX_load_partial<size, 0>(from); HVX_VectorPair dup = Q6_W_vshuff_VVR(load, load, -4); HVX_VectorPair quad = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(dup), HEXAGON_HVX_GET_V0(dup), -8); - return Packet32f::Create(HEXAGON_HVX_GET_V0(quad)); + return HVXPacket<T>::Create(HEXAGON_HVX_GET_V0(quad)); +} +template <> +EIGEN_STRONG_INLINE Packet32f ploadquad(const float* from) { + return ploadquad_hvx<HVXPacketSize::Full>(from); +} +template <> +EIGEN_STRONG_INLINE Packet16f ploadquad(const float* from) { + return ploadquad_hvx<HVXPacketSize::Half>(from); +} +template <> +EIGEN_STRONG_INLINE Packet8f ploadquad(const float* from) { + return ploadquad_hvx<HVXPacketSize::Quarter>(from); } template <> @@ -330,99 +821,249 @@ } template <> -EIGEN_STRONG_INLINE Packet32f pmin(const Packet32f& a, const Packet32f& b) { - return Packet32f::Create(Q6_Vsf_vmin_VsfVsf(a.Get(), b.Get())); +EIGEN_STRONG_INLINE Packet16f preverse(const Packet16f& a) { + HVX_Vector delta = Q6_Vb_vsplat_R(0x3c); + return Packet16f::Create(Q6_V_vdelta_VV(a.Get(), delta)); } template <> +EIGEN_STRONG_INLINE Packet8f preverse(const Packet8f& a) { + HVX_Vector delta = Q6_Vb_vsplat_R(0x1c); + return Packet8f::Create(Q6_V_vdelta_VV(a.Get(), delta)); +} + +template <HVXPacketSize T> +EIGEN_STRONG_INLINE HVXPacket<T> pmin_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) { + return HVXPacket<T>::Create(Q6_Vsf_vmin_VsfVsf(a.Get(), b.Get())); +} +template <> +EIGEN_STRONG_INLINE Packet32f pmin(const Packet32f& a, const Packet32f& b) { + return pmin_hvx(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet16f pmin(const Packet16f& a, const Packet16f& b) { + return pmin_hvx(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8f pmin(const Packet8f& a, const Packet8f& b) { + return pmin_hvx(a, b); +} + +template <HVXPacketSize T> +EIGEN_STRONG_INLINE HVXPacket<T> pmax_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) { + return HVXPacket<T>::Create(Q6_Vsf_vmax_VsfVsf(a.Get(), b.Get())); +} +template <> EIGEN_STRONG_INLINE Packet32f pmax(const Packet32f& a, const Packet32f& b) { - return Packet32f::Create(Q6_Vsf_vmax_VsfVsf(a.Get(), b.Get())); + return pmax_hvx(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet16f pmax(const Packet16f& a, const Packet16f& b) { + return pmax_hvx(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8f pmax(const Packet8f& a, const Packet8f& b) { + return pmax_hvx(a, b); } +template <HVXPacketSize T> +EIGEN_STRONG_INLINE HVXPacket<T> pand_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) { + return HVXPacket<T>::Create(a.Get() & b.Get()); +} template <> EIGEN_STRONG_INLINE Packet32f pand(const Packet32f& a, const Packet32f& b) { - return Packet32f::Create(a.Get() & b.Get()); + return pand_hvx(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet16f pand(const Packet16f& a, const Packet16f& b) { + return pand_hvx(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8f pand(const Packet8f& a, const Packet8f& b) { + return pand_hvx(a, b); } +template <HVXPacketSize T> +EIGEN_STRONG_INLINE HVXPacket<T> por_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) { + return HVXPacket<T>::Create(a.Get() | b.Get()); +} template <> EIGEN_STRONG_INLINE Packet32f por(const Packet32f& a, const Packet32f& b) { - return Packet32f::Create(a.Get() | b.Get()); + return por_hvx(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet16f por(const Packet16f& a, const Packet16f& b) { + return por_hvx(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8f por(const Packet8f& a, const Packet8f& b) { + return por_hvx(a, b); } +template <HVXPacketSize T> +EIGEN_STRONG_INLINE HVXPacket<T> pxor_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) { + return HVXPacket<T>::Create(a.Get() ^ b.Get()); +} template <> EIGEN_STRONG_INLINE Packet32f pxor(const Packet32f& a, const Packet32f& b) { - return Packet32f::Create(a.Get() ^ b.Get()); + return pxor_hvx(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet16f pxor(const Packet16f& a, const Packet16f& b) { + return pxor_hvx(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8f pxor(const Packet8f& a, const Packet8f& b) { + return pxor_hvx(a, b); } +template <HVXPacketSize T> +EIGEN_STRONG_INLINE HVXPacket<T> pnot_hvx(const HVXPacket<T>& a) { + return HVXPacket<T>::Create(~a.Get()); +} template <> EIGEN_STRONG_INLINE Packet32f pnot(const Packet32f& a) { - return Packet32f::Create(~a.Get()); + return pnot_hvx(a); +} +template <> +EIGEN_STRONG_INLINE Packet16f pnot(const Packet16f& a) { + return pnot_hvx(a); +} +template <> +EIGEN_STRONG_INLINE Packet8f pnot(const Packet8f& a) { + return pnot_hvx(a); } +template <HVXPacketSize T> +EIGEN_STRONG_INLINE HVXPacket<T> pselect_hvx(const HVXPacket<T>& mask, const HVXPacket<T>& a, const HVXPacket<T>& b) { + HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(mask.Get(), Q6_V_vzero()); + return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, b.Get(), a.Get())); +} template <> EIGEN_STRONG_INLINE Packet32f pselect(const Packet32f& mask, const Packet32f& a, const Packet32f& b) { - HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(mask.Get(), Q6_V_vzero()); - return Packet32f::Create(Q6_V_vmux_QVV(pred, b.Get(), a.Get())); + return pselect_hvx(mask, a, b); +} +template <> +EIGEN_STRONG_INLINE Packet16f pselect(const Packet16f& mask, const Packet16f& a, const Packet16f& b) { + return pselect_hvx(mask, a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8f pselect(const Packet8f& mask, const Packet8f& a, const Packet8f& b) { + return pselect_hvx(mask, a, b); } -template <typename Op> -EIGEN_STRONG_INLINE float predux_generic(const Packet32f& a, Op op) { - Packet32f vredux_4 = op(Packet32f::Create(Q6_V_vror_VR(a.Get(), 4)), a); - Packet32f vredux_8 = op(Packet32f::Create(Q6_V_vror_VR(vredux_4.Get(), 8)), vredux_4); - Packet32f vredux_16 = op(Packet32f::Create(Q6_V_vror_VR(vredux_8.Get(), 16)), vredux_8); - Packet32f vredux_32 = op(Packet32f::Create(Q6_V_vror_VR(vredux_16.Get(), 32)), vredux_16); - Packet32f vredux_64 = op(Packet32f::Create(Q6_V_vror_VR(vredux_32.Get(), 64)), vredux_32); - return pfirst(vredux_64); +template <HVXPacketSize T, typename Op> +EIGEN_STRONG_INLINE float predux_generic(const HVXPacket<T>& a, Op op) { + const Index packet_size = unpacket_traits<HVXPacket<T>>::size; + HVXPacket<T> vredux = a; + for (int i = 1; i < packet_size; i <<= 1) { + vredux = op(vredux, HVXPacket<T>::Create(Q6_V_vror_VR(vredux.Get(), i * sizeof(float)))); + } + return pfirst(vredux); } template <> EIGEN_STRONG_INLINE float predux_max(const Packet32f& a) { return predux_generic(a, pmax<Packet32f>); } +template <> +EIGEN_STRONG_INLINE float predux_max(const Packet16f& a) { + return predux_generic(a, pmax<Packet16f>); +} +template <> +EIGEN_STRONG_INLINE float predux_max(const Packet8f& a) { + return predux_generic(a, pmax<Packet8f>); +} template <> EIGEN_STRONG_INLINE float predux_min(const Packet32f& a) { return predux_generic(a, pmin<Packet32f>); } +template <> +EIGEN_STRONG_INLINE float predux_min(const Packet16f& a) { + return predux_generic(a, pmin<Packet16f>); +} +template <> +EIGEN_STRONG_INLINE float predux_min(const Packet8f& a) { + return predux_generic(a, pmin<Packet8f>); +} template <> EIGEN_STRONG_INLINE bool predux_any(const Packet32f& a) { return predux_generic(a, por<Packet32f>) != 0.0f; } +template <> +EIGEN_STRONG_INLINE bool predux_any(const Packet16f& a) { + return predux_generic(a, por<Packet16f>) != 0.0f; +} +template <> +EIGEN_STRONG_INLINE bool predux_any(const Packet8f& a) { + return predux_generic(a, por<Packet8f>) != 0.0f; +} static const float index_vsf[32] - __attribute__((aligned(128))) = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; + __attribute__((aligned(__HVX_LENGTH__))) = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; +template <HVXPacketSize T> +EIGEN_STRONG_INLINE HVXPacket<T> plset_hvx(const float& a) { + return padd(pload<HVXPacket<T>>(index_vsf), pset1<HVXPacket<T>>(a)); +} template <> EIGEN_STRONG_INLINE Packet32f plset(const float& a) { - return padd(pload<Packet32f>(index_vsf), pset1<Packet32f>(a)); + return plset_hvx<HVXPacketSize::Full>(a); } - -// qfloat32 operations. template <> -EIGEN_STRONG_INLINE Packet32qf pzero<Packet32qf>(const Packet32qf&) { - return Packet32qf::Create(Q6_V_vzero()); +EIGEN_STRONG_INLINE Packet16f plset(const float& a) { + return plset_hvx<HVXPacketSize::Half>(a); } - template <> -EIGEN_STRONG_INLINE Packet32qf pmul<Packet32qf>(const Packet32qf& a, const Packet32qf& b) { - return Packet32qf::Create(Q6_Vqf32_vmpy_Vqf32Vqf32(a.Get(), b.Get())); +EIGEN_STRONG_INLINE Packet8f plset(const float& a) { + return plset_hvx<HVXPacketSize::Quarter>(a); } +template <HVXPacketSize T> +EIGEN_STRONG_INLINE void pscatter_hvx(float* to, const HVXPacket<T>& from, Index stride) { + const Index packet_size = unpacket_traits<HVXPacket<T>>::size; + float elements[packet_size] __attribute__((aligned(__HVX_LENGTH__))); + pstore<float>(elements, from); + for (Index i = 0; i < packet_size; ++i) { + to[i * stride] = elements[i]; + } +} template <> -EIGEN_STRONG_INLINE Packet32qf padd<Packet32qf>(const Packet32qf& a, const Packet32qf& b) { - return Packet32qf::Create(Q6_Vqf32_vadd_Vqf32Vqf32(a.Get(), b.Get())); +EIGEN_STRONG_INLINE void pscatter<float, Packet32f>(float* to, const Packet32f& from, Index stride) { + pscatter_hvx(to, from, stride); +} +template <> +EIGEN_STRONG_INLINE void pscatter<float, Packet16f>(float* to, const Packet16f& from, Index stride) { + pscatter_hvx(to, from, stride); +} +template <> +EIGEN_STRONG_INLINE void pscatter<float, Packet8f>(float* to, const Packet8f& from, Index stride) { + pscatter_hvx(to, from, stride); } -// Mixed float32 and qfloat32 operations. -EIGEN_STRONG_INLINE Packet32qf pmadd_f32_to_qf32(const Packet32f& a, const Packet32f& b, const Packet32qf& c) { - return Packet32qf::Create(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_Vqf32_vmpy_VsfVsf(a.Get(), b.Get()), c.Get())); +template <HVXPacketSize T> +EIGEN_STRONG_INLINE HVXPacket<T> pgather_hvx(const float* from, Index stride) { + const Index packet_size = unpacket_traits<HVXPacket<T>>::size; + float elements[packet_size] __attribute__((aligned(__HVX_LENGTH__))); + for (Index i = 0; i < packet_size; i++) { + elements[i] = from[i * stride]; + } + return pload<HVXPacket<T>>(elements); } - -EIGEN_STRONG_INLINE Packet32f pmadd_qf32_to_f32(const Packet32qf& a, const Packet32f& b, const Packet32f& c) { - return Packet32f::Create(Q6_Vsf_equals_Vqf32( - Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(a.Get()), b.Get()), c.Get()))); +template <> +EIGEN_STRONG_INLINE Packet32f pgather<float, Packet32f>(const float* from, Index stride) { + return pgather_hvx<HVXPacketSize::Full>(from, stride); +} +template <> +EIGEN_STRONG_INLINE Packet16f pgather<float, Packet16f>(const float* from, Index stride) { + return pgather_hvx<HVXPacketSize::Half>(from, stride); +} +template <> +EIGEN_STRONG_INLINE Packet8f pgather<float, Packet8f>(const float* from, Index stride) { + return pgather_hvx<HVXPacketSize::Quarter>(from, stride); } } // end namespace internal
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 4e3a14d..71e5f5f 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -1271,7 +1271,7 @@ return pset1<Packet2ul>(0ULL); } -#ifdef __ARM_FEATURE_FMA +#ifdef EIGEN_VECTORIZE_FMA template <> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vfmaq_f32(c, a, b); @@ -5249,7 +5249,7 @@ return vdivq_f64(a, b); } -#ifdef __ARM_FEATURE_FMA +#ifdef EIGEN_VECTORIZE_FMA // See bug 936. See above comment about FMA for float. template <> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h index b16952a..e692438 100644 --- a/Eigen/src/Core/util/ConfigureVectorization.h +++ b/Eigen/src/Core/util/ConfigureVectorization.h
@@ -354,6 +354,7 @@ #define EIGEN_VECTORIZE #define EIGEN_VECTORIZE_VSX 1 +#define EIGEN_VECTORIZE_FMA #include <altivec.h> // We need to #undef all these ugly tokens defined in <altivec.h> // => use __vector instead of vector @@ -365,6 +366,7 @@ #define EIGEN_VECTORIZE #define EIGEN_VECTORIZE_ALTIVEC +#define EIGEN_VECTORIZE_FMA #include <altivec.h> // We need to #undef all these ugly tokens defined in <altivec.h> // => use __vector instead of vector @@ -431,6 +433,11 @@ #include <arm_fp16.h> #endif +// Enable FMA for ARM. +#if defined(__ARM_FEATURE_FMA) +#define EIGEN_VECTORIZE_FMA +#endif + #if defined(__F16C__) && !defined(EIGEN_GPUCC) && (!EIGEN_COMP_CLANG_STRICT || EIGEN_CLANG_STRICT_AT_LEAST(3, 8, 0)) // We can use the optimized fp16 to float and float to fp16 conversion routines #define EIGEN_HAS_FP16_C
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 31f1057..6253454 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h
@@ -156,7 +156,7 @@ /** \internal Frees memory allocated with handmade_aligned_malloc */ EIGEN_DEVICE_FUNC inline void handmade_aligned_free(void* ptr) { - if (ptr) { + if (ptr != nullptr) { uint8_t offset = static_cast<uint8_t>(*(static_cast<uint8_t*>(ptr) - 1)); void* original = static_cast<void*>(static_cast<uint8_t*>(ptr) - offset); @@ -224,9 +224,11 @@ EIGEN_DEVICE_FUNC inline void aligned_free(void* ptr) { #if (EIGEN_DEFAULT_ALIGN_BYTES == 0) || EIGEN_MALLOC_ALREADY_ALIGNED - if (ptr) check_that_malloc_is_allowed(); - EIGEN_USING_STD(free) - free(ptr); + if (ptr != nullptr) { + check_that_malloc_is_allowed(); + EIGEN_USING_STD(free) + free(ptr); + } #else handmade_aligned_free(ptr); @@ -294,9 +296,11 @@ template <> EIGEN_DEVICE_FUNC inline void conditional_aligned_free<false>(void* ptr) { - if (ptr) check_that_malloc_is_allowed(); - EIGEN_USING_STD(free) - free(ptr); + if (ptr != nullptr) { + check_that_malloc_is_allowed(); + EIGEN_USING_STD(free) + free(ptr); + } } template <bool Align>
diff --git a/Eigen/src/Eigenvalues/ComplexSchur.h b/Eigen/src/Eigenvalues/ComplexSchur.h index 126b442..1ec8fb8 100644 --- a/Eigen/src/Eigenvalues/ComplexSchur.h +++ b/Eigen/src/Eigenvalues/ComplexSchur.h
@@ -275,7 +275,7 @@ template <typename MatrixType> typename ComplexSchur<MatrixType>::ComplexScalar ComplexSchur<MatrixType>::computeShift(Index iu, Index iter) { using std::abs; - if (iter == 10 || iter == 20) { + if ((iter == 10 || iter == 20) && iu > 1) { // exceptional shift, taken from http://www.netlib.org/eispack/comqr.f return abs(numext::real(m_matT.coeff(iu, iu - 1))) + abs(numext::real(m_matT.coeff(iu - 1, iu - 2))); }
diff --git a/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h b/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h index 1eb07f7..3132794 100644 --- a/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +++ b/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
@@ -142,8 +142,8 @@ cholmod_sparse A; A = viewAsCholmod(mat); m_rows = matrix.rows(); - Index col = matrix.cols(); - m_rank = SuiteSparseQR<Scalar>(m_ordering, pivotThreshold, col, &A, &m_cR, &m_E, &m_H, &m_HPinv, &m_HTau, &m_cc); + m_rank = SuiteSparseQR<Scalar>(m_ordering, pivotThreshold, internal::convert_index<StorageIndex>(matrix.cols()), &A, + &m_cR, &m_E, &m_H, &m_HPinv, &m_HTau, &m_cc); if (!m_cR) { m_info = NumericalIssue; @@ -196,7 +196,7 @@ const MatrixType matrixR() const { eigen_assert(m_isInitialized && " The QR factorization should be computed first, call compute()"); if (!m_isRUpToDate) { - m_R = viewAsEigen<Scalar, ColMajor, typename MatrixType::StorageIndex>(*m_cR); + m_R = viewAsEigen<Scalar, StorageIndex>(*m_cR); m_isRUpToDate = true; } return m_R;
diff --git a/Eigen/src/SparseCore/CompressedStorage.h b/Eigen/src/SparseCore/CompressedStorage.h index 123c89c..8f8a696 100644 --- a/Eigen/src/SparseCore/CompressedStorage.h +++ b/Eigen/src/SparseCore/CompressedStorage.h
@@ -71,8 +71,13 @@ void resize(Index size, double reserveSizeFactor = 0) { if (m_allocatedSize < size) { + // Avoid underflow on the std::min<Index> call by choosing the smaller index type. + using SmallerIndexType = + typename std::conditional<static_cast<size_t>((std::numeric_limits<Index>::max)()) < + static_cast<size_t>((std::numeric_limits<StorageIndex>::max)()), + Index, StorageIndex>::type; Index realloc_size = - (std::min<Index>)(NumTraits<StorageIndex>::highest(), size + Index(reserveSizeFactor * double(size))); + (std::min<Index>)(NumTraits<SmallerIndexType>::highest(), size + Index(reserveSizeFactor * double(size))); if (realloc_size < size) internal::throw_std_bad_alloc(); reallocate(realloc_size); }
diff --git a/Eigen/src/SparseLU/SparseLU.h b/Eigen/src/SparseLU/SparseLU.h index aee3d94..29be01a 100644 --- a/Eigen/src/SparseLU/SparseLU.h +++ b/Eigen/src/SparseLU/SparseLU.h
@@ -99,13 +99,34 @@ * \code * VectorXd x(n), b(n); * SparseMatrix<double> A; - * SparseLU<SparseMatrix<double>, COLAMDOrdering<int> > solver; - * // fill A and b; - * // Compute the ordering permutation vector from the structural pattern of A + * SparseLU<SparseMatrix<double>, COLAMDOrdering<int> > solver; + * // Fill A and b. + * // Compute the ordering permutation vector from the structural pattern of A. * solver.analyzePattern(A); - * // Compute the numerical factorization + * // Compute the numerical factorization. * solver.factorize(A); - * //Use the factors to solve the linear system + * // Use the factors to solve the linear system. + * x = solver.solve(b); + * \endcode + * + * We can directly call compute() instead of analyzePattern() and factorize() + * \code + * VectorXd x(n), b(n); + * SparseMatrix<double> A; + * SparseLU<SparseMatrix<double>, COLAMDOrdering<int> > solver; + * // Fill A and b. + * solver.compute(A); + * // Use the factors to solve the linear system. + * x = solver.solve(b); + * \endcode + * + * Or give the matrix to the constructor SparseLU(const MatrixType& matrix) + * \code + * VectorXd x(n), b(n); + * SparseMatrix<double> A; + * // Fill A and b. + * SparseLU<SparseMatrix<double>, COLAMDOrdering<int> > solver(A); + * // Use the factors to solve the linear system. * x = solver.solve(b); * \endcode * @@ -150,10 +171,18 @@ enum { ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime }; public: + /** \brief Basic constructor of the solver. + * + * Construct a SparseLU. As no matrix is given as argument, compute() should be called afterward with a matrix. + */ SparseLU() : m_lastError(""), m_Ustore(0, 0, 0, 0, 0, 0), m_symmetricmode(false), m_diagpivotthresh(1.0), m_detPermR(1) { initperfvalues(); } + /** \brief Constructor of the solver already based on a specific matrix. + * + * Construct a SparseLU. compute() is already called with the given matrix. + */ explicit SparseLU(const MatrixType& matrix) : m_lastError(""), m_Ustore(0, 0, 0, 0, 0, 0), m_symmetricmode(false), m_diagpivotthresh(1.0), m_detPermR(1) { initperfvalues(); @@ -168,9 +197,15 @@ void factorize(const MatrixType& matrix); void simplicialfactorize(const MatrixType& matrix); - /** + /** \brief Analyze and factorize the matrix so the solver is ready to solve. + * * Compute the symbolic and numeric factorization of the input sparse matrix. - * The input matrix should be in column-major storage. + * The input matrix should be in column-major storage, otherwise analyzePattern() + * will do a heavy copy. + * + * Call analyzePattern() followed by factorize() + * + * \sa analyzePattern(), factorize() */ void compute(const MatrixType& matrix) { // Analyze @@ -179,7 +214,9 @@ factorize(matrix); } - /** \returns an expression of the transposed of the factored matrix. + /** \brief Return a solver for the transposed matrix. + * + * \returns an expression of the transposed of the factored matrix. * * A typical usage is to solve for the transposed problem A^T x = b: * \code @@ -196,7 +233,9 @@ return transposeView; } - /** \returns an expression of the adjoint of the factored matrix + /** \brief Return a solver for the adjointed matrix. + * + * \returns an expression of the adjoint of the factored matrix * * A typical usage is to solve for the adjoint problem A' x = b: * \code @@ -215,19 +254,28 @@ return adjointView; } + /** \brief Give the number of rows. + */ inline Index rows() const { return m_mat.rows(); } + /** \brief Give the numver of columns. + */ inline Index cols() const { return m_mat.cols(); } - /** Indicate that the pattern of the input matrix is symmetric */ + /** \brief Let you set that the pattern of the input matrix is symmetric + */ void isSymmetric(bool sym) { m_symmetricmode = sym; } - /** \returns an expression of the matrix L, internally stored as supernodes + /** \brief Give the matrixL + * + * \returns an expression of the matrix L, internally stored as supernodes * The only operation available with this expression is the triangular solve * \code * y = b; matrixL().solveInPlace(y); * \endcode */ SparseLUMatrixLReturnType<SCMatrix> matrixL() const { return SparseLUMatrixLReturnType<SCMatrix>(m_Lstore); } - /** \returns an expression of the matrix U, + /** \brief Give the MatrixU + * + * \returns an expression of the matrix U, * The only operation available with this expression is the triangular solve * \code * y = b; matrixU().solveInPlace(y); @@ -237,12 +285,14 @@ return SparseLUMatrixUReturnType<SCMatrix, Map<SparseMatrix<Scalar, ColMajor, StorageIndex>>>(m_Lstore, m_Ustore); } - /** + /** \brief Give the row matrix permutation. + * * \returns a reference to the row matrix permutation \f$ P_r \f$ such that \f$P_r A P_c^T = L U\f$ * \sa colsPermutation() */ inline const PermutationType& rowsPermutation() const { return m_perm_r; } - /** + /** \brief Give the column matrix permutation. + * * \returns a reference to the column matrix permutation\f$ P_c^T \f$ such that \f$P_r A P_c^T = L U\f$ * \sa rowsPermutation() */ @@ -251,7 +301,9 @@ void setPivotThreshold(const RealScalar& thresh) { m_diagpivotthresh = thresh; } #ifdef EIGEN_PARSED_BY_DOXYGEN - /** \returns the solution X of \f$ A X = B \f$ using the current decomposition of A. + /** \brief Solve a system \f$ A X = B \f$ + * + * \returns the solution X of \f$ A X = B \f$ using the current decomposition of A. * * \warning the destination matrix X in X = this->solve(B) must be colmun-major. * @@ -267,14 +319,17 @@ * \c NumericalIssue if the LU factorization reports a problem, zero diagonal for instance * \c InvalidInput if the input matrix is invalid * - * \sa iparm() + * You can get a readable error message with lastErrorMessage(). + * + * \sa lastErrorMessage() */ ComputationInfo info() const { eigen_assert(m_isInitialized && "Decomposition is not initialized."); return m_info; } - /** + /** \brief Give a human readable error + * * \returns A string describing the type of error */ std::string lastErrorMessage() const { return m_lastError; } @@ -302,7 +357,8 @@ return true; } - /** + /** \brief Give the absolute value of the determinant. + * * \returns the absolute value of the determinant of the matrix of which * *this is the QR decomposition. * @@ -330,7 +386,9 @@ return det; } - /** \returns the natural log of the absolute value of the determinant of the matrix + /** \brief Give the natural log of the absolute determinant. + * + * \returns the natural log of the absolute value of the determinant of the matrix * of which **this is the QR decomposition * * \note This method is useful to work around the risk of overflow/underflow that's @@ -356,7 +414,9 @@ return det; } - /** \returns A number representing the sign of the determinant + /** \brief Give the sign of the determinant. + * + * \returns A number representing the sign of the determinant * * \sa absDeterminant(), logAbsDeterminant() */ @@ -380,7 +440,9 @@ return det * m_detPermR * m_detPermC; } - /** \returns The determinant of the matrix. + /** \brief Give the determinant. + * + * \returns The determinant of the matrix. * * \sa absDeterminant(), logAbsDeterminant() */ @@ -401,7 +463,11 @@ return (m_detPermR * m_detPermC) > 0 ? det : -det; } + /** \brief Give the number of non zero in matrix L. + */ Index nnzL() const { return m_nnzL; } + /** \brief Give the number of non zero in matrix U. + */ Index nnzU() const { return m_nnzU; } protected: @@ -442,7 +508,8 @@ }; // End class SparseLU // Functions needed by the anaysis phase -/** +/** \brief Compute the column permutation. + * * Compute the column permutation to minimize the fill-in * * - Apply this permutation to the input matrix - @@ -451,6 +518,11 @@ * * - Postorder the elimination tree and the column permutation * + * It is possible to call compute() instead of analyzePattern() + factorize(). + * + * If the matrix is row-major this function will do an heavy copy. + * + * \sa factorize(), compute() */ template <typename MatrixType, typename OrderingType> void SparseLU<MatrixType, OrderingType>::analyzePattern(const MatrixType& mat) { @@ -516,23 +588,24 @@ // Functions needed by the numerical factorization phase -/** +/** \brief Factorize the matrix to get the solver ready. + * * - Numerical factorization * - Interleaved with the symbolic factorization - * On exit, info is * - * = 0: successful factorization + * To get error of this function you should check info(), you can get more info of + * errors with lastErrorMessage(). * - * > 0: if info = i, and i is + * In the past (before 2012 (git history is not older)), this function was returning an integer. + * This exit was 0 if successful factorization. + * > 0 if info = i, and i is been completed, but the factor U is exactly singular, + * and division by zero will occur if it is used to solve a system of equation. + * > A->ncol: number of bytes allocated when memory allocation failure occured, plus A->ncol. + * If lwork = -1, it is the estimated amount of space needed, plus A->ncol. * - * <= A->ncol: U(i,i) is exactly zero. The factorization has - * been completed, but the factor U is exactly singular, - * and division by zero will occur if it is used to solve a - * system of equations. + * It seems that A was the name of the matrix in the past. * - * > A->ncol: number of bytes allocated when memory allocation - * failure occurred, plus A->ncol. If lwork = -1, it is - * the estimated amount of space needed, plus A->ncol. + * \sa analyzePattern(), compute(), SparseLU(), info(), lastErrorMessage() */ template <typename MatrixType, typename OrderingType> void SparseLU<MatrixType, OrderingType>::factorize(const MatrixType& matrix) { @@ -572,6 +645,8 @@ Index maxpanel = m_perfv.panel_size * m; // Allocate working storage common to the factor routines Index lwork = 0; + // Return the size of actually allocated memory when allocation failed, + // and 0 on success. Index info = Base::memInit(m, n, nnz, lwork, m_perfv.fillfactor, m_perfv.panel_size, m_glu); if (info) { m_lastError = "UNABLE TO ALLOCATE WORKING MEMORY\n\n"; @@ -656,6 +731,7 @@ // Depth-first-search for the current column VectorBlock<IndexVector> panel_lsubk(panel_lsub, k, m); VectorBlock<IndexVector> repfnz_k(repfnz, k, m); + // Return 0 on success and > 0 number of bytes allocated when run out of space. info = Base::column_dfs(m, jj, m_perm_r.indices(), m_perfv.maxsuper, nseg, panel_lsubk, segrep, repfnz_k, xprune, marker, parent, xplore, m_glu); if (info) { @@ -667,6 +743,7 @@ // Numeric updates to this column VectorBlock<ScalarVector> dense_k(dense, k, m); VectorBlock<IndexVector> segrep_k(segrep, nseg1, m - nseg1); + // Return 0 on success and > 0 number of bytes allocated when run out of space. info = Base::column_bmod(jj, (nseg - nseg1), dense_k, tempv, segrep_k, repfnz_k, jcol, m_glu); if (info) { m_lastError = "UNABLE TO EXPAND MEMORY IN COLUMN_BMOD() "; @@ -676,6 +753,7 @@ } // Copy the U-segments to ucol(*) + // Return 0 on success and > 0 number of bytes allocated when run out of space. info = Base::copy_to_ucol(jj, nseg, segrep, repfnz_k, m_perm_r.indices(), dense_k, m_glu); if (info) { m_lastError = "UNABLE TO EXPAND MEMORY IN COPY_TO_UCOL() "; @@ -685,6 +763,7 @@ } // Form the L-segment + // Return O if success, i > 0 if U(i, i) is exactly zero. info = Base::pivotL(jj, m_diagpivotthresh, m_perm_r.indices(), iperm_c.indices(), pivrow, m_glu); if (info) { m_lastError = "THE MATRIX IS STRUCTURALLY SINGULAR";
diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake index 2022cf0..2f6f89e 100644 --- a/cmake/EigenTesting.cmake +++ b/cmake/EigenTesting.cmake
@@ -30,7 +30,7 @@ hip_reset_flags() hip_add_executable(${targetname} ${filename} HIPCC_OPTIONS -std=c++14) target_compile_definitions(${targetname} PRIVATE -DEIGEN_USE_HIP) - set_property(TARGET ${targetname} PROPERTY HIP_ARCHITECTURES gfx900 gfx906 gfx908 gfx90a gfx1030) + set_property(TARGET ${targetname} PROPERTY HIP_ARCHITECTURES gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030) elseif(EIGEN_TEST_CUDA_CLANG) set_source_files_properties(${filename} PROPERTIES LANGUAGE CXX)
diff --git a/debug/msvc/eigen.natvis b/debug/msvc/eigen.natvis index 22cf346..da89857 100644 --- a/debug/msvc/eigen.natvis +++ b/debug/msvc/eigen.natvis
@@ -1,235 +1,235 @@ -<?xml version="1.0" encoding="utf-8"?> - -<AutoVisualizer xmlns="http://schemas.microsoft.com/vstudio/debugger/natvis/2010"> - - <!-- Fixed x Fixed Matrix --> - <Type Name="Eigen::Matrix<*,*,*,*,*,*>"> - <AlternativeType Name="Eigen::Array<*,-1,-1,*,*,*>"/> - <DisplayString>[{$T2}, {$T3}] (fixed matrix)</DisplayString> - <Expand> - <ArrayItems Condition="Flags%2"> <!-- row major layout --> - <Rank>2</Rank> - <Size>$i==0 ? $T2 : $T3</Size> - <ValuePointer>m_storage.m_data.array</ValuePointer> - </ArrayItems> - <ArrayItems Condition="!(Flags%2)"> <!-- column major layout --> - <Direction>Backward</Direction> - <Rank>2</Rank> - <Size>$i==0 ? $T2 : $T3</Size> - <ValuePointer>m_storage.m_data.array</ValuePointer> - </ArrayItems> - </Expand> - </Type> - - <!-- 2 x 2 Matrix --> - <Type Name="Eigen::Matrix<*,2,2,*,*,*>"> - <AlternativeType Name="Eigen::Array<*,2,2,*,*,*>"/> - <DisplayString>[2, 2] (fixed matrix)</DisplayString> - <Expand> - <Synthetic Name="[row 0]" Condition="Flags%2"> - <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]})</DisplayString> - </Synthetic> - <Synthetic Name="[row 0]" Condition="!(Flags%2)"> - <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[2]})</DisplayString> - </Synthetic> - <Synthetic Name="[row 1]" Condition="Flags%2"> - <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString> - </Synthetic> - <Synthetic Name="[row 1]" Condition="!(Flags%2)"> - <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[3]})</DisplayString> - </Synthetic> - </Expand> - </Type> - - <!-- 3 x 3 Matrix --> - <Type Name="Eigen::Matrix<*,3,3,*,*,*>"> - <AlternativeType Name="Eigen::Array<*,3,3,*,*,*>"/> - <DisplayString>[3, 3] (fixed matrix)</DisplayString> - <Expand> - <Synthetic Name="[row 0]" Condition="Flags%2"> - <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]})</DisplayString> - </Synthetic> - <Synthetic Name="[row 0]" Condition="!(Flags%2)"> - <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[3]}, {m_storage.m_data.array[6]})</DisplayString> - </Synthetic> - <Synthetic Name="[row 1]" Condition="Flags%2"> - <DisplayString>({m_storage.m_data.array[3]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[5]})</DisplayString> - </Synthetic> - <Synthetic Name="[row 1]" Condition="!(Flags%2)"> - <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[7]})</DisplayString> - </Synthetic> - <Synthetic Name="[row 2]" Condition="Flags%2"> - <DisplayString>({m_storage.m_data.array[6]}, {m_storage.m_data.array[7]}, {m_storage.m_data.array[8]})</DisplayString> - </Synthetic> - <Synthetic Name="[row 2]" Condition="!(Flags%2)"> - <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[8]})</DisplayString> - </Synthetic> - </Expand> - </Type> - - <!-- 4 x 4 Matrix --> - <Type Name="Eigen::Matrix<*,4,4,*,*,*>"> - <AlternativeType Name="Eigen::Array<*,4,4,*,*,*>"/> - <DisplayString>[4, 4] (fixed matrix)</DisplayString> - <Expand> - <Synthetic Name="[row 0]" Condition="Flags%2"> - <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString> - </Synthetic> - <Synthetic Name="[row 0]" Condition="!(Flags%2)"> - <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[8]}, {m_storage.m_data.array[12]})</DisplayString> - </Synthetic> - <Synthetic Name="[row 1]" Condition="Flags%2"> - <DisplayString>({m_storage.m_data.array[4]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[6]}, {m_storage.m_data.array[7]})</DisplayString> - </Synthetic> - <Synthetic Name="[row 1]" Condition="!(Flags%2)"> - <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[9]}, {m_storage.m_data.array[13]})</DisplayString> - </Synthetic> - <Synthetic Name="[row 2]" Condition="Flags%2"> - <DisplayString>({m_storage.m_data.array[8]}, {m_storage.m_data.array[9]}, {m_storage.m_data.array[10]}, {m_storage.m_data.array[11]})</DisplayString> - </Synthetic> - <Synthetic Name="[row 2]" Condition="!(Flags%2)"> - <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[6]}, {m_storage.m_data.array[10]}, {m_storage.m_data.array[14]})</DisplayString> - </Synthetic> - <Synthetic Name="[row 3]" Condition="Flags%2"> - <DisplayString>({m_storage.m_data.array[12]}, {m_storage.m_data.array[13]}, {m_storage.m_data.array[14]}, {m_storage.m_data.array[15]})</DisplayString> - </Synthetic> - <Synthetic Name="[row 3]" Condition="!(Flags%2)"> - <DisplayString>({m_storage.m_data.array[3]}, {m_storage.m_data.array[7]}, {m_storage.m_data.array[11]}, {m_storage.m_data.array[15]})</DisplayString> - </Synthetic> - </Expand> - </Type> - - <!-- Dynamic x Dynamic Matrix --> - <Type Name="Eigen::Matrix<*,-1,-1,*,*,*>"> - <AlternativeType Name="Eigen::Array<*,-1,-1,*,*,*>"/> - <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString> - <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}, {m_storage.m_cols}] (dynamic matrix)</DisplayString> - <Expand> - <ArrayItems Condition="Flags%2"> <!-- row major layout --> - <Rank>2</Rank> - <Size>$i==0 ? m_storage.m_rows : m_storage.m_cols</Size> - <ValuePointer>m_storage.m_data</ValuePointer> - </ArrayItems> - <ArrayItems Condition="!(Flags%2)"> <!-- column major layout --> - <Direction>Backward</Direction> - <Rank>2</Rank> - <Size>$i==0 ? m_storage.m_rows : m_storage.m_cols</Size> - <ValuePointer>m_storage.m_data</ValuePointer> - </ArrayItems> - </Expand> - </Type> - - <!-- Fixed x Dynamic Matrix --> - <Type Name="Eigen::Matrix<*,*,-1,*,*,*>"> - <AlternativeType Name="Eigen::Array<*,*,-1,*,*,*>"/> - <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString> - <DisplayString Condition="m_storage.m_data != 0">[{$T2}, {m_storage.m_cols}] (dynamic column matrix)</DisplayString> - <Expand> - <ArrayItems Condition="Flags%2"> <!-- row major layout --> - <Rank>2</Rank> - <Size>$i==0 ? $T2 : m_storage.m_cols</Size> - <ValuePointer>m_storage.m_data</ValuePointer> - </ArrayItems> - <ArrayItems Condition="!(Flags%2)"> <!-- column major layout --> - <Direction>Backward</Direction> - <Rank>2</Rank> - <Size>$i==0 ? $T2 : m_storage.m_cols</Size> - <ValuePointer>m_storage.m_data</ValuePointer> - </ArrayItems> - </Expand> - </Type> - - <!-- Dynamic x Fixed Matrix --> - <Type Name="Eigen::Matrix<*,-1,*,*,*,*>"> - <AlternativeType Name="Eigen::Array<*,-1,*,*,*,*>"/> - <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString> - <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}, {$T2}] (dynamic row matrix)</DisplayString> - <Expand> - <ArrayItems Condition="Flags%2"> <!-- row major layout --> - <Rank>2</Rank> - <Size>$i==0 ? m_storage.m_rows : $T2</Size> - <ValuePointer>m_storage.m_data</ValuePointer> - </ArrayItems> - <ArrayItems Condition="!(Flags%2)"> <!-- column major layout --> - <Direction>Backward</Direction> - <Rank>2</Rank> - <Size>$i==0 ? m_storage.m_rows : $T2</Size> - <ValuePointer>m_storage.m_data</ValuePointer> - </ArrayItems> - </Expand> - </Type> - - <!-- Dynamic Column Vector --> - <Type Name="Eigen::Matrix<*,1,-1,*,*,*>"> - <AlternativeType Name="Eigen::Array<*,1,-1,*,*,*>"/> - <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString> - <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_cols}] (dynamic column vector)</DisplayString> - <Expand> - <Item Name="[size]">m_storage.m_cols</Item> - <ArrayItems> - <Size>m_storage.m_cols</Size> - <ValuePointer>m_storage.m_data</ValuePointer> - </ArrayItems> - </Expand> - </Type> - - <!-- Dynamic Row Vector --> - <Type Name="Eigen::Matrix<*,-1,1,*,*,*>"> - <AlternativeType Name="Eigen::Array<*,-1,1,*,*,*>"/> - <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString> - <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}] (dynamic row vector)</DisplayString> - <Expand> - <Item Name="[size]">m_storage.m_rows</Item> - <ArrayItems> - <Size>m_storage.m_rows</Size> - <ValuePointer>m_storage.m_data</ValuePointer> - </ArrayItems> - </Expand> - </Type> - - <!-- Fixed Vector --> - <Type Name="Eigen::Matrix<*,1,1,*,*,*>"> - <AlternativeType Name="Eigen::Array<*,1,1,*,*,*>"/> - <DisplayString>[1] ({m_storage.m_data.array[0]})</DisplayString> - <Expand> - <Item Name="[x]">m_storage.m_data.array[0]</Item> - </Expand> - </Type> - - <Type Name="Eigen::Matrix<*,2,1,*,*,*>"> - <AlternativeType Name="Eigen::Matrix<*,1,2,*,*,*>"/> - <AlternativeType Name="Eigen::Array<*,2,1,*,*,*>"/> - <AlternativeType Name="Eigen::Array<*,1,2,*,*,*>"/> - <DisplayString>[2] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]})</DisplayString> - <Expand> - <Item Name="[x]">m_storage.m_data.array[0]</Item> - <Item Name="[y]">m_storage.m_data.array[1]</Item> - </Expand> - </Type> - - <Type Name="Eigen::Matrix<*,3,1,*,*,*>"> - <AlternativeType Name="Eigen::Matrix<*,1,3,*,*,*>"/> - <AlternativeType Name="Eigen::Array<*,3,1,*,*,*>"/> - <AlternativeType Name="Eigen::Array<*,1,3,*,*,*>"/> - <DisplayString>[3] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]})</DisplayString> - <Expand> - <Item Name="[x]">m_storage.m_data.array[0]</Item> - <Item Name="[y]">m_storage.m_data.array[1]</Item> - <Item Name="[z]">m_storage.m_data.array[2]</Item> - </Expand> - </Type> - - <Type Name="Eigen::Matrix<*,4,1,*,*,*>"> - <AlternativeType Name="Eigen::Matrix<*,1,4,*,*,*>"/> - <AlternativeType Name="Eigen::Array<*,4,1,*,*,*>"/> - <AlternativeType Name="Eigen::Array<*,1,4,*,*,*>"/> - <DisplayString>[4] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString> - <Expand> - <Item Name="[x]">m_storage.m_data.array[0]</Item> - <Item Name="[y]">m_storage.m_data.array[1]</Item> - <Item Name="[z]">m_storage.m_data.array[2]</Item> - <Item Name="[w]">m_storage.m_data.array[3]</Item> - </Expand> - </Type> - -</AutoVisualizer> +<?xml version="1.0" encoding="utf-8"?> + +<AutoVisualizer xmlns="http://schemas.microsoft.com/vstudio/debugger/natvis/2010"> + + <!-- Fixed x Fixed Matrix --> + <Type Name="Eigen::Matrix<*,*,*,*,*,*>"> + <AlternativeType Name="Eigen::Array<*,-1,-1,*,*,*>"/> + <DisplayString>[{$T2}, {$T3}] (fixed matrix)</DisplayString> + <Expand> + <ArrayItems Condition="Flags%2"> <!-- row major layout --> + <Rank>2</Rank> + <Size>$i==0 ? $T2 : $T3</Size> + <ValuePointer>m_storage.m_data.array</ValuePointer> + </ArrayItems> + <ArrayItems Condition="!(Flags%2)"> <!-- column major layout --> + <Direction>Backward</Direction> + <Rank>2</Rank> + <Size>$i==0 ? $T2 : $T3</Size> + <ValuePointer>m_storage.m_data.array</ValuePointer> + </ArrayItems> + </Expand> + </Type> + + <!-- 2 x 2 Matrix --> + <Type Name="Eigen::Matrix<*,2,2,*,*,*>"> + <AlternativeType Name="Eigen::Array<*,2,2,*,*,*>"/> + <DisplayString>[2, 2] (fixed matrix)</DisplayString> + <Expand> + <Synthetic Name="[row 0]" Condition="Flags%2"> + <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]})</DisplayString> + </Synthetic> + <Synthetic Name="[row 0]" Condition="!(Flags%2)"> + <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[2]})</DisplayString> + </Synthetic> + <Synthetic Name="[row 1]" Condition="Flags%2"> + <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString> + </Synthetic> + <Synthetic Name="[row 1]" Condition="!(Flags%2)"> + <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[3]})</DisplayString> + </Synthetic> + </Expand> + </Type> + + <!-- 3 x 3 Matrix --> + <Type Name="Eigen::Matrix<*,3,3,*,*,*>"> + <AlternativeType Name="Eigen::Array<*,3,3,*,*,*>"/> + <DisplayString>[3, 3] (fixed matrix)</DisplayString> + <Expand> + <Synthetic Name="[row 0]" Condition="Flags%2"> + <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]})</DisplayString> + </Synthetic> + <Synthetic Name="[row 0]" Condition="!(Flags%2)"> + <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[3]}, {m_storage.m_data.array[6]})</DisplayString> + </Synthetic> + <Synthetic Name="[row 1]" Condition="Flags%2"> + <DisplayString>({m_storage.m_data.array[3]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[5]})</DisplayString> + </Synthetic> + <Synthetic Name="[row 1]" Condition="!(Flags%2)"> + <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[7]})</DisplayString> + </Synthetic> + <Synthetic Name="[row 2]" Condition="Flags%2"> + <DisplayString>({m_storage.m_data.array[6]}, {m_storage.m_data.array[7]}, {m_storage.m_data.array[8]})</DisplayString> + </Synthetic> + <Synthetic Name="[row 2]" Condition="!(Flags%2)"> + <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[8]})</DisplayString> + </Synthetic> + </Expand> + </Type> + + <!-- 4 x 4 Matrix --> + <Type Name="Eigen::Matrix<*,4,4,*,*,*>"> + <AlternativeType Name="Eigen::Array<*,4,4,*,*,*>"/> + <DisplayString>[4, 4] (fixed matrix)</DisplayString> + <Expand> + <Synthetic Name="[row 0]" Condition="Flags%2"> + <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString> + </Synthetic> + <Synthetic Name="[row 0]" Condition="!(Flags%2)"> + <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[8]}, {m_storage.m_data.array[12]})</DisplayString> + </Synthetic> + <Synthetic Name="[row 1]" Condition="Flags%2"> + <DisplayString>({m_storage.m_data.array[4]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[6]}, {m_storage.m_data.array[7]})</DisplayString> + </Synthetic> + <Synthetic Name="[row 1]" Condition="!(Flags%2)"> + <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[9]}, {m_storage.m_data.array[13]})</DisplayString> + </Synthetic> + <Synthetic Name="[row 2]" Condition="Flags%2"> + <DisplayString>({m_storage.m_data.array[8]}, {m_storage.m_data.array[9]}, {m_storage.m_data.array[10]}, {m_storage.m_data.array[11]})</DisplayString> + </Synthetic> + <Synthetic Name="[row 2]" Condition="!(Flags%2)"> + <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[6]}, {m_storage.m_data.array[10]}, {m_storage.m_data.array[14]})</DisplayString> + </Synthetic> + <Synthetic Name="[row 3]" Condition="Flags%2"> + <DisplayString>({m_storage.m_data.array[12]}, {m_storage.m_data.array[13]}, {m_storage.m_data.array[14]}, {m_storage.m_data.array[15]})</DisplayString> + </Synthetic> + <Synthetic Name="[row 3]" Condition="!(Flags%2)"> + <DisplayString>({m_storage.m_data.array[3]}, {m_storage.m_data.array[7]}, {m_storage.m_data.array[11]}, {m_storage.m_data.array[15]})</DisplayString> + </Synthetic> + </Expand> + </Type> + + <!-- Dynamic x Dynamic Matrix --> + <Type Name="Eigen::Matrix<*,-1,-1,*,*,*>"> + <AlternativeType Name="Eigen::Array<*,-1,-1,*,*,*>"/> + <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString> + <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}, {m_storage.m_cols}] (dynamic matrix)</DisplayString> + <Expand> + <ArrayItems Condition="Flags%2"> <!-- row major layout --> + <Rank>2</Rank> + <Size>$i==0 ? m_storage.m_rows : m_storage.m_cols</Size> + <ValuePointer>m_storage.m_data</ValuePointer> + </ArrayItems> + <ArrayItems Condition="!(Flags%2)"> <!-- column major layout --> + <Direction>Backward</Direction> + <Rank>2</Rank> + <Size>$i==0 ? m_storage.m_rows : m_storage.m_cols</Size> + <ValuePointer>m_storage.m_data</ValuePointer> + </ArrayItems> + </Expand> + </Type> + + <!-- Fixed x Dynamic Matrix --> + <Type Name="Eigen::Matrix<*,*,-1,*,*,*>"> + <AlternativeType Name="Eigen::Array<*,*,-1,*,*,*>"/> + <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString> + <DisplayString Condition="m_storage.m_data != 0">[{$T2}, {m_storage.m_cols}] (dynamic column matrix)</DisplayString> + <Expand> + <ArrayItems Condition="Flags%2"> <!-- row major layout --> + <Rank>2</Rank> + <Size>$i==0 ? $T2 : m_storage.m_cols</Size> + <ValuePointer>m_storage.m_data</ValuePointer> + </ArrayItems> + <ArrayItems Condition="!(Flags%2)"> <!-- column major layout --> + <Direction>Backward</Direction> + <Rank>2</Rank> + <Size>$i==0 ? $T2 : m_storage.m_cols</Size> + <ValuePointer>m_storage.m_data</ValuePointer> + </ArrayItems> + </Expand> + </Type> + + <!-- Dynamic x Fixed Matrix --> + <Type Name="Eigen::Matrix<*,-1,*,*,*,*>"> + <AlternativeType Name="Eigen::Array<*,-1,*,*,*,*>"/> + <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString> + <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}, {$T2}] (dynamic row matrix)</DisplayString> + <Expand> + <ArrayItems Condition="Flags%2"> <!-- row major layout --> + <Rank>2</Rank> + <Size>$i==0 ? m_storage.m_rows : $T2</Size> + <ValuePointer>m_storage.m_data</ValuePointer> + </ArrayItems> + <ArrayItems Condition="!(Flags%2)"> <!-- column major layout --> + <Direction>Backward</Direction> + <Rank>2</Rank> + <Size>$i==0 ? m_storage.m_rows : $T2</Size> + <ValuePointer>m_storage.m_data</ValuePointer> + </ArrayItems> + </Expand> + </Type> + + <!-- Dynamic Column Vector --> + <Type Name="Eigen::Matrix<*,1,-1,*,*,*>"> + <AlternativeType Name="Eigen::Array<*,1,-1,*,*,*>"/> + <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString> + <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_cols}] (dynamic column vector)</DisplayString> + <Expand> + <Item Name="[size]">m_storage.m_cols</Item> + <ArrayItems> + <Size>m_storage.m_cols</Size> + <ValuePointer>m_storage.m_data</ValuePointer> + </ArrayItems> + </Expand> + </Type> + + <!-- Dynamic Row Vector --> + <Type Name="Eigen::Matrix<*,-1,1,*,*,*>"> + <AlternativeType Name="Eigen::Array<*,-1,1,*,*,*>"/> + <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString> + <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}] (dynamic row vector)</DisplayString> + <Expand> + <Item Name="[size]">m_storage.m_rows</Item> + <ArrayItems> + <Size>m_storage.m_rows</Size> + <ValuePointer>m_storage.m_data</ValuePointer> + </ArrayItems> + </Expand> + </Type> + + <!-- Fixed Vector --> + <Type Name="Eigen::Matrix<*,1,1,*,*,*>"> + <AlternativeType Name="Eigen::Array<*,1,1,*,*,*>"/> + <DisplayString>[1] ({m_storage.m_data.array[0]})</DisplayString> + <Expand> + <Item Name="[x]">m_storage.m_data.array[0]</Item> + </Expand> + </Type> + + <Type Name="Eigen::Matrix<*,2,1,*,*,*>"> + <AlternativeType Name="Eigen::Matrix<*,1,2,*,*,*>"/> + <AlternativeType Name="Eigen::Array<*,2,1,*,*,*>"/> + <AlternativeType Name="Eigen::Array<*,1,2,*,*,*>"/> + <DisplayString>[2] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]})</DisplayString> + <Expand> + <Item Name="[x]">m_storage.m_data.array[0]</Item> + <Item Name="[y]">m_storage.m_data.array[1]</Item> + </Expand> + </Type> + + <Type Name="Eigen::Matrix<*,3,1,*,*,*>"> + <AlternativeType Name="Eigen::Matrix<*,1,3,*,*,*>"/> + <AlternativeType Name="Eigen::Array<*,3,1,*,*,*>"/> + <AlternativeType Name="Eigen::Array<*,1,3,*,*,*>"/> + <DisplayString>[3] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]})</DisplayString> + <Expand> + <Item Name="[x]">m_storage.m_data.array[0]</Item> + <Item Name="[y]">m_storage.m_data.array[1]</Item> + <Item Name="[z]">m_storage.m_data.array[2]</Item> + </Expand> + </Type> + + <Type Name="Eigen::Matrix<*,4,1,*,*,*>"> + <AlternativeType Name="Eigen::Matrix<*,1,4,*,*,*>"/> + <AlternativeType Name="Eigen::Array<*,4,1,*,*,*>"/> + <AlternativeType Name="Eigen::Array<*,1,4,*,*,*>"/> + <DisplayString>[4] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString> + <Expand> + <Item Name="[x]">m_storage.m_data.array[0]</Item> + <Item Name="[y]">m_storage.m_data.array[1]</Item> + <Item Name="[z]">m_storage.m_data.array[2]</Item> + <Item Name="[w]">m_storage.m_data.array[3]</Item> + </Expand> + </Type> + +</AutoVisualizer>
diff --git a/debug/msvc/eigen_autoexp_part.dat b/debug/msvc/eigen_autoexp_part.dat index 273c10d..35ef580 100644 --- a/debug/msvc/eigen_autoexp_part.dat +++ b/debug/msvc/eigen_autoexp_part.dat
@@ -1,295 +1,295 @@ -; *************************************************************** -; * Eigen Visualizer -; * -; * Author: Hauke Heibel <hauke.heibel@gmail.com> -; * -; * Support the enhanced debugging of the following Eigen -; * types (*: any, +:fixed dimension) : -; * -; * - Eigen::Matrix<*,4,1,*,*,*> and Eigen::Matrix<*,1,4,*,*,*> -; * - Eigen::Matrix<*,3,1,*,*,*> and Eigen::Matrix<*,1,3,*,*,*> -; * - Eigen::Matrix<*,2,1,*,*,*> and Eigen::Matrix<*,1,2,*,*,*> -; * - Eigen::Matrix<*,-1,-1,*,*,*> -; * - Eigen::Matrix<*,+,-1,*,*,*> -; * - Eigen::Matrix<*,-1,+,*,*,*> -; * - Eigen::Matrix<*,+,+,*,*,*> -; * -; * Matrices are displayed properly independently of the memory -; * alignment (RowMajor vs. ColMajor). -; * -; * This file is distributed WITHOUT ANY WARRANTY. Please ensure -; * that your original autoexp.dat file is copied to a safe -; * place before proceeding with its modification. -; *************************************************************** - -[Visualizer] - -; Fixed size 4-vectors -Eigen::Matrix<*,4,1,*,*,*>|Eigen::Matrix<*,1,4,*,*,*>{ - children - ( - #( - [internals]: [$c,!], - x : ($c.m_storage.m_data.array)[0], - y : ($c.m_storage.m_data.array)[1], - z : ($c.m_storage.m_data.array)[2], - w : ($c.m_storage.m_data.array)[3] - ) - ) - - preview - ( - #( - "[", - 4, - "](", - #array(expr: $e.m_storage.m_data.array[$i], size: 4), - ")" - ) - ) -} - -; Fixed size 3-vectors -Eigen::Matrix<*,3,1,*,*,*>|Eigen::Matrix<*,1,3,*,*,*>{ - children - ( - #( - [internals]: [$c,!], - x : ($c.m_storage.m_data.array)[0], - y : ($c.m_storage.m_data.array)[1], - z : ($c.m_storage.m_data.array)[2] - ) - ) - - preview - ( - #( - "[", - 3, - "](", - #array(expr: $e.m_storage.m_data.array[$i], size: 3), - ")" - ) - ) -} - -; Fixed size 2-vectors -Eigen::Matrix<*,2,1,*,*,*>|Eigen::Matrix<*,1,2,*,*,*>{ - children - ( - #( - [internals]: [$c,!], - x : ($c.m_storage.m_data.array)[0], - y : ($c.m_storage.m_data.array)[1] - ) - ) - - preview - ( - #( - "[", - 2, - "](", - #array(expr: $e.m_storage.m_data.array[$i], size: 2), - ")" - ) - ) -} - -; Fixed size 1-vectors -Eigen::Matrix<*,1,1,*,*,*>|Eigen::Matrix<*,1,1,*,*,*>{ - children - ( - #( - [internals]: [$c,!], - x : ($c.m_storage.m_data.array)[0] - ) - ) - - preview - ( - #( - "[", - 1, - "](", - #array(expr: $e.m_storage.m_data.array[$i], size: 1), - ")" - ) - ) -} - -; Dynamic matrices (ColMajor and RowMajor support) -Eigen::Matrix<*,-1,-1,*,*,*>{ - children - ( - #( - [internals]: [$c,!], - rows: $c.m_storage.m_rows, - cols: $c.m_storage.m_cols, - ; Check for RowMajorBit - #if ($c.Flags & 0x1) ( - #array( - rank: 2, - base: 0, - expr: ($c.m_storage.m_data)[($i % $c.m_storage.m_rows)*$c.m_storage.m_cols + (($i- $i % $c.m_storage.m_rows)/$c.m_storage.m_rows)], - size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.m_storage.m_cols - ) - ) #else ( - #array( - rank: 2, - base: 0, - expr: ($c.m_storage.m_data)[$i], - size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.m_storage.m_cols - ) - ) - ) - ) - - preview - ( - #( - "[", - $c.m_storage.m_rows, - ",", - $c.m_storage.m_cols, - "](", - #array( - expr : [($c.m_storage.m_data)[$i],g], - size : $c.m_storage.m_rows*$c.m_storage.m_cols - ), - ")" - ) - ) -} - -; Fixed rows, dynamic columns matrix (ColMajor and RowMajor support) -Eigen::Matrix<*,*,-1,*,*,*>{ - children - ( - #( - [internals]: [$c,!], - rows: $c.RowsAtCompileTime, - cols: $c.m_storage.m_cols, - ; Check for RowMajorBit - #if ($c.Flags & 0x1) ( - #array( - rank: 2, - base: 0, - expr: ($c.m_storage.m_data)[($i % $c.RowsAtCompileTime)*$c.m_storage.m_cols + (($i- $i % $c.RowsAtCompileTime)/$c.RowsAtCompileTime)], - size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.m_storage.m_cols - ) - ) #else ( - #array( - rank: 2, - base: 0, - expr: ($c.m_storage.m_data)[$i], - size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.m_storage.m_cols - ) - ) - ) - ) - - preview - ( - #( - "[", - $c.RowsAtCompileTime, - ",", - $c.m_storage.m_cols, - "](", - #array( - expr : [($c.m_storage.m_data)[$i],g], - size : $c.RowsAtCompileTime*$c.m_storage.m_cols - ), - ")" - ) - ) -} - -; Dynamic rows, fixed columns matrix (ColMajor and RowMajor support) -Eigen::Matrix<*,-1,*,*,*,*>{ - children - ( - #( - [internals]: [$c,!], - rows: $c.m_storage.m_rows, - cols: $c.ColsAtCompileTime, - ; Check for RowMajorBit - #if ($c.Flags & 0x1) ( - #array( - rank: 2, - base: 0, - expr: ($c.m_storage.m_data)[($i % $c.m_storage.m_rows)*$c.ColsAtCompileTime + (($i- $i % $c.m_storage.m_rows)/$c.m_storage.m_rows)], - size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.ColsAtCompileTime - ) - ) #else ( - #array( - rank: 2, - base: 0, - expr: ($c.m_storage.m_data)[$i], - size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.ColsAtCompileTime - ) - ) - ) - ) - - preview - ( - #( - "[", - $c.m_storage.m_rows, - ",", - $c.ColsAtCompileTime, - "](", - #array( - expr : [($c.m_storage.m_data)[$i],g], - size : $c.m_storage.m_rows*$c.ColsAtCompileTime - ), - ")" - ) - ) -} - -; Fixed size matrix (ColMajor and RowMajor support) -Eigen::Matrix<*,*,*,*,*,*>{ - children - ( - #( - [internals]: [$c,!], - rows: $c.RowsAtCompileTime, - cols: $c.ColsAtCompileTime, - ; Check for RowMajorBit - #if ($c.Flags & 0x1) ( - #array( - rank: 2, - base: 0, - expr: ($c.m_storage.m_data.array)[($i % $c.RowsAtCompileTime)*$c.ColsAtCompileTime + (($i- $i % $c.RowsAtCompileTime)/$c.RowsAtCompileTime)], - size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.ColsAtCompileTime - ) - ) #else ( - #array( - rank: 2, - base: 0, - expr: ($c.m_storage.m_data.array)[$i], - size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.ColsAtCompileTime - ) - ) - ) - ) - - preview - ( - #( - "[", - $c.RowsAtCompileTime, - ",", - $c.ColsAtCompileTime, - "](", - #array( - expr : [($c.m_storage.m_data.array)[$i],g], - size : $c.RowsAtCompileTime*$c.ColsAtCompileTime - ), - ")" - ) - ) -} +; *************************************************************** +; * Eigen Visualizer +; * +; * Author: Hauke Heibel <hauke.heibel@gmail.com> +; * +; * Support the enhanced debugging of the following Eigen +; * types (*: any, +:fixed dimension) : +; * +; * - Eigen::Matrix<*,4,1,*,*,*> and Eigen::Matrix<*,1,4,*,*,*> +; * - Eigen::Matrix<*,3,1,*,*,*> and Eigen::Matrix<*,1,3,*,*,*> +; * - Eigen::Matrix<*,2,1,*,*,*> and Eigen::Matrix<*,1,2,*,*,*> +; * - Eigen::Matrix<*,-1,-1,*,*,*> +; * - Eigen::Matrix<*,+,-1,*,*,*> +; * - Eigen::Matrix<*,-1,+,*,*,*> +; * - Eigen::Matrix<*,+,+,*,*,*> +; * +; * Matrices are displayed properly independently of the memory +; * alignment (RowMajor vs. ColMajor). +; * +; * This file is distributed WITHOUT ANY WARRANTY. Please ensure +; * that your original autoexp.dat file is copied to a safe +; * place before proceeding with its modification. +; *************************************************************** + +[Visualizer] + +; Fixed size 4-vectors +Eigen::Matrix<*,4,1,*,*,*>|Eigen::Matrix<*,1,4,*,*,*>{ + children + ( + #( + [internals]: [$c,!], + x : ($c.m_storage.m_data.array)[0], + y : ($c.m_storage.m_data.array)[1], + z : ($c.m_storage.m_data.array)[2], + w : ($c.m_storage.m_data.array)[3] + ) + ) + + preview + ( + #( + "[", + 4, + "](", + #array(expr: $e.m_storage.m_data.array[$i], size: 4), + ")" + ) + ) +} + +; Fixed size 3-vectors +Eigen::Matrix<*,3,1,*,*,*>|Eigen::Matrix<*,1,3,*,*,*>{ + children + ( + #( + [internals]: [$c,!], + x : ($c.m_storage.m_data.array)[0], + y : ($c.m_storage.m_data.array)[1], + z : ($c.m_storage.m_data.array)[2] + ) + ) + + preview + ( + #( + "[", + 3, + "](", + #array(expr: $e.m_storage.m_data.array[$i], size: 3), + ")" + ) + ) +} + +; Fixed size 2-vectors +Eigen::Matrix<*,2,1,*,*,*>|Eigen::Matrix<*,1,2,*,*,*>{ + children + ( + #( + [internals]: [$c,!], + x : ($c.m_storage.m_data.array)[0], + y : ($c.m_storage.m_data.array)[1] + ) + ) + + preview + ( + #( + "[", + 2, + "](", + #array(expr: $e.m_storage.m_data.array[$i], size: 2), + ")" + ) + ) +} + +; Fixed size 1-vectors +Eigen::Matrix<*,1,1,*,*,*>|Eigen::Matrix<*,1,1,*,*,*>{ + children + ( + #( + [internals]: [$c,!], + x : ($c.m_storage.m_data.array)[0] + ) + ) + + preview + ( + #( + "[", + 1, + "](", + #array(expr: $e.m_storage.m_data.array[$i], size: 1), + ")" + ) + ) +} + +; Dynamic matrices (ColMajor and RowMajor support) +Eigen::Matrix<*,-1,-1,*,*,*>{ + children + ( + #( + [internals]: [$c,!], + rows: $c.m_storage.m_rows, + cols: $c.m_storage.m_cols, + ; Check for RowMajorBit + #if ($c.Flags & 0x1) ( + #array( + rank: 2, + base: 0, + expr: ($c.m_storage.m_data)[($i % $c.m_storage.m_rows)*$c.m_storage.m_cols + (($i- $i % $c.m_storage.m_rows)/$c.m_storage.m_rows)], + size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.m_storage.m_cols + ) + ) #else ( + #array( + rank: 2, + base: 0, + expr: ($c.m_storage.m_data)[$i], + size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.m_storage.m_cols + ) + ) + ) + ) + + preview + ( + #( + "[", + $c.m_storage.m_rows, + ",", + $c.m_storage.m_cols, + "](", + #array( + expr : [($c.m_storage.m_data)[$i],g], + size : $c.m_storage.m_rows*$c.m_storage.m_cols + ), + ")" + ) + ) +} + +; Fixed rows, dynamic columns matrix (ColMajor and RowMajor support) +Eigen::Matrix<*,*,-1,*,*,*>{ + children + ( + #( + [internals]: [$c,!], + rows: $c.RowsAtCompileTime, + cols: $c.m_storage.m_cols, + ; Check for RowMajorBit + #if ($c.Flags & 0x1) ( + #array( + rank: 2, + base: 0, + expr: ($c.m_storage.m_data)[($i % $c.RowsAtCompileTime)*$c.m_storage.m_cols + (($i- $i % $c.RowsAtCompileTime)/$c.RowsAtCompileTime)], + size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.m_storage.m_cols + ) + ) #else ( + #array( + rank: 2, + base: 0, + expr: ($c.m_storage.m_data)[$i], + size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.m_storage.m_cols + ) + ) + ) + ) + + preview + ( + #( + "[", + $c.RowsAtCompileTime, + ",", + $c.m_storage.m_cols, + "](", + #array( + expr : [($c.m_storage.m_data)[$i],g], + size : $c.RowsAtCompileTime*$c.m_storage.m_cols + ), + ")" + ) + ) +} + +; Dynamic rows, fixed columns matrix (ColMajor and RowMajor support) +Eigen::Matrix<*,-1,*,*,*,*>{ + children + ( + #( + [internals]: [$c,!], + rows: $c.m_storage.m_rows, + cols: $c.ColsAtCompileTime, + ; Check for RowMajorBit + #if ($c.Flags & 0x1) ( + #array( + rank: 2, + base: 0, + expr: ($c.m_storage.m_data)[($i % $c.m_storage.m_rows)*$c.ColsAtCompileTime + (($i- $i % $c.m_storage.m_rows)/$c.m_storage.m_rows)], + size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.ColsAtCompileTime + ) + ) #else ( + #array( + rank: 2, + base: 0, + expr: ($c.m_storage.m_data)[$i], + size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.ColsAtCompileTime + ) + ) + ) + ) + + preview + ( + #( + "[", + $c.m_storage.m_rows, + ",", + $c.ColsAtCompileTime, + "](", + #array( + expr : [($c.m_storage.m_data)[$i],g], + size : $c.m_storage.m_rows*$c.ColsAtCompileTime + ), + ")" + ) + ) +} + +; Fixed size matrix (ColMajor and RowMajor support) +Eigen::Matrix<*,*,*,*,*,*>{ + children + ( + #( + [internals]: [$c,!], + rows: $c.RowsAtCompileTime, + cols: $c.ColsAtCompileTime, + ; Check for RowMajorBit + #if ($c.Flags & 0x1) ( + #array( + rank: 2, + base: 0, + expr: ($c.m_storage.m_data.array)[($i % $c.RowsAtCompileTime)*$c.ColsAtCompileTime + (($i- $i % $c.RowsAtCompileTime)/$c.RowsAtCompileTime)], + size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.ColsAtCompileTime + ) + ) #else ( + #array( + rank: 2, + base: 0, + expr: ($c.m_storage.m_data.array)[$i], + size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.ColsAtCompileTime + ) + ) + ) + ) + + preview + ( + #( + "[", + $c.RowsAtCompileTime, + ",", + $c.ColsAtCompileTime, + "](", + #array( + expr : [($c.m_storage.m_data.array)[$i],g], + size : $c.RowsAtCompileTime*$c.ColsAtCompileTime + ), + ")" + ) + ) +}
diff --git a/doc/TutorialSlicingIndexing.dox b/doc/TutorialSlicingIndexing.dox index 645a7cd..7f89554 100644 --- a/doc/TutorialSlicingIndexing.dox +++ b/doc/TutorialSlicingIndexing.dox
@@ -72,12 +72,12 @@ </tr> <tr> <td>%Block starting at \c i,j having \c m rows, and \c n columns</td> - <td>\code A(seqN(i,m), seqN(i,n)) \endcode</td> + <td>\code A(seqN(i,m), seqN(j,n)) \endcode</td> <td>\code A.block(i,j,m,n) \endcode</td> </tr> <tr> <td>%Block starting at \c i0,j0 and ending at \c i1,j1</td> - <td>\code A(seq(i0,i1), seq(j0,j1) \endcode</td> + <td>\code A(seq(i0,i1), seq(j0,j1)) \endcode</td> <td>\code A.block(i0,j0,i1-i0+1,j1-j0+1) \endcode</td> </tr> <tr> @@ -97,7 +97,7 @@ </tr> <tr> <td>The middle row</td> - <td>\code A(last/2,all) \endcode</td> + <td>\code A(last/2, all) \endcode</td> <td>\code A.row((A.rows()-1)/2) \endcode</td> </tr> <tr>
diff --git a/doc/eigen_navtree_hacks.js b/doc/eigen_navtree_hacks.js index afb97ed..f36b332 100644 --- a/doc/eigen_navtree_hacks.js +++ b/doc/eigen_navtree_hacks.js
@@ -62,23 +62,161 @@ } } -// Overloaded to adjust the size of the navtree wrt the toc -function resizeHeight() -{ - var header = $("#top"); - var sidenav = $("#side-nav"); - var content = $("#doc-content"); - var navtree = $("#nav-tree"); - var footer = $("#nav-path"); - var toc = $("#nav-toc"); +/* + @licstart The following is the entire license notice for the JavaScript code in this file. - var headerHeight = header.outerHeight(); - var footerHeight = footer.outerHeight(); - var tocHeight = toc.height(); - var windowHeight = $(window).height() - headerHeight - footerHeight; - content.css({height:windowHeight + "px"}); - navtree.css({height:(windowHeight-tocHeight) + "px"}); - sidenav.css({height:windowHeight + "px"}); + The MIT License (MIT) + + Copyright (C) 1997-2020 by Dimitri van Heesch + + Permission is hereby granted, free of charge, to any person obtaining a copy of this software + and associated documentation files (the "Software"), to deal in the Software without restriction, + including without limitation the rights to use, copy, modify, merge, publish, distribute, + sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all copies or + substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + @licend The above is the entire license notice for the JavaScript code in this file + */ +// We need to override entire resizable just so we can change the height to account for the TOC. +function initResizable() +{ + var cookie_namespace = 'doxygen'; + var sidenav,navtree,content,header,collapsed,collapsedWidth=0,barWidth=6,desktop_vp=768,titleHeight; + + function readCookie(cookie) + { + var myCookie = cookie_namespace+"_"+cookie+"="; + if (document.cookie) { + var index = document.cookie.indexOf(myCookie); + if (index != -1) { + var valStart = index + myCookie.length; + var valEnd = document.cookie.indexOf(";", valStart); + if (valEnd == -1) { + valEnd = document.cookie.length; + } + var val = document.cookie.substring(valStart, valEnd); + return val; + } + } + return 0; + } + + function writeCookie(cookie, val, expiration) + { + if (val==undefined) return; + if (expiration == null) { + var date = new Date(); + date.setTime(date.getTime()+(10*365*24*60*60*1000)); // default expiration is one week + expiration = date.toGMTString(); + } + document.cookie = cookie_namespace + "_" + cookie + "=" + val + "; expires=" + expiration+"; path=/"; + } + + function resizeWidth() + { + var windowWidth = $(window).width() + "px"; + var sidenavWidth = $(sidenav).outerWidth(); + content.css({marginLeft:parseInt(sidenavWidth)+"px"}); + writeCookie('width',sidenavWidth-barWidth, null); + } + + function restoreWidth(navWidth) + { + var windowWidth = $(window).width() + "px"; + content.css({marginLeft:parseInt(navWidth)+barWidth+"px"}); + sidenav.css({width:navWidth + "px"}); + } + + function resizeHeight() + { + var headerHeight = header.outerHeight(); + var footerHeight = footer.outerHeight(); + var windowHeight = $(window).height() - headerHeight - footerHeight; + //========================================================================== + // MODIFICATION: + // This small section is the only portion modified within initResizable(). + // The rest is copy-pasted from the doxygen-generated resize.js. + // + // Adjust nav height to make room for TOC. + var toc = $("#nav-toc"); + var tocHeight = toc.height(); + var navHeight = windowHeight; + // tocHeight is not always defined (e.g. if empty) + if (tocHeight) { + navHeight = windowHeight - tocHeight; + } + //========================================================================== + + content.css({height:windowHeight + "px"}); + navtree.css({height:navHeight + "px"}); + sidenav.css({height:windowHeight + "px"}); + + var width=$(window).width(); + if (width!=collapsedWidth) { + if (width<desktop_vp && collapsedWidth>=desktop_vp) { + if (!collapsed) { + collapseExpand(); + } + } else if (width>desktop_vp && collapsedWidth<desktop_vp) { + if (collapsed) { + collapseExpand(); + } + } + collapsedWidth=width; + } + if (location.hash.slice(1)) { + (document.getElementById(location.hash.slice(1))||document.body).scrollIntoView(); + } + } + + function collapseExpand() + { + if (sidenav.width()>0) { + restoreWidth(0); + collapsed=true; + } + else { + var width = readCookie('width'); + if (width>200 && width<$(window).width()) { restoreWidth(width); } else { restoreWidth(200); } + collapsed=false; + } + } + header = $("#top"); + sidenav = $("#side-nav"); + content = $("#doc-content"); + navtree = $("#nav-tree"); + footer = $("#nav-path"); + + $(".side-nav-resizable").resizable({resize: function(e, ui) { resizeWidth(); } }); + $(sidenav).resizable({ minWidth: 0 }); + $(window).resize(function() { resizeHeight(); }); + var device = navigator.userAgent.toLowerCase(); + var touch_device = device.match(/(iphone|ipod|ipad|android)/); + if (touch_device) { /* wider split bar for touch only devices */ + $(sidenav).css({ paddingRight:'20px' }); + $('.ui-resizable-e').css({ width:'20px' }); + $('#nav-sync').css({ right:'34px' }); + barWidth=20; + } + var width = readCookie('width'); + if (width) { restoreWidth(width); } else { resizeWidth(); } + resizeHeight(); + var url = location.href; + var i=url.indexOf("#"); + if (i>=0) window.location.hash=url.substr(i); + var _preventDefault = function(evt) { evt.preventDefault(); }; + $("#splitbar").bind("dragstart", _preventDefault).bind("selectstart", _preventDefault); + $(".ui-resizable-handle").dblclick(collapseExpand); + $(window).on('load',resizeHeight); } // Overloaded to save the root node into global_navtree_object @@ -241,7 +379,4 @@ setTimeout(arguments.callee, 10); } })(); - - $(window).on("load", resizeHeight); }); -
diff --git a/failtest/const_qualified_block_method_retval_0.cpp b/failtest/const_qualified_block_method_retval_0.cpp index 08b5d3c..6d19bb4 100644 --- a/failtest/const_qualified_block_method_retval_0.cpp +++ b/failtest/const_qualified_block_method_retval_0.cpp
@@ -8,6 +8,9 @@ using namespace Eigen; -void foo(CV_QUALIFIER Matrix3d &m) { Block<Matrix3d, 3, 3> b(m.block<3, 3>(0, 0)); } +void foo(CV_QUALIFIER Matrix3d &m) { + Block<Matrix3d, 3, 3> b(m.block<3, 3>(0, 0)); + EIGEN_UNUSED_VARIABLE(b); +} int main() {}
diff --git a/failtest/const_qualified_block_method_retval_1.cpp b/failtest/const_qualified_block_method_retval_1.cpp index 06e12c7..d58a018 100644 --- a/failtest/const_qualified_block_method_retval_1.cpp +++ b/failtest/const_qualified_block_method_retval_1.cpp
@@ -8,6 +8,9 @@ using namespace Eigen; -void foo(CV_QUALIFIER Matrix3d &m) { Block<Matrix3d> b(m.block(0, 0, 3, 3)); } +void foo(CV_QUALIFIER Matrix3d &m) { + Block<Matrix3d> b(m.block(0, 0, 3, 3)); + EIGEN_UNUSED_VARIABLE(b); +} int main() {}
diff --git a/failtest/const_qualified_diagonal_method_retval.cpp b/failtest/const_qualified_diagonal_method_retval.cpp index f3acba6..a10796a 100644 --- a/failtest/const_qualified_diagonal_method_retval.cpp +++ b/failtest/const_qualified_diagonal_method_retval.cpp
@@ -8,6 +8,9 @@ using namespace Eigen; -void foo(CV_QUALIFIER Matrix3d &m) { Diagonal<Matrix3d> b(m.diagonal()); } +void foo(CV_QUALIFIER Matrix3d &m) { + Diagonal<Matrix3d> b(m.diagonal()); + EIGEN_UNUSED_VARIABLE(b); +} int main() {}
diff --git a/failtest/const_qualified_transpose_method_retval.cpp b/failtest/const_qualified_transpose_method_retval.cpp index 394f64a..33200a9 100644 --- a/failtest/const_qualified_transpose_method_retval.cpp +++ b/failtest/const_qualified_transpose_method_retval.cpp
@@ -8,6 +8,9 @@ using namespace Eigen; -void foo(CV_QUALIFIER Matrix3d &m) { Transpose<Matrix3d> b(m.transpose()); } +void foo(CV_QUALIFIER Matrix3d &m) { + Transpose<Matrix3d> b(m.transpose()); + EIGEN_UNUSED_VARIABLE(b); +} int main() {}
diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt index 8d6d754..1babd13 100644 --- a/lapack/CMakeLists.txt +++ b/lapack/CMakeLists.txt
@@ -22,7 +22,7 @@ include_directories(../blas) set(EigenLapack_SRCS -single.cpp double.cpp complex_single.cpp complex_double.cpp ../blas/xerbla.cpp + dsecnd_INT_CPU_TIME.cpp second_INT_CPU_TIME.cpp single.cpp double.cpp complex_single.cpp complex_double.cpp ../blas/xerbla.cpp ) if(EIGEN_Fortran_COMPILER_WORKS) @@ -38,7 +38,6 @@ dlapy2.f dlapy3.f slapy2.f slapy3.f clacgv.f zlacgv.f slamch.f dlamch.f - second_NONE.f dsecnd_NONE.f ) option(EIGEN_ENABLE_LAPACK_TESTS OFF "Enable the Lapack unit tests")
diff --git a/lapack/dsecnd_INT_CPU_TIME.cpp b/lapack/dsecnd_INT_CPU_TIME.cpp new file mode 100644 index 0000000..684fa1d --- /dev/null +++ b/lapack/dsecnd_INT_CPU_TIME.cpp
@@ -0,0 +1,39 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2024 The Eigen Authors +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifdef _WIN32 +#include <cstdint> +#define WIN32_LEAN_AND_MEAN +#include <windows.h> +#else +#include <ctime> +#endif + +extern "C" { +double dsecnd_(); +} + +// Elapsed CPU Time in seconds. +double dsecnd_() { +#ifdef _WIN32 + // For MSVC, use `GetProcessTimes` for proper CPU time - MSVC uses + // a non-standard `std::clock` implementation (see + // https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/clock?view=msvc-170). + // GetProcessTimes() uses 100-nanosecond time units. + FILETIME creation_time, exit_time, kernel_time, user_time; + GetProcessTimes(GetCurrentProcess(), &creation_time, &exit_time, &kernel_time, &user_time); + ULARGE_INTEGER user; + user.HighPart = user_time.dwHighDateTime; + user.LowPart = user_time.dwLowDateTime; + uint64_t time_100ns = user.QuadPart; + return static_cast<double>(time_100ns) / 10000000.0; +#else + return static_cast<double>(std::clock()) / static_cast<double>(CLOCKS_PER_SEC); +#endif +}
diff --git a/lapack/second_INT_CPU_TIME.cpp b/lapack/second_INT_CPU_TIME.cpp new file mode 100644 index 0000000..d6eb402 --- /dev/null +++ b/lapack/second_INT_CPU_TIME.cpp
@@ -0,0 +1,39 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2024 The Eigen Authors +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifdef _WIN32 +#include <cstdint> +#define WIN32_LEAN_AND_MEAN +#include <windows.h> +#else +#include <ctime> +#endif + +extern "C" { +float second_(); +} + +// Elapsed CPU Time in seconds. +float second_() { +#ifdef _WIN32 + // For MSVC, use `GetProcessTimes` for proper CPU time - MSVC uses + // a non-standard `std::clock` implementation (see + // https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/clock?view=msvc-170). + // GetProcessTimes() uses 100-nanosecond time units. + FILETIME creation_time, exit_time, kernel_time, user_time; + GetProcessTimes(GetCurrentProcess(), &creation_time, &exit_time, &kernel_time, &user_time); + ULARGE_INTEGER user; + user.HighPart = user_time.dwHighDateTime; + user.LowPart = user_time.dwLowDateTime; + uint64_t time_100ns = user.QuadPart; + return static_cast<float>(time_100ns) / 10000000.0f; +#else + return static_cast<float>(std::clock()) / static_cast<float>(CLOCKS_PER_SEC); +#endif +}
diff --git a/scripts/relicense.py b/scripts/relicense.py deleted file mode 100644 index 8a5265f..0000000 --- a/scripts/relicense.py +++ /dev/null
@@ -1,69 +0,0 @@ -# This file is part of Eigen, a lightweight C++ template library -# for linear algebra. -# -# Copyright (C) 2012 Keir Mierle <mierle@gmail.com> -# -# This Source Code Form is subject to the terms of the Mozilla -# Public License v. 2.0. If a copy of the MPL was not distributed -# with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -# -# Author: mierle@gmail.com (Keir Mierle) -# -# Make the long-awaited conversion to MPL. - -lgpl3_header = ''' -// Eigen is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 3 of the License, or (at your option) any later version. -// -// Alternatively, you can redistribute it and/or -// modify it under the terms of the GNU General Public License as -// published by the Free Software Foundation; either version 2 of -// the License, or (at your option) any later version. -// -// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY -// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License and a copy of the GNU General Public License along with -// Eigen. If not, see <http://www.gnu.org/licenses/>. -''' - -mpl2_header = """ -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -""" - -import os -import sys - -exclusions = set(['relicense.py']) - -def update(text): - if text.find(lgpl3_header) == -1: - return text, False - return text.replace(lgpl3_header, mpl2_header), True - -rootdir = sys.argv[1] -for root, sub_folders, files in os.walk(rootdir): - for basename in files: - if basename in exclusions: - print 'SKIPPED', filename - continue - filename = os.path.join(root, basename) - fo = file(filename) - text = fo.read() - fo.close() - - text, updated = update(text) - if updated: - fo = file(filename, "w") - fo.write(text) - fo.close() - print 'UPDATED', filename - else: - print ' ', filename
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index fbbc98a..4c7c3a4 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt
@@ -48,7 +48,7 @@ set(SPARSE_LIBS ${SPARSE_LIBS} ${CHOLMOD_LIBRARIES} ${EIGEN_BLAS_LIBRARIES} ${EIGEN_LAPACK_LIBRARIES}) set(CHOLMOD_ALL_LIBS ${CHOLMOD_LIBRARIES} ${EIGEN_BLAS_LIBRARIES} ${EIGEN_LAPACK_LIBRARIES}) ei_add_property(EIGEN_TESTED_BACKENDS "CHOLMOD, ") - + ei_add_test(cholmod_support "" "${CHOLMOD_ALL_LIBS}") else() ei_add_property(EIGEN_MISSING_BACKENDS "CHOLMOD, ") @@ -61,7 +61,7 @@ set(SPARSE_LIBS ${SPARSE_LIBS} ${UMFPACK_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) set(UMFPACK_ALL_LIBS ${UMFPACK_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) ei_add_property(EIGEN_TESTED_BACKENDS "UMFPACK, ") - + ei_add_test(umfpack_support "" "${UMFPACK_ALL_LIBS}") else() ei_add_property(EIGEN_MISSING_BACKENDS "UMFPACK, ") @@ -74,7 +74,7 @@ set(SPARSE_LIBS ${SPARSE_LIBS} ${KLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) set(KLU_ALL_LIBS ${KLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) ei_add_property(EIGEN_TESTED_BACKENDS "KLU, ") - + ei_add_test(klu_support "" "${KLU_ALL_LIBS}") else() ei_add_property(EIGEN_MISSING_BACKENDS "KLU, ") @@ -87,7 +87,7 @@ set(SPARSE_LIBS ${SPARSE_LIBS} ${SUPERLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) set(SUPERLU_ALL_LIBS ${SUPERLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) ei_add_property(EIGEN_TESTED_BACKENDS "SuperLU, ") - + ei_add_test(superlu_support "" "${SUPERLU_ALL_LIBS}") else() ei_add_property(EIGEN_MISSING_BACKENDS "SuperLU, ") @@ -171,6 +171,7 @@ set_property(GLOBAL PROPERTY EIGEN_CURRENT_SUBPROJECT "Official") add_custom_target(BuildOfficial) +ei_add_test(clz) ei_add_test(rand) ei_add_test(meta) ei_add_test(maxsizevector) @@ -406,7 +407,7 @@ string(REPLACE "-pedantic" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") string(REPLACE "-Wundef" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") string(REPLACE "-Wnon-virtual-dtor" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") - string(REPLACE "-fno-check-new" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") + string(REPLACE "-fno-check-new" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") if(EIGEN_TEST_CUDA_CLANG) string(APPEND CMAKE_CXX_FLAGS " --cuda-path=${CUDA_TOOLKIT_ROOT_DIR}") @@ -433,12 +434,12 @@ set(CUDA_NVCC_FLAGS "--expt-relaxed-constexpr -Xcudafe \"--display_error_number\" ${NVCC_ARCH_FLAGS} ${CUDA_NVCC_FLAGS} ${EIGEN_CUDA_CXX_FLAGS}") cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include") endif() - + set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") - + ei_add_test(gpu_example) ei_add_test(gpu_basic) - + unset(EIGEN_ADD_TEST_FILENAME_EXTENSION) endif() @@ -477,7 +478,7 @@ message(FATAL_ERROR "HIP_PLATFORM = nvcc is not supported within Eigen") else () message(FATAL_ERROR "Unknown HIP_PLATFORM = ${HIP_PLATFORM}") - endif() + endif() endif() endif()
diff --git a/test/array_cwise.cpp b/test/array_cwise.cpp index 0cfea8b..543ef2e 100644 --- a/test/array_cwise.cpp +++ b/test/array_cwise.cpp
@@ -97,10 +97,12 @@ Scalar e = static_cast<Scalar>(ref(lhs(i, j), rhs(i, j))); Scalar a = actual(i, j); #if EIGEN_ARCH_ARM - // Work around NEON flush-to-zero mode - // if ref returns denormalized value and Eigen returns 0, then skip the test - int ref_fpclass = std::fpclassify(e); - if (a == Scalar(0) && ref_fpclass == FP_SUBNORMAL) continue; + // Work around NEON flush-to-zero mode. + // If ref returns a subnormal value and Eigen returns 0, then skip the test. + if (a == Scalar(0) && (e > -(std::numeric_limits<Scalar>::min)() && e < (std::numeric_limits<Scalar>::min)()) && + (e <= -std::numeric_limits<Scalar>::denorm_min() || e >= std::numeric_limits<Scalar>::denorm_min())) { + continue; + } #endif bool success = (a == e) || ((numext::isfinite)(e) && internal::isApprox(a, e, tol)) || ((numext::isnan)(a) && (numext::isnan)(e));
diff --git a/test/clz.cpp b/test/clz.cpp new file mode 100644 index 0000000..b56d328 --- /dev/null +++ b/test/clz.cpp
@@ -0,0 +1,74 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2023 The Eigen Authors +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +template <typename T> +int ref_clz(T val) { + constexpr int kNumBits = sizeof(T) * CHAR_BIT; + T kMsbMask = T(1) << (kNumBits - 1); + int z = 0; + for (; z < kNumBits && ((val & kMsbMask) == 0); ++z) { + val <<= 1; + } + return z; +} + +template <typename T> +int ref_ctz(T val) { + constexpr int kNumBits = sizeof(T) * CHAR_BIT; + T kLsbMask = T(1); + int z = 0; + for (; z < kNumBits && ((val & kLsbMask) == 0); ++z) { + val >>= 1; + } + return z; +} + +template <typename T> +void test_clz_ctz() { + T step = sizeof(T) <= 2 ? 1 : (Eigen::NumTraits<T>::highest() / (T(1) << 16)); + T iters = Eigen::NumTraits<T>::highest() / step; + for (T i = 0; i < iters; ++i) { + T val = i * step; + int expected_clz = ref_clz(val); + int actual_clz = Eigen::internal::clz(val); + VERIFY(expected_clz == actual_clz); + + int expected_ctz = ref_ctz(val); + int actual_ctz = Eigen::internal::ctz(val); + VERIFY(expected_ctz == actual_ctz); + } +} + +template <typename T> +void test_clz_ctz_random() { + for (int i = 0; i < 1024 * 1024; ++i) { + T val = Eigen::internal::random<T>(); + int expected_clz = ref_clz(val); + int actual_clz = Eigen::internal::clz(val); + VERIFY(expected_clz == actual_clz); + + int expected_ctz = ref_ctz(val); + int actual_ctz = Eigen::internal::ctz(val); + VERIFY(expected_ctz == actual_ctz); + } +} + +EIGEN_DECLARE_TEST(clz) { + CALL_SUBTEST_1(test_clz_ctz<uint8_t>()); + CALL_SUBTEST_2(test_clz_ctz<uint16_t>()); + CALL_SUBTEST_3(test_clz_ctz<uint32_t>()); + CALL_SUBTEST_4(test_clz_ctz<uint64_t>()); + + for (int i = 0; i < g_repeat; i++) { + test_clz_ctz_random<uint32_t>(); + test_clz_ctz_random<uint64_t>(); + } +}
diff --git a/test/product_threaded.cpp b/test/product_threaded.cpp index 410f767..1eb38fb 100644 --- a/test/product_threaded.cpp +++ b/test/product_threaded.cpp
@@ -13,9 +13,9 @@ void test_parallelize_gemm() { constexpr int n = 1024; constexpr int num_threads = 4; - MatrixXf a(n, n); - MatrixXf b(n, n); - MatrixXf c(n, n); + MatrixXf a = MatrixXf::Random(n, n); + MatrixXf b = MatrixXf::Random(n, n); + MatrixXf c = MatrixXf::Random(n, n); c.noalias() = a * b; ThreadPool pool(num_threads);
diff --git a/test/stable_norm.cpp b/test/stable_norm.cpp index 130284a..e9ed3d5 100644 --- a/test/stable_norm.cpp +++ b/test/stable_norm.cpp
@@ -209,6 +209,11 @@ } } +void test_empty() { + Eigen::VectorXf empty(0); + VERIFY_IS_EQUAL(empty.stableNorm(), 0.0f); +} + template <typename Scalar> void test_hypot() { typedef typename NumTraits<Scalar>::Real RealScalar; @@ -235,6 +240,8 @@ } EIGEN_DECLARE_TEST(stable_norm) { + CALL_SUBTEST_1(test_empty()); + for (int i = 0; i < g_repeat; i++) { CALL_SUBTEST_3(test_hypot<double>()); CALL_SUBTEST_4(test_hypot<float>());
diff --git a/test/threads_non_blocking_thread_pool.cpp b/test/threads_non_blocking_thread_pool.cpp index 2f0cf58..e805cf2 100644 --- a/test/threads_non_blocking_thread_pool.cpp +++ b/test/threads_non_blocking_thread_pool.cpp
@@ -112,53 +112,56 @@ static void test_pool_partitions() { const int kThreads = 2; - ThreadPool tp(kThreads); - - // Assign each thread to its own partition, so that stealing other work only - // occurs globally when a thread is idle. - std::vector<std::pair<unsigned, unsigned>> steal_partitions(kThreads); - for (int i = 0; i < kThreads; ++i) { - steal_partitions[i] = std::make_pair(i, i + 1); - } - tp.SetStealPartitions(steal_partitions); std::atomic<int> running(0); std::atomic<int> done(0); std::atomic<int> phase(0); - // Schedule kThreads tasks and ensure that they all are running. - for (int i = 0; i < kThreads; ++i) { - tp.Schedule([&]() { - const int thread_id = tp.CurrentThreadId(); - VERIFY_GE(thread_id, 0); - VERIFY_LE(thread_id, kThreads - 1); - ++running; - while (phase < 1) { - } - ++done; - }); + { + ThreadPool tp(kThreads); + + // Assign each thread to its own partition, so that stealing other work only + // occurs globally when a thread is idle. + std::vector<std::pair<unsigned, unsigned>> steal_partitions(kThreads); + for (int i = 0; i < kThreads; ++i) { + steal_partitions[i] = std::make_pair(i, i + 1); + } + tp.SetStealPartitions(steal_partitions); + + // Schedule kThreads tasks and ensure that they all are running. + for (int i = 0; i < kThreads; ++i) { + tp.Schedule([&]() { + const int thread_id = tp.CurrentThreadId(); + VERIFY_GE(thread_id, 0); + VERIFY_LE(thread_id, kThreads - 1); + ++running; + while (phase < 1) { + } + ++done; + }); + } + while (running != kThreads) { + } + // Schedule each closure to only run on thread 'i' and verify that it does. + for (int i = 0; i < kThreads; ++i) { + tp.ScheduleWithHint( + [&, i]() { + ++running; + const int thread_id = tp.CurrentThreadId(); + VERIFY_IS_EQUAL(thread_id, i); + while (phase < 2) { + } + ++done; + }, + i, i + 1); + } + running = 0; + phase = 1; + while (running != kThreads) { + } + running = 0; + phase = 2; } - while (running != kThreads) { - } - // Schedule each closure to only run on thread 'i' and verify that it does. - for (int i = 0; i < kThreads; ++i) { - tp.ScheduleWithHint( - [&, i]() { - ++running; - const int thread_id = tp.CurrentThreadId(); - VERIFY_IS_EQUAL(thread_id, i); - while (phase < 2) { - } - ++done; - }, - i, i + 1); - } - running = 0; - phase = 1; - while (running != kThreads) { - } - running = 0; - phase = 2; } EIGEN_DECLARE_TEST(cxx11_non_blocking_thread_pool) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index c4dedc1..9375398 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -934,6 +934,7 @@ template <Index DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorChippingOp<DimId, const Derived> chip(const Index offset) const { + EIGEN_STATIC_ASSERT(DimId < Derived::NumDimensions && DimId >= 0, Chip_Dim_out_of_range) return TensorChippingOp<DimId, const Derived>(derived(), offset, DimId); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -1132,11 +1133,13 @@ template <DenseIndex DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorChippingOp<DimId, const Derived> chip(const Index offset) const { + EIGEN_STATIC_ASSERT(DimId < Derived::NumDimensions && DimId >= 0, Chip_Dim_out_of_range) return TensorChippingOp<DimId, const Derived>(derived(), offset, DimId); } template <Index DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorChippingOp<DimId, Derived> chip(const Index offset) { + EIGEN_STATIC_ASSERT(DimId < Derived::NumDimensions && DimId >= 0, Chip_Dim_out_of_range) return TensorChippingOp<DimId, Derived>(derived(), offset, DimId); }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index 8a784af..32980c7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -78,7 +78,9 @@ typedef typename Eigen::internal::traits<TensorChippingOp>::Index Index; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorChippingOp(const XprType& expr, const Index offset, const Index dim) - : m_xpr(expr), m_offset(offset), m_dim(dim) {} + : m_xpr(expr), m_offset(offset), m_dim(dim) { + eigen_assert(dim < XprType::NumDimensions && dim >= 0 && "Chip_Dim_out_of_range"); + } EIGEN_DEVICE_FUNC const Index offset() const { return m_offset; } EIGEN_DEVICE_FUNC const Index dim() const { return m_dim.actualDim(); }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index c3a7ef4..d0fbfb3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -13,6 +13,8 @@ // IWYU pragma: private #include "./InternalHeaderCheck.h" +#include <memory> + namespace Eigen { /** \class TensorForcedEval @@ -91,6 +93,26 @@ }; } // end namespace internal +template <typename Device> +class DeviceTempPointerHolder { + public: + DeviceTempPointerHolder(const Device& device, size_t size) + : device_(device), size_(size), ptr_(device.allocate_temp(size)) {} + + ~DeviceTempPointerHolder() { + device_.deallocate_temp(ptr_); + size_ = 0; + ptr_ = nullptr; + } + + void* ptr() { return ptr_; } + + private: + Device device_; + size_t size_; + void* ptr_; +}; + template <typename ArgType_, typename Device> struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device> { typedef const internal::remove_all_t<ArgType_> ArgType; @@ -124,13 +146,20 @@ //===--------------------------------------------------------------------===// TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL) {} + : m_impl(op.expression(), device), + m_op(op.expression()), + m_device(device), + m_buffer_holder(nullptr), + m_buffer(nullptr) {} + + ~TensorEvaluator() { cleanup(); } EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { const Index numValues = internal::array_prod(m_impl.dimensions()); - m_buffer = m_device.get((CoeffReturnType*)m_device.allocate_temp(numValues * sizeof(CoeffReturnType))); + m_buffer_holder = std::make_shared<DeviceTempPointerHolder<Device>>(m_device, numValues * sizeof(CoeffReturnType)); + m_buffer = static_cast<EvaluatorPointerType>(m_buffer_holder->ptr()); internal::non_integral_type_placement_new<Device, CoeffReturnType>()(numValues, m_buffer); @@ -148,7 +177,9 @@ template <typename EvalSubExprsCallback> EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) { const Index numValues = internal::array_prod(m_impl.dimensions()); - m_buffer = m_device.get((CoeffReturnType*)m_device.allocate_temp(numValues * sizeof(CoeffReturnType))); + m_buffer_holder = std::make_shared<DeviceTempPointerHolder<Device>>(m_device, numValues * sizeof(CoeffReturnType)); + m_buffer = static_cast<EvaluatorPointerType>(m_buffer_holder->ptr()); + typedef TensorEvalToOp<const std::remove_const_t<ArgType>> EvalTo; EvalTo evalToTmp(m_device.get(m_buffer), m_op); @@ -162,8 +193,8 @@ #endif EIGEN_STRONG_INLINE void cleanup() { - m_device.deallocate_temp(m_buffer); - m_buffer = NULL; + m_buffer_holder = nullptr; + m_buffer = nullptr; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_buffer[index]; } @@ -179,7 +210,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch, bool /*root_of_expr_ast*/ = false) const { - eigen_assert(m_buffer != NULL); + eigen_assert(m_buffer != nullptr); return TensorBlock::materialize(m_buffer, m_impl.dimensions(), desc, scratch); } @@ -187,13 +218,14 @@ return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_buffer; } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE EvaluatorPointerType data() const { return m_buffer; } private: TensorEvaluator<ArgType, Device> m_impl; const ArgType m_op; const Device EIGEN_DEVICE_REF m_device; - EvaluatorPointerType m_buffer; + std::shared_ptr<DeviceTempPointerHolder<Device>> m_buffer_holder; + EvaluatorPointerType m_buffer; // Cached copy of the value stored in m_buffer_holder. }; } // end namespace Eigen
diff --git a/unsupported/Eigen/src/SparseExtra/MarketIO.h b/unsupported/Eigen/src/SparseExtra/MarketIO.h index 5e65b26..f92622d 100644 --- a/unsupported/Eigen/src/SparseExtra/MarketIO.h +++ b/unsupported/Eigen/src/SparseExtra/MarketIO.h
@@ -309,6 +309,7 @@ out << header << std::endl; out << mat.rows() << " " << mat.cols() << " " << mat.nonZeros() << "\n"; int count = 0; + EIGEN_UNUSED_VARIABLE(count); for (int j = 0; j < mat.outerSize(); ++j) for (typename SparseMatrixType::InnerIterator it(mat, j); it; ++it) { ++count;
diff --git a/unsupported/test/cxx11_tensor_concatenation.cpp b/unsupported/test/cxx11_tensor_concatenation.cpp index ce78892..68455b3 100644 --- a/unsupported/test/cxx11_tensor_concatenation.cpp +++ b/unsupported/test/cxx11_tensor_concatenation.cpp
@@ -36,6 +36,8 @@ static void test_static_dimension_failure() { Tensor<int, 2, DataLayout> left(2, 3); Tensor<int, 3, DataLayout> right(2, 3, 1); + left.setRandom(); + right.setRandom(); #ifdef CXX11_TENSOR_CONCATENATION_STATIC_DIMENSION_FAILURE // Technically compatible, but we static assert that the inputs have same
diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp index ba553f9..228fa9e 100644 --- a/unsupported/test/cxx11_tensor_executor.cpp +++ b/unsupported/test/cxx11_tensor_executor.cpp
@@ -24,7 +24,7 @@ // Default assignment that does no use block evaluation or vectorization. // We assume that default coefficient evaluation is well tested and correct. template <typename Dst, typename Expr> -static void DefaultAssign(Dst& dst, Expr expr) { +void DefaultAssign(Dst& dst, Expr expr) { using Assign = Eigen::TensorAssignOp<Dst, const Expr>; using Executor = Eigen::internal::TensorExecutor<const Assign, DefaultDevice, /*Vectorizable=*/false, @@ -35,7 +35,7 @@ // Assignment with specified device and tiling strategy. template <bool Vectorizable, TiledEvaluation Tiling, typename Device, typename Dst, typename Expr> -static void DeviceAssign(Device& d, Dst& dst, Expr expr) { +void DeviceAssign(Device& d, Dst& dst, Expr expr) { using Assign = Eigen::TensorAssignOp<Dst, const Expr>; using Executor = Eigen::internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>; @@ -52,7 +52,7 @@ } template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout> -static void test_execute_unary_expr(Device d) { +void test_execute_unary_expr(Device d) { static constexpr int Options = 0 | Layout; // Pick a large enough tensor size to bypass small tensor block evaluation @@ -77,7 +77,7 @@ } template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout> -static void test_execute_binary_expr(Device d) { +void test_execute_binary_expr(Device d) { static constexpr int Options = 0 | Layout; // Pick a large enough tensor size to bypass small tensor block evaluation @@ -105,7 +105,7 @@ } template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout> -static void test_execute_broadcasting(Device d) { +void test_execute_broadcasting(Device d) { static constexpr int Options = 0 | Layout; auto dims = RandomDims<NumDims>(1, 10); @@ -134,92 +134,103 @@ } template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout> -static void test_execute_chipping_rvalue(Device d) { - auto dims = RandomDims<NumDims>(1, 10); - Tensor<T, NumDims, Layout, Index> src(dims); - src.setRandom(); +struct test_execute_chipping_rvalue_runner { + template <int ChipDim> + static std::enable_if_t<0 <= ChipDim, void> run_dim(Device& d, const array<Index, NumDims>& dims, + const Tensor<T, NumDims, Layout, Index>& src) { + const auto offset = internal::random<Index>(0, dims[(ChipDim)] - 1); + const auto expr = src.template chip<ChipDim>(offset); -#define TEST_CHIPPING(CHIP_DIM) \ - if (NumDims > (CHIP_DIM)) { \ - const auto offset = internal::random<Index>(0, dims[(CHIP_DIM)] - 1); \ - const auto expr = src.template chip<(CHIP_DIM)>(offset); \ - \ - Tensor<T, NumDims - 1, Layout, Index> golden; \ - golden = expr; \ - \ - Tensor<T, NumDims - 1, Layout, Index> dst(golden.dimensions()); \ - \ - using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>; \ - using Executor = internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>; \ - \ - Executor::run(Assign(dst, expr), d); \ - \ - for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { \ - VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); \ - } \ + Tensor<T, NumDims - 1, Layout, Index> golden; + golden = expr; + + Tensor<T, NumDims - 1, Layout, Index> dst(golden.dimensions()); + + using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>; + using Executor = internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>; + + Executor::run(Assign(dst, expr), d); + + for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { + VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); + } + + // Recursively reduce chip dimension. + run_dim<ChipDim - 1>(d, dims, src); } - TEST_CHIPPING(0) - TEST_CHIPPING(1) - TEST_CHIPPING(2) - TEST_CHIPPING(3) - TEST_CHIPPING(4) - TEST_CHIPPING(5) + template <int ChipDim> + static std::enable_if_t < + ChipDim<0, void> run_dim(Device&, const array<Index, NumDims>&, const Tensor<T, NumDims, Layout, Index>&) {} -#undef TEST_CHIPPING + static void run(Device d) { + auto dims = RandomDims<NumDims>(1, 10); + Tensor<T, NumDims, Layout, Index> src(dims); + src.setRandom(); + run_dim<NumDims - 1>(d, dims, src); + } +}; + +template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout> +void test_execute_chipping_rvalue(Device d) { + test_execute_chipping_rvalue_runner<T, NumDims, Device, Vectorizable, Tiling, Layout>::run(d); } template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout> -static void test_execute_chipping_lvalue(Device d) { - auto dims = RandomDims<NumDims>(1, 10); +struct test_execute_chipping_lvalue_runner { + template <int ChipDim> + static std::enable_if_t<0 <= ChipDim> run_dim(Device& d, const array<Index, NumDims>& dims) { + /* Generate random data that we'll assign to the chipped tensor dim. */ + array<Index, NumDims - 1> src_dims; + for (int i = 0; i < NumDims - 1; ++i) { + int dim = i < (ChipDim) ? i : i + 1; + src_dims[i] = dims[dim]; + } -#define TEST_CHIPPING(CHIP_DIM) \ - if (NumDims > (CHIP_DIM)) { \ - /* Generate random data that we'll assign to the chipped tensor dim. */ \ - array<Index, NumDims - 1> src_dims; \ - for (int i = 0; i < NumDims - 1; ++i) { \ - int dim = i < (CHIP_DIM) ? i : i + 1; \ - src_dims[i] = dims[dim]; \ - } \ - \ - Tensor<T, NumDims - 1, Layout, Index> src(src_dims); \ - src.setRandom(); \ - \ - const auto offset = internal::random<Index>(0, dims[(CHIP_DIM)] - 1); \ - \ - Tensor<T, NumDims, Layout, Index> random(dims); \ - random.setZero(); \ - \ - Tensor<T, NumDims, Layout, Index> golden(dims); \ - golden = random; \ - golden.template chip<(CHIP_DIM)>(offset) = src; \ - \ - Tensor<T, NumDims, Layout, Index> dst(dims); \ - dst = random; \ - auto expr = dst.template chip<(CHIP_DIM)>(offset); \ - \ - using Assign = TensorAssignOp<decltype(expr), const decltype(src)>; \ - using Executor = internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>; \ - \ - Executor::run(Assign(expr, src), d); \ - \ - for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { \ - VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); \ - } \ + Tensor<T, NumDims - 1, Layout, Index> src(src_dims); + src.setRandom(); + + const auto offset = internal::random<Index>(0, dims[(ChipDim)] - 1); + + Tensor<T, NumDims, Layout, Index> random(dims); + random.setZero(); + + Tensor<T, NumDims, Layout, Index> golden(dims); + golden = random; + golden.template chip<(ChipDim)>(offset) = src; + + Tensor<T, NumDims, Layout, Index> dst(dims); + dst = random; + auto expr = dst.template chip<(ChipDim)>(offset); + + using Assign = TensorAssignOp<decltype(expr), const decltype(src)>; + using Executor = internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>; + + Executor::run(Assign(expr, src), d); + + for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { + VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); + } + + run_dim<ChipDim - 1>(d, dims); } - TEST_CHIPPING(0) - TEST_CHIPPING(1) - TEST_CHIPPING(2) - TEST_CHIPPING(3) - TEST_CHIPPING(4) - TEST_CHIPPING(5) + template <int ChipDim> + static std::enable_if_t < ChipDim<0, void> run_dim(Device&, const array<Index, NumDims>&) {} -#undef TEST_CHIPPING + static void run(Device d) { + auto dims = RandomDims<NumDims>(1, 10); + run_dim<NumDims - 1>(d, dims); + } +}; + +template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout> +void test_execute_chipping_lvalue(Device d) { + test_execute_chipping_lvalue_runner<T, NumDims, Device, Vectorizable, Tiling, Layout>::run(d); } template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout> -static void test_execute_shuffle_rvalue(Device d) { +void test_execute_shuffle_rvalue(Device d) { static constexpr int Options = 0 | Layout; auto dims = RandomDims<NumDims>(1, 10); @@ -255,7 +266,7 @@ } template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout> -static void test_execute_shuffle_lvalue(Device d) { +void test_execute_shuffle_lvalue(Device d) { static constexpr int Options = 0 | Layout; auto dims = RandomDims<NumDims>(5, 10); @@ -289,7 +300,7 @@ } template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout> -static void test_execute_reshape(Device d) { +void test_execute_reshape(Device d) { static_assert(NumDims >= 2, "NumDims must be greater or equal than 2"); static constexpr int ReshapedDims = NumDims - 1; @@ -326,7 +337,7 @@ } template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout> -static void test_execute_slice_rvalue(Device d) { +void test_execute_slice_rvalue(Device d) { static_assert(NumDims >= 2, "NumDims must be greater or equal than 2"); static constexpr int Options = 0 | Layout; @@ -362,7 +373,7 @@ } template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout> -static void test_execute_slice_lvalue(Device d) { +void test_execute_slice_lvalue(Device d) { static_assert(NumDims >= 2, "NumDims must be greater or equal than 2"); static constexpr int Options = 0 | Layout; @@ -402,7 +413,7 @@ } template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout> -static void test_execute_broadcasting_of_forced_eval(Device d) { +void test_execute_broadcasting_of_forced_eval(Device d) { static constexpr int Options = 0 | Layout; auto dims = RandomDims<NumDims>(1, 10); @@ -442,7 +453,7 @@ }; template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout> -static void test_execute_generator_op(Device d) { +void test_execute_generator_op(Device d) { static constexpr int Options = 0 | Layout; auto dims = RandomDims<NumDims>(20, 30); @@ -470,7 +481,7 @@ } template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout> -static void test_execute_reverse_rvalue(Device d) { +void test_execute_reverse_rvalue(Device d) { static constexpr int Options = 0 | Layout; auto dims = RandomDims<NumDims>(1, numext::pow(1000000.0, 1.0 / NumDims)); @@ -502,7 +513,7 @@ } template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout> -static void test_async_execute_unary_expr(Device d) { +void test_async_execute_unary_expr(Device d) { static constexpr int Options = 0 | Layout; // Pick a large enough tensor size to bypass small tensor block evaluation @@ -532,7 +543,7 @@ } template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout> -static void test_async_execute_binary_expr(Device d) { +void test_async_execute_binary_expr(Device d) { static constexpr int Options = 0 | Layout; // Pick a large enough tensor size to bypass small tensor block evaluation
diff --git a/unsupported/test/sparse_extra.cpp b/unsupported/test/sparse_extra.cpp index 22d6665..4bf5452 100644 --- a/unsupported/test/sparse_extra.cpp +++ b/unsupported/test/sparse_extra.cpp
@@ -7,6 +7,9 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +#include <cstdlib> +#include <string> + #include "sparse.h" #ifdef min @@ -19,6 +22,16 @@ #include <Eigen/SparseExtra> +// Read from an environment variable TEST_TMPDIR, if available, +// and append the provided filename. Defaults to local directory. +std::string GetTestTempFilename(const char* filename) { + const char* test_tmpdir = std::getenv("TEST_TMPDIR"); + if (test_tmpdir == nullptr) { + return std::string(filename); + } + return std::string(test_tmpdir) + std::string("/") + std::string(filename); +} + template <typename SetterType, typename DenseType, typename Scalar, int Options> bool test_random_setter(SparseMatrix<Scalar, Options>& sm, const DenseType& ref, const std::vector<Vector2i>& nonzeroCoords) { @@ -116,8 +129,9 @@ Index cols = internal::random<Index>(1, 100); SparseMatrixType m1, m2; m1 = DenseMatrix::Random(rows, cols).sparseView(); - saveMarket(m1, "sparse_extra.mtx"); - loadMarket(m2, "sparse_extra.mtx"); + std::string filename = GetTestTempFilename("sparse_extra.mtx"); + saveMarket(m1, filename); + loadMarket(m2, filename); VERIFY_IS_EQUAL(DenseMatrix(m1), DenseMatrix(m2)); } @@ -126,8 +140,9 @@ Index size = internal::random<Index>(1, 100); VectorType v1, v2; v1 = VectorType::Random(size); - saveMarketVector(v1, "vector_extra.mtx"); - loadMarketVector(v2, "vector_extra.mtx"); + std::string filename = GetTestTempFilename("vector_extra.mtx"); + saveMarketVector(v1, filename); + loadMarketVector(v2, filename); VERIFY_IS_EQUAL(v1, v2); } @@ -149,8 +164,9 @@ DenseMatrixType m1, m2; m1 = DenseMatrixType::Random(rows, cols); - saveMarketDense(m1, "dense_extra.mtx"); - loadMarketDense(m2, "dense_extra.mtx"); + std::string filename = GetTestTempFilename("dense_extra.mtx"); + saveMarketDense(m1, filename); + loadMarketDense(m2, filename); VERIFY_IS_EQUAL(m1, m2); }