Update Eigen to commit:f3912891504ad965d5fe8bd11d7346faa69b4026
CHANGELOG
=========
f39128915 - Fix bug in checking subnormals.
5a90fbcea - Update documentation of lapack second/dsecnd.
907982024 - Remove simple relicense script.
851b40afd - LAPACK CPU time functions.
a73970a86 - Fix arm32 issues.
580812201 - Formatting.
92f9544f6 - Remove explicit specialization of member function.
2692fb2b7 - Fix compile-time error caused by chip static asserts
2c6b61c00 - Add half and quarter vector support to HVX architecture
05a457534 - Chipping Asserts v2
fc92fe312 - SPQR: Fix build error, Index/StorageIndex mismatch.
eede526b7 - [Compressed Storage] Use smaller type of Index & StorageIndex for determining maximum size during resize.
772057c55 - Revert "Add asserts for .chip"
6163dbe2b - Allow specifying a temporary directory for fileio outputs.
6b6bb9d34 - Fix unused warnings in failtest.
f6e41e643 - Revert "Clean up stableNorm"
b1ae206ea - Add asserts for .chip
b0f906419 - add missing constexpr qualifier
34fd46a9b - Update CI with testing framework from eigen_ci_cross_testing.
b2814d53a - Fix stableNorm when input is zero-sized.
c29a41011 - check pointers before freeing
48e0c827d - [ROCm] MI300 related test support
538577301 - Fix TensorForcedEval in the case of the evaluator being copied.
3f3bc6d86 - Improve documentation of SparseLU
d33174d5a - Doc: Fix Basic slicing examples minor issues
19b119288 - Add factor getters to Cholmod LLT/LDLT
a1a96fafd - Clean up stableNorm
3026f1f29 - Fix various asan errors.
a2cf99ec6 - Fix GPU+clang+asan.
fee5d60b5 - Fix MSAN failures.
9697d481c - Fix up clang-format CI.
85efa8329 - Set up clang-format in CI
2c4541f73 - fix msvc clz
75e273afc - Add internal ctz/clz implementation.
PiperOrigin-RevId: 601659527
Change-Id: I52089e61fd81692bb2147f7d15ad98eb56da27c1
diff --git a/Eigen/Core b/Eigen/Core
index 39f2b3f..f9d9974 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -378,10 +378,6 @@
#include "src/Core/arch/AVX512/GemmKernel.h"
#endif
-#if defined(EIGEN_VECTORIZE_HVX)
-#include "src/Core/arch/HVX/GeneralBlockPanelKernel.h"
-#endif
-
#include "src/Core/Select.h"
#include "src/Core/VectorwiseOp.h"
#include "src/Core/PartialReduxEvaluator.h"
diff --git a/Eigen/src/CholmodSupport/CholmodSupport.h b/Eigen/src/CholmodSupport/CholmodSupport.h
index be2e737..447a393 100644
--- a/Eigen/src/CholmodSupport/CholmodSupport.h
+++ b/Eigen/src/CholmodSupport/CholmodSupport.h
@@ -149,11 +149,20 @@
/** Returns a view of the Cholmod sparse matrix \a cm as an Eigen sparse matrix.
* The data are not copied but shared. */
-template <typename Scalar, int Flags, typename StorageIndex>
-Map<SparseMatrix<Scalar, Flags, StorageIndex> > viewAsEigen(cholmod_sparse& cm) {
- return Map<SparseMatrix<Scalar, Flags, StorageIndex> >(cm.nrow, cm.ncol, static_cast<StorageIndex*>(cm.p)[cm.ncol],
- static_cast<StorageIndex*>(cm.p),
- static_cast<StorageIndex*>(cm.i), static_cast<Scalar*>(cm.x));
+template <typename Scalar, typename StorageIndex>
+Map<const SparseMatrix<Scalar, ColMajor, StorageIndex> > viewAsEigen(cholmod_sparse& cm) {
+ return Map<const SparseMatrix<Scalar, ColMajor, StorageIndex> >(
+ cm.nrow, cm.ncol, static_cast<StorageIndex*>(cm.p)[cm.ncol], static_cast<StorageIndex*>(cm.p),
+ static_cast<StorageIndex*>(cm.i), static_cast<Scalar*>(cm.x));
+}
+
+/** Returns a view of the Cholmod sparse matrix factor \a cm as an Eigen sparse matrix.
+ * The data are not copied but shared. */
+template <typename Scalar, typename StorageIndex>
+Map<const SparseMatrix<Scalar, ColMajor, StorageIndex> > viewAsEigen(cholmod_factor& cm) {
+ return Map<const SparseMatrix<Scalar, ColMajor, StorageIndex> >(
+ cm.n, cm.n, static_cast<StorageIndex*>(cm.p)[cm.n], static_cast<StorageIndex*>(cm.p),
+ static_cast<StorageIndex*>(cm.i), static_cast<Scalar*>(cm.x));
}
namespace internal {
@@ -188,6 +197,7 @@
EIGEN_CHOLMOD_SPECIALIZE1(int, free_sparse, cholmod_sparse*, A)
EIGEN_CHOLMOD_SPECIALIZE1(cholmod_factor*, analyze, cholmod_sparse, A)
+EIGEN_CHOLMOD_SPECIALIZE1(cholmod_sparse*, factor_to_sparse, cholmod_factor, L)
template <typename StorageIndex_>
inline cholmod_dense* cm_solve(int sys, cholmod_factor& L, cholmod_dense& B, cholmod_common& Common) {
@@ -377,7 +387,7 @@
// TODO optimize this copy by swapping when possible (be careful with alignment, etc.)
// NOTE cholmod_spsolve in fact just calls the dense solver for blocks of 4 columns at a time (similar to Eigen's
// sparse solver)
- dest.derived() = viewAsEigen<typename DestDerived::Scalar, ColMajor, typename DestDerived::StorageIndex>(*x_cs);
+ dest.derived() = viewAsEigen<typename DestDerived::Scalar, typename DestDerived::StorageIndex>(*x_cs);
internal::cm_free_sparse<StorageIndex>(x_cs, m_cholmod);
}
#endif // EIGEN_PARSED_BY_DOXYGEN
@@ -483,6 +493,11 @@
public:
typedef MatrixType_ MatrixType;
+ typedef typename MatrixType::Scalar Scalar;
+ typedef typename MatrixType::RealScalar RealScalar;
+ typedef typename MatrixType::StorageIndex StorageIndex;
+ typedef TriangularView<const MatrixType, Eigen::Lower> MatrixL;
+ typedef TriangularView<const typename MatrixType::AdjointReturnType, Eigen::Upper> MatrixU;
CholmodSimplicialLLT() : Base() { init(); }
@@ -493,6 +508,12 @@
~CholmodSimplicialLLT() {}
+ /** \returns an expression of the factor L */
+ inline MatrixL matrixL() const { return viewAsEigen<Scalar, StorageIndex>(*Base::m_cholmodFactor); }
+
+ /** \returns an expression of the factor U (= L^*) */
+ inline MatrixU matrixU() const { return matrixL().adjoint(); }
+
protected:
void init() {
m_cholmod.final_asis = 0;
@@ -531,6 +552,12 @@
public:
typedef MatrixType_ MatrixType;
+ typedef typename MatrixType::Scalar Scalar;
+ typedef typename MatrixType::RealScalar RealScalar;
+ typedef typename MatrixType::StorageIndex StorageIndex;
+ typedef Matrix<Scalar, Dynamic, 1> VectorType;
+ typedef TriangularView<const MatrixType, Eigen::UnitLower> MatrixL;
+ typedef TriangularView<const typename MatrixType::AdjointReturnType, Eigen::UnitUpper> MatrixU;
CholmodSimplicialLDLT() : Base() { init(); }
@@ -541,6 +568,26 @@
~CholmodSimplicialLDLT() {}
+ /** \returns a vector expression of the diagonal D */
+ inline VectorType vectorD() const {
+ auto cholmodL = viewAsEigen<Scalar, StorageIndex>(*Base::m_cholmodFactor);
+
+ VectorType D{cholmodL.rows()};
+
+ for (Index k = 0; k < cholmodL.outerSize(); ++k) {
+ typename decltype(cholmodL)::InnerIterator it{cholmodL, k};
+ D(k) = it.value();
+ }
+
+ return D;
+ }
+
+ /** \returns an expression of the factor L */
+ inline MatrixL matrixL() const { return viewAsEigen<Scalar, StorageIndex>(*Base::m_cholmodFactor); }
+
+ /** \returns an expression of the factor U (= L^*) */
+ inline MatrixU matrixU() const { return matrixL().adjoint(); }
+
protected:
void init() {
m_cholmod.final_asis = 1;
@@ -578,6 +625,9 @@
public:
typedef MatrixType_ MatrixType;
+ typedef typename MatrixType::Scalar Scalar;
+ typedef typename MatrixType::RealScalar RealScalar;
+ typedef typename MatrixType::StorageIndex StorageIndex;
CholmodSupernodalLLT() : Base() { init(); }
@@ -588,6 +638,19 @@
~CholmodSupernodalLLT() {}
+ /** \returns an expression of the factor L */
+ inline MatrixType matrixL() const {
+ // Convert Cholmod factor's supernodal storage format to Eigen's CSC storage format
+ cholmod_sparse* cholmodL = internal::cm_factor_to_sparse(*Base::m_cholmodFactor, m_cholmod);
+ MatrixType L = viewAsEigen<Scalar, StorageIndex>(*cholmodL);
+ internal::cm_free_sparse<StorageIndex>(cholmodL, m_cholmod);
+
+ return L;
+ }
+
+ /** \returns an expression of the factor U (= L^*) */
+ inline MatrixType matrixU() const { return matrixL().adjoint(); }
+
protected:
void init() {
m_cholmod.final_asis = 1;
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index 5936336..3a302c0 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -612,11 +612,7 @@
}
};
-#ifndef SYCL_DEVICE_ONLY
-#define EIGEN_BINARY_OP_NAN_PROPAGATION(Type, Func) Func
-#else
#define EIGEN_BINARY_OP_NAN_PROPAGATION(Type, Func) [](const Type& a, const Type& b) { return Func(a, b); }
-#endif
/** \internal \returns the min of \a a and \a b (coeff-wise).
If \a a or \b b is NaN, the return value is implementation defined. */
diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index 95f9b97..0be29bc 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -628,6 +628,147 @@
// no value, error at compile time
};
+template <typename BitsType, typename EnableIf = void>
+struct count_bits_impl {
+ static_assert(std::is_integral<BitsType>::value && std::is_unsigned<BitsType>::value,
+ "BitsType must be an unsigned integer");
+
+ static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) {
+ int n = CHAR_BIT * sizeof(BitsType);
+ int shift = n / 2;
+ while (bits > 0 && shift > 0) {
+ BitsType y = bits >> shift;
+ if (y > 0) {
+ n -= shift;
+ bits = y;
+ }
+ shift /= 2;
+ }
+ if (shift == 0) {
+ --n;
+ }
+ return n;
+ }
+
+ static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) {
+ int n = CHAR_BIT * sizeof(BitsType);
+ int shift = n / 2;
+ while (bits > 0 && shift > 0) {
+ BitsType y = bits << shift;
+ if (y > 0) {
+ n -= shift;
+ bits = y;
+ }
+ shift /= 2;
+ }
+ if (shift == 0) {
+ --n;
+ }
+ return n;
+ }
+};
+
+// Count leading zeros.
+template <typename BitsType>
+EIGEN_DEVICE_FUNC inline int clz(BitsType bits) {
+ return count_bits_impl<BitsType>::clz(bits);
+}
+
+// Count trailing zeros.
+template <typename BitsType>
+EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) {
+ return count_bits_impl<BitsType>::ctz(bits);
+}
+
+#if EIGEN_COMP_GNUC || EIGEN_COMP_CLANG
+
+template <typename BitsType>
+struct count_bits_impl<BitsType, std::enable_if_t<sizeof(BitsType) <= sizeof(unsigned int)>> {
+ static constexpr int kNumBits = static_cast<int>(sizeof(BitsType) * CHAR_BIT);
+ static_assert(std::is_integral<BitsType>::value, "BitsType must be a built-in integer");
+ static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) {
+ static constexpr int kLeadingBitsOffset = (sizeof(unsigned int) - sizeof(BitsType)) * CHAR_BIT;
+ return bits == 0 ? kNumBits : __builtin_clz(static_cast<unsigned int>(bits)) - kLeadingBitsOffset;
+ }
+
+ static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) {
+ return bits == 0 ? kNumBits : __builtin_ctz(static_cast<unsigned int>(bits));
+ }
+};
+
+template <typename BitsType>
+struct count_bits_impl<
+ BitsType, std::enable_if_t<sizeof(unsigned int) < sizeof(BitsType) && sizeof(BitsType) <= sizeof(unsigned long)>> {
+ static constexpr int kNumBits = static_cast<int>(sizeof(BitsType) * CHAR_BIT);
+ static_assert(std::is_integral<BitsType>::value, "BitsType must be a built-in integer");
+ static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) {
+ static constexpr int kLeadingBitsOffset = (sizeof(unsigned long) - sizeof(BitsType)) * CHAR_BIT;
+ return bits == 0 ? kNumBits : __builtin_clzl(static_cast<unsigned long>(bits)) - kLeadingBitsOffset;
+ }
+
+ static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) {
+ return bits == 0 ? kNumBits : __builtin_ctzl(static_cast<unsigned long>(bits));
+ }
+};
+
+template <typename BitsType>
+struct count_bits_impl<BitsType, std::enable_if_t<sizeof(unsigned long) < sizeof(BitsType) &&
+ sizeof(BitsType) <= sizeof(unsigned long long)>> {
+ static constexpr int kNumBits = static_cast<int>(sizeof(BitsType) * CHAR_BIT);
+ static_assert(std::is_integral<BitsType>::value, "BitsType must be a built-in integer");
+ static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) {
+ static constexpr int kLeadingBitsOffset = (sizeof(unsigned long long) - sizeof(BitsType)) * CHAR_BIT;
+ return bits == 0 ? kNumBits : __builtin_clzll(static_cast<unsigned long long>(bits)) - kLeadingBitsOffset;
+ }
+
+ static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) {
+ return bits == 0 ? kNumBits : __builtin_ctzll(static_cast<unsigned long long>(bits));
+ }
+};
+
+#elif EIGEN_COMP_MSVC
+
+template <typename BitsType>
+struct count_bits_impl<BitsType, std::enable_if_t<sizeof(BitsType) <= sizeof(unsigned long)>> {
+ static constexpr int kNumBits = static_cast<int>(sizeof(BitsType) * CHAR_BIT);
+ static_assert(std::is_integral<BitsType>::value, "BitsType must be a built-in integer");
+ static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) {
+ unsigned long out;
+ _BitScanReverse(&out, static_cast<unsigned long>(bits));
+ return bits == 0 ? kNumBits : (kNumBits - 1) - static_cast<int>(out);
+ }
+
+ static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) {
+ unsigned long out;
+ _BitScanForward(&out, static_cast<unsigned long>(bits));
+ return bits == 0 ? kNumBits : static_cast<int>(out);
+ }
+};
+
+#ifdef _WIN64
+
+template <typename BitsType>
+struct count_bits_impl<
+ BitsType, std::enable_if_t<sizeof(unsigned long) < sizeof(BitsType) && sizeof(BitsType) <= sizeof(__int64)>> {
+ static constexpr int kNumBits = static_cast<int>(sizeof(BitsType) * CHAR_BIT);
+ static_assert(std::is_integral<BitsType>::value, "BitsType must be a built-in integer");
+ static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) {
+ unsigned long out;
+ _BitScanReverse64(&out, static_cast<unsigned __int64>(bits));
+ return bits == 0 ? kNumBits : (kNumBits - 1) - static_cast<int>(out);
+ }
+
+ static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) {
+ unsigned long out;
+ _BitScanForward64(&out, static_cast<unsigned __int64>(bits));
+ return bits == 0 ? kNumBits : static_cast<int>(out);
+ }
+};
+
+#endif // _WIN64
+
+#endif // EIGEN_COMP_GNUC || EIGEN_COMP_CLANG
+
template <typename Scalar>
struct random_default_impl<Scalar, false, true> {
static inline Scalar run(const Scalar& x, const Scalar& y) {
diff --git a/Eigen/src/Core/PlainObjectBase.h b/Eigen/src/Core/PlainObjectBase.h
index a8307c7..f9bf737 100644
--- a/Eigen/src/Core/PlainObjectBase.h
+++ b/Eigen/src/Core/PlainObjectBase.h
@@ -204,7 +204,9 @@
* provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts.
*
* See DenseCoeffsBase<Derived,ReadOnlyAccessors>::coeff(Index) const for details. */
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const { return m_storage.data()[index]; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const Scalar& coeff(Index index) const {
+ return m_storage.data()[index];
+ }
/** This is an overloaded version of DenseCoeffsBase<Derived,WriteAccessors>::coeffRef(Index,Index) const
* provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts.
diff --git a/Eigen/src/Core/arch/AVX512/PacketMathFP16.h b/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
index fc11174..131e6f1 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
@@ -1,870 +1,870 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-//
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_PACKET_MATH_FP16_AVX512_H
-#define EIGEN_PACKET_MATH_FP16_AVX512_H
-
-// IWYU pragma: private
-#include "../../InternalHeaderCheck.h"
-
-namespace Eigen {
-
-namespace internal {
-
-typedef __m512h Packet32h;
-typedef eigen_packet_wrapper<__m256i, 1> Packet16h;
-typedef eigen_packet_wrapper<__m128i, 2> Packet8h;
-
-template <>
-struct is_arithmetic<Packet8h> {
- enum { value = true };
-};
-
-template <>
-struct packet_traits<half> : default_packet_traits {
- typedef Packet32h type;
- typedef Packet16h half;
- enum {
- Vectorizable = 1,
- AlignedOnScalar = 1,
- size = 32,
-
- HasCmp = 1,
- HasAdd = 1,
- HasSub = 1,
- HasMul = 1,
- HasDiv = 1,
- HasNegate = 1,
- HasAbs = 1,
- HasAbs2 = 0,
- HasMin = 1,
- HasMax = 1,
- HasConj = 1,
- HasSetLinear = 0,
- HasLog = 1,
- HasLog1p = 1,
- HasExp = 1,
- HasExpm1 = 1,
- HasSqrt = 1,
- HasRsqrt = 1,
- // These ones should be implemented in future
- HasBessel = 0,
- HasNdtri = 0,
- HasSin = EIGEN_FAST_MATH,
- HasCos = EIGEN_FAST_MATH,
- HasTanh = EIGEN_FAST_MATH,
- HasErf = 0, // EIGEN_FAST_MATH,
- HasBlend = 0,
- HasRound = 1,
- HasFloor = 1,
- HasCeil = 1,
- HasRint = 1
- };
-};
-
-template <>
-struct unpacket_traits<Packet32h> {
- typedef Eigen::half type;
- typedef Packet16h half;
- enum {
- size = 32,
- alignment = Aligned64,
- vectorizable = true,
- masked_load_available = false,
- masked_store_available = false
- };
-};
-
-template <>
-struct unpacket_traits<Packet16h> {
- typedef Eigen::half type;
- typedef Packet8h half;
- enum {
- size = 16,
- alignment = Aligned32,
- vectorizable = true,
- masked_load_available = false,
- masked_store_available = false
- };
-};
-
-template <>
-struct unpacket_traits<Packet8h> {
- typedef Eigen::half type;
- typedef Packet8h half;
- enum {
- size = 8,
- alignment = Aligned16,
- vectorizable = true,
- masked_load_available = false,
- masked_store_available = false
- };
-};
-
-// Memory functions
-
-// pset1
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pset1<Packet32h>(const Eigen::half& from) {
- return _mm512_set1_ph(static_cast<_Float16>(from));
-}
-
-// pset1frombits
-template <>
-EIGEN_STRONG_INLINE Packet32h pset1frombits<Packet32h>(unsigned short from) {
- return _mm512_castsi512_ph(_mm512_set1_epi16(from));
-}
-
-// pfirst
-
-template <>
-EIGEN_STRONG_INLINE Eigen::half pfirst<Packet32h>(const Packet32h& from) {
-#ifdef EIGEN_VECTORIZE_AVX512DQ
- return half_impl::raw_uint16_to_half(
- static_cast<unsigned short>(_mm256_extract_epi16(_mm512_extracti32x8_epi32(_mm512_castph_si512(from), 0), 0)));
-#else
- Eigen::half dest[32];
- _mm512_storeu_ph(dest, from);
- return dest[0];
-#endif
-}
-
-// pload
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pload<Packet32h>(const Eigen::half* from) {
- EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_ph(from);
-}
-
-// ploadu
-
-template <>
-EIGEN_STRONG_INLINE Packet32h ploadu<Packet32h>(const Eigen::half* from) {
- EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_ph(from);
-}
-
-// pstore
-
-template <>
-EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet32h& from) {
- EIGEN_DEBUG_ALIGNED_STORE _mm512_store_ph(to, from);
-}
-
-// pstoreu
-
-template <>
-EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet32h& from) {
- EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_ph(to, from);
-}
-
-// ploaddup
-template <>
-EIGEN_STRONG_INLINE Packet32h ploaddup<Packet32h>(const Eigen::half* from) {
- __m512h a = _mm512_castph256_ph512(_mm256_loadu_ph(from));
- return _mm512_permutexvar_ph(_mm512_set_epi16(15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6,
- 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0),
- a);
-}
-
-// ploadquad
-template <>
-EIGEN_STRONG_INLINE Packet32h ploadquad<Packet32h>(const Eigen::half* from) {
- __m512h a = _mm512_castph128_ph512(_mm_loadu_ph(from));
- return _mm512_permutexvar_ph(
- _mm512_set_epi16(7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0),
- a);
-}
-
-// pabs
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pabs<Packet32h>(const Packet32h& a) {
- return _mm512_abs_ph(a);
-}
-
-// psignbit
-
-template <>
-EIGEN_STRONG_INLINE Packet32h psignbit<Packet32h>(const Packet32h& a) {
- return _mm512_castsi512_ph(_mm512_srai_epi16(_mm512_castph_si512(a), 15));
-}
-
-// pmin
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pmin<Packet32h>(const Packet32h& a, const Packet32h& b) {
- return _mm512_min_ph(a, b);
-}
-
-// pmax
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pmax<Packet32h>(const Packet32h& a, const Packet32h& b) {
- return _mm512_max_ph(a, b);
-}
-
-// plset
-template <>
-EIGEN_STRONG_INLINE Packet32h plset<Packet32h>(const half& a) {
- return _mm512_add_ph(_mm512_set1_ph(a),
- _mm512_set_ph(31.0f, 30.0f, 29.0f, 28.0f, 27.0f, 26.0f, 25.0f, 24.0f, 23.0f, 22.0f, 21.0f, 20.0f,
- 19.0f, 18.0f, 17.0f, 16.0f, 15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f,
- 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f));
-}
-
-// por
-
-template <>
-EIGEN_STRONG_INLINE Packet32h por(const Packet32h& a, const Packet32h& b) {
- return _mm512_castsi512_ph(_mm512_or_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
-}
-
-// pxor
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pxor(const Packet32h& a, const Packet32h& b) {
- return _mm512_castsi512_ph(_mm512_xor_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
-}
-
-// pand
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pand(const Packet32h& a, const Packet32h& b) {
- return _mm512_castsi512_ph(_mm512_and_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
-}
-
-// pandnot
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pandnot(const Packet32h& a, const Packet32h& b) {
- return _mm512_castsi512_ph(_mm512_andnot_si512(_mm512_castph_si512(b), _mm512_castph_si512(a)));
-}
-
-// pselect
-
-template <>
-EIGEN_DEVICE_FUNC inline Packet32h pselect(const Packet32h& mask, const Packet32h& a, const Packet32h& b) {
- __mmask32 mask32 = _mm512_cmp_epi16_mask(_mm512_castph_si512(mask), _mm512_setzero_epi32(), _MM_CMPINT_EQ);
- return _mm512_mask_blend_ph(mask32, a, b);
-}
-
-// pcmp_eq
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pcmp_eq(const Packet32h& a, const Packet32h& b) {
- __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_EQ_OQ);
- return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu));
-}
-
-// pcmp_le
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pcmp_le(const Packet32h& a, const Packet32h& b) {
- __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_LE_OQ);
- return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu));
-}
-
-// pcmp_lt
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pcmp_lt(const Packet32h& a, const Packet32h& b) {
- __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_LT_OQ);
- return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu));
-}
-
-// pcmp_lt_or_nan
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pcmp_lt_or_nan(const Packet32h& a, const Packet32h& b) {
- __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_NGE_UQ);
- return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi16(0), mask, 0xffffu));
-}
-
-// padd
-
-template <>
-EIGEN_STRONG_INLINE Packet32h padd<Packet32h>(const Packet32h& a, const Packet32h& b) {
- return _mm512_add_ph(a, b);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
- return _mm256_castph_si256(_mm256_add_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {
- return _mm_castph_si128(_mm_add_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
-}
-
-// psub
-
-template <>
-EIGEN_STRONG_INLINE Packet32h psub<Packet32h>(const Packet32h& a, const Packet32h& b) {
- return _mm512_sub_ph(a, b);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16h psub<Packet16h>(const Packet16h& a, const Packet16h& b) {
- return _mm256_castph_si256(_mm256_sub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8h psub<Packet8h>(const Packet8h& a, const Packet8h& b) {
- return _mm_castph_si128(_mm_sub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
-}
-
-// pmul
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pmul<Packet32h>(const Packet32h& a, const Packet32h& b) {
- return _mm512_mul_ph(a, b);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {
- return _mm256_castph_si256(_mm256_mul_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {
- return _mm_castph_si128(_mm_mul_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
-}
-
-// pdiv
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pdiv<Packet32h>(const Packet32h& a, const Packet32h& b) {
- return _mm512_div_ph(a, b);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16h pdiv<Packet16h>(const Packet16h& a, const Packet16h& b) {
- return _mm256_castph_si256(_mm256_div_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8h pdiv<Packet8h>(const Packet8h& a, const Packet8h& b) {
- return _mm_castph_si128(_mm_div_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
-}
-
-// pround
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pround<Packet32h>(const Packet32h& a) {
- // Work-around for default std::round rounding mode.
-
- // Mask for the sign bit
- const Packet32h signMask = pset1frombits<Packet32h>(static_cast<numext::uint16_t>(0x8000u));
- // The largest half-preicision float less than 0.5
- const Packet32h prev0dot5 = pset1frombits<Packet32h>(static_cast<numext::uint16_t>(0x37FFu));
-
- return _mm512_roundscale_ph(padd(por(pand(a, signMask), prev0dot5), a), _MM_FROUND_TO_ZERO);
-}
-
-// print
-
-template <>
-EIGEN_STRONG_INLINE Packet32h print<Packet32h>(const Packet32h& a) {
- return _mm512_roundscale_ph(a, _MM_FROUND_CUR_DIRECTION);
-}
-
-// pceil
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pceil<Packet32h>(const Packet32h& a) {
- return _mm512_roundscale_ph(a, _MM_FROUND_TO_POS_INF);
-}
-
-// pfloor
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pfloor<Packet32h>(const Packet32h& a) {
- return _mm512_roundscale_ph(a, _MM_FROUND_TO_NEG_INF);
-}
-
-// predux
-template <>
-EIGEN_STRONG_INLINE half predux<Packet32h>(const Packet32h& a) {
- return (half)_mm512_reduce_add_ph(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& a) {
- return (half)_mm256_reduce_add_ph(_mm256_castsi256_ph(a));
-}
-
-template <>
-EIGEN_STRONG_INLINE half predux<Packet8h>(const Packet8h& a) {
- return (half)_mm_reduce_add_ph(_mm_castsi128_ph(a));
-}
-
-// predux_half_dowto4
-template <>
-EIGEN_STRONG_INLINE Packet16h predux_half_dowto4<Packet32h>(const Packet32h& a) {
-#ifdef EIGEN_VECTORIZE_AVX512DQ
- __m256i lowHalf = _mm256_castps_si256(_mm512_extractf32x8_ps(_mm512_castph_ps(a), 0));
- __m256i highHalf = _mm256_castps_si256(_mm512_extractf32x8_ps(_mm512_castph_ps(a), 1));
-
- return Packet16h(padd<Packet16h>(lowHalf, highHalf));
-#else
- Eigen::half data[32];
- _mm512_storeu_ph(data, a);
-
- __m256i lowHalf = _mm256_castph_si256(_mm256_loadu_ph(data));
- __m256i highHalf = _mm256_castph_si256(_mm256_loadu_ph(data + 16));
-
- return Packet16h(padd<Packet16h>(lowHalf, highHalf));
-#endif
-}
-
-// predux_max
-
-// predux_min
-
-// predux_mul
-
-#ifdef EIGEN_VECTORIZE_FMA
-
-// pmadd
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pmadd(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
- return _mm512_fmadd_ph(a, b, c);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16h pmadd(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
- return _mm256_castph_si256(_mm256_fmadd_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8h pmadd(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
- return _mm_castph_si128(_mm_fmadd_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
-}
-
-// pmsub
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pmsub(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
- return _mm512_fmsub_ph(a, b, c);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16h pmsub(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
- return _mm256_castph_si256(_mm256_fmsub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8h pmsub(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
- return _mm_castph_si128(_mm_fmsub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
-}
-
-// pnmadd
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pnmadd(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
- return _mm512_fnmadd_ph(a, b, c);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16h pnmadd(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
- return _mm256_castph_si256(_mm256_fnmadd_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8h pnmadd(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
- return _mm_castph_si128(_mm_fnmadd_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
-}
-
-// pnmsub
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pnmsub(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
- return _mm512_fnmsub_ph(a, b, c);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16h pnmsub(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
- return _mm256_castph_si256(_mm256_fnmsub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8h pnmsub(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
- return _mm_castph_si128(_mm_fnmsub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
-}
-
-#endif
-
-// pnegate
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pnegate<Packet32h>(const Packet32h& a) {
- return _mm512_sub_ph(_mm512_set1_ph(0.0), a);
-}
-
-// pconj
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pconj<Packet32h>(const Packet32h& a) {
- return a;
-}
-
-// psqrt
-
-template <>
-EIGEN_STRONG_INLINE Packet32h psqrt<Packet32h>(const Packet32h& a) {
- return _mm512_sqrt_ph(a);
-}
-
-// prsqrt
-
-template <>
-EIGEN_STRONG_INLINE Packet32h prsqrt<Packet32h>(const Packet32h& a) {
- return _mm512_rsqrt_ph(a);
-}
-
-// preciprocal
-
-template <>
-EIGEN_STRONG_INLINE Packet32h preciprocal<Packet32h>(const Packet32h& a) {
- return _mm512_rcp_ph(a);
-}
-
-// ptranspose
-
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet32h, 32>& a) {
- __m512i t[32];
-
- EIGEN_UNROLL_LOOP
- for (int i = 0; i < 16; i++) {
- t[2 * i] = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[2 * i]), _mm512_castph_si512(a.packet[2 * i + 1]));
- t[2 * i + 1] =
- _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[2 * i]), _mm512_castph_si512(a.packet[2 * i + 1]));
- }
-
- __m512i p[32];
-
- EIGEN_UNROLL_LOOP
- for (int i = 0; i < 8; i++) {
- p[4 * i] = _mm512_unpacklo_epi32(t[4 * i], t[4 * i + 2]);
- p[4 * i + 1] = _mm512_unpackhi_epi32(t[4 * i], t[4 * i + 2]);
- p[4 * i + 2] = _mm512_unpacklo_epi32(t[4 * i + 1], t[4 * i + 3]);
- p[4 * i + 3] = _mm512_unpackhi_epi32(t[4 * i + 1], t[4 * i + 3]);
- }
-
- __m512i q[32];
-
- EIGEN_UNROLL_LOOP
- for (int i = 0; i < 4; i++) {
- q[8 * i] = _mm512_unpacklo_epi64(p[8 * i], p[8 * i + 4]);
- q[8 * i + 1] = _mm512_unpackhi_epi64(p[8 * i], p[8 * i + 4]);
- q[8 * i + 2] = _mm512_unpacklo_epi64(p[8 * i + 1], p[8 * i + 5]);
- q[8 * i + 3] = _mm512_unpackhi_epi64(p[8 * i + 1], p[8 * i + 5]);
- q[8 * i + 4] = _mm512_unpacklo_epi64(p[8 * i + 2], p[8 * i + 6]);
- q[8 * i + 5] = _mm512_unpackhi_epi64(p[8 * i + 2], p[8 * i + 6]);
- q[8 * i + 6] = _mm512_unpacklo_epi64(p[8 * i + 3], p[8 * i + 7]);
- q[8 * i + 7] = _mm512_unpackhi_epi64(p[8 * i + 3], p[8 * i + 7]);
- }
-
- __m512i f[32];
-
-#define PACKET32H_TRANSPOSE_HELPER(X, Y) \
- do { \
- f[Y * 8] = _mm512_inserti32x4(f[Y * 8], _mm512_extracti32x4_epi32(q[X * 8], Y), X); \
- f[Y * 8 + 1] = _mm512_inserti32x4(f[Y * 8 + 1], _mm512_extracti32x4_epi32(q[X * 8 + 1], Y), X); \
- f[Y * 8 + 2] = _mm512_inserti32x4(f[Y * 8 + 2], _mm512_extracti32x4_epi32(q[X * 8 + 2], Y), X); \
- f[Y * 8 + 3] = _mm512_inserti32x4(f[Y * 8 + 3], _mm512_extracti32x4_epi32(q[X * 8 + 3], Y), X); \
- f[Y * 8 + 4] = _mm512_inserti32x4(f[Y * 8 + 4], _mm512_extracti32x4_epi32(q[X * 8 + 4], Y), X); \
- f[Y * 8 + 5] = _mm512_inserti32x4(f[Y * 8 + 5], _mm512_extracti32x4_epi32(q[X * 8 + 5], Y), X); \
- f[Y * 8 + 6] = _mm512_inserti32x4(f[Y * 8 + 6], _mm512_extracti32x4_epi32(q[X * 8 + 6], Y), X); \
- f[Y * 8 + 7] = _mm512_inserti32x4(f[Y * 8 + 7], _mm512_extracti32x4_epi32(q[X * 8 + 7], Y), X); \
- } while (false);
-
- PACKET32H_TRANSPOSE_HELPER(0, 0);
- PACKET32H_TRANSPOSE_HELPER(1, 1);
- PACKET32H_TRANSPOSE_HELPER(2, 2);
- PACKET32H_TRANSPOSE_HELPER(3, 3);
-
- PACKET32H_TRANSPOSE_HELPER(1, 0);
- PACKET32H_TRANSPOSE_HELPER(2, 0);
- PACKET32H_TRANSPOSE_HELPER(3, 0);
- PACKET32H_TRANSPOSE_HELPER(2, 1);
- PACKET32H_TRANSPOSE_HELPER(3, 1);
- PACKET32H_TRANSPOSE_HELPER(3, 2);
-
- PACKET32H_TRANSPOSE_HELPER(0, 1);
- PACKET32H_TRANSPOSE_HELPER(0, 2);
- PACKET32H_TRANSPOSE_HELPER(0, 3);
- PACKET32H_TRANSPOSE_HELPER(1, 2);
- PACKET32H_TRANSPOSE_HELPER(1, 3);
- PACKET32H_TRANSPOSE_HELPER(2, 3);
-
-#undef PACKET32H_TRANSPOSE_HELPER
-
- EIGEN_UNROLL_LOOP
- for (int i = 0; i < 32; i++) {
- a.packet[i] = _mm512_castsi512_ph(f[i]);
- }
-}
-
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet32h, 4>& a) {
- __m512i p0, p1, p2, p3, t0, t1, t2, t3, a0, a1, a2, a3;
- t0 = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[0]), _mm512_castph_si512(a.packet[1]));
- t1 = _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[0]), _mm512_castph_si512(a.packet[1]));
- t2 = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[2]), _mm512_castph_si512(a.packet[3]));
- t3 = _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[2]), _mm512_castph_si512(a.packet[3]));
-
- p0 = _mm512_unpacklo_epi32(t0, t2);
- p1 = _mm512_unpackhi_epi32(t0, t2);
- p2 = _mm512_unpacklo_epi32(t1, t3);
- p3 = _mm512_unpackhi_epi32(t1, t3);
-
- a0 = p0;
- a1 = p1;
- a2 = p2;
- a3 = p3;
-
- a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p1, 0), 1);
- a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p0, 1), 0);
-
- a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p2, 0), 2);
- a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p0, 2), 0);
-
- a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p3, 0), 3);
- a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p0, 3), 0);
-
- a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p2, 1), 2);
- a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p1, 2), 1);
-
- a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p3, 2), 3);
- a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p2, 3), 2);
-
- a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p3, 1), 3);
- a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p1, 3), 1);
-
- a.packet[0] = _mm512_castsi512_ph(a0);
- a.packet[1] = _mm512_castsi512_ph(a1);
- a.packet[2] = _mm512_castsi512_ph(a2);
- a.packet[3] = _mm512_castsi512_ph(a3);
-}
-
-// preverse
-
-template <>
-EIGEN_STRONG_INLINE Packet32h preverse(const Packet32h& a) {
- return _mm512_permutexvar_ph(_mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
- 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31),
- a);
-}
-
-// pscatter
-
-template <>
-EIGEN_STRONG_INLINE void pscatter<half, Packet32h>(half* to, const Packet32h& from, Index stride) {
- EIGEN_ALIGN64 half aux[32];
- pstore(aux, from);
-
- EIGEN_UNROLL_LOOP
- for (int i = 0; i < 32; i++) {
- to[stride * i] = aux[i];
- }
-}
-
-// pgather
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pgather<Eigen::half, Packet32h>(const Eigen::half* from, Index stride) {
- return _mm512_castsi512_ph(_mm512_set_epi16(
- from[31 * stride].x, from[30 * stride].x, from[29 * stride].x, from[28 * stride].x, from[27 * stride].x,
- from[26 * stride].x, from[25 * stride].x, from[24 * stride].x, from[23 * stride].x, from[22 * stride].x,
- from[21 * stride].x, from[20 * stride].x, from[19 * stride].x, from[18 * stride].x, from[17 * stride].x,
- from[16 * stride].x, from[15 * stride].x, from[14 * stride].x, from[13 * stride].x, from[12 * stride].x,
- from[11 * stride].x, from[10 * stride].x, from[9 * stride].x, from[8 * stride].x, from[7 * stride].x,
- from[6 * stride].x, from[5 * stride].x, from[4 * stride].x, from[3 * stride].x, from[2 * stride].x,
- from[1 * stride].x, from[0 * stride].x));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16h pcos<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h psin<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h plog<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h plog2<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h plog1p<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h pexp<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h pexpm1<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h ptanh<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h pfrexp<Packet16h>(const Packet16h&, Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h pldexp<Packet16h>(const Packet16h&, const Packet16h&);
-
-EIGEN_STRONG_INLINE Packet32h combine2Packet16h(const Packet16h& a, const Packet16h& b) {
- __m512d result = _mm512_undefined_pd();
- result = _mm512_insertf64x4(result, _mm256_castsi256_pd(a), 0);
- result = _mm512_insertf64x4(result, _mm256_castsi256_pd(b), 1);
- return _mm512_castpd_ph(result);
-}
-
-EIGEN_STRONG_INLINE void extract2Packet16h(const Packet32h& x, Packet16h& a, Packet16h& b) {
- a = _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(x), 0));
- b = _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(x), 1));
-}
-
-// psin
-template <>
-EIGEN_STRONG_INLINE Packet32h psin<Packet32h>(const Packet32h& a) {
- Packet16h low;
- Packet16h high;
- extract2Packet16h(a, low, high);
-
- Packet16h lowOut = psin(low);
- Packet16h highOut = psin(high);
-
- return combine2Packet16h(lowOut, highOut);
-}
-
-// pcos
-template <>
-EIGEN_STRONG_INLINE Packet32h pcos<Packet32h>(const Packet32h& a) {
- Packet16h low;
- Packet16h high;
- extract2Packet16h(a, low, high);
-
- Packet16h lowOut = pcos(low);
- Packet16h highOut = pcos(high);
-
- return combine2Packet16h(lowOut, highOut);
-}
-
-// plog
-template <>
-EIGEN_STRONG_INLINE Packet32h plog<Packet32h>(const Packet32h& a) {
- Packet16h low;
- Packet16h high;
- extract2Packet16h(a, low, high);
-
- Packet16h lowOut = plog(low);
- Packet16h highOut = plog(high);
-
- return combine2Packet16h(lowOut, highOut);
-}
-
-// plog2
-template <>
-EIGEN_STRONG_INLINE Packet32h plog2<Packet32h>(const Packet32h& a) {
- Packet16h low;
- Packet16h high;
- extract2Packet16h(a, low, high);
-
- Packet16h lowOut = plog2(low);
- Packet16h highOut = plog2(high);
-
- return combine2Packet16h(lowOut, highOut);
-}
-
-// plog1p
-template <>
-EIGEN_STRONG_INLINE Packet32h plog1p<Packet32h>(const Packet32h& a) {
- Packet16h low;
- Packet16h high;
- extract2Packet16h(a, low, high);
-
- Packet16h lowOut = plog1p(low);
- Packet16h highOut = plog1p(high);
-
- return combine2Packet16h(lowOut, highOut);
-}
-
-// pexp
-template <>
-EIGEN_STRONG_INLINE Packet32h pexp<Packet32h>(const Packet32h& a) {
- Packet16h low;
- Packet16h high;
- extract2Packet16h(a, low, high);
-
- Packet16h lowOut = pexp(low);
- Packet16h highOut = pexp(high);
-
- return combine2Packet16h(lowOut, highOut);
-}
-
-// pexpm1
-template <>
-EIGEN_STRONG_INLINE Packet32h pexpm1<Packet32h>(const Packet32h& a) {
- Packet16h low;
- Packet16h high;
- extract2Packet16h(a, low, high);
-
- Packet16h lowOut = pexpm1(low);
- Packet16h highOut = pexpm1(high);
-
- return combine2Packet16h(lowOut, highOut);
-}
-
-// ptanh
-template <>
-EIGEN_STRONG_INLINE Packet32h ptanh<Packet32h>(const Packet32h& a) {
- Packet16h low;
- Packet16h high;
- extract2Packet16h(a, low, high);
-
- Packet16h lowOut = ptanh(low);
- Packet16h highOut = ptanh(high);
-
- return combine2Packet16h(lowOut, highOut);
-}
-
-// pfrexp
-template <>
-EIGEN_STRONG_INLINE Packet32h pfrexp<Packet32h>(const Packet32h& a, Packet32h& exponent) {
- Packet16h low;
- Packet16h high;
- extract2Packet16h(a, low, high);
-
- Packet16h exp1 = _mm256_undefined_si256();
- Packet16h exp2 = _mm256_undefined_si256();
-
- Packet16h lowOut = pfrexp(low, exp1);
- Packet16h highOut = pfrexp(high, exp2);
-
- exponent = combine2Packet16h(exp1, exp2);
-
- return combine2Packet16h(lowOut, highOut);
-}
-
-// pldexp
-template <>
-EIGEN_STRONG_INLINE Packet32h pldexp<Packet32h>(const Packet32h& a, const Packet32h& exponent) {
- Packet16h low;
- Packet16h high;
- extract2Packet16h(a, low, high);
-
- Packet16h exp1;
- Packet16h exp2;
- extract2Packet16h(exponent, exp1, exp2);
-
- Packet16h lowOut = pldexp(low, exp1);
- Packet16h highOut = pldexp(high, exp2);
-
- return combine2Packet16h(lowOut, highOut);
-}
-
-} // end namespace internal
-} // end namespace Eigen
-
-#endif // EIGEN_PACKET_MATH_FP16_AVX512_H
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+//
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_FP16_AVX512_H
+#define EIGEN_PACKET_MATH_FP16_AVX512_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+typedef __m512h Packet32h;
+typedef eigen_packet_wrapper<__m256i, 1> Packet16h;
+typedef eigen_packet_wrapper<__m128i, 2> Packet8h;
+
+template <>
+struct is_arithmetic<Packet8h> {
+ enum { value = true };
+};
+
+template <>
+struct packet_traits<half> : default_packet_traits {
+ typedef Packet32h type;
+ typedef Packet16h half;
+ enum {
+ Vectorizable = 1,
+ AlignedOnScalar = 1,
+ size = 32,
+
+ HasCmp = 1,
+ HasAdd = 1,
+ HasSub = 1,
+ HasMul = 1,
+ HasDiv = 1,
+ HasNegate = 1,
+ HasAbs = 1,
+ HasAbs2 = 0,
+ HasMin = 1,
+ HasMax = 1,
+ HasConj = 1,
+ HasSetLinear = 0,
+ HasLog = 1,
+ HasLog1p = 1,
+ HasExp = 1,
+ HasExpm1 = 1,
+ HasSqrt = 1,
+ HasRsqrt = 1,
+ // These ones should be implemented in future
+ HasBessel = 0,
+ HasNdtri = 0,
+ HasSin = EIGEN_FAST_MATH,
+ HasCos = EIGEN_FAST_MATH,
+ HasTanh = EIGEN_FAST_MATH,
+ HasErf = 0, // EIGEN_FAST_MATH,
+ HasBlend = 0,
+ HasRound = 1,
+ HasFloor = 1,
+ HasCeil = 1,
+ HasRint = 1
+ };
+};
+
+template <>
+struct unpacket_traits<Packet32h> {
+ typedef Eigen::half type;
+ typedef Packet16h half;
+ enum {
+ size = 32,
+ alignment = Aligned64,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
+ };
+};
+
+template <>
+struct unpacket_traits<Packet16h> {
+ typedef Eigen::half type;
+ typedef Packet8h half;
+ enum {
+ size = 16,
+ alignment = Aligned32,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
+ };
+};
+
+template <>
+struct unpacket_traits<Packet8h> {
+ typedef Eigen::half type;
+ typedef Packet8h half;
+ enum {
+ size = 8,
+ alignment = Aligned16,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
+ };
+};
+
+// Memory functions
+
+// pset1
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pset1<Packet32h>(const Eigen::half& from) {
+ return _mm512_set1_ph(static_cast<_Float16>(from));
+}
+
+// pset1frombits
+template <>
+EIGEN_STRONG_INLINE Packet32h pset1frombits<Packet32h>(unsigned short from) {
+ return _mm512_castsi512_ph(_mm512_set1_epi16(from));
+}
+
+// pfirst
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half pfirst<Packet32h>(const Packet32h& from) {
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+ return half_impl::raw_uint16_to_half(
+ static_cast<unsigned short>(_mm256_extract_epi16(_mm512_extracti32x8_epi32(_mm512_castph_si512(from), 0), 0)));
+#else
+ Eigen::half dest[32];
+ _mm512_storeu_ph(dest, from);
+ return dest[0];
+#endif
+}
+
+// pload
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pload<Packet32h>(const Eigen::half* from) {
+ EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_ph(from);
+}
+
+// ploadu
+
+template <>
+EIGEN_STRONG_INLINE Packet32h ploadu<Packet32h>(const Eigen::half* from) {
+ EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_ph(from);
+}
+
+// pstore
+
+template <>
+EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet32h& from) {
+ EIGEN_DEBUG_ALIGNED_STORE _mm512_store_ph(to, from);
+}
+
+// pstoreu
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet32h& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_ph(to, from);
+}
+
+// ploaddup
+template <>
+EIGEN_STRONG_INLINE Packet32h ploaddup<Packet32h>(const Eigen::half* from) {
+ __m512h a = _mm512_castph256_ph512(_mm256_loadu_ph(from));
+ return _mm512_permutexvar_ph(_mm512_set_epi16(15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6,
+ 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0),
+ a);
+}
+
+// ploadquad
+template <>
+EIGEN_STRONG_INLINE Packet32h ploadquad<Packet32h>(const Eigen::half* from) {
+ __m512h a = _mm512_castph128_ph512(_mm_loadu_ph(from));
+ return _mm512_permutexvar_ph(
+ _mm512_set_epi16(7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0),
+ a);
+}
+
+// pabs
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pabs<Packet32h>(const Packet32h& a) {
+ return _mm512_abs_ph(a);
+}
+
+// psignbit
+
+template <>
+EIGEN_STRONG_INLINE Packet32h psignbit<Packet32h>(const Packet32h& a) {
+ return _mm512_castsi512_ph(_mm512_srai_epi16(_mm512_castph_si512(a), 15));
+}
+
+// pmin
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pmin<Packet32h>(const Packet32h& a, const Packet32h& b) {
+ return _mm512_min_ph(a, b);
+}
+
+// pmax
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pmax<Packet32h>(const Packet32h& a, const Packet32h& b) {
+ return _mm512_max_ph(a, b);
+}
+
+// plset
+template <>
+EIGEN_STRONG_INLINE Packet32h plset<Packet32h>(const half& a) {
+ return _mm512_add_ph(_mm512_set1_ph(a),
+ _mm512_set_ph(31.0f, 30.0f, 29.0f, 28.0f, 27.0f, 26.0f, 25.0f, 24.0f, 23.0f, 22.0f, 21.0f, 20.0f,
+ 19.0f, 18.0f, 17.0f, 16.0f, 15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f,
+ 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f));
+}
+
+// por
+
+template <>
+EIGEN_STRONG_INLINE Packet32h por(const Packet32h& a, const Packet32h& b) {
+ return _mm512_castsi512_ph(_mm512_or_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
+}
+
+// pxor
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pxor(const Packet32h& a, const Packet32h& b) {
+ return _mm512_castsi512_ph(_mm512_xor_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
+}
+
+// pand
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pand(const Packet32h& a, const Packet32h& b) {
+ return _mm512_castsi512_ph(_mm512_and_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
+}
+
+// pandnot
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pandnot(const Packet32h& a, const Packet32h& b) {
+ return _mm512_castsi512_ph(_mm512_andnot_si512(_mm512_castph_si512(b), _mm512_castph_si512(a)));
+}
+
+// pselect
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet32h pselect(const Packet32h& mask, const Packet32h& a, const Packet32h& b) {
+ __mmask32 mask32 = _mm512_cmp_epi16_mask(_mm512_castph_si512(mask), _mm512_setzero_epi32(), _MM_CMPINT_EQ);
+ return _mm512_mask_blend_ph(mask32, a, b);
+}
+
+// pcmp_eq
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pcmp_eq(const Packet32h& a, const Packet32h& b) {
+ __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_EQ_OQ);
+ return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu));
+}
+
+// pcmp_le
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pcmp_le(const Packet32h& a, const Packet32h& b) {
+ __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_LE_OQ);
+ return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu));
+}
+
+// pcmp_lt
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pcmp_lt(const Packet32h& a, const Packet32h& b) {
+ __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_LT_OQ);
+ return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu));
+}
+
+// pcmp_lt_or_nan
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pcmp_lt_or_nan(const Packet32h& a, const Packet32h& b) {
+ __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_NGE_UQ);
+ return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi16(0), mask, 0xffffu));
+}
+
+// padd
+
+template <>
+EIGEN_STRONG_INLINE Packet32h padd<Packet32h>(const Packet32h& a, const Packet32h& b) {
+ return _mm512_add_ph(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
+ return _mm256_castph_si256(_mm256_add_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {
+ return _mm_castph_si128(_mm_add_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
+}
+
+// psub
+
+template <>
+EIGEN_STRONG_INLINE Packet32h psub<Packet32h>(const Packet32h& a, const Packet32h& b) {
+ return _mm512_sub_ph(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h psub<Packet16h>(const Packet16h& a, const Packet16h& b) {
+ return _mm256_castph_si256(_mm256_sub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h psub<Packet8h>(const Packet8h& a, const Packet8h& b) {
+ return _mm_castph_si128(_mm_sub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
+}
+
+// pmul
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pmul<Packet32h>(const Packet32h& a, const Packet32h& b) {
+ return _mm512_mul_ph(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {
+ return _mm256_castph_si256(_mm256_mul_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {
+ return _mm_castph_si128(_mm_mul_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
+}
+
+// pdiv
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pdiv<Packet32h>(const Packet32h& a, const Packet32h& b) {
+ return _mm512_div_ph(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pdiv<Packet16h>(const Packet16h& a, const Packet16h& b) {
+ return _mm256_castph_si256(_mm256_div_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pdiv<Packet8h>(const Packet8h& a, const Packet8h& b) {
+ return _mm_castph_si128(_mm_div_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
+}
+
+// pround
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pround<Packet32h>(const Packet32h& a) {
+ // Work-around for default std::round rounding mode.
+
+ // Mask for the sign bit
+ const Packet32h signMask = pset1frombits<Packet32h>(static_cast<numext::uint16_t>(0x8000u));
+ // The largest half-preicision float less than 0.5
+ const Packet32h prev0dot5 = pset1frombits<Packet32h>(static_cast<numext::uint16_t>(0x37FFu));
+
+ return _mm512_roundscale_ph(padd(por(pand(a, signMask), prev0dot5), a), _MM_FROUND_TO_ZERO);
+}
+
+// print
+
+template <>
+EIGEN_STRONG_INLINE Packet32h print<Packet32h>(const Packet32h& a) {
+ return _mm512_roundscale_ph(a, _MM_FROUND_CUR_DIRECTION);
+}
+
+// pceil
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pceil<Packet32h>(const Packet32h& a) {
+ return _mm512_roundscale_ph(a, _MM_FROUND_TO_POS_INF);
+}
+
+// pfloor
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pfloor<Packet32h>(const Packet32h& a) {
+ return _mm512_roundscale_ph(a, _MM_FROUND_TO_NEG_INF);
+}
+
+// predux
+template <>
+EIGEN_STRONG_INLINE half predux<Packet32h>(const Packet32h& a) {
+ return (half)_mm512_reduce_add_ph(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& a) {
+ return (half)_mm256_reduce_add_ph(_mm256_castsi256_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux<Packet8h>(const Packet8h& a) {
+ return (half)_mm_reduce_add_ph(_mm_castsi128_ph(a));
+}
+
+// predux_half_dowto4
+template <>
+EIGEN_STRONG_INLINE Packet16h predux_half_dowto4<Packet32h>(const Packet32h& a) {
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+ __m256i lowHalf = _mm256_castps_si256(_mm512_extractf32x8_ps(_mm512_castph_ps(a), 0));
+ __m256i highHalf = _mm256_castps_si256(_mm512_extractf32x8_ps(_mm512_castph_ps(a), 1));
+
+ return Packet16h(padd<Packet16h>(lowHalf, highHalf));
+#else
+ Eigen::half data[32];
+ _mm512_storeu_ph(data, a);
+
+ __m256i lowHalf = _mm256_castph_si256(_mm256_loadu_ph(data));
+ __m256i highHalf = _mm256_castph_si256(_mm256_loadu_ph(data + 16));
+
+ return Packet16h(padd<Packet16h>(lowHalf, highHalf));
+#endif
+}
+
+// predux_max
+
+// predux_min
+
+// predux_mul
+
+#ifdef EIGEN_VECTORIZE_FMA
+
+// pmadd
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pmadd(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
+ return _mm512_fmadd_ph(a, b, c);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pmadd(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
+ return _mm256_castph_si256(_mm256_fmadd_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pmadd(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
+ return _mm_castph_si128(_mm_fmadd_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
+}
+
+// pmsub
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pmsub(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
+ return _mm512_fmsub_ph(a, b, c);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pmsub(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
+ return _mm256_castph_si256(_mm256_fmsub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pmsub(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
+ return _mm_castph_si128(_mm_fmsub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
+}
+
+// pnmadd
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pnmadd(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
+ return _mm512_fnmadd_ph(a, b, c);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pnmadd(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
+ return _mm256_castph_si256(_mm256_fnmadd_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pnmadd(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
+ return _mm_castph_si128(_mm_fnmadd_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
+}
+
+// pnmsub
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pnmsub(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
+ return _mm512_fnmsub_ph(a, b, c);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pnmsub(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
+ return _mm256_castph_si256(_mm256_fnmsub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pnmsub(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
+ return _mm_castph_si128(_mm_fnmsub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
+}
+
+#endif
+
+// pnegate
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pnegate<Packet32h>(const Packet32h& a) {
+ return _mm512_sub_ph(_mm512_set1_ph(0.0), a);
+}
+
+// pconj
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pconj<Packet32h>(const Packet32h& a) {
+ return a;
+}
+
+// psqrt
+
+template <>
+EIGEN_STRONG_INLINE Packet32h psqrt<Packet32h>(const Packet32h& a) {
+ return _mm512_sqrt_ph(a);
+}
+
+// prsqrt
+
+template <>
+EIGEN_STRONG_INLINE Packet32h prsqrt<Packet32h>(const Packet32h& a) {
+ return _mm512_rsqrt_ph(a);
+}
+
+// preciprocal
+
+template <>
+EIGEN_STRONG_INLINE Packet32h preciprocal<Packet32h>(const Packet32h& a) {
+ return _mm512_rcp_ph(a);
+}
+
+// ptranspose
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet32h, 32>& a) {
+ __m512i t[32];
+
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < 16; i++) {
+ t[2 * i] = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[2 * i]), _mm512_castph_si512(a.packet[2 * i + 1]));
+ t[2 * i + 1] =
+ _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[2 * i]), _mm512_castph_si512(a.packet[2 * i + 1]));
+ }
+
+ __m512i p[32];
+
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < 8; i++) {
+ p[4 * i] = _mm512_unpacklo_epi32(t[4 * i], t[4 * i + 2]);
+ p[4 * i + 1] = _mm512_unpackhi_epi32(t[4 * i], t[4 * i + 2]);
+ p[4 * i + 2] = _mm512_unpacklo_epi32(t[4 * i + 1], t[4 * i + 3]);
+ p[4 * i + 3] = _mm512_unpackhi_epi32(t[4 * i + 1], t[4 * i + 3]);
+ }
+
+ __m512i q[32];
+
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < 4; i++) {
+ q[8 * i] = _mm512_unpacklo_epi64(p[8 * i], p[8 * i + 4]);
+ q[8 * i + 1] = _mm512_unpackhi_epi64(p[8 * i], p[8 * i + 4]);
+ q[8 * i + 2] = _mm512_unpacklo_epi64(p[8 * i + 1], p[8 * i + 5]);
+ q[8 * i + 3] = _mm512_unpackhi_epi64(p[8 * i + 1], p[8 * i + 5]);
+ q[8 * i + 4] = _mm512_unpacklo_epi64(p[8 * i + 2], p[8 * i + 6]);
+ q[8 * i + 5] = _mm512_unpackhi_epi64(p[8 * i + 2], p[8 * i + 6]);
+ q[8 * i + 6] = _mm512_unpacklo_epi64(p[8 * i + 3], p[8 * i + 7]);
+ q[8 * i + 7] = _mm512_unpackhi_epi64(p[8 * i + 3], p[8 * i + 7]);
+ }
+
+ __m512i f[32];
+
+#define PACKET32H_TRANSPOSE_HELPER(X, Y) \
+ do { \
+ f[Y * 8] = _mm512_inserti32x4(f[Y * 8], _mm512_extracti32x4_epi32(q[X * 8], Y), X); \
+ f[Y * 8 + 1] = _mm512_inserti32x4(f[Y * 8 + 1], _mm512_extracti32x4_epi32(q[X * 8 + 1], Y), X); \
+ f[Y * 8 + 2] = _mm512_inserti32x4(f[Y * 8 + 2], _mm512_extracti32x4_epi32(q[X * 8 + 2], Y), X); \
+ f[Y * 8 + 3] = _mm512_inserti32x4(f[Y * 8 + 3], _mm512_extracti32x4_epi32(q[X * 8 + 3], Y), X); \
+ f[Y * 8 + 4] = _mm512_inserti32x4(f[Y * 8 + 4], _mm512_extracti32x4_epi32(q[X * 8 + 4], Y), X); \
+ f[Y * 8 + 5] = _mm512_inserti32x4(f[Y * 8 + 5], _mm512_extracti32x4_epi32(q[X * 8 + 5], Y), X); \
+ f[Y * 8 + 6] = _mm512_inserti32x4(f[Y * 8 + 6], _mm512_extracti32x4_epi32(q[X * 8 + 6], Y), X); \
+ f[Y * 8 + 7] = _mm512_inserti32x4(f[Y * 8 + 7], _mm512_extracti32x4_epi32(q[X * 8 + 7], Y), X); \
+ } while (false);
+
+ PACKET32H_TRANSPOSE_HELPER(0, 0);
+ PACKET32H_TRANSPOSE_HELPER(1, 1);
+ PACKET32H_TRANSPOSE_HELPER(2, 2);
+ PACKET32H_TRANSPOSE_HELPER(3, 3);
+
+ PACKET32H_TRANSPOSE_HELPER(1, 0);
+ PACKET32H_TRANSPOSE_HELPER(2, 0);
+ PACKET32H_TRANSPOSE_HELPER(3, 0);
+ PACKET32H_TRANSPOSE_HELPER(2, 1);
+ PACKET32H_TRANSPOSE_HELPER(3, 1);
+ PACKET32H_TRANSPOSE_HELPER(3, 2);
+
+ PACKET32H_TRANSPOSE_HELPER(0, 1);
+ PACKET32H_TRANSPOSE_HELPER(0, 2);
+ PACKET32H_TRANSPOSE_HELPER(0, 3);
+ PACKET32H_TRANSPOSE_HELPER(1, 2);
+ PACKET32H_TRANSPOSE_HELPER(1, 3);
+ PACKET32H_TRANSPOSE_HELPER(2, 3);
+
+#undef PACKET32H_TRANSPOSE_HELPER
+
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < 32; i++) {
+ a.packet[i] = _mm512_castsi512_ph(f[i]);
+ }
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet32h, 4>& a) {
+ __m512i p0, p1, p2, p3, t0, t1, t2, t3, a0, a1, a2, a3;
+ t0 = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[0]), _mm512_castph_si512(a.packet[1]));
+ t1 = _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[0]), _mm512_castph_si512(a.packet[1]));
+ t2 = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[2]), _mm512_castph_si512(a.packet[3]));
+ t3 = _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[2]), _mm512_castph_si512(a.packet[3]));
+
+ p0 = _mm512_unpacklo_epi32(t0, t2);
+ p1 = _mm512_unpackhi_epi32(t0, t2);
+ p2 = _mm512_unpacklo_epi32(t1, t3);
+ p3 = _mm512_unpackhi_epi32(t1, t3);
+
+ a0 = p0;
+ a1 = p1;
+ a2 = p2;
+ a3 = p3;
+
+ a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p1, 0), 1);
+ a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p0, 1), 0);
+
+ a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p2, 0), 2);
+ a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p0, 2), 0);
+
+ a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p3, 0), 3);
+ a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p0, 3), 0);
+
+ a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p2, 1), 2);
+ a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p1, 2), 1);
+
+ a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p3, 2), 3);
+ a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p2, 3), 2);
+
+ a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p3, 1), 3);
+ a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p1, 3), 1);
+
+ a.packet[0] = _mm512_castsi512_ph(a0);
+ a.packet[1] = _mm512_castsi512_ph(a1);
+ a.packet[2] = _mm512_castsi512_ph(a2);
+ a.packet[3] = _mm512_castsi512_ph(a3);
+}
+
+// preverse
+
+template <>
+EIGEN_STRONG_INLINE Packet32h preverse(const Packet32h& a) {
+ return _mm512_permutexvar_ph(_mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31),
+ a);
+}
+
+// pscatter
+
+template <>
+EIGEN_STRONG_INLINE void pscatter<half, Packet32h>(half* to, const Packet32h& from, Index stride) {
+ EIGEN_ALIGN64 half aux[32];
+ pstore(aux, from);
+
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < 32; i++) {
+ to[stride * i] = aux[i];
+ }
+}
+
+// pgather
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pgather<Eigen::half, Packet32h>(const Eigen::half* from, Index stride) {
+ return _mm512_castsi512_ph(_mm512_set_epi16(
+ from[31 * stride].x, from[30 * stride].x, from[29 * stride].x, from[28 * stride].x, from[27 * stride].x,
+ from[26 * stride].x, from[25 * stride].x, from[24 * stride].x, from[23 * stride].x, from[22 * stride].x,
+ from[21 * stride].x, from[20 * stride].x, from[19 * stride].x, from[18 * stride].x, from[17 * stride].x,
+ from[16 * stride].x, from[15 * stride].x, from[14 * stride].x, from[13 * stride].x, from[12 * stride].x,
+ from[11 * stride].x, from[10 * stride].x, from[9 * stride].x, from[8 * stride].x, from[7 * stride].x,
+ from[6 * stride].x, from[5 * stride].x, from[4 * stride].x, from[3 * stride].x, from[2 * stride].x,
+ from[1 * stride].x, from[0 * stride].x));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pcos<Packet16h>(const Packet16h&);
+template <>
+EIGEN_STRONG_INLINE Packet16h psin<Packet16h>(const Packet16h&);
+template <>
+EIGEN_STRONG_INLINE Packet16h plog<Packet16h>(const Packet16h&);
+template <>
+EIGEN_STRONG_INLINE Packet16h plog2<Packet16h>(const Packet16h&);
+template <>
+EIGEN_STRONG_INLINE Packet16h plog1p<Packet16h>(const Packet16h&);
+template <>
+EIGEN_STRONG_INLINE Packet16h pexp<Packet16h>(const Packet16h&);
+template <>
+EIGEN_STRONG_INLINE Packet16h pexpm1<Packet16h>(const Packet16h&);
+template <>
+EIGEN_STRONG_INLINE Packet16h ptanh<Packet16h>(const Packet16h&);
+template <>
+EIGEN_STRONG_INLINE Packet16h pfrexp<Packet16h>(const Packet16h&, Packet16h&);
+template <>
+EIGEN_STRONG_INLINE Packet16h pldexp<Packet16h>(const Packet16h&, const Packet16h&);
+
+EIGEN_STRONG_INLINE Packet32h combine2Packet16h(const Packet16h& a, const Packet16h& b) {
+ __m512d result = _mm512_undefined_pd();
+ result = _mm512_insertf64x4(result, _mm256_castsi256_pd(a), 0);
+ result = _mm512_insertf64x4(result, _mm256_castsi256_pd(b), 1);
+ return _mm512_castpd_ph(result);
+}
+
+EIGEN_STRONG_INLINE void extract2Packet16h(const Packet32h& x, Packet16h& a, Packet16h& b) {
+ a = _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(x), 0));
+ b = _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(x), 1));
+}
+
+// psin
+template <>
+EIGEN_STRONG_INLINE Packet32h psin<Packet32h>(const Packet32h& a) {
+ Packet16h low;
+ Packet16h high;
+ extract2Packet16h(a, low, high);
+
+ Packet16h lowOut = psin(low);
+ Packet16h highOut = psin(high);
+
+ return combine2Packet16h(lowOut, highOut);
+}
+
+// pcos
+template <>
+EIGEN_STRONG_INLINE Packet32h pcos<Packet32h>(const Packet32h& a) {
+ Packet16h low;
+ Packet16h high;
+ extract2Packet16h(a, low, high);
+
+ Packet16h lowOut = pcos(low);
+ Packet16h highOut = pcos(high);
+
+ return combine2Packet16h(lowOut, highOut);
+}
+
+// plog
+template <>
+EIGEN_STRONG_INLINE Packet32h plog<Packet32h>(const Packet32h& a) {
+ Packet16h low;
+ Packet16h high;
+ extract2Packet16h(a, low, high);
+
+ Packet16h lowOut = plog(low);
+ Packet16h highOut = plog(high);
+
+ return combine2Packet16h(lowOut, highOut);
+}
+
+// plog2
+template <>
+EIGEN_STRONG_INLINE Packet32h plog2<Packet32h>(const Packet32h& a) {
+ Packet16h low;
+ Packet16h high;
+ extract2Packet16h(a, low, high);
+
+ Packet16h lowOut = plog2(low);
+ Packet16h highOut = plog2(high);
+
+ return combine2Packet16h(lowOut, highOut);
+}
+
+// plog1p
+template <>
+EIGEN_STRONG_INLINE Packet32h plog1p<Packet32h>(const Packet32h& a) {
+ Packet16h low;
+ Packet16h high;
+ extract2Packet16h(a, low, high);
+
+ Packet16h lowOut = plog1p(low);
+ Packet16h highOut = plog1p(high);
+
+ return combine2Packet16h(lowOut, highOut);
+}
+
+// pexp
+template <>
+EIGEN_STRONG_INLINE Packet32h pexp<Packet32h>(const Packet32h& a) {
+ Packet16h low;
+ Packet16h high;
+ extract2Packet16h(a, low, high);
+
+ Packet16h lowOut = pexp(low);
+ Packet16h highOut = pexp(high);
+
+ return combine2Packet16h(lowOut, highOut);
+}
+
+// pexpm1
+template <>
+EIGEN_STRONG_INLINE Packet32h pexpm1<Packet32h>(const Packet32h& a) {
+ Packet16h low;
+ Packet16h high;
+ extract2Packet16h(a, low, high);
+
+ Packet16h lowOut = pexpm1(low);
+ Packet16h highOut = pexpm1(high);
+
+ return combine2Packet16h(lowOut, highOut);
+}
+
+// ptanh
+template <>
+EIGEN_STRONG_INLINE Packet32h ptanh<Packet32h>(const Packet32h& a) {
+ Packet16h low;
+ Packet16h high;
+ extract2Packet16h(a, low, high);
+
+ Packet16h lowOut = ptanh(low);
+ Packet16h highOut = ptanh(high);
+
+ return combine2Packet16h(lowOut, highOut);
+}
+
+// pfrexp
+template <>
+EIGEN_STRONG_INLINE Packet32h pfrexp<Packet32h>(const Packet32h& a, Packet32h& exponent) {
+ Packet16h low;
+ Packet16h high;
+ extract2Packet16h(a, low, high);
+
+ Packet16h exp1 = _mm256_undefined_si256();
+ Packet16h exp2 = _mm256_undefined_si256();
+
+ Packet16h lowOut = pfrexp(low, exp1);
+ Packet16h highOut = pfrexp(high, exp2);
+
+ exponent = combine2Packet16h(exp1, exp2);
+
+ return combine2Packet16h(lowOut, highOut);
+}
+
+// pldexp
+template <>
+EIGEN_STRONG_INLINE Packet32h pldexp<Packet32h>(const Packet32h& a, const Packet32h& exponent) {
+ Packet16h low;
+ Packet16h high;
+ extract2Packet16h(a, low, high);
+
+ Packet16h exp1;
+ Packet16h exp2;
+ extract2Packet16h(exponent, exp1, exp2);
+
+ Packet16h lowOut = pldexp(low, exp1);
+ Packet16h highOut = pldexp(high, exp2);
+
+ return combine2Packet16h(lowOut, highOut);
+}
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // EIGEN_PACKET_MATH_FP16_AVX512_H
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
index 8fb5b68..d84b1cc 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@@ -582,8 +582,8 @@
// Subtract y * Pi/2 to reduce x to the interval -Pi/4 <= x <= +Pi/4
// using "Extended precision modular arithmetic"
-#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD)
- // This version requires true FMA for high accuracy
+#if defined(EIGEN_VECTORIZE_FMA)
+ // This version requires true FMA for high accuracy.
// It provides a max error of 1ULP up to (with absolute_error < 5.9605e-08):
const float huge_th = ComputeSine ? 117435.992f : 71476.0625f;
x = pmadd(y, pset1<Packet>(-1.57079601287841796875f), x);
@@ -1181,7 +1181,7 @@
s_lo = psub(y, t);
}
-#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#ifdef EIGEN_VECTORIZE_FMA
// This function implements the extended precision product of
// a pair of floating point numbers. Given {x, y}, it computes the pair
// {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and
@@ -1227,7 +1227,7 @@
p_lo = pmadd(x_lo, y_lo, p_lo);
}
-#endif // EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#endif // EIGEN_VECTORIZE_FMA
// This function implements Dekker's algorithm for the addition
// of two double word numbers represented by {x_hi, x_lo} and {y_hi, y_lo}.
diff --git a/Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h b/Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h
deleted file mode 100644
index a159739..0000000
--- a/Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h
+++ /dev/null
@@ -1,41 +0,0 @@
-#ifndef EIGEN_HVX_GENERAL_BLOCK_KERNEL_H
-#define EIGEN_HVX_GENERAL_BLOCK_KERNEL_H
-
-// Only support 128B HVX now.
-// Floating-point operations are only supported since V68.
-#if defined __HVX__ && (__HVX_LENGTH__ == 128) && __HVX_ARCH__ >= 68
-
-namespace Eigen {
-namespace internal {
-
-template <bool ConjLhs_, bool ConjRhs_, int PacketSize_>
-class gebp_traits<float, float, ConjLhs_, ConjRhs_, Architecture::Target, PacketSize_>
- : public gebp_traits<float, float, ConjLhs_, ConjRhs_, Architecture::Generic, PacketSize_> {
- public:
- typedef Packet32qf AccPacket;
-
- EIGEN_STRONG_INLINE void initAcc(Packet32qf& p) { p = pzero<Packet32qf>(p); }
-
- template <typename LaneIdType>
- EIGEN_STRONG_INLINE void madd(const Packet32f& a, const Packet32f& b, Packet32qf& c, Packet32f& /*tmp*/,
- const LaneIdType&) const {
- c = pmadd_f32_to_qf32(a, b, c);
- }
-
- template <typename LaneIdType>
- EIGEN_STRONG_INLINE void madd(const Packet32f& a, const QuadPacket<Packet32f>& b, Packet32qf& c, Packet32f& tmp,
- const LaneIdType& lane) const {
- madd(a, b.get(lane), c, tmp, lane);
- }
-
- EIGEN_STRONG_INLINE void acc(const Packet32qf& c, const Packet32f& alpha, Packet32f& r) const {
- r = pmadd_qf32_to_f32(c, alpha, r);
- }
-};
-
-} // end namespace internal
-} // end namespace Eigen
-
-#endif // __HVX__ && (__HVX_LENGTH__ == 128) && __HVX_ARCH__ >= 68
-
-#endif // EIGEN_HVX_GENERAL_BLOCK_KERNEL_H
diff --git a/Eigen/src/Core/arch/HVX/PacketMath.h b/Eigen/src/Core/arch/HVX/PacketMath.h
index 7c69f3b..7e139de 100644
--- a/Eigen/src/Core/arch/HVX/PacketMath.h
+++ b/Eigen/src/Core/arch/HVX/PacketMath.h
@@ -18,18 +18,107 @@
namespace Eigen {
namespace internal {
-EIGEN_STRONG_INLINE HVX_Vector HVX_load(const void* mem) { return *((HVX_Vector*)mem); }
+// HVX utilities.
-EIGEN_STRONG_INLINE HVX_Vector HVX_loadu(const void* mem) { return *((HVX_UVector*)mem); }
+template <int D>
+EIGEN_STRONG_INLINE HVX_Vector HVX_vmem(const void* m) {
+ HVX_Vector v;
+#if EIGEN_COMP_CLANG
+ // Use inlined assembly for aligned vmem load on unaligned memory.
+ // Use type cast to HVX_Vector* may mess up with compiler data alignment.
+ __asm__("%0 = vmem(%1+#%2)" : "=v"(v) : "r"(m), "i"(D) : "memory");
+#else
+ void* aligned_mem =
+ reinterpret_cast<void*>((reinterpret_cast<uintptr_t>(m) & ~(__HVX_LENGTH__ - 1)) + D * __HVX_LENGTH__);
+ memcpy(&v, aligned_mem, __HVX_LENGTH__);
+#endif
+ return v;
+}
-EIGEN_STRONG_INLINE void HVX_store(void* mem, HVX_Vector v) { *((HVX_Vector*)mem) = v; }
+template <typename T>
+EIGEN_STRONG_INLINE HVX_Vector HVX_load(const T* mem) {
+ HVX_Vector v;
+ memcpy(&v, reinterpret_cast<const HVX_Vector*>(mem), __HVX_LENGTH__);
+ return v;
+}
-EIGEN_STRONG_INLINE void HVX_storeu(void* mem, HVX_Vector v) { *((HVX_UVector*)mem) = v; }
+template <typename T>
+EIGEN_STRONG_INLINE HVX_Vector HVX_loadu(const T* mem) {
+ HVX_Vector v;
+ memcpy(&v, mem, __HVX_LENGTH__);
+ return v;
+}
+
+template <size_t Size, size_t Alignment, typename T>
+EIGEN_STRONG_INLINE HVX_Vector HVX_load_partial(const T* mem) {
+#if defined(EIGEN_HVX_FAST_PARTIAL_VECTOR_LOAD)
+ // Fast partial vector load through aligned vmem load.
+ // The load may past end of array but is aligned to prevent memory fault.
+ HVX_Vector v0 = HVX_vmem<0>(mem);
+ HVX_Vector v1 = v0;
+ uintptr_t mem_addr = reinterpret_cast<uintptr_t>(mem);
+ EIGEN_IF_CONSTEXPR(Size * sizeof(T) <= Alignment) {
+ // Data size less than alignment will never cross multiple aligned vectors.
+ v1 = v0;
+ }
+ else {
+ uintptr_t left_off = mem_addr & (__HVX_LENGTH__ - 1);
+ if (left_off + Size * sizeof(T) > __HVX_LENGTH__) {
+ v1 = HVX_vmem<1>(mem);
+ } else {
+ v1 = v0;
+ }
+ }
+ return Q6_V_valign_VVR(v1, v0, mem_addr);
+#else
+ HVX_Vector v;
+ memcpy(&v, mem, Size * sizeof(T));
+ return v;
+#endif
+}
+
+template <typename T>
+EIGEN_STRONG_INLINE void HVX_store(T* mem, HVX_Vector v) {
+ memcpy(reinterpret_cast<HVX_Vector*>(mem), &v, __HVX_LENGTH__);
+}
+
+template <typename T>
+EIGEN_STRONG_INLINE void HVX_storeu(T* mem, HVX_Vector v) {
+ memcpy(mem, &v, __HVX_LENGTH__);
+}
+
+template <size_t Size, size_t Alignment, typename T>
+EIGEN_STRONG_INLINE void HVX_store_partial(T* mem, HVX_Vector v) {
+ uintptr_t mem_addr = reinterpret_cast<uintptr_t>(mem);
+ HVX_Vector value = Q6_V_vlalign_VVR(v, v, mem_addr);
+ uintptr_t left_off = mem_addr & (__HVX_LENGTH__ - 1);
+ uintptr_t right_off = left_off + Size * sizeof(T);
+
+ HVX_VectorPred ql_not = Q6_Q_vsetq_R(mem_addr);
+ HVX_VectorPred qr = Q6_Q_vsetq2_R(right_off);
+
+ EIGEN_IF_CONSTEXPR(Size * sizeof(T) > Alignment) {
+ if (right_off > __HVX_LENGTH__) {
+ Q6_vmem_QRIV(qr, mem + __HVX_LENGTH__ / sizeof(T), value);
+ qr = Q6_Q_vcmp_eq_VbVb(value, value);
+ }
+ }
+
+ ql_not = Q6_Q_or_QQn(ql_not, qr);
+ Q6_vmem_QnRIV(ql_not, mem, value);
+}
+
+// Packet definitions.
+enum class HVXPacketSize {
+ Full,
+ Half,
+ Quarter,
+};
// Hexagon compiler uses same HVX_Vector to represent all HVX vector types.
// Wrap different vector type (float32, int32, etc) to different class with
// explicit constructor and casting back-and-force to HVX_Vector.
-template <int unique_id>
+template <HVXPacketSize T>
class HVXPacket {
public:
HVXPacket() = default;
@@ -41,24 +130,62 @@
HVX_Vector m_val = Q6_V_vzero();
};
-typedef HVXPacket<0> Packet32f; // float32
-typedef HVXPacket<1> Packet32qf; // qfloat32
+typedef HVXPacket<HVXPacketSize::Full> Packet32f;
+typedef HVXPacket<HVXPacketSize::Half> Packet16f;
+typedef HVXPacket<HVXPacketSize::Quarter> Packet8f;
+// Packet traits.
template <>
struct packet_traits<float> : default_packet_traits {
typedef Packet32f type;
- typedef Packet32f half;
+ typedef Packet16f half;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size = 32,
+
+ HasCmp = 1,
+ HasAdd = 1,
+ HasSub = 1,
+ HasShift = 0,
+ HasMul = 1,
+ HasNegate = 1,
+ HasAbs = 1,
+ HasArg = 0,
+ HasAbs2 = 0,
+ HasAbsDiff = 0,
+ HasMin = 1,
+ HasMax = 1,
+ HasConj = 0,
+ HasSetLinear = 0,
+ HasBlend = 0,
+
+ HasDiv = 0,
+ HasFloor = 0,
+ HasCeil = 0,
+ HasRint = 0,
+
+ HasSin = 0,
+ HasCos = 0,
+ HasACos = 0,
+ HasASin = 0,
+ HasATan = 0,
+ HasATanh = 0,
+ HasLog = 0,
+ HasExp = 0,
+ HasSqrt = 0,
+ HasRsqrt = 0,
+ HasTanh = 0,
+ HasErf = 0,
+ HasBessel = 0,
+ HasNdtri = 0
};
};
template <>
struct unpacket_traits<Packet32f> {
typedef float type;
- typedef Packet32f half;
+ typedef Packet16f half;
enum {
size = 32,
alignment = Aligned128,
@@ -68,15 +195,89 @@
};
};
-// float32 operations.
template <>
-EIGEN_STRONG_INLINE Packet32f pset1<Packet32f>(const float& from) {
+struct unpacket_traits<Packet16f> {
+ typedef float type;
+ typedef Packet8f half;
+ enum {
+ size = 16,
+ // Many code assume alignment on packet size instead of following trait
+ // So we do not use Aligned128 to optimize aligned load/store,
+ alignment = Aligned64,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
+ };
+};
+
+template <>
+struct unpacket_traits<Packet8f> {
+ typedef float type;
+ typedef Packet8f half;
+ enum {
+ size = 8,
+ // Many code assume alignment on packet size instead of following trait
+ // So we do not use Aligned128 to optimize aligned load/store,
+ alignment = Aligned32,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
+ };
+};
+
+// float32 operations.
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> pzero_hvx(const HVXPacket<T>&) {
+ return HVXPacket<T>::Create(Q6_V_vzero());
+}
+template <>
+EIGEN_STRONG_INLINE Packet32f pzero<Packet32f>(const Packet32f&) {
+ return pzero_hvx(Packet32f());
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pzero<Packet16f>(const Packet16f&) {
+ return pzero_hvx(Packet16f());
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pzero<Packet8f>(const Packet8f&) {
+ return pzero_hvx(Packet8f());
+}
+
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE typename unpacket_traits<HVXPacket<T>>::half predux_half_dowto4_hvx(const HVXPacket<T>& a) {
+ const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
+ return unpacket_traits<HVXPacket<T>>::half::Create(
+ Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_vror_VR(a.Get(), sizeof(float) * packet_size / 2), a.Get())));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f predux_half_dowto4(const Packet32f& a) {
+ return predux_half_dowto4_hvx(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f predux_half_dowto4(const Packet16f& a) {
+ return predux_half_dowto4_hvx(a);
+}
+
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> pset1_hvx(const float& from) {
union {
float f;
int32_t i;
} u;
u.f = from;
- return Packet32f::Create(Q6_V_vsplat_R(u.i));
+ return HVXPacket<T>::Create(Q6_V_vsplat_R(u.i));
+}
+template <>
+EIGEN_STRONG_INLINE Packet32f pset1<Packet32f>(const float& from) {
+ return pset1_hvx<HVXPacketSize::Full>(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pset1<Packet16f>(const float& from) {
+ return pset1_hvx<HVXPacketSize::Half>(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float& from) {
+ return pset1_hvx<HVXPacketSize::Quarter>(from);
}
template <>
@@ -84,78 +285,236 @@
return Packet32f::Create(HVX_load(from));
}
template <>
+EIGEN_STRONG_INLINE Packet16f pload<Packet16f>(const float* from) {
+ return Packet16f::Create(
+ HVX_load_partial<unpacket_traits<Packet16f>::size, unpacket_traits<Packet16f>::alignment>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pload<Packet8f>(const float* from) {
+ return Packet8f::Create(
+ HVX_load_partial<unpacket_traits<Packet8f>::size, unpacket_traits<Packet8f>::alignment>(from));
+}
+
+template <>
EIGEN_STRONG_INLINE Packet32f ploadu<Packet32f>(const float* from) {
return Packet32f::Create(HVX_loadu(from));
}
+template <>
+EIGEN_STRONG_INLINE Packet16f ploadu<Packet16f>(const float* from) {
+ return Packet16f::Create(HVX_load_partial<unpacket_traits<Packet16f>::size, 0>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from) {
+ return Packet8f::Create(HVX_load_partial<unpacket_traits<Packet8f>::size, 0>(from));
+}
template <>
EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet32f& from) {
HVX_store(to, from.Get());
}
template <>
+EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet16f& from) {
+ HVX_store_partial<unpacket_traits<Packet16f>::size, unpacket_traits<Packet16f>::alignment>(to, from.Get());
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet8f& from) {
+ HVX_store_partial<unpacket_traits<Packet8f>::size, unpacket_traits<Packet8f>::alignment>(to, from.Get());
+}
+
+template <>
EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet32f& from) {
HVX_storeu(to, from.Get());
}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet16f& from) {
+ HVX_store_partial<unpacket_traits<Packet16f>::size, 0>(to, from.Get());
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet8f& from) {
+ HVX_store_partial<unpacket_traits<Packet8f>::size, 0>(to, from.Get());
+}
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> pmul_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
+ return HVXPacket<T>::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(a.Get(), b.Get())));
+}
template <>
EIGEN_STRONG_INLINE Packet32f pmul<Packet32f>(const Packet32f& a, const Packet32f& b) {
- return Packet32f::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(a.Get(), b.Get())));
+ return pmul_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pmul<Packet16f>(const Packet16f& a, const Packet16f& b) {
+ return pmul_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pmul<Packet8f>(const Packet8f& a, const Packet8f& b) {
+ return pmul_hvx(a, b);
}
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> padd_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
+ return HVXPacket<T>::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(a.Get(), b.Get())));
+}
template <>
EIGEN_STRONG_INLINE Packet32f padd<Packet32f>(const Packet32f& a, const Packet32f& b) {
- return Packet32f::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(a.Get(), b.Get())));
+ return padd_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f padd<Packet16f>(const Packet16f& a, const Packet16f& b) {
+ return padd_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b) {
+ return padd_hvx(a, b);
}
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> psub_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
+ return HVXPacket<T>::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(a.Get(), b.Get())));
+}
template <>
EIGEN_STRONG_INLINE Packet32f psub<Packet32f>(const Packet32f& a, const Packet32f& b) {
- return Packet32f::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(a.Get(), b.Get())));
+ return psub_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f psub<Packet16f>(const Packet16f& a, const Packet16f& b) {
+ return psub_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f psub<Packet8f>(const Packet8f& a, const Packet8f& b) {
+ return psub_hvx(a, b);
}
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> pnegate_hvx(const HVXPacket<T>& a) {
+ return HVXPacket<T>::Create(a.Get() ^ Q6_V_vsplat_R(0x80000000));
+}
template <>
EIGEN_STRONG_INLINE Packet32f pnegate(const Packet32f& a) {
- return psub(Packet32f::Create(Q6_V_vzero()), a);
+ return pnegate_hvx(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pnegate(const Packet16f& a) {
+ return pnegate_hvx(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a) {
+ return pnegate_hvx(a);
}
-template <>
-EIGEN_STRONG_INLINE Packet32f pcmp_le(const Packet32f& a, const Packet32f& b) {
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> pcmp_le_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
HVX_Vector v_true = Q6_Vb_vsplat_R(0xff);
HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(a.Get(), b.Get());
- return Packet32f::Create(Q6_V_vmux_QVV(pred, Q6_V_vzero(), v_true));
+ return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, Q6_V_vzero(), v_true));
+}
+template <>
+EIGEN_STRONG_INLINE Packet32f pcmp_le(const Packet32f& a, const Packet32f& b) {
+ return pcmp_le_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pcmp_le(const Packet16f& a, const Packet16f& b) {
+ return pcmp_le_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pcmp_le(const Packet8f& a, const Packet8f& b) {
+ return pcmp_le_hvx(a, b);
}
-template <>
-EIGEN_STRONG_INLINE Packet32f pcmp_eq(const Packet32f& a, const Packet32f& b) {
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> pcmp_eq_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
HVX_Vector v_true = Q6_Vb_vsplat_R(0xff);
HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(a.Get(), b.Get());
- return Packet32f::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero()));
+ return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero()));
+}
+template <>
+EIGEN_STRONG_INLINE Packet32f pcmp_eq(const Packet32f& a, const Packet32f& b) {
+ return pcmp_eq_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) {
+ return pcmp_eq_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pcmp_eq(const Packet8f& a, const Packet8f& b) {
+ return pcmp_eq_hvx(a, b);
}
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> pcmp_lt_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
+ HVX_Vector v_true = Q6_Vb_vsplat_R(0xff);
+ HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get());
+ return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero()));
+}
template <>
EIGEN_STRONG_INLINE Packet32f pcmp_lt(const Packet32f& a, const Packet32f& b) {
- HVX_Vector v_true = Q6_Vb_vsplat_R(0xff);
- HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get());
- return Packet32f::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero()));
+ return pcmp_lt_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pcmp_lt(const Packet16f& a, const Packet16f& b) {
+ return pcmp_lt_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pcmp_lt(const Packet8f& a, const Packet8f& b) {
+ return pcmp_lt_hvx(a, b);
}
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> pcmp_lt_or_nan_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
+ HVX_Vector v_true = Q6_Vb_vsplat_R(0xff);
+ HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get());
+ return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero()));
+}
template <>
EIGEN_STRONG_INLINE Packet32f pcmp_lt_or_nan(const Packet32f& a, const Packet32f& b) {
- HVX_Vector v_true = Q6_Vb_vsplat_R(0xff);
- HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get());
- return Packet32f::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero()));
+ return pcmp_lt_or_nan_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) {
+ return pcmp_lt_or_nan_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pcmp_lt_or_nan(const Packet8f& a, const Packet8f& b) {
+ return pcmp_lt_or_nan_hvx(a, b);
}
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> pabs_hvx(const HVXPacket<T>& a) {
+ return HVXPacket<T>::Create(a.Get() & Q6_V_vsplat_R(0x7FFFFFFF));
+}
template <>
EIGEN_STRONG_INLINE Packet32f pabs(const Packet32f& a) {
- HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(a.Get(), Q6_V_vzero());
- return Packet32f::Create(Q6_V_vmux_QVV(pred, a.Get(), pnegate(a).Get()));
+ return pabs_hvx(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pabs(const Packet16f& a) {
+ return pabs_hvx(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a) {
+ return pabs_hvx(a);
}
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE float pfirst_hvx(const HVXPacket<T>& a) {
+ union {
+ float array[1];
+ HVX_Vector vector;
+ } HVX_and_array;
+ HVX_and_array.vector = a.Get();
+ return HVX_and_array.array[0];
+}
template <>
EIGEN_STRONG_INLINE float pfirst(const Packet32f& a) {
- float vsf[32] __attribute__((aligned(128)));
- pstore(vsf, a);
- return vsf[0];
+ return pfirst_hvx(a);
+}
+template <>
+EIGEN_STRONG_INLINE float pfirst(const Packet16f& a) {
+ return pfirst_hvx(a);
+}
+template <>
+EIGEN_STRONG_INLINE float pfirst(const Packet8f& a) {
+ return pfirst_hvx(a);
}
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet32f, 4>& kernel) {
@@ -166,13 +525,107 @@
// Shuffle the 64-bit lanes.
HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_3_2), HEXAGON_HVX_GET_V1(v_0_1_0), -8);
-
kernel.packet[0] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_1_1_0));
kernel.packet[1] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_1_1_0));
kernel.packet[2] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_1_3_2));
kernel.packet[3] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_1_3_2));
}
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16f, 4>& kernel) {
+ // Shuffle the 32-bit lanes.
+ HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
+ HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
+ // Shuffle the 64-bit lanes.
+ HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
+
+ kernel.packet[0] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_1_0));
+ kernel.packet[1] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 64));
+ kernel.packet[2] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_1_0));
+ kernel.packet[3] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_1_0), HEXAGON_HVX_GET_V1(v_1_1_0), 64));
+}
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8f, 4>& kernel) {
+ // Shuffle the 32-bit lanes.
+ HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
+ HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
+
+ // Shuffle the 64-bit lanes.
+ HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
+
+ kernel.packet[0] = Packet8f::Create(HEXAGON_HVX_GET_V0(v_1_1_0));
+ kernel.packet[1] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 32));
+ kernel.packet[2] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 64));
+ kernel.packet[3] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 96));
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8f, 8>& kernel) {
+ // Shuffle the 32-bit lanes.
+ HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
+ HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
+ HVX_VectorPair v_0_5_4 = Q6_W_vshuff_VVR(kernel.packet[5].Get(), kernel.packet[4].Get(), -4);
+ HVX_VectorPair v_0_7_6 = Q6_W_vshuff_VVR(kernel.packet[7].Get(), kernel.packet[6].Get(), -4);
+
+ // Shuffle the 64-bit lanes.
+ HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
+ HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_7_6), HEXAGON_HVX_GET_V0(v_0_5_4), -8);
+
+ // Shuffle the 128-bit lanes.
+ v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_3_2), HEXAGON_HVX_GET_V0(v_1_1_0), -16);
+
+ kernel.packet[0] = Packet8f::Create(HEXAGON_HVX_GET_V0(v_0_1_0));
+ kernel.packet[1] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_0_1_0), HEXAGON_HVX_GET_V0(v_0_1_0), 32));
+ kernel.packet[2] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_0_1_0), HEXAGON_HVX_GET_V0(v_0_1_0), 64));
+ kernel.packet[3] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_0_1_0), HEXAGON_HVX_GET_V0(v_0_1_0), 96));
+ kernel.packet[4] = Packet8f::Create(HEXAGON_HVX_GET_V1(v_0_1_0));
+ kernel.packet[5] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_0_1_0), HEXAGON_HVX_GET_V1(v_0_1_0), 32));
+ kernel.packet[6] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_0_1_0), HEXAGON_HVX_GET_V1(v_0_1_0), 64));
+ kernel.packet[7] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_0_1_0), HEXAGON_HVX_GET_V1(v_0_1_0), 96));
+}
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16f, 16>& kernel) {
+ // Shuffle the 32-bit lanes.
+ HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
+ HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
+ HVX_VectorPair v_0_5_4 = Q6_W_vshuff_VVR(kernel.packet[5].Get(), kernel.packet[4].Get(), -4);
+ HVX_VectorPair v_0_7_6 = Q6_W_vshuff_VVR(kernel.packet[7].Get(), kernel.packet[6].Get(), -4);
+ HVX_VectorPair v_0_9_8 = Q6_W_vshuff_VVR(kernel.packet[9].Get(), kernel.packet[8].Get(), -4);
+ HVX_VectorPair v_0_11_10 = Q6_W_vshuff_VVR(kernel.packet[11].Get(), kernel.packet[10].Get(), -4);
+ HVX_VectorPair v_0_13_12 = Q6_W_vshuff_VVR(kernel.packet[13].Get(), kernel.packet[12].Get(), -4);
+ HVX_VectorPair v_0_15_14 = Q6_W_vshuff_VVR(kernel.packet[15].Get(), kernel.packet[14].Get(), -4);
+
+ // Shuffle the 64-bit lanes.
+ HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
+ HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_7_6), HEXAGON_HVX_GET_V0(v_0_5_4), -8);
+ HVX_VectorPair v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10), HEXAGON_HVX_GET_V0(v_0_9_8), -8);
+ HVX_VectorPair v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_15_14), HEXAGON_HVX_GET_V0(v_0_13_12), -8);
+
+ // Shuffle the 128-bit lanes.
+ v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_3_2), HEXAGON_HVX_GET_V0(v_1_1_0), -16);
+ v_0_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_3_2), HEXAGON_HVX_GET_V1(v_1_1_0), -16);
+ v_0_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_7_6), HEXAGON_HVX_GET_V0(v_1_5_4), -16);
+ v_0_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_7_6), HEXAGON_HVX_GET_V1(v_1_5_4), -16);
+
+ // Shuffle the 256-bit lanes.
+ v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_9_8), HEXAGON_HVX_GET_V0(v_0_1_0), -32);
+ v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_9_8), HEXAGON_HVX_GET_V1(v_0_1_0), -32);
+ v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10), HEXAGON_HVX_GET_V0(v_0_3_2), -32);
+ v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_11_10), HEXAGON_HVX_GET_V1(v_0_3_2), -32);
+
+ kernel.packet[0] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_1_0));
+ kernel.packet[1] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 64));
+ kernel.packet[2] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_1_0));
+ kernel.packet[3] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_1_0), HEXAGON_HVX_GET_V1(v_1_1_0), 64));
+ kernel.packet[4] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_3_2));
+ kernel.packet[5] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_3_2), HEXAGON_HVX_GET_V0(v_1_3_2), 64));
+ kernel.packet[6] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_3_2));
+ kernel.packet[7] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_3_2), HEXAGON_HVX_GET_V1(v_1_3_2), 64));
+ kernel.packet[8] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_5_4));
+ kernel.packet[9] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_5_4), HEXAGON_HVX_GET_V0(v_1_5_4), 64));
+ kernel.packet[10] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_5_4));
+ kernel.packet[11] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_5_4), HEXAGON_HVX_GET_V1(v_1_5_4), 64));
+ kernel.packet[12] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_7_6));
+ kernel.packet[13] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_7_6), HEXAGON_HVX_GET_V0(v_1_7_6), 64));
+ kernel.packet[14] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_7_6));
+ kernel.packet[15] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_7_6), HEXAGON_HVX_GET_V1(v_1_7_6), 64));
+}
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet32f, 32>& kernel) {
// Shuffle the 32-bit lanes.
HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
@@ -298,29 +751,67 @@
kernel.packet[31] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_31_30));
}
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE float predux_hvx(const HVXPacket<T>& a) {
+ const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
+ HVX_Vector vsum = Q6_Vqf32_vadd_VsfVsf(a.Get(), Q6_V_vror_VR(a.Get(), sizeof(float)));
+ for (int i = 2; i < packet_size; i <<= 1) {
+ vsum = Q6_Vqf32_vadd_Vqf32Vqf32(vsum, Q6_V_vror_VR(vsum, i * sizeof(float)));
+ }
+ return pfirst(HVXPacket<T>::Create(Q6_Vsf_equals_Vqf32(vsum)));
+}
template <>
EIGEN_STRONG_INLINE float predux<Packet32f>(const Packet32f& a) {
- HVX_Vector vsum_4 = Q6_Vqf32_vadd_VsfVsf(Q6_V_vror_VR(a.Get(), 4), a.Get());
- HVX_Vector vsum_8 = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_vror_VR(vsum_4, 8), vsum_4);
- HVX_Vector vsum_16 = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_vror_VR(vsum_8, 16), vsum_8);
- HVX_Vector vsum_32 = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_vror_VR(vsum_16, 32), vsum_16);
- HVX_Vector vsum_64 = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_vror_VR(vsum_32, 64), vsum_32);
- return pfirst(Packet32f::Create(Q6_Vsf_equals_Vqf32(vsum_64)));
+ return predux_hvx(a);
+}
+template <>
+EIGEN_STRONG_INLINE float predux<Packet16f>(const Packet16f& a) {
+ return predux_hvx(a);
+}
+template <>
+EIGEN_STRONG_INLINE float predux<Packet8f>(const Packet8f& a) {
+ return predux_hvx(a);
}
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> ploaddup_hvx(const float* from) {
+ constexpr Index size = unpacket_traits<HVXPacket<T>>::size / 2;
+ HVX_Vector load = HVX_load_partial<size, 0>(from);
+ HVX_VectorPair dup = Q6_W_vshuff_VVR(load, load, -4);
+ return HVXPacket<T>::Create(HEXAGON_HVX_GET_V0(dup));
+}
template <>
EIGEN_STRONG_INLINE Packet32f ploaddup(const float* from) {
- HVX_Vector load = HVX_loadu(from);
- HVX_VectorPair dup = Q6_W_vshuff_VVR(load, load, -4);
- return Packet32f::Create(HEXAGON_HVX_GET_V0(dup));
+ return ploaddup_hvx<HVXPacketSize::Full>(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f ploaddup(const float* from) {
+ return ploaddup_hvx<HVXPacketSize::Half>(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f ploaddup(const float* from) {
+ return ploaddup_hvx<HVXPacketSize::Quarter>(from);
}
-template <>
-EIGEN_STRONG_INLINE Packet32f ploadquad(const float* from) {
- HVX_Vector load = HVX_loadu(from);
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> ploadquad_hvx(const float* from) {
+ constexpr Index size = unpacket_traits<HVXPacket<T>>::size / 4;
+ HVX_Vector load = HVX_load_partial<size, 0>(from);
HVX_VectorPair dup = Q6_W_vshuff_VVR(load, load, -4);
HVX_VectorPair quad = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(dup), HEXAGON_HVX_GET_V0(dup), -8);
- return Packet32f::Create(HEXAGON_HVX_GET_V0(quad));
+ return HVXPacket<T>::Create(HEXAGON_HVX_GET_V0(quad));
+}
+template <>
+EIGEN_STRONG_INLINE Packet32f ploadquad(const float* from) {
+ return ploadquad_hvx<HVXPacketSize::Full>(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f ploadquad(const float* from) {
+ return ploadquad_hvx<HVXPacketSize::Half>(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f ploadquad(const float* from) {
+ return ploadquad_hvx<HVXPacketSize::Quarter>(from);
}
template <>
@@ -330,99 +821,249 @@
}
template <>
-EIGEN_STRONG_INLINE Packet32f pmin(const Packet32f& a, const Packet32f& b) {
- return Packet32f::Create(Q6_Vsf_vmin_VsfVsf(a.Get(), b.Get()));
+EIGEN_STRONG_INLINE Packet16f preverse(const Packet16f& a) {
+ HVX_Vector delta = Q6_Vb_vsplat_R(0x3c);
+ return Packet16f::Create(Q6_V_vdelta_VV(a.Get(), delta));
}
template <>
+EIGEN_STRONG_INLINE Packet8f preverse(const Packet8f& a) {
+ HVX_Vector delta = Q6_Vb_vsplat_R(0x1c);
+ return Packet8f::Create(Q6_V_vdelta_VV(a.Get(), delta));
+}
+
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> pmin_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
+ return HVXPacket<T>::Create(Q6_Vsf_vmin_VsfVsf(a.Get(), b.Get()));
+}
+template <>
+EIGEN_STRONG_INLINE Packet32f pmin(const Packet32f& a, const Packet32f& b) {
+ return pmin_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pmin(const Packet16f& a, const Packet16f& b) {
+ return pmin_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pmin(const Packet8f& a, const Packet8f& b) {
+ return pmin_hvx(a, b);
+}
+
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> pmax_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
+ return HVXPacket<T>::Create(Q6_Vsf_vmax_VsfVsf(a.Get(), b.Get()));
+}
+template <>
EIGEN_STRONG_INLINE Packet32f pmax(const Packet32f& a, const Packet32f& b) {
- return Packet32f::Create(Q6_Vsf_vmax_VsfVsf(a.Get(), b.Get()));
+ return pmax_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pmax(const Packet16f& a, const Packet16f& b) {
+ return pmax_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pmax(const Packet8f& a, const Packet8f& b) {
+ return pmax_hvx(a, b);
}
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> pand_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
+ return HVXPacket<T>::Create(a.Get() & b.Get());
+}
template <>
EIGEN_STRONG_INLINE Packet32f pand(const Packet32f& a, const Packet32f& b) {
- return Packet32f::Create(a.Get() & b.Get());
+ return pand_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pand(const Packet16f& a, const Packet16f& b) {
+ return pand_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pand(const Packet8f& a, const Packet8f& b) {
+ return pand_hvx(a, b);
}
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> por_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
+ return HVXPacket<T>::Create(a.Get() | b.Get());
+}
template <>
EIGEN_STRONG_INLINE Packet32f por(const Packet32f& a, const Packet32f& b) {
- return Packet32f::Create(a.Get() | b.Get());
+ return por_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f por(const Packet16f& a, const Packet16f& b) {
+ return por_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f por(const Packet8f& a, const Packet8f& b) {
+ return por_hvx(a, b);
}
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> pxor_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
+ return HVXPacket<T>::Create(a.Get() ^ b.Get());
+}
template <>
EIGEN_STRONG_INLINE Packet32f pxor(const Packet32f& a, const Packet32f& b) {
- return Packet32f::Create(a.Get() ^ b.Get());
+ return pxor_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pxor(const Packet16f& a, const Packet16f& b) {
+ return pxor_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pxor(const Packet8f& a, const Packet8f& b) {
+ return pxor_hvx(a, b);
}
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> pnot_hvx(const HVXPacket<T>& a) {
+ return HVXPacket<T>::Create(~a.Get());
+}
template <>
EIGEN_STRONG_INLINE Packet32f pnot(const Packet32f& a) {
- return Packet32f::Create(~a.Get());
+ return pnot_hvx(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pnot(const Packet16f& a) {
+ return pnot_hvx(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pnot(const Packet8f& a) {
+ return pnot_hvx(a);
}
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> pselect_hvx(const HVXPacket<T>& mask, const HVXPacket<T>& a, const HVXPacket<T>& b) {
+ HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(mask.Get(), Q6_V_vzero());
+ return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, b.Get(), a.Get()));
+}
template <>
EIGEN_STRONG_INLINE Packet32f pselect(const Packet32f& mask, const Packet32f& a, const Packet32f& b) {
- HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(mask.Get(), Q6_V_vzero());
- return Packet32f::Create(Q6_V_vmux_QVV(pred, b.Get(), a.Get()));
+ return pselect_hvx(mask, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pselect(const Packet16f& mask, const Packet16f& a, const Packet16f& b) {
+ return pselect_hvx(mask, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pselect(const Packet8f& mask, const Packet8f& a, const Packet8f& b) {
+ return pselect_hvx(mask, a, b);
}
-template <typename Op>
-EIGEN_STRONG_INLINE float predux_generic(const Packet32f& a, Op op) {
- Packet32f vredux_4 = op(Packet32f::Create(Q6_V_vror_VR(a.Get(), 4)), a);
- Packet32f vredux_8 = op(Packet32f::Create(Q6_V_vror_VR(vredux_4.Get(), 8)), vredux_4);
- Packet32f vredux_16 = op(Packet32f::Create(Q6_V_vror_VR(vredux_8.Get(), 16)), vredux_8);
- Packet32f vredux_32 = op(Packet32f::Create(Q6_V_vror_VR(vredux_16.Get(), 32)), vredux_16);
- Packet32f vredux_64 = op(Packet32f::Create(Q6_V_vror_VR(vredux_32.Get(), 64)), vredux_32);
- return pfirst(vredux_64);
+template <HVXPacketSize T, typename Op>
+EIGEN_STRONG_INLINE float predux_generic(const HVXPacket<T>& a, Op op) {
+ const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
+ HVXPacket<T> vredux = a;
+ for (int i = 1; i < packet_size; i <<= 1) {
+ vredux = op(vredux, HVXPacket<T>::Create(Q6_V_vror_VR(vredux.Get(), i * sizeof(float))));
+ }
+ return pfirst(vredux);
}
template <>
EIGEN_STRONG_INLINE float predux_max(const Packet32f& a) {
return predux_generic(a, pmax<Packet32f>);
}
+template <>
+EIGEN_STRONG_INLINE float predux_max(const Packet16f& a) {
+ return predux_generic(a, pmax<Packet16f>);
+}
+template <>
+EIGEN_STRONG_INLINE float predux_max(const Packet8f& a) {
+ return predux_generic(a, pmax<Packet8f>);
+}
template <>
EIGEN_STRONG_INLINE float predux_min(const Packet32f& a) {
return predux_generic(a, pmin<Packet32f>);
}
+template <>
+EIGEN_STRONG_INLINE float predux_min(const Packet16f& a) {
+ return predux_generic(a, pmin<Packet16f>);
+}
+template <>
+EIGEN_STRONG_INLINE float predux_min(const Packet8f& a) {
+ return predux_generic(a, pmin<Packet8f>);
+}
template <>
EIGEN_STRONG_INLINE bool predux_any(const Packet32f& a) {
return predux_generic(a, por<Packet32f>) != 0.0f;
}
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet16f& a) {
+ return predux_generic(a, por<Packet16f>) != 0.0f;
+}
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet8f& a) {
+ return predux_generic(a, por<Packet8f>) != 0.0f;
+}
static const float index_vsf[32]
- __attribute__((aligned(128))) = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
- 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
+ __attribute__((aligned(__HVX_LENGTH__))) = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> plset_hvx(const float& a) {
+ return padd(pload<HVXPacket<T>>(index_vsf), pset1<HVXPacket<T>>(a));
+}
template <>
EIGEN_STRONG_INLINE Packet32f plset(const float& a) {
- return padd(pload<Packet32f>(index_vsf), pset1<Packet32f>(a));
+ return plset_hvx<HVXPacketSize::Full>(a);
}
-
-// qfloat32 operations.
template <>
-EIGEN_STRONG_INLINE Packet32qf pzero<Packet32qf>(const Packet32qf&) {
- return Packet32qf::Create(Q6_V_vzero());
+EIGEN_STRONG_INLINE Packet16f plset(const float& a) {
+ return plset_hvx<HVXPacketSize::Half>(a);
}
-
template <>
-EIGEN_STRONG_INLINE Packet32qf pmul<Packet32qf>(const Packet32qf& a, const Packet32qf& b) {
- return Packet32qf::Create(Q6_Vqf32_vmpy_Vqf32Vqf32(a.Get(), b.Get()));
+EIGEN_STRONG_INLINE Packet8f plset(const float& a) {
+ return plset_hvx<HVXPacketSize::Quarter>(a);
}
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE void pscatter_hvx(float* to, const HVXPacket<T>& from, Index stride) {
+ const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
+ float elements[packet_size] __attribute__((aligned(__HVX_LENGTH__)));
+ pstore<float>(elements, from);
+ for (Index i = 0; i < packet_size; ++i) {
+ to[i * stride] = elements[i];
+ }
+}
template <>
-EIGEN_STRONG_INLINE Packet32qf padd<Packet32qf>(const Packet32qf& a, const Packet32qf& b) {
- return Packet32qf::Create(Q6_Vqf32_vadd_Vqf32Vqf32(a.Get(), b.Get()));
+EIGEN_STRONG_INLINE void pscatter<float, Packet32f>(float* to, const Packet32f& from, Index stride) {
+ pscatter_hvx(to, from, stride);
+}
+template <>
+EIGEN_STRONG_INLINE void pscatter<float, Packet16f>(float* to, const Packet16f& from, Index stride) {
+ pscatter_hvx(to, from, stride);
+}
+template <>
+EIGEN_STRONG_INLINE void pscatter<float, Packet8f>(float* to, const Packet8f& from, Index stride) {
+ pscatter_hvx(to, from, stride);
}
-// Mixed float32 and qfloat32 operations.
-EIGEN_STRONG_INLINE Packet32qf pmadd_f32_to_qf32(const Packet32f& a, const Packet32f& b, const Packet32qf& c) {
- return Packet32qf::Create(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_Vqf32_vmpy_VsfVsf(a.Get(), b.Get()), c.Get()));
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> pgather_hvx(const float* from, Index stride) {
+ const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
+ float elements[packet_size] __attribute__((aligned(__HVX_LENGTH__)));
+ for (Index i = 0; i < packet_size; i++) {
+ elements[i] = from[i * stride];
+ }
+ return pload<HVXPacket<T>>(elements);
}
-
-EIGEN_STRONG_INLINE Packet32f pmadd_qf32_to_f32(const Packet32qf& a, const Packet32f& b, const Packet32f& c) {
- return Packet32f::Create(Q6_Vsf_equals_Vqf32(
- Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(a.Get()), b.Get()), c.Get())));
+template <>
+EIGEN_STRONG_INLINE Packet32f pgather<float, Packet32f>(const float* from, Index stride) {
+ return pgather_hvx<HVXPacketSize::Full>(from, stride);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pgather<float, Packet16f>(const float* from, Index stride) {
+ return pgather_hvx<HVXPacketSize::Half>(from, stride);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pgather<float, Packet8f>(const float* from, Index stride) {
+ return pgather_hvx<HVXPacketSize::Quarter>(from, stride);
}
} // end namespace internal
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 4e3a14d..71e5f5f 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -1271,7 +1271,7 @@
return pset1<Packet2ul>(0ULL);
}
-#ifdef __ARM_FEATURE_FMA
+#ifdef EIGEN_VECTORIZE_FMA
template <>
EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
return vfmaq_f32(c, a, b);
@@ -5249,7 +5249,7 @@
return vdivq_f64(a, b);
}
-#ifdef __ARM_FEATURE_FMA
+#ifdef EIGEN_VECTORIZE_FMA
// See bug 936. See above comment about FMA for float.
template <>
EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h
index b16952a..e692438 100644
--- a/Eigen/src/Core/util/ConfigureVectorization.h
+++ b/Eigen/src/Core/util/ConfigureVectorization.h
@@ -354,6 +354,7 @@
#define EIGEN_VECTORIZE
#define EIGEN_VECTORIZE_VSX 1
+#define EIGEN_VECTORIZE_FMA
#include <altivec.h>
// We need to #undef all these ugly tokens defined in <altivec.h>
// => use __vector instead of vector
@@ -365,6 +366,7 @@
#define EIGEN_VECTORIZE
#define EIGEN_VECTORIZE_ALTIVEC
+#define EIGEN_VECTORIZE_FMA
#include <altivec.h>
// We need to #undef all these ugly tokens defined in <altivec.h>
// => use __vector instead of vector
@@ -431,6 +433,11 @@
#include <arm_fp16.h>
#endif
+// Enable FMA for ARM.
+#if defined(__ARM_FEATURE_FMA)
+#define EIGEN_VECTORIZE_FMA
+#endif
+
#if defined(__F16C__) && !defined(EIGEN_GPUCC) && (!EIGEN_COMP_CLANG_STRICT || EIGEN_CLANG_STRICT_AT_LEAST(3, 8, 0))
// We can use the optimized fp16 to float and float to fp16 conversion routines
#define EIGEN_HAS_FP16_C
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index 31f1057..6253454 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -156,7 +156,7 @@
/** \internal Frees memory allocated with handmade_aligned_malloc */
EIGEN_DEVICE_FUNC inline void handmade_aligned_free(void* ptr) {
- if (ptr) {
+ if (ptr != nullptr) {
uint8_t offset = static_cast<uint8_t>(*(static_cast<uint8_t*>(ptr) - 1));
void* original = static_cast<void*>(static_cast<uint8_t*>(ptr) - offset);
@@ -224,9 +224,11 @@
EIGEN_DEVICE_FUNC inline void aligned_free(void* ptr) {
#if (EIGEN_DEFAULT_ALIGN_BYTES == 0) || EIGEN_MALLOC_ALREADY_ALIGNED
- if (ptr) check_that_malloc_is_allowed();
- EIGEN_USING_STD(free)
- free(ptr);
+ if (ptr != nullptr) {
+ check_that_malloc_is_allowed();
+ EIGEN_USING_STD(free)
+ free(ptr);
+ }
#else
handmade_aligned_free(ptr);
@@ -294,9 +296,11 @@
template <>
EIGEN_DEVICE_FUNC inline void conditional_aligned_free<false>(void* ptr) {
- if (ptr) check_that_malloc_is_allowed();
- EIGEN_USING_STD(free)
- free(ptr);
+ if (ptr != nullptr) {
+ check_that_malloc_is_allowed();
+ EIGEN_USING_STD(free)
+ free(ptr);
+ }
}
template <bool Align>
diff --git a/Eigen/src/Eigenvalues/ComplexSchur.h b/Eigen/src/Eigenvalues/ComplexSchur.h
index 126b442..1ec8fb8 100644
--- a/Eigen/src/Eigenvalues/ComplexSchur.h
+++ b/Eigen/src/Eigenvalues/ComplexSchur.h
@@ -275,7 +275,7 @@
template <typename MatrixType>
typename ComplexSchur<MatrixType>::ComplexScalar ComplexSchur<MatrixType>::computeShift(Index iu, Index iter) {
using std::abs;
- if (iter == 10 || iter == 20) {
+ if ((iter == 10 || iter == 20) && iu > 1) {
// exceptional shift, taken from http://www.netlib.org/eispack/comqr.f
return abs(numext::real(m_matT.coeff(iu, iu - 1))) + abs(numext::real(m_matT.coeff(iu - 1, iu - 2)));
}
diff --git a/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h b/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
index 1eb07f7..3132794 100644
--- a/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
+++ b/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
@@ -142,8 +142,8 @@
cholmod_sparse A;
A = viewAsCholmod(mat);
m_rows = matrix.rows();
- Index col = matrix.cols();
- m_rank = SuiteSparseQR<Scalar>(m_ordering, pivotThreshold, col, &A, &m_cR, &m_E, &m_H, &m_HPinv, &m_HTau, &m_cc);
+ m_rank = SuiteSparseQR<Scalar>(m_ordering, pivotThreshold, internal::convert_index<StorageIndex>(matrix.cols()), &A,
+ &m_cR, &m_E, &m_H, &m_HPinv, &m_HTau, &m_cc);
if (!m_cR) {
m_info = NumericalIssue;
@@ -196,7 +196,7 @@
const MatrixType matrixR() const {
eigen_assert(m_isInitialized && " The QR factorization should be computed first, call compute()");
if (!m_isRUpToDate) {
- m_R = viewAsEigen<Scalar, ColMajor, typename MatrixType::StorageIndex>(*m_cR);
+ m_R = viewAsEigen<Scalar, StorageIndex>(*m_cR);
m_isRUpToDate = true;
}
return m_R;
diff --git a/Eigen/src/SparseCore/CompressedStorage.h b/Eigen/src/SparseCore/CompressedStorage.h
index 123c89c..8f8a696 100644
--- a/Eigen/src/SparseCore/CompressedStorage.h
+++ b/Eigen/src/SparseCore/CompressedStorage.h
@@ -71,8 +71,13 @@
void resize(Index size, double reserveSizeFactor = 0) {
if (m_allocatedSize < size) {
+ // Avoid underflow on the std::min<Index> call by choosing the smaller index type.
+ using SmallerIndexType =
+ typename std::conditional<static_cast<size_t>((std::numeric_limits<Index>::max)()) <
+ static_cast<size_t>((std::numeric_limits<StorageIndex>::max)()),
+ Index, StorageIndex>::type;
Index realloc_size =
- (std::min<Index>)(NumTraits<StorageIndex>::highest(), size + Index(reserveSizeFactor * double(size)));
+ (std::min<Index>)(NumTraits<SmallerIndexType>::highest(), size + Index(reserveSizeFactor * double(size)));
if (realloc_size < size) internal::throw_std_bad_alloc();
reallocate(realloc_size);
}
diff --git a/Eigen/src/SparseLU/SparseLU.h b/Eigen/src/SparseLU/SparseLU.h
index aee3d94..29be01a 100644
--- a/Eigen/src/SparseLU/SparseLU.h
+++ b/Eigen/src/SparseLU/SparseLU.h
@@ -99,13 +99,34 @@
* \code
* VectorXd x(n), b(n);
* SparseMatrix<double> A;
- * SparseLU<SparseMatrix<double>, COLAMDOrdering<int> > solver;
- * // fill A and b;
- * // Compute the ordering permutation vector from the structural pattern of A
+ * SparseLU<SparseMatrix<double>, COLAMDOrdering<int> > solver;
+ * // Fill A and b.
+ * // Compute the ordering permutation vector from the structural pattern of A.
* solver.analyzePattern(A);
- * // Compute the numerical factorization
+ * // Compute the numerical factorization.
* solver.factorize(A);
- * //Use the factors to solve the linear system
+ * // Use the factors to solve the linear system.
+ * x = solver.solve(b);
+ * \endcode
+ *
+ * We can directly call compute() instead of analyzePattern() and factorize()
+ * \code
+ * VectorXd x(n), b(n);
+ * SparseMatrix<double> A;
+ * SparseLU<SparseMatrix<double>, COLAMDOrdering<int> > solver;
+ * // Fill A and b.
+ * solver.compute(A);
+ * // Use the factors to solve the linear system.
+ * x = solver.solve(b);
+ * \endcode
+ *
+ * Or give the matrix to the constructor SparseLU(const MatrixType& matrix)
+ * \code
+ * VectorXd x(n), b(n);
+ * SparseMatrix<double> A;
+ * // Fill A and b.
+ * SparseLU<SparseMatrix<double>, COLAMDOrdering<int> > solver(A);
+ * // Use the factors to solve the linear system.
* x = solver.solve(b);
* \endcode
*
@@ -150,10 +171,18 @@
enum { ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime };
public:
+ /** \brief Basic constructor of the solver.
+ *
+ * Construct a SparseLU. As no matrix is given as argument, compute() should be called afterward with a matrix.
+ */
SparseLU()
: m_lastError(""), m_Ustore(0, 0, 0, 0, 0, 0), m_symmetricmode(false), m_diagpivotthresh(1.0), m_detPermR(1) {
initperfvalues();
}
+ /** \brief Constructor of the solver already based on a specific matrix.
+ *
+ * Construct a SparseLU. compute() is already called with the given matrix.
+ */
explicit SparseLU(const MatrixType& matrix)
: m_lastError(""), m_Ustore(0, 0, 0, 0, 0, 0), m_symmetricmode(false), m_diagpivotthresh(1.0), m_detPermR(1) {
initperfvalues();
@@ -168,9 +197,15 @@
void factorize(const MatrixType& matrix);
void simplicialfactorize(const MatrixType& matrix);
- /**
+ /** \brief Analyze and factorize the matrix so the solver is ready to solve.
+ *
* Compute the symbolic and numeric factorization of the input sparse matrix.
- * The input matrix should be in column-major storage.
+ * The input matrix should be in column-major storage, otherwise analyzePattern()
+ * will do a heavy copy.
+ *
+ * Call analyzePattern() followed by factorize()
+ *
+ * \sa analyzePattern(), factorize()
*/
void compute(const MatrixType& matrix) {
// Analyze
@@ -179,7 +214,9 @@
factorize(matrix);
}
- /** \returns an expression of the transposed of the factored matrix.
+ /** \brief Return a solver for the transposed matrix.
+ *
+ * \returns an expression of the transposed of the factored matrix.
*
* A typical usage is to solve for the transposed problem A^T x = b:
* \code
@@ -196,7 +233,9 @@
return transposeView;
}
- /** \returns an expression of the adjoint of the factored matrix
+ /** \brief Return a solver for the adjointed matrix.
+ *
+ * \returns an expression of the adjoint of the factored matrix
*
* A typical usage is to solve for the adjoint problem A' x = b:
* \code
@@ -215,19 +254,28 @@
return adjointView;
}
+ /** \brief Give the number of rows.
+ */
inline Index rows() const { return m_mat.rows(); }
+ /** \brief Give the numver of columns.
+ */
inline Index cols() const { return m_mat.cols(); }
- /** Indicate that the pattern of the input matrix is symmetric */
+ /** \brief Let you set that the pattern of the input matrix is symmetric
+ */
void isSymmetric(bool sym) { m_symmetricmode = sym; }
- /** \returns an expression of the matrix L, internally stored as supernodes
+ /** \brief Give the matrixL
+ *
+ * \returns an expression of the matrix L, internally stored as supernodes
* The only operation available with this expression is the triangular solve
* \code
* y = b; matrixL().solveInPlace(y);
* \endcode
*/
SparseLUMatrixLReturnType<SCMatrix> matrixL() const { return SparseLUMatrixLReturnType<SCMatrix>(m_Lstore); }
- /** \returns an expression of the matrix U,
+ /** \brief Give the MatrixU
+ *
+ * \returns an expression of the matrix U,
* The only operation available with this expression is the triangular solve
* \code
* y = b; matrixU().solveInPlace(y);
@@ -237,12 +285,14 @@
return SparseLUMatrixUReturnType<SCMatrix, Map<SparseMatrix<Scalar, ColMajor, StorageIndex>>>(m_Lstore, m_Ustore);
}
- /**
+ /** \brief Give the row matrix permutation.
+ *
* \returns a reference to the row matrix permutation \f$ P_r \f$ such that \f$P_r A P_c^T = L U\f$
* \sa colsPermutation()
*/
inline const PermutationType& rowsPermutation() const { return m_perm_r; }
- /**
+ /** \brief Give the column matrix permutation.
+ *
* \returns a reference to the column matrix permutation\f$ P_c^T \f$ such that \f$P_r A P_c^T = L U\f$
* \sa rowsPermutation()
*/
@@ -251,7 +301,9 @@
void setPivotThreshold(const RealScalar& thresh) { m_diagpivotthresh = thresh; }
#ifdef EIGEN_PARSED_BY_DOXYGEN
- /** \returns the solution X of \f$ A X = B \f$ using the current decomposition of A.
+ /** \brief Solve a system \f$ A X = B \f$
+ *
+ * \returns the solution X of \f$ A X = B \f$ using the current decomposition of A.
*
* \warning the destination matrix X in X = this->solve(B) must be colmun-major.
*
@@ -267,14 +319,17 @@
* \c NumericalIssue if the LU factorization reports a problem, zero diagonal for instance
* \c InvalidInput if the input matrix is invalid
*
- * \sa iparm()
+ * You can get a readable error message with lastErrorMessage().
+ *
+ * \sa lastErrorMessage()
*/
ComputationInfo info() const {
eigen_assert(m_isInitialized && "Decomposition is not initialized.");
return m_info;
}
- /**
+ /** \brief Give a human readable error
+ *
* \returns A string describing the type of error
*/
std::string lastErrorMessage() const { return m_lastError; }
@@ -302,7 +357,8 @@
return true;
}
- /**
+ /** \brief Give the absolute value of the determinant.
+ *
* \returns the absolute value of the determinant of the matrix of which
* *this is the QR decomposition.
*
@@ -330,7 +386,9 @@
return det;
}
- /** \returns the natural log of the absolute value of the determinant of the matrix
+ /** \brief Give the natural log of the absolute determinant.
+ *
+ * \returns the natural log of the absolute value of the determinant of the matrix
* of which **this is the QR decomposition
*
* \note This method is useful to work around the risk of overflow/underflow that's
@@ -356,7 +414,9 @@
return det;
}
- /** \returns A number representing the sign of the determinant
+ /** \brief Give the sign of the determinant.
+ *
+ * \returns A number representing the sign of the determinant
*
* \sa absDeterminant(), logAbsDeterminant()
*/
@@ -380,7 +440,9 @@
return det * m_detPermR * m_detPermC;
}
- /** \returns The determinant of the matrix.
+ /** \brief Give the determinant.
+ *
+ * \returns The determinant of the matrix.
*
* \sa absDeterminant(), logAbsDeterminant()
*/
@@ -401,7 +463,11 @@
return (m_detPermR * m_detPermC) > 0 ? det : -det;
}
+ /** \brief Give the number of non zero in matrix L.
+ */
Index nnzL() const { return m_nnzL; }
+ /** \brief Give the number of non zero in matrix U.
+ */
Index nnzU() const { return m_nnzU; }
protected:
@@ -442,7 +508,8 @@
}; // End class SparseLU
// Functions needed by the anaysis phase
-/**
+/** \brief Compute the column permutation.
+ *
* Compute the column permutation to minimize the fill-in
*
* - Apply this permutation to the input matrix -
@@ -451,6 +518,11 @@
*
* - Postorder the elimination tree and the column permutation
*
+ * It is possible to call compute() instead of analyzePattern() + factorize().
+ *
+ * If the matrix is row-major this function will do an heavy copy.
+ *
+ * \sa factorize(), compute()
*/
template <typename MatrixType, typename OrderingType>
void SparseLU<MatrixType, OrderingType>::analyzePattern(const MatrixType& mat) {
@@ -516,23 +588,24 @@
// Functions needed by the numerical factorization phase
-/**
+/** \brief Factorize the matrix to get the solver ready.
+ *
* - Numerical factorization
* - Interleaved with the symbolic factorization
- * On exit, info is
*
- * = 0: successful factorization
+ * To get error of this function you should check info(), you can get more info of
+ * errors with lastErrorMessage().
*
- * > 0: if info = i, and i is
+ * In the past (before 2012 (git history is not older)), this function was returning an integer.
+ * This exit was 0 if successful factorization.
+ * > 0 if info = i, and i is been completed, but the factor U is exactly singular,
+ * and division by zero will occur if it is used to solve a system of equation.
+ * > A->ncol: number of bytes allocated when memory allocation failure occured, plus A->ncol.
+ * If lwork = -1, it is the estimated amount of space needed, plus A->ncol.
*
- * <= A->ncol: U(i,i) is exactly zero. The factorization has
- * been completed, but the factor U is exactly singular,
- * and division by zero will occur if it is used to solve a
- * system of equations.
+ * It seems that A was the name of the matrix in the past.
*
- * > A->ncol: number of bytes allocated when memory allocation
- * failure occurred, plus A->ncol. If lwork = -1, it is
- * the estimated amount of space needed, plus A->ncol.
+ * \sa analyzePattern(), compute(), SparseLU(), info(), lastErrorMessage()
*/
template <typename MatrixType, typename OrderingType>
void SparseLU<MatrixType, OrderingType>::factorize(const MatrixType& matrix) {
@@ -572,6 +645,8 @@
Index maxpanel = m_perfv.panel_size * m;
// Allocate working storage common to the factor routines
Index lwork = 0;
+ // Return the size of actually allocated memory when allocation failed,
+ // and 0 on success.
Index info = Base::memInit(m, n, nnz, lwork, m_perfv.fillfactor, m_perfv.panel_size, m_glu);
if (info) {
m_lastError = "UNABLE TO ALLOCATE WORKING MEMORY\n\n";
@@ -656,6 +731,7 @@
// Depth-first-search for the current column
VectorBlock<IndexVector> panel_lsubk(panel_lsub, k, m);
VectorBlock<IndexVector> repfnz_k(repfnz, k, m);
+ // Return 0 on success and > 0 number of bytes allocated when run out of space.
info = Base::column_dfs(m, jj, m_perm_r.indices(), m_perfv.maxsuper, nseg, panel_lsubk, segrep, repfnz_k, xprune,
marker, parent, xplore, m_glu);
if (info) {
@@ -667,6 +743,7 @@
// Numeric updates to this column
VectorBlock<ScalarVector> dense_k(dense, k, m);
VectorBlock<IndexVector> segrep_k(segrep, nseg1, m - nseg1);
+ // Return 0 on success and > 0 number of bytes allocated when run out of space.
info = Base::column_bmod(jj, (nseg - nseg1), dense_k, tempv, segrep_k, repfnz_k, jcol, m_glu);
if (info) {
m_lastError = "UNABLE TO EXPAND MEMORY IN COLUMN_BMOD() ";
@@ -676,6 +753,7 @@
}
// Copy the U-segments to ucol(*)
+ // Return 0 on success and > 0 number of bytes allocated when run out of space.
info = Base::copy_to_ucol(jj, nseg, segrep, repfnz_k, m_perm_r.indices(), dense_k, m_glu);
if (info) {
m_lastError = "UNABLE TO EXPAND MEMORY IN COPY_TO_UCOL() ";
@@ -685,6 +763,7 @@
}
// Form the L-segment
+ // Return O if success, i > 0 if U(i, i) is exactly zero.
info = Base::pivotL(jj, m_diagpivotthresh, m_perm_r.indices(), iperm_c.indices(), pivrow, m_glu);
if (info) {
m_lastError = "THE MATRIX IS STRUCTURALLY SINGULAR";
diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake
index 2022cf0..2f6f89e 100644
--- a/cmake/EigenTesting.cmake
+++ b/cmake/EigenTesting.cmake
@@ -30,7 +30,7 @@
hip_reset_flags()
hip_add_executable(${targetname} ${filename} HIPCC_OPTIONS -std=c++14)
target_compile_definitions(${targetname} PRIVATE -DEIGEN_USE_HIP)
- set_property(TARGET ${targetname} PROPERTY HIP_ARCHITECTURES gfx900 gfx906 gfx908 gfx90a gfx1030)
+ set_property(TARGET ${targetname} PROPERTY HIP_ARCHITECTURES gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
elseif(EIGEN_TEST_CUDA_CLANG)
set_source_files_properties(${filename} PROPERTIES LANGUAGE CXX)
diff --git a/debug/msvc/eigen.natvis b/debug/msvc/eigen.natvis
index 22cf346..da89857 100644
--- a/debug/msvc/eigen.natvis
+++ b/debug/msvc/eigen.natvis
@@ -1,235 +1,235 @@
-<?xml version="1.0" encoding="utf-8"?>
-
-<AutoVisualizer xmlns="http://schemas.microsoft.com/vstudio/debugger/natvis/2010">
-
- <!-- Fixed x Fixed Matrix -->
- <Type Name="Eigen::Matrix<*,*,*,*,*,*>">
- <AlternativeType Name="Eigen::Array<*,-1,-1,*,*,*>"/>
- <DisplayString>[{$T2}, {$T3}] (fixed matrix)</DisplayString>
- <Expand>
- <ArrayItems Condition="Flags%2"> <!-- row major layout -->
- <Rank>2</Rank>
- <Size>$i==0 ? $T2 : $T3</Size>
- <ValuePointer>m_storage.m_data.array</ValuePointer>
- </ArrayItems>
- <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->
- <Direction>Backward</Direction>
- <Rank>2</Rank>
- <Size>$i==0 ? $T2 : $T3</Size>
- <ValuePointer>m_storage.m_data.array</ValuePointer>
- </ArrayItems>
- </Expand>
- </Type>
-
- <!-- 2 x 2 Matrix -->
- <Type Name="Eigen::Matrix<*,2,2,*,*,*>">
- <AlternativeType Name="Eigen::Array<*,2,2,*,*,*>"/>
- <DisplayString>[2, 2] (fixed matrix)</DisplayString>
- <Expand>
- <Synthetic Name="[row 0]" Condition="Flags%2">
- <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 0]" Condition="!(Flags%2)">
- <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[2]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 1]" Condition="Flags%2">
- <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 1]" Condition="!(Flags%2)">
- <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[3]})</DisplayString>
- </Synthetic>
- </Expand>
- </Type>
-
- <!-- 3 x 3 Matrix -->
- <Type Name="Eigen::Matrix<*,3,3,*,*,*>">
- <AlternativeType Name="Eigen::Array<*,3,3,*,*,*>"/>
- <DisplayString>[3, 3] (fixed matrix)</DisplayString>
- <Expand>
- <Synthetic Name="[row 0]" Condition="Flags%2">
- <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 0]" Condition="!(Flags%2)">
- <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[3]}, {m_storage.m_data.array[6]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 1]" Condition="Flags%2">
- <DisplayString>({m_storage.m_data.array[3]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[5]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 1]" Condition="!(Flags%2)">
- <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[7]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 2]" Condition="Flags%2">
- <DisplayString>({m_storage.m_data.array[6]}, {m_storage.m_data.array[7]}, {m_storage.m_data.array[8]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 2]" Condition="!(Flags%2)">
- <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[8]})</DisplayString>
- </Synthetic>
- </Expand>
- </Type>
-
- <!-- 4 x 4 Matrix -->
- <Type Name="Eigen::Matrix<*,4,4,*,*,*>">
- <AlternativeType Name="Eigen::Array<*,4,4,*,*,*>"/>
- <DisplayString>[4, 4] (fixed matrix)</DisplayString>
- <Expand>
- <Synthetic Name="[row 0]" Condition="Flags%2">
- <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 0]" Condition="!(Flags%2)">
- <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[8]}, {m_storage.m_data.array[12]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 1]" Condition="Flags%2">
- <DisplayString>({m_storage.m_data.array[4]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[6]}, {m_storage.m_data.array[7]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 1]" Condition="!(Flags%2)">
- <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[9]}, {m_storage.m_data.array[13]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 2]" Condition="Flags%2">
- <DisplayString>({m_storage.m_data.array[8]}, {m_storage.m_data.array[9]}, {m_storage.m_data.array[10]}, {m_storage.m_data.array[11]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 2]" Condition="!(Flags%2)">
- <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[6]}, {m_storage.m_data.array[10]}, {m_storage.m_data.array[14]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 3]" Condition="Flags%2">
- <DisplayString>({m_storage.m_data.array[12]}, {m_storage.m_data.array[13]}, {m_storage.m_data.array[14]}, {m_storage.m_data.array[15]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 3]" Condition="!(Flags%2)">
- <DisplayString>({m_storage.m_data.array[3]}, {m_storage.m_data.array[7]}, {m_storage.m_data.array[11]}, {m_storage.m_data.array[15]})</DisplayString>
- </Synthetic>
- </Expand>
- </Type>
-
- <!-- Dynamic x Dynamic Matrix -->
- <Type Name="Eigen::Matrix<*,-1,-1,*,*,*>">
- <AlternativeType Name="Eigen::Array<*,-1,-1,*,*,*>"/>
- <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
- <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}, {m_storage.m_cols}] (dynamic matrix)</DisplayString>
- <Expand>
- <ArrayItems Condition="Flags%2"> <!-- row major layout -->
- <Rank>2</Rank>
- <Size>$i==0 ? m_storage.m_rows : m_storage.m_cols</Size>
- <ValuePointer>m_storage.m_data</ValuePointer>
- </ArrayItems>
- <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->
- <Direction>Backward</Direction>
- <Rank>2</Rank>
- <Size>$i==0 ? m_storage.m_rows : m_storage.m_cols</Size>
- <ValuePointer>m_storage.m_data</ValuePointer>
- </ArrayItems>
- </Expand>
- </Type>
-
- <!-- Fixed x Dynamic Matrix -->
- <Type Name="Eigen::Matrix<*,*,-1,*,*,*>">
- <AlternativeType Name="Eigen::Array<*,*,-1,*,*,*>"/>
- <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
- <DisplayString Condition="m_storage.m_data != 0">[{$T2}, {m_storage.m_cols}] (dynamic column matrix)</DisplayString>
- <Expand>
- <ArrayItems Condition="Flags%2"> <!-- row major layout -->
- <Rank>2</Rank>
- <Size>$i==0 ? $T2 : m_storage.m_cols</Size>
- <ValuePointer>m_storage.m_data</ValuePointer>
- </ArrayItems>
- <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->
- <Direction>Backward</Direction>
- <Rank>2</Rank>
- <Size>$i==0 ? $T2 : m_storage.m_cols</Size>
- <ValuePointer>m_storage.m_data</ValuePointer>
- </ArrayItems>
- </Expand>
- </Type>
-
- <!-- Dynamic x Fixed Matrix -->
- <Type Name="Eigen::Matrix<*,-1,*,*,*,*>">
- <AlternativeType Name="Eigen::Array<*,-1,*,*,*,*>"/>
- <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
- <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}, {$T2}] (dynamic row matrix)</DisplayString>
- <Expand>
- <ArrayItems Condition="Flags%2"> <!-- row major layout -->
- <Rank>2</Rank>
- <Size>$i==0 ? m_storage.m_rows : $T2</Size>
- <ValuePointer>m_storage.m_data</ValuePointer>
- </ArrayItems>
- <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->
- <Direction>Backward</Direction>
- <Rank>2</Rank>
- <Size>$i==0 ? m_storage.m_rows : $T2</Size>
- <ValuePointer>m_storage.m_data</ValuePointer>
- </ArrayItems>
- </Expand>
- </Type>
-
- <!-- Dynamic Column Vector -->
- <Type Name="Eigen::Matrix<*,1,-1,*,*,*>">
- <AlternativeType Name="Eigen::Array<*,1,-1,*,*,*>"/>
- <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
- <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_cols}] (dynamic column vector)</DisplayString>
- <Expand>
- <Item Name="[size]">m_storage.m_cols</Item>
- <ArrayItems>
- <Size>m_storage.m_cols</Size>
- <ValuePointer>m_storage.m_data</ValuePointer>
- </ArrayItems>
- </Expand>
- </Type>
-
- <!-- Dynamic Row Vector -->
- <Type Name="Eigen::Matrix<*,-1,1,*,*,*>">
- <AlternativeType Name="Eigen::Array<*,-1,1,*,*,*>"/>
- <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
- <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}] (dynamic row vector)</DisplayString>
- <Expand>
- <Item Name="[size]">m_storage.m_rows</Item>
- <ArrayItems>
- <Size>m_storage.m_rows</Size>
- <ValuePointer>m_storage.m_data</ValuePointer>
- </ArrayItems>
- </Expand>
- </Type>
-
- <!-- Fixed Vector -->
- <Type Name="Eigen::Matrix<*,1,1,*,*,*>">
- <AlternativeType Name="Eigen::Array<*,1,1,*,*,*>"/>
- <DisplayString>[1] ({m_storage.m_data.array[0]})</DisplayString>
- <Expand>
- <Item Name="[x]">m_storage.m_data.array[0]</Item>
- </Expand>
- </Type>
-
- <Type Name="Eigen::Matrix<*,2,1,*,*,*>">
- <AlternativeType Name="Eigen::Matrix<*,1,2,*,*,*>"/>
- <AlternativeType Name="Eigen::Array<*,2,1,*,*,*>"/>
- <AlternativeType Name="Eigen::Array<*,1,2,*,*,*>"/>
- <DisplayString>[2] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]})</DisplayString>
- <Expand>
- <Item Name="[x]">m_storage.m_data.array[0]</Item>
- <Item Name="[y]">m_storage.m_data.array[1]</Item>
- </Expand>
- </Type>
-
- <Type Name="Eigen::Matrix<*,3,1,*,*,*>">
- <AlternativeType Name="Eigen::Matrix<*,1,3,*,*,*>"/>
- <AlternativeType Name="Eigen::Array<*,3,1,*,*,*>"/>
- <AlternativeType Name="Eigen::Array<*,1,3,*,*,*>"/>
- <DisplayString>[3] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]})</DisplayString>
- <Expand>
- <Item Name="[x]">m_storage.m_data.array[0]</Item>
- <Item Name="[y]">m_storage.m_data.array[1]</Item>
- <Item Name="[z]">m_storage.m_data.array[2]</Item>
- </Expand>
- </Type>
-
- <Type Name="Eigen::Matrix<*,4,1,*,*,*>">
- <AlternativeType Name="Eigen::Matrix<*,1,4,*,*,*>"/>
- <AlternativeType Name="Eigen::Array<*,4,1,*,*,*>"/>
- <AlternativeType Name="Eigen::Array<*,1,4,*,*,*>"/>
- <DisplayString>[4] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>
- <Expand>
- <Item Name="[x]">m_storage.m_data.array[0]</Item>
- <Item Name="[y]">m_storage.m_data.array[1]</Item>
- <Item Name="[z]">m_storage.m_data.array[2]</Item>
- <Item Name="[w]">m_storage.m_data.array[3]</Item>
- </Expand>
- </Type>
-
-</AutoVisualizer>
+<?xml version="1.0" encoding="utf-8"?>
+
+<AutoVisualizer xmlns="http://schemas.microsoft.com/vstudio/debugger/natvis/2010">
+
+ <!-- Fixed x Fixed Matrix -->
+ <Type Name="Eigen::Matrix<*,*,*,*,*,*>">
+ <AlternativeType Name="Eigen::Array<*,-1,-1,*,*,*>"/>
+ <DisplayString>[{$T2}, {$T3}] (fixed matrix)</DisplayString>
+ <Expand>
+ <ArrayItems Condition="Flags%2"> <!-- row major layout -->
+ <Rank>2</Rank>
+ <Size>$i==0 ? $T2 : $T3</Size>
+ <ValuePointer>m_storage.m_data.array</ValuePointer>
+ </ArrayItems>
+ <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->
+ <Direction>Backward</Direction>
+ <Rank>2</Rank>
+ <Size>$i==0 ? $T2 : $T3</Size>
+ <ValuePointer>m_storage.m_data.array</ValuePointer>
+ </ArrayItems>
+ </Expand>
+ </Type>
+
+ <!-- 2 x 2 Matrix -->
+ <Type Name="Eigen::Matrix<*,2,2,*,*,*>">
+ <AlternativeType Name="Eigen::Array<*,2,2,*,*,*>"/>
+ <DisplayString>[2, 2] (fixed matrix)</DisplayString>
+ <Expand>
+ <Synthetic Name="[row 0]" Condition="Flags%2">
+ <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 0]" Condition="!(Flags%2)">
+ <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[2]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 1]" Condition="Flags%2">
+ <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 1]" Condition="!(Flags%2)">
+ <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[3]})</DisplayString>
+ </Synthetic>
+ </Expand>
+ </Type>
+
+ <!-- 3 x 3 Matrix -->
+ <Type Name="Eigen::Matrix<*,3,3,*,*,*>">
+ <AlternativeType Name="Eigen::Array<*,3,3,*,*,*>"/>
+ <DisplayString>[3, 3] (fixed matrix)</DisplayString>
+ <Expand>
+ <Synthetic Name="[row 0]" Condition="Flags%2">
+ <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 0]" Condition="!(Flags%2)">
+ <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[3]}, {m_storage.m_data.array[6]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 1]" Condition="Flags%2">
+ <DisplayString>({m_storage.m_data.array[3]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[5]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 1]" Condition="!(Flags%2)">
+ <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[7]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 2]" Condition="Flags%2">
+ <DisplayString>({m_storage.m_data.array[6]}, {m_storage.m_data.array[7]}, {m_storage.m_data.array[8]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 2]" Condition="!(Flags%2)">
+ <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[8]})</DisplayString>
+ </Synthetic>
+ </Expand>
+ </Type>
+
+ <!-- 4 x 4 Matrix -->
+ <Type Name="Eigen::Matrix<*,4,4,*,*,*>">
+ <AlternativeType Name="Eigen::Array<*,4,4,*,*,*>"/>
+ <DisplayString>[4, 4] (fixed matrix)</DisplayString>
+ <Expand>
+ <Synthetic Name="[row 0]" Condition="Flags%2">
+ <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 0]" Condition="!(Flags%2)">
+ <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[8]}, {m_storage.m_data.array[12]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 1]" Condition="Flags%2">
+ <DisplayString>({m_storage.m_data.array[4]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[6]}, {m_storage.m_data.array[7]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 1]" Condition="!(Flags%2)">
+ <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[9]}, {m_storage.m_data.array[13]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 2]" Condition="Flags%2">
+ <DisplayString>({m_storage.m_data.array[8]}, {m_storage.m_data.array[9]}, {m_storage.m_data.array[10]}, {m_storage.m_data.array[11]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 2]" Condition="!(Flags%2)">
+ <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[6]}, {m_storage.m_data.array[10]}, {m_storage.m_data.array[14]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 3]" Condition="Flags%2">
+ <DisplayString>({m_storage.m_data.array[12]}, {m_storage.m_data.array[13]}, {m_storage.m_data.array[14]}, {m_storage.m_data.array[15]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 3]" Condition="!(Flags%2)">
+ <DisplayString>({m_storage.m_data.array[3]}, {m_storage.m_data.array[7]}, {m_storage.m_data.array[11]}, {m_storage.m_data.array[15]})</DisplayString>
+ </Synthetic>
+ </Expand>
+ </Type>
+
+ <!-- Dynamic x Dynamic Matrix -->
+ <Type Name="Eigen::Matrix<*,-1,-1,*,*,*>">
+ <AlternativeType Name="Eigen::Array<*,-1,-1,*,*,*>"/>
+ <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
+ <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}, {m_storage.m_cols}] (dynamic matrix)</DisplayString>
+ <Expand>
+ <ArrayItems Condition="Flags%2"> <!-- row major layout -->
+ <Rank>2</Rank>
+ <Size>$i==0 ? m_storage.m_rows : m_storage.m_cols</Size>
+ <ValuePointer>m_storage.m_data</ValuePointer>
+ </ArrayItems>
+ <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->
+ <Direction>Backward</Direction>
+ <Rank>2</Rank>
+ <Size>$i==0 ? m_storage.m_rows : m_storage.m_cols</Size>
+ <ValuePointer>m_storage.m_data</ValuePointer>
+ </ArrayItems>
+ </Expand>
+ </Type>
+
+ <!-- Fixed x Dynamic Matrix -->
+ <Type Name="Eigen::Matrix<*,*,-1,*,*,*>">
+ <AlternativeType Name="Eigen::Array<*,*,-1,*,*,*>"/>
+ <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
+ <DisplayString Condition="m_storage.m_data != 0">[{$T2}, {m_storage.m_cols}] (dynamic column matrix)</DisplayString>
+ <Expand>
+ <ArrayItems Condition="Flags%2"> <!-- row major layout -->
+ <Rank>2</Rank>
+ <Size>$i==0 ? $T2 : m_storage.m_cols</Size>
+ <ValuePointer>m_storage.m_data</ValuePointer>
+ </ArrayItems>
+ <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->
+ <Direction>Backward</Direction>
+ <Rank>2</Rank>
+ <Size>$i==0 ? $T2 : m_storage.m_cols</Size>
+ <ValuePointer>m_storage.m_data</ValuePointer>
+ </ArrayItems>
+ </Expand>
+ </Type>
+
+ <!-- Dynamic x Fixed Matrix -->
+ <Type Name="Eigen::Matrix<*,-1,*,*,*,*>">
+ <AlternativeType Name="Eigen::Array<*,-1,*,*,*,*>"/>
+ <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
+ <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}, {$T2}] (dynamic row matrix)</DisplayString>
+ <Expand>
+ <ArrayItems Condition="Flags%2"> <!-- row major layout -->
+ <Rank>2</Rank>
+ <Size>$i==0 ? m_storage.m_rows : $T2</Size>
+ <ValuePointer>m_storage.m_data</ValuePointer>
+ </ArrayItems>
+ <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->
+ <Direction>Backward</Direction>
+ <Rank>2</Rank>
+ <Size>$i==0 ? m_storage.m_rows : $T2</Size>
+ <ValuePointer>m_storage.m_data</ValuePointer>
+ </ArrayItems>
+ </Expand>
+ </Type>
+
+ <!-- Dynamic Column Vector -->
+ <Type Name="Eigen::Matrix<*,1,-1,*,*,*>">
+ <AlternativeType Name="Eigen::Array<*,1,-1,*,*,*>"/>
+ <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
+ <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_cols}] (dynamic column vector)</DisplayString>
+ <Expand>
+ <Item Name="[size]">m_storage.m_cols</Item>
+ <ArrayItems>
+ <Size>m_storage.m_cols</Size>
+ <ValuePointer>m_storage.m_data</ValuePointer>
+ </ArrayItems>
+ </Expand>
+ </Type>
+
+ <!-- Dynamic Row Vector -->
+ <Type Name="Eigen::Matrix<*,-1,1,*,*,*>">
+ <AlternativeType Name="Eigen::Array<*,-1,1,*,*,*>"/>
+ <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
+ <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}] (dynamic row vector)</DisplayString>
+ <Expand>
+ <Item Name="[size]">m_storage.m_rows</Item>
+ <ArrayItems>
+ <Size>m_storage.m_rows</Size>
+ <ValuePointer>m_storage.m_data</ValuePointer>
+ </ArrayItems>
+ </Expand>
+ </Type>
+
+ <!-- Fixed Vector -->
+ <Type Name="Eigen::Matrix<*,1,1,*,*,*>">
+ <AlternativeType Name="Eigen::Array<*,1,1,*,*,*>"/>
+ <DisplayString>[1] ({m_storage.m_data.array[0]})</DisplayString>
+ <Expand>
+ <Item Name="[x]">m_storage.m_data.array[0]</Item>
+ </Expand>
+ </Type>
+
+ <Type Name="Eigen::Matrix<*,2,1,*,*,*>">
+ <AlternativeType Name="Eigen::Matrix<*,1,2,*,*,*>"/>
+ <AlternativeType Name="Eigen::Array<*,2,1,*,*,*>"/>
+ <AlternativeType Name="Eigen::Array<*,1,2,*,*,*>"/>
+ <DisplayString>[2] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]})</DisplayString>
+ <Expand>
+ <Item Name="[x]">m_storage.m_data.array[0]</Item>
+ <Item Name="[y]">m_storage.m_data.array[1]</Item>
+ </Expand>
+ </Type>
+
+ <Type Name="Eigen::Matrix<*,3,1,*,*,*>">
+ <AlternativeType Name="Eigen::Matrix<*,1,3,*,*,*>"/>
+ <AlternativeType Name="Eigen::Array<*,3,1,*,*,*>"/>
+ <AlternativeType Name="Eigen::Array<*,1,3,*,*,*>"/>
+ <DisplayString>[3] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]})</DisplayString>
+ <Expand>
+ <Item Name="[x]">m_storage.m_data.array[0]</Item>
+ <Item Name="[y]">m_storage.m_data.array[1]</Item>
+ <Item Name="[z]">m_storage.m_data.array[2]</Item>
+ </Expand>
+ </Type>
+
+ <Type Name="Eigen::Matrix<*,4,1,*,*,*>">
+ <AlternativeType Name="Eigen::Matrix<*,1,4,*,*,*>"/>
+ <AlternativeType Name="Eigen::Array<*,4,1,*,*,*>"/>
+ <AlternativeType Name="Eigen::Array<*,1,4,*,*,*>"/>
+ <DisplayString>[4] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>
+ <Expand>
+ <Item Name="[x]">m_storage.m_data.array[0]</Item>
+ <Item Name="[y]">m_storage.m_data.array[1]</Item>
+ <Item Name="[z]">m_storage.m_data.array[2]</Item>
+ <Item Name="[w]">m_storage.m_data.array[3]</Item>
+ </Expand>
+ </Type>
+
+</AutoVisualizer>
diff --git a/debug/msvc/eigen_autoexp_part.dat b/debug/msvc/eigen_autoexp_part.dat
index 273c10d..35ef580 100644
--- a/debug/msvc/eigen_autoexp_part.dat
+++ b/debug/msvc/eigen_autoexp_part.dat
@@ -1,295 +1,295 @@
-; ***************************************************************
-; * Eigen Visualizer
-; *
-; * Author: Hauke Heibel <hauke.heibel@gmail.com>
-; *
-; * Support the enhanced debugging of the following Eigen
-; * types (*: any, +:fixed dimension) :
-; *
-; * - Eigen::Matrix<*,4,1,*,*,*> and Eigen::Matrix<*,1,4,*,*,*>
-; * - Eigen::Matrix<*,3,1,*,*,*> and Eigen::Matrix<*,1,3,*,*,*>
-; * - Eigen::Matrix<*,2,1,*,*,*> and Eigen::Matrix<*,1,2,*,*,*>
-; * - Eigen::Matrix<*,-1,-1,*,*,*>
-; * - Eigen::Matrix<*,+,-1,*,*,*>
-; * - Eigen::Matrix<*,-1,+,*,*,*>
-; * - Eigen::Matrix<*,+,+,*,*,*>
-; *
-; * Matrices are displayed properly independently of the memory
-; * alignment (RowMajor vs. ColMajor).
-; *
-; * This file is distributed WITHOUT ANY WARRANTY. Please ensure
-; * that your original autoexp.dat file is copied to a safe
-; * place before proceeding with its modification.
-; ***************************************************************
-
-[Visualizer]
-
-; Fixed size 4-vectors
-Eigen::Matrix<*,4,1,*,*,*>|Eigen::Matrix<*,1,4,*,*,*>{
- children
- (
- #(
- [internals]: [$c,!],
- x : ($c.m_storage.m_data.array)[0],
- y : ($c.m_storage.m_data.array)[1],
- z : ($c.m_storage.m_data.array)[2],
- w : ($c.m_storage.m_data.array)[3]
- )
- )
-
- preview
- (
- #(
- "[",
- 4,
- "](",
- #array(expr: $e.m_storage.m_data.array[$i], size: 4),
- ")"
- )
- )
-}
-
-; Fixed size 3-vectors
-Eigen::Matrix<*,3,1,*,*,*>|Eigen::Matrix<*,1,3,*,*,*>{
- children
- (
- #(
- [internals]: [$c,!],
- x : ($c.m_storage.m_data.array)[0],
- y : ($c.m_storage.m_data.array)[1],
- z : ($c.m_storage.m_data.array)[2]
- )
- )
-
- preview
- (
- #(
- "[",
- 3,
- "](",
- #array(expr: $e.m_storage.m_data.array[$i], size: 3),
- ")"
- )
- )
-}
-
-; Fixed size 2-vectors
-Eigen::Matrix<*,2,1,*,*,*>|Eigen::Matrix<*,1,2,*,*,*>{
- children
- (
- #(
- [internals]: [$c,!],
- x : ($c.m_storage.m_data.array)[0],
- y : ($c.m_storage.m_data.array)[1]
- )
- )
-
- preview
- (
- #(
- "[",
- 2,
- "](",
- #array(expr: $e.m_storage.m_data.array[$i], size: 2),
- ")"
- )
- )
-}
-
-; Fixed size 1-vectors
-Eigen::Matrix<*,1,1,*,*,*>|Eigen::Matrix<*,1,1,*,*,*>{
- children
- (
- #(
- [internals]: [$c,!],
- x : ($c.m_storage.m_data.array)[0]
- )
- )
-
- preview
- (
- #(
- "[",
- 1,
- "](",
- #array(expr: $e.m_storage.m_data.array[$i], size: 1),
- ")"
- )
- )
-}
-
-; Dynamic matrices (ColMajor and RowMajor support)
-Eigen::Matrix<*,-1,-1,*,*,*>{
- children
- (
- #(
- [internals]: [$c,!],
- rows: $c.m_storage.m_rows,
- cols: $c.m_storage.m_cols,
- ; Check for RowMajorBit
- #if ($c.Flags & 0x1) (
- #array(
- rank: 2,
- base: 0,
- expr: ($c.m_storage.m_data)[($i % $c.m_storage.m_rows)*$c.m_storage.m_cols + (($i- $i % $c.m_storage.m_rows)/$c.m_storage.m_rows)],
- size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.m_storage.m_cols
- )
- ) #else (
- #array(
- rank: 2,
- base: 0,
- expr: ($c.m_storage.m_data)[$i],
- size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.m_storage.m_cols
- )
- )
- )
- )
-
- preview
- (
- #(
- "[",
- $c.m_storage.m_rows,
- ",",
- $c.m_storage.m_cols,
- "](",
- #array(
- expr : [($c.m_storage.m_data)[$i],g],
- size : $c.m_storage.m_rows*$c.m_storage.m_cols
- ),
- ")"
- )
- )
-}
-
-; Fixed rows, dynamic columns matrix (ColMajor and RowMajor support)
-Eigen::Matrix<*,*,-1,*,*,*>{
- children
- (
- #(
- [internals]: [$c,!],
- rows: $c.RowsAtCompileTime,
- cols: $c.m_storage.m_cols,
- ; Check for RowMajorBit
- #if ($c.Flags & 0x1) (
- #array(
- rank: 2,
- base: 0,
- expr: ($c.m_storage.m_data)[($i % $c.RowsAtCompileTime)*$c.m_storage.m_cols + (($i- $i % $c.RowsAtCompileTime)/$c.RowsAtCompileTime)],
- size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.m_storage.m_cols
- )
- ) #else (
- #array(
- rank: 2,
- base: 0,
- expr: ($c.m_storage.m_data)[$i],
- size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.m_storage.m_cols
- )
- )
- )
- )
-
- preview
- (
- #(
- "[",
- $c.RowsAtCompileTime,
- ",",
- $c.m_storage.m_cols,
- "](",
- #array(
- expr : [($c.m_storage.m_data)[$i],g],
- size : $c.RowsAtCompileTime*$c.m_storage.m_cols
- ),
- ")"
- )
- )
-}
-
-; Dynamic rows, fixed columns matrix (ColMajor and RowMajor support)
-Eigen::Matrix<*,-1,*,*,*,*>{
- children
- (
- #(
- [internals]: [$c,!],
- rows: $c.m_storage.m_rows,
- cols: $c.ColsAtCompileTime,
- ; Check for RowMajorBit
- #if ($c.Flags & 0x1) (
- #array(
- rank: 2,
- base: 0,
- expr: ($c.m_storage.m_data)[($i % $c.m_storage.m_rows)*$c.ColsAtCompileTime + (($i- $i % $c.m_storage.m_rows)/$c.m_storage.m_rows)],
- size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.ColsAtCompileTime
- )
- ) #else (
- #array(
- rank: 2,
- base: 0,
- expr: ($c.m_storage.m_data)[$i],
- size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.ColsAtCompileTime
- )
- )
- )
- )
-
- preview
- (
- #(
- "[",
- $c.m_storage.m_rows,
- ",",
- $c.ColsAtCompileTime,
- "](",
- #array(
- expr : [($c.m_storage.m_data)[$i],g],
- size : $c.m_storage.m_rows*$c.ColsAtCompileTime
- ),
- ")"
- )
- )
-}
-
-; Fixed size matrix (ColMajor and RowMajor support)
-Eigen::Matrix<*,*,*,*,*,*>{
- children
- (
- #(
- [internals]: [$c,!],
- rows: $c.RowsAtCompileTime,
- cols: $c.ColsAtCompileTime,
- ; Check for RowMajorBit
- #if ($c.Flags & 0x1) (
- #array(
- rank: 2,
- base: 0,
- expr: ($c.m_storage.m_data.array)[($i % $c.RowsAtCompileTime)*$c.ColsAtCompileTime + (($i- $i % $c.RowsAtCompileTime)/$c.RowsAtCompileTime)],
- size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.ColsAtCompileTime
- )
- ) #else (
- #array(
- rank: 2,
- base: 0,
- expr: ($c.m_storage.m_data.array)[$i],
- size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.ColsAtCompileTime
- )
- )
- )
- )
-
- preview
- (
- #(
- "[",
- $c.RowsAtCompileTime,
- ",",
- $c.ColsAtCompileTime,
- "](",
- #array(
- expr : [($c.m_storage.m_data.array)[$i],g],
- size : $c.RowsAtCompileTime*$c.ColsAtCompileTime
- ),
- ")"
- )
- )
-}
+; ***************************************************************
+; * Eigen Visualizer
+; *
+; * Author: Hauke Heibel <hauke.heibel@gmail.com>
+; *
+; * Support the enhanced debugging of the following Eigen
+; * types (*: any, +:fixed dimension) :
+; *
+; * - Eigen::Matrix<*,4,1,*,*,*> and Eigen::Matrix<*,1,4,*,*,*>
+; * - Eigen::Matrix<*,3,1,*,*,*> and Eigen::Matrix<*,1,3,*,*,*>
+; * - Eigen::Matrix<*,2,1,*,*,*> and Eigen::Matrix<*,1,2,*,*,*>
+; * - Eigen::Matrix<*,-1,-1,*,*,*>
+; * - Eigen::Matrix<*,+,-1,*,*,*>
+; * - Eigen::Matrix<*,-1,+,*,*,*>
+; * - Eigen::Matrix<*,+,+,*,*,*>
+; *
+; * Matrices are displayed properly independently of the memory
+; * alignment (RowMajor vs. ColMajor).
+; *
+; * This file is distributed WITHOUT ANY WARRANTY. Please ensure
+; * that your original autoexp.dat file is copied to a safe
+; * place before proceeding with its modification.
+; ***************************************************************
+
+[Visualizer]
+
+; Fixed size 4-vectors
+Eigen::Matrix<*,4,1,*,*,*>|Eigen::Matrix<*,1,4,*,*,*>{
+ children
+ (
+ #(
+ [internals]: [$c,!],
+ x : ($c.m_storage.m_data.array)[0],
+ y : ($c.m_storage.m_data.array)[1],
+ z : ($c.m_storage.m_data.array)[2],
+ w : ($c.m_storage.m_data.array)[3]
+ )
+ )
+
+ preview
+ (
+ #(
+ "[",
+ 4,
+ "](",
+ #array(expr: $e.m_storage.m_data.array[$i], size: 4),
+ ")"
+ )
+ )
+}
+
+; Fixed size 3-vectors
+Eigen::Matrix<*,3,1,*,*,*>|Eigen::Matrix<*,1,3,*,*,*>{
+ children
+ (
+ #(
+ [internals]: [$c,!],
+ x : ($c.m_storage.m_data.array)[0],
+ y : ($c.m_storage.m_data.array)[1],
+ z : ($c.m_storage.m_data.array)[2]
+ )
+ )
+
+ preview
+ (
+ #(
+ "[",
+ 3,
+ "](",
+ #array(expr: $e.m_storage.m_data.array[$i], size: 3),
+ ")"
+ )
+ )
+}
+
+; Fixed size 2-vectors
+Eigen::Matrix<*,2,1,*,*,*>|Eigen::Matrix<*,1,2,*,*,*>{
+ children
+ (
+ #(
+ [internals]: [$c,!],
+ x : ($c.m_storage.m_data.array)[0],
+ y : ($c.m_storage.m_data.array)[1]
+ )
+ )
+
+ preview
+ (
+ #(
+ "[",
+ 2,
+ "](",
+ #array(expr: $e.m_storage.m_data.array[$i], size: 2),
+ ")"
+ )
+ )
+}
+
+; Fixed size 1-vectors
+Eigen::Matrix<*,1,1,*,*,*>|Eigen::Matrix<*,1,1,*,*,*>{
+ children
+ (
+ #(
+ [internals]: [$c,!],
+ x : ($c.m_storage.m_data.array)[0]
+ )
+ )
+
+ preview
+ (
+ #(
+ "[",
+ 1,
+ "](",
+ #array(expr: $e.m_storage.m_data.array[$i], size: 1),
+ ")"
+ )
+ )
+}
+
+; Dynamic matrices (ColMajor and RowMajor support)
+Eigen::Matrix<*,-1,-1,*,*,*>{
+ children
+ (
+ #(
+ [internals]: [$c,!],
+ rows: $c.m_storage.m_rows,
+ cols: $c.m_storage.m_cols,
+ ; Check for RowMajorBit
+ #if ($c.Flags & 0x1) (
+ #array(
+ rank: 2,
+ base: 0,
+ expr: ($c.m_storage.m_data)[($i % $c.m_storage.m_rows)*$c.m_storage.m_cols + (($i- $i % $c.m_storage.m_rows)/$c.m_storage.m_rows)],
+ size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.m_storage.m_cols
+ )
+ ) #else (
+ #array(
+ rank: 2,
+ base: 0,
+ expr: ($c.m_storage.m_data)[$i],
+ size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.m_storage.m_cols
+ )
+ )
+ )
+ )
+
+ preview
+ (
+ #(
+ "[",
+ $c.m_storage.m_rows,
+ ",",
+ $c.m_storage.m_cols,
+ "](",
+ #array(
+ expr : [($c.m_storage.m_data)[$i],g],
+ size : $c.m_storage.m_rows*$c.m_storage.m_cols
+ ),
+ ")"
+ )
+ )
+}
+
+; Fixed rows, dynamic columns matrix (ColMajor and RowMajor support)
+Eigen::Matrix<*,*,-1,*,*,*>{
+ children
+ (
+ #(
+ [internals]: [$c,!],
+ rows: $c.RowsAtCompileTime,
+ cols: $c.m_storage.m_cols,
+ ; Check for RowMajorBit
+ #if ($c.Flags & 0x1) (
+ #array(
+ rank: 2,
+ base: 0,
+ expr: ($c.m_storage.m_data)[($i % $c.RowsAtCompileTime)*$c.m_storage.m_cols + (($i- $i % $c.RowsAtCompileTime)/$c.RowsAtCompileTime)],
+ size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.m_storage.m_cols
+ )
+ ) #else (
+ #array(
+ rank: 2,
+ base: 0,
+ expr: ($c.m_storage.m_data)[$i],
+ size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.m_storage.m_cols
+ )
+ )
+ )
+ )
+
+ preview
+ (
+ #(
+ "[",
+ $c.RowsAtCompileTime,
+ ",",
+ $c.m_storage.m_cols,
+ "](",
+ #array(
+ expr : [($c.m_storage.m_data)[$i],g],
+ size : $c.RowsAtCompileTime*$c.m_storage.m_cols
+ ),
+ ")"
+ )
+ )
+}
+
+; Dynamic rows, fixed columns matrix (ColMajor and RowMajor support)
+Eigen::Matrix<*,-1,*,*,*,*>{
+ children
+ (
+ #(
+ [internals]: [$c,!],
+ rows: $c.m_storage.m_rows,
+ cols: $c.ColsAtCompileTime,
+ ; Check for RowMajorBit
+ #if ($c.Flags & 0x1) (
+ #array(
+ rank: 2,
+ base: 0,
+ expr: ($c.m_storage.m_data)[($i % $c.m_storage.m_rows)*$c.ColsAtCompileTime + (($i- $i % $c.m_storage.m_rows)/$c.m_storage.m_rows)],
+ size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.ColsAtCompileTime
+ )
+ ) #else (
+ #array(
+ rank: 2,
+ base: 0,
+ expr: ($c.m_storage.m_data)[$i],
+ size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.ColsAtCompileTime
+ )
+ )
+ )
+ )
+
+ preview
+ (
+ #(
+ "[",
+ $c.m_storage.m_rows,
+ ",",
+ $c.ColsAtCompileTime,
+ "](",
+ #array(
+ expr : [($c.m_storage.m_data)[$i],g],
+ size : $c.m_storage.m_rows*$c.ColsAtCompileTime
+ ),
+ ")"
+ )
+ )
+}
+
+; Fixed size matrix (ColMajor and RowMajor support)
+Eigen::Matrix<*,*,*,*,*,*>{
+ children
+ (
+ #(
+ [internals]: [$c,!],
+ rows: $c.RowsAtCompileTime,
+ cols: $c.ColsAtCompileTime,
+ ; Check for RowMajorBit
+ #if ($c.Flags & 0x1) (
+ #array(
+ rank: 2,
+ base: 0,
+ expr: ($c.m_storage.m_data.array)[($i % $c.RowsAtCompileTime)*$c.ColsAtCompileTime + (($i- $i % $c.RowsAtCompileTime)/$c.RowsAtCompileTime)],
+ size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.ColsAtCompileTime
+ )
+ ) #else (
+ #array(
+ rank: 2,
+ base: 0,
+ expr: ($c.m_storage.m_data.array)[$i],
+ size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.ColsAtCompileTime
+ )
+ )
+ )
+ )
+
+ preview
+ (
+ #(
+ "[",
+ $c.RowsAtCompileTime,
+ ",",
+ $c.ColsAtCompileTime,
+ "](",
+ #array(
+ expr : [($c.m_storage.m_data.array)[$i],g],
+ size : $c.RowsAtCompileTime*$c.ColsAtCompileTime
+ ),
+ ")"
+ )
+ )
+}
diff --git a/doc/TutorialSlicingIndexing.dox b/doc/TutorialSlicingIndexing.dox
index 645a7cd..7f89554 100644
--- a/doc/TutorialSlicingIndexing.dox
+++ b/doc/TutorialSlicingIndexing.dox
@@ -72,12 +72,12 @@
</tr>
<tr>
<td>%Block starting at \c i,j having \c m rows, and \c n columns</td>
- <td>\code A(seqN(i,m), seqN(i,n)) \endcode</td>
+ <td>\code A(seqN(i,m), seqN(j,n)) \endcode</td>
<td>\code A.block(i,j,m,n) \endcode</td>
</tr>
<tr>
<td>%Block starting at \c i0,j0 and ending at \c i1,j1</td>
- <td>\code A(seq(i0,i1), seq(j0,j1) \endcode</td>
+ <td>\code A(seq(i0,i1), seq(j0,j1)) \endcode</td>
<td>\code A.block(i0,j0,i1-i0+1,j1-j0+1) \endcode</td>
</tr>
<tr>
@@ -97,7 +97,7 @@
</tr>
<tr>
<td>The middle row</td>
- <td>\code A(last/2,all) \endcode</td>
+ <td>\code A(last/2, all) \endcode</td>
<td>\code A.row((A.rows()-1)/2) \endcode</td>
</tr>
<tr>
diff --git a/doc/eigen_navtree_hacks.js b/doc/eigen_navtree_hacks.js
index afb97ed..f36b332 100644
--- a/doc/eigen_navtree_hacks.js
+++ b/doc/eigen_navtree_hacks.js
@@ -62,23 +62,161 @@
}
}
-// Overloaded to adjust the size of the navtree wrt the toc
-function resizeHeight()
-{
- var header = $("#top");
- var sidenav = $("#side-nav");
- var content = $("#doc-content");
- var navtree = $("#nav-tree");
- var footer = $("#nav-path");
- var toc = $("#nav-toc");
+/*
+ @licstart The following is the entire license notice for the JavaScript code in this file.
- var headerHeight = header.outerHeight();
- var footerHeight = footer.outerHeight();
- var tocHeight = toc.height();
- var windowHeight = $(window).height() - headerHeight - footerHeight;
- content.css({height:windowHeight + "px"});
- navtree.css({height:(windowHeight-tocHeight) + "px"});
- sidenav.css({height:windowHeight + "px"});
+ The MIT License (MIT)
+
+ Copyright (C) 1997-2020 by Dimitri van Heesch
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software
+ and associated documentation files (the "Software"), to deal in the Software without restriction,
+ including without limitation the rights to use, copy, modify, merge, publish, distribute,
+ sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in all copies or
+ substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+ BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ @licend The above is the entire license notice for the JavaScript code in this file
+ */
+// We need to override entire resizable just so we can change the height to account for the TOC.
+function initResizable()
+{
+ var cookie_namespace = 'doxygen';
+ var sidenav,navtree,content,header,collapsed,collapsedWidth=0,barWidth=6,desktop_vp=768,titleHeight;
+
+ function readCookie(cookie)
+ {
+ var myCookie = cookie_namespace+"_"+cookie+"=";
+ if (document.cookie) {
+ var index = document.cookie.indexOf(myCookie);
+ if (index != -1) {
+ var valStart = index + myCookie.length;
+ var valEnd = document.cookie.indexOf(";", valStart);
+ if (valEnd == -1) {
+ valEnd = document.cookie.length;
+ }
+ var val = document.cookie.substring(valStart, valEnd);
+ return val;
+ }
+ }
+ return 0;
+ }
+
+ function writeCookie(cookie, val, expiration)
+ {
+ if (val==undefined) return;
+ if (expiration == null) {
+ var date = new Date();
+ date.setTime(date.getTime()+(10*365*24*60*60*1000)); // default expiration is one week
+ expiration = date.toGMTString();
+ }
+ document.cookie = cookie_namespace + "_" + cookie + "=" + val + "; expires=" + expiration+"; path=/";
+ }
+
+ function resizeWidth()
+ {
+ var windowWidth = $(window).width() + "px";
+ var sidenavWidth = $(sidenav).outerWidth();
+ content.css({marginLeft:parseInt(sidenavWidth)+"px"});
+ writeCookie('width',sidenavWidth-barWidth, null);
+ }
+
+ function restoreWidth(navWidth)
+ {
+ var windowWidth = $(window).width() + "px";
+ content.css({marginLeft:parseInt(navWidth)+barWidth+"px"});
+ sidenav.css({width:navWidth + "px"});
+ }
+
+ function resizeHeight()
+ {
+ var headerHeight = header.outerHeight();
+ var footerHeight = footer.outerHeight();
+ var windowHeight = $(window).height() - headerHeight - footerHeight;
+ //==========================================================================
+ // MODIFICATION:
+ // This small section is the only portion modified within initResizable().
+ // The rest is copy-pasted from the doxygen-generated resize.js.
+ //
+ // Adjust nav height to make room for TOC.
+ var toc = $("#nav-toc");
+ var tocHeight = toc.height();
+ var navHeight = windowHeight;
+ // tocHeight is not always defined (e.g. if empty)
+ if (tocHeight) {
+ navHeight = windowHeight - tocHeight;
+ }
+ //==========================================================================
+
+ content.css({height:windowHeight + "px"});
+ navtree.css({height:navHeight + "px"});
+ sidenav.css({height:windowHeight + "px"});
+
+ var width=$(window).width();
+ if (width!=collapsedWidth) {
+ if (width<desktop_vp && collapsedWidth>=desktop_vp) {
+ if (!collapsed) {
+ collapseExpand();
+ }
+ } else if (width>desktop_vp && collapsedWidth<desktop_vp) {
+ if (collapsed) {
+ collapseExpand();
+ }
+ }
+ collapsedWidth=width;
+ }
+ if (location.hash.slice(1)) {
+ (document.getElementById(location.hash.slice(1))||document.body).scrollIntoView();
+ }
+ }
+
+ function collapseExpand()
+ {
+ if (sidenav.width()>0) {
+ restoreWidth(0);
+ collapsed=true;
+ }
+ else {
+ var width = readCookie('width');
+ if (width>200 && width<$(window).width()) { restoreWidth(width); } else { restoreWidth(200); }
+ collapsed=false;
+ }
+ }
+ header = $("#top");
+ sidenav = $("#side-nav");
+ content = $("#doc-content");
+ navtree = $("#nav-tree");
+ footer = $("#nav-path");
+
+ $(".side-nav-resizable").resizable({resize: function(e, ui) { resizeWidth(); } });
+ $(sidenav).resizable({ minWidth: 0 });
+ $(window).resize(function() { resizeHeight(); });
+ var device = navigator.userAgent.toLowerCase();
+ var touch_device = device.match(/(iphone|ipod|ipad|android)/);
+ if (touch_device) { /* wider split bar for touch only devices */
+ $(sidenav).css({ paddingRight:'20px' });
+ $('.ui-resizable-e').css({ width:'20px' });
+ $('#nav-sync').css({ right:'34px' });
+ barWidth=20;
+ }
+ var width = readCookie('width');
+ if (width) { restoreWidth(width); } else { resizeWidth(); }
+ resizeHeight();
+ var url = location.href;
+ var i=url.indexOf("#");
+ if (i>=0) window.location.hash=url.substr(i);
+ var _preventDefault = function(evt) { evt.preventDefault(); };
+ $("#splitbar").bind("dragstart", _preventDefault).bind("selectstart", _preventDefault);
+ $(".ui-resizable-handle").dblclick(collapseExpand);
+ $(window).on('load',resizeHeight);
}
// Overloaded to save the root node into global_navtree_object
@@ -241,7 +379,4 @@
setTimeout(arguments.callee, 10);
}
})();
-
- $(window).on("load", resizeHeight);
});
-
diff --git a/failtest/const_qualified_block_method_retval_0.cpp b/failtest/const_qualified_block_method_retval_0.cpp
index 08b5d3c..6d19bb4 100644
--- a/failtest/const_qualified_block_method_retval_0.cpp
+++ b/failtest/const_qualified_block_method_retval_0.cpp
@@ -8,6 +8,9 @@
using namespace Eigen;
-void foo(CV_QUALIFIER Matrix3d &m) { Block<Matrix3d, 3, 3> b(m.block<3, 3>(0, 0)); }
+void foo(CV_QUALIFIER Matrix3d &m) {
+ Block<Matrix3d, 3, 3> b(m.block<3, 3>(0, 0));
+ EIGEN_UNUSED_VARIABLE(b);
+}
int main() {}
diff --git a/failtest/const_qualified_block_method_retval_1.cpp b/failtest/const_qualified_block_method_retval_1.cpp
index 06e12c7..d58a018 100644
--- a/failtest/const_qualified_block_method_retval_1.cpp
+++ b/failtest/const_qualified_block_method_retval_1.cpp
@@ -8,6 +8,9 @@
using namespace Eigen;
-void foo(CV_QUALIFIER Matrix3d &m) { Block<Matrix3d> b(m.block(0, 0, 3, 3)); }
+void foo(CV_QUALIFIER Matrix3d &m) {
+ Block<Matrix3d> b(m.block(0, 0, 3, 3));
+ EIGEN_UNUSED_VARIABLE(b);
+}
int main() {}
diff --git a/failtest/const_qualified_diagonal_method_retval.cpp b/failtest/const_qualified_diagonal_method_retval.cpp
index f3acba6..a10796a 100644
--- a/failtest/const_qualified_diagonal_method_retval.cpp
+++ b/failtest/const_qualified_diagonal_method_retval.cpp
@@ -8,6 +8,9 @@
using namespace Eigen;
-void foo(CV_QUALIFIER Matrix3d &m) { Diagonal<Matrix3d> b(m.diagonal()); }
+void foo(CV_QUALIFIER Matrix3d &m) {
+ Diagonal<Matrix3d> b(m.diagonal());
+ EIGEN_UNUSED_VARIABLE(b);
+}
int main() {}
diff --git a/failtest/const_qualified_transpose_method_retval.cpp b/failtest/const_qualified_transpose_method_retval.cpp
index 394f64a..33200a9 100644
--- a/failtest/const_qualified_transpose_method_retval.cpp
+++ b/failtest/const_qualified_transpose_method_retval.cpp
@@ -8,6 +8,9 @@
using namespace Eigen;
-void foo(CV_QUALIFIER Matrix3d &m) { Transpose<Matrix3d> b(m.transpose()); }
+void foo(CV_QUALIFIER Matrix3d &m) {
+ Transpose<Matrix3d> b(m.transpose());
+ EIGEN_UNUSED_VARIABLE(b);
+}
int main() {}
diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt
index 8d6d754..1babd13 100644
--- a/lapack/CMakeLists.txt
+++ b/lapack/CMakeLists.txt
@@ -22,7 +22,7 @@
include_directories(../blas)
set(EigenLapack_SRCS
-single.cpp double.cpp complex_single.cpp complex_double.cpp ../blas/xerbla.cpp
+ dsecnd_INT_CPU_TIME.cpp second_INT_CPU_TIME.cpp single.cpp double.cpp complex_single.cpp complex_double.cpp ../blas/xerbla.cpp
)
if(EIGEN_Fortran_COMPILER_WORKS)
@@ -38,7 +38,6 @@
dlapy2.f dlapy3.f slapy2.f slapy3.f
clacgv.f zlacgv.f
slamch.f dlamch.f
- second_NONE.f dsecnd_NONE.f
)
option(EIGEN_ENABLE_LAPACK_TESTS OFF "Enable the Lapack unit tests")
diff --git a/lapack/dsecnd_INT_CPU_TIME.cpp b/lapack/dsecnd_INT_CPU_TIME.cpp
new file mode 100644
index 0000000..684fa1d
--- /dev/null
+++ b/lapack/dsecnd_INT_CPU_TIME.cpp
@@ -0,0 +1,39 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2024 The Eigen Authors
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifdef _WIN32
+#include <cstdint>
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#else
+#include <ctime>
+#endif
+
+extern "C" {
+double dsecnd_();
+}
+
+// Elapsed CPU Time in seconds.
+double dsecnd_() {
+#ifdef _WIN32
+ // For MSVC, use `GetProcessTimes` for proper CPU time - MSVC uses
+ // a non-standard `std::clock` implementation (see
+ // https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/clock?view=msvc-170).
+ // GetProcessTimes() uses 100-nanosecond time units.
+ FILETIME creation_time, exit_time, kernel_time, user_time;
+ GetProcessTimes(GetCurrentProcess(), &creation_time, &exit_time, &kernel_time, &user_time);
+ ULARGE_INTEGER user;
+ user.HighPart = user_time.dwHighDateTime;
+ user.LowPart = user_time.dwLowDateTime;
+ uint64_t time_100ns = user.QuadPart;
+ return static_cast<double>(time_100ns) / 10000000.0;
+#else
+ return static_cast<double>(std::clock()) / static_cast<double>(CLOCKS_PER_SEC);
+#endif
+}
diff --git a/lapack/second_INT_CPU_TIME.cpp b/lapack/second_INT_CPU_TIME.cpp
new file mode 100644
index 0000000..d6eb402
--- /dev/null
+++ b/lapack/second_INT_CPU_TIME.cpp
@@ -0,0 +1,39 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2024 The Eigen Authors
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifdef _WIN32
+#include <cstdint>
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#else
+#include <ctime>
+#endif
+
+extern "C" {
+float second_();
+}
+
+// Elapsed CPU Time in seconds.
+float second_() {
+#ifdef _WIN32
+ // For MSVC, use `GetProcessTimes` for proper CPU time - MSVC uses
+ // a non-standard `std::clock` implementation (see
+ // https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/clock?view=msvc-170).
+ // GetProcessTimes() uses 100-nanosecond time units.
+ FILETIME creation_time, exit_time, kernel_time, user_time;
+ GetProcessTimes(GetCurrentProcess(), &creation_time, &exit_time, &kernel_time, &user_time);
+ ULARGE_INTEGER user;
+ user.HighPart = user_time.dwHighDateTime;
+ user.LowPart = user_time.dwLowDateTime;
+ uint64_t time_100ns = user.QuadPart;
+ return static_cast<float>(time_100ns) / 10000000.0f;
+#else
+ return static_cast<float>(std::clock()) / static_cast<float>(CLOCKS_PER_SEC);
+#endif
+}
diff --git a/scripts/relicense.py b/scripts/relicense.py
deleted file mode 100644
index 8a5265f..0000000
--- a/scripts/relicense.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# This file is part of Eigen, a lightweight C++ template library
-# for linear algebra.
-#
-# Copyright (C) 2012 Keir Mierle <mierle@gmail.com>
-#
-# This Source Code Form is subject to the terms of the Mozilla
-# Public License v. 2.0. If a copy of the MPL was not distributed
-# with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-# Author: mierle@gmail.com (Keir Mierle)
-#
-# Make the long-awaited conversion to MPL.
-
-lgpl3_header = '''
-// Eigen is free software; you can redistribute it and/or
-// modify it under the terms of the GNU Lesser General Public
-// License as published by the Free Software Foundation; either
-// version 3 of the License, or (at your option) any later version.
-//
-// Alternatively, you can redistribute it and/or
-// modify it under the terms of the GNU General Public License as
-// published by the Free Software Foundation; either version 2 of
-// the License, or (at your option) any later version.
-//
-// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
-// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU Lesser General Public
-// License and a copy of the GNU General Public License along with
-// Eigen. If not, see <http://www.gnu.org/licenses/>.
-'''
-
-mpl2_header = """
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-"""
-
-import os
-import sys
-
-exclusions = set(['relicense.py'])
-
-def update(text):
- if text.find(lgpl3_header) == -1:
- return text, False
- return text.replace(lgpl3_header, mpl2_header), True
-
-rootdir = sys.argv[1]
-for root, sub_folders, files in os.walk(rootdir):
- for basename in files:
- if basename in exclusions:
- print 'SKIPPED', filename
- continue
- filename = os.path.join(root, basename)
- fo = file(filename)
- text = fo.read()
- fo.close()
-
- text, updated = update(text)
- if updated:
- fo = file(filename, "w")
- fo.write(text)
- fo.close()
- print 'UPDATED', filename
- else:
- print ' ', filename
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index fbbc98a..4c7c3a4 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -48,7 +48,7 @@
set(SPARSE_LIBS ${SPARSE_LIBS} ${CHOLMOD_LIBRARIES} ${EIGEN_BLAS_LIBRARIES} ${EIGEN_LAPACK_LIBRARIES})
set(CHOLMOD_ALL_LIBS ${CHOLMOD_LIBRARIES} ${EIGEN_BLAS_LIBRARIES} ${EIGEN_LAPACK_LIBRARIES})
ei_add_property(EIGEN_TESTED_BACKENDS "CHOLMOD, ")
-
+
ei_add_test(cholmod_support "" "${CHOLMOD_ALL_LIBS}")
else()
ei_add_property(EIGEN_MISSING_BACKENDS "CHOLMOD, ")
@@ -61,7 +61,7 @@
set(SPARSE_LIBS ${SPARSE_LIBS} ${UMFPACK_LIBRARIES} ${EIGEN_BLAS_LIBRARIES})
set(UMFPACK_ALL_LIBS ${UMFPACK_LIBRARIES} ${EIGEN_BLAS_LIBRARIES})
ei_add_property(EIGEN_TESTED_BACKENDS "UMFPACK, ")
-
+
ei_add_test(umfpack_support "" "${UMFPACK_ALL_LIBS}")
else()
ei_add_property(EIGEN_MISSING_BACKENDS "UMFPACK, ")
@@ -74,7 +74,7 @@
set(SPARSE_LIBS ${SPARSE_LIBS} ${KLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES})
set(KLU_ALL_LIBS ${KLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES})
ei_add_property(EIGEN_TESTED_BACKENDS "KLU, ")
-
+
ei_add_test(klu_support "" "${KLU_ALL_LIBS}")
else()
ei_add_property(EIGEN_MISSING_BACKENDS "KLU, ")
@@ -87,7 +87,7 @@
set(SPARSE_LIBS ${SPARSE_LIBS} ${SUPERLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES})
set(SUPERLU_ALL_LIBS ${SUPERLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES})
ei_add_property(EIGEN_TESTED_BACKENDS "SuperLU, ")
-
+
ei_add_test(superlu_support "" "${SUPERLU_ALL_LIBS}")
else()
ei_add_property(EIGEN_MISSING_BACKENDS "SuperLU, ")
@@ -171,6 +171,7 @@
set_property(GLOBAL PROPERTY EIGEN_CURRENT_SUBPROJECT "Official")
add_custom_target(BuildOfficial)
+ei_add_test(clz)
ei_add_test(rand)
ei_add_test(meta)
ei_add_test(maxsizevector)
@@ -406,7 +407,7 @@
string(REPLACE "-pedantic" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
string(REPLACE "-Wundef" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
string(REPLACE "-Wnon-virtual-dtor" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
- string(REPLACE "-fno-check-new" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+ string(REPLACE "-fno-check-new" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
if(EIGEN_TEST_CUDA_CLANG)
string(APPEND CMAKE_CXX_FLAGS " --cuda-path=${CUDA_TOOLKIT_ROOT_DIR}")
@@ -433,12 +434,12 @@
set(CUDA_NVCC_FLAGS "--expt-relaxed-constexpr -Xcudafe \"--display_error_number\" ${NVCC_ARCH_FLAGS} ${CUDA_NVCC_FLAGS} ${EIGEN_CUDA_CXX_FLAGS}")
cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include")
endif()
-
+
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
-
+
ei_add_test(gpu_example)
ei_add_test(gpu_basic)
-
+
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
endif()
@@ -477,7 +478,7 @@
message(FATAL_ERROR "HIP_PLATFORM = nvcc is not supported within Eigen")
else ()
message(FATAL_ERROR "Unknown HIP_PLATFORM = ${HIP_PLATFORM}")
- endif()
+ endif()
endif()
endif()
diff --git a/test/array_cwise.cpp b/test/array_cwise.cpp
index 0cfea8b..543ef2e 100644
--- a/test/array_cwise.cpp
+++ b/test/array_cwise.cpp
@@ -97,10 +97,12 @@
Scalar e = static_cast<Scalar>(ref(lhs(i, j), rhs(i, j)));
Scalar a = actual(i, j);
#if EIGEN_ARCH_ARM
- // Work around NEON flush-to-zero mode
- // if ref returns denormalized value and Eigen returns 0, then skip the test
- int ref_fpclass = std::fpclassify(e);
- if (a == Scalar(0) && ref_fpclass == FP_SUBNORMAL) continue;
+ // Work around NEON flush-to-zero mode.
+ // If ref returns a subnormal value and Eigen returns 0, then skip the test.
+ if (a == Scalar(0) && (e > -(std::numeric_limits<Scalar>::min)() && e < (std::numeric_limits<Scalar>::min)()) &&
+ (e <= -std::numeric_limits<Scalar>::denorm_min() || e >= std::numeric_limits<Scalar>::denorm_min())) {
+ continue;
+ }
#endif
bool success = (a == e) || ((numext::isfinite)(e) && internal::isApprox(a, e, tol)) ||
((numext::isnan)(a) && (numext::isnan)(e));
diff --git a/test/clz.cpp b/test/clz.cpp
new file mode 100644
index 0000000..b56d328
--- /dev/null
+++ b/test/clz.cpp
@@ -0,0 +1,74 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2023 The Eigen Authors
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+template <typename T>
+int ref_clz(T val) {
+ constexpr int kNumBits = sizeof(T) * CHAR_BIT;
+ T kMsbMask = T(1) << (kNumBits - 1);
+ int z = 0;
+ for (; z < kNumBits && ((val & kMsbMask) == 0); ++z) {
+ val <<= 1;
+ }
+ return z;
+}
+
+template <typename T>
+int ref_ctz(T val) {
+ constexpr int kNumBits = sizeof(T) * CHAR_BIT;
+ T kLsbMask = T(1);
+ int z = 0;
+ for (; z < kNumBits && ((val & kLsbMask) == 0); ++z) {
+ val >>= 1;
+ }
+ return z;
+}
+
+template <typename T>
+void test_clz_ctz() {
+ T step = sizeof(T) <= 2 ? 1 : (Eigen::NumTraits<T>::highest() / (T(1) << 16));
+ T iters = Eigen::NumTraits<T>::highest() / step;
+ for (T i = 0; i < iters; ++i) {
+ T val = i * step;
+ int expected_clz = ref_clz(val);
+ int actual_clz = Eigen::internal::clz(val);
+ VERIFY(expected_clz == actual_clz);
+
+ int expected_ctz = ref_ctz(val);
+ int actual_ctz = Eigen::internal::ctz(val);
+ VERIFY(expected_ctz == actual_ctz);
+ }
+}
+
+template <typename T>
+void test_clz_ctz_random() {
+ for (int i = 0; i < 1024 * 1024; ++i) {
+ T val = Eigen::internal::random<T>();
+ int expected_clz = ref_clz(val);
+ int actual_clz = Eigen::internal::clz(val);
+ VERIFY(expected_clz == actual_clz);
+
+ int expected_ctz = ref_ctz(val);
+ int actual_ctz = Eigen::internal::ctz(val);
+ VERIFY(expected_ctz == actual_ctz);
+ }
+}
+
+EIGEN_DECLARE_TEST(clz) {
+ CALL_SUBTEST_1(test_clz_ctz<uint8_t>());
+ CALL_SUBTEST_2(test_clz_ctz<uint16_t>());
+ CALL_SUBTEST_3(test_clz_ctz<uint32_t>());
+ CALL_SUBTEST_4(test_clz_ctz<uint64_t>());
+
+ for (int i = 0; i < g_repeat; i++) {
+ test_clz_ctz_random<uint32_t>();
+ test_clz_ctz_random<uint64_t>();
+ }
+}
diff --git a/test/product_threaded.cpp b/test/product_threaded.cpp
index 410f767..1eb38fb 100644
--- a/test/product_threaded.cpp
+++ b/test/product_threaded.cpp
@@ -13,9 +13,9 @@
void test_parallelize_gemm() {
constexpr int n = 1024;
constexpr int num_threads = 4;
- MatrixXf a(n, n);
- MatrixXf b(n, n);
- MatrixXf c(n, n);
+ MatrixXf a = MatrixXf::Random(n, n);
+ MatrixXf b = MatrixXf::Random(n, n);
+ MatrixXf c = MatrixXf::Random(n, n);
c.noalias() = a * b;
ThreadPool pool(num_threads);
diff --git a/test/stable_norm.cpp b/test/stable_norm.cpp
index 130284a..e9ed3d5 100644
--- a/test/stable_norm.cpp
+++ b/test/stable_norm.cpp
@@ -209,6 +209,11 @@
}
}
+void test_empty() {
+ Eigen::VectorXf empty(0);
+ VERIFY_IS_EQUAL(empty.stableNorm(), 0.0f);
+}
+
template <typename Scalar>
void test_hypot() {
typedef typename NumTraits<Scalar>::Real RealScalar;
@@ -235,6 +240,8 @@
}
EIGEN_DECLARE_TEST(stable_norm) {
+ CALL_SUBTEST_1(test_empty());
+
for (int i = 0; i < g_repeat; i++) {
CALL_SUBTEST_3(test_hypot<double>());
CALL_SUBTEST_4(test_hypot<float>());
diff --git a/test/threads_non_blocking_thread_pool.cpp b/test/threads_non_blocking_thread_pool.cpp
index 2f0cf58..e805cf2 100644
--- a/test/threads_non_blocking_thread_pool.cpp
+++ b/test/threads_non_blocking_thread_pool.cpp
@@ -112,53 +112,56 @@
static void test_pool_partitions() {
const int kThreads = 2;
- ThreadPool tp(kThreads);
-
- // Assign each thread to its own partition, so that stealing other work only
- // occurs globally when a thread is idle.
- std::vector<std::pair<unsigned, unsigned>> steal_partitions(kThreads);
- for (int i = 0; i < kThreads; ++i) {
- steal_partitions[i] = std::make_pair(i, i + 1);
- }
- tp.SetStealPartitions(steal_partitions);
std::atomic<int> running(0);
std::atomic<int> done(0);
std::atomic<int> phase(0);
- // Schedule kThreads tasks and ensure that they all are running.
- for (int i = 0; i < kThreads; ++i) {
- tp.Schedule([&]() {
- const int thread_id = tp.CurrentThreadId();
- VERIFY_GE(thread_id, 0);
- VERIFY_LE(thread_id, kThreads - 1);
- ++running;
- while (phase < 1) {
- }
- ++done;
- });
+ {
+ ThreadPool tp(kThreads);
+
+ // Assign each thread to its own partition, so that stealing other work only
+ // occurs globally when a thread is idle.
+ std::vector<std::pair<unsigned, unsigned>> steal_partitions(kThreads);
+ for (int i = 0; i < kThreads; ++i) {
+ steal_partitions[i] = std::make_pair(i, i + 1);
+ }
+ tp.SetStealPartitions(steal_partitions);
+
+ // Schedule kThreads tasks and ensure that they all are running.
+ for (int i = 0; i < kThreads; ++i) {
+ tp.Schedule([&]() {
+ const int thread_id = tp.CurrentThreadId();
+ VERIFY_GE(thread_id, 0);
+ VERIFY_LE(thread_id, kThreads - 1);
+ ++running;
+ while (phase < 1) {
+ }
+ ++done;
+ });
+ }
+ while (running != kThreads) {
+ }
+ // Schedule each closure to only run on thread 'i' and verify that it does.
+ for (int i = 0; i < kThreads; ++i) {
+ tp.ScheduleWithHint(
+ [&, i]() {
+ ++running;
+ const int thread_id = tp.CurrentThreadId();
+ VERIFY_IS_EQUAL(thread_id, i);
+ while (phase < 2) {
+ }
+ ++done;
+ },
+ i, i + 1);
+ }
+ running = 0;
+ phase = 1;
+ while (running != kThreads) {
+ }
+ running = 0;
+ phase = 2;
}
- while (running != kThreads) {
- }
- // Schedule each closure to only run on thread 'i' and verify that it does.
- for (int i = 0; i < kThreads; ++i) {
- tp.ScheduleWithHint(
- [&, i]() {
- ++running;
- const int thread_id = tp.CurrentThreadId();
- VERIFY_IS_EQUAL(thread_id, i);
- while (phase < 2) {
- }
- ++done;
- },
- i, i + 1);
- }
- running = 0;
- phase = 1;
- while (running != kThreads) {
- }
- running = 0;
- phase = 2;
}
EIGEN_DECLARE_TEST(cxx11_non_blocking_thread_pool) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index c4dedc1..9375398 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -934,6 +934,7 @@
template <Index DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorChippingOp<DimId, const Derived>
chip(const Index offset) const {
+ EIGEN_STATIC_ASSERT(DimId < Derived::NumDimensions && DimId >= 0, Chip_Dim_out_of_range)
return TensorChippingOp<DimId, const Derived>(derived(), offset, DimId);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -1132,11 +1133,13 @@
template <DenseIndex DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorChippingOp<DimId, const Derived>
chip(const Index offset) const {
+ EIGEN_STATIC_ASSERT(DimId < Derived::NumDimensions && DimId >= 0, Chip_Dim_out_of_range)
return TensorChippingOp<DimId, const Derived>(derived(), offset, DimId);
}
template <Index DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
TensorChippingOp<DimId, Derived>
chip(const Index offset) {
+ EIGEN_STATIC_ASSERT(DimId < Derived::NumDimensions && DimId >= 0, Chip_Dim_out_of_range)
return TensorChippingOp<DimId, Derived>(derived(), offset, DimId);
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index 8a784af..32980c7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -78,7 +78,9 @@
typedef typename Eigen::internal::traits<TensorChippingOp>::Index Index;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorChippingOp(const XprType& expr, const Index offset, const Index dim)
- : m_xpr(expr), m_offset(offset), m_dim(dim) {}
+ : m_xpr(expr), m_offset(offset), m_dim(dim) {
+ eigen_assert(dim < XprType::NumDimensions && dim >= 0 && "Chip_Dim_out_of_range");
+ }
EIGEN_DEVICE_FUNC const Index offset() const { return m_offset; }
EIGEN_DEVICE_FUNC const Index dim() const { return m_dim.actualDim(); }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
index c3a7ef4..d0fbfb3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -13,6 +13,8 @@
// IWYU pragma: private
#include "./InternalHeaderCheck.h"
+#include <memory>
+
namespace Eigen {
/** \class TensorForcedEval
@@ -91,6 +93,26 @@
};
} // end namespace internal
+template <typename Device>
+class DeviceTempPointerHolder {
+ public:
+ DeviceTempPointerHolder(const Device& device, size_t size)
+ : device_(device), size_(size), ptr_(device.allocate_temp(size)) {}
+
+ ~DeviceTempPointerHolder() {
+ device_.deallocate_temp(ptr_);
+ size_ = 0;
+ ptr_ = nullptr;
+ }
+
+ void* ptr() { return ptr_; }
+
+ private:
+ Device device_;
+ size_t size_;
+ void* ptr_;
+};
+
template <typename ArgType_, typename Device>
struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device> {
typedef const internal::remove_all_t<ArgType_> ArgType;
@@ -124,13 +146,20 @@
//===--------------------------------------------------------------------===//
TensorEvaluator(const XprType& op, const Device& device)
- : m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL) {}
+ : m_impl(op.expression(), device),
+ m_op(op.expression()),
+ m_device(device),
+ m_buffer_holder(nullptr),
+ m_buffer(nullptr) {}
+
+ ~TensorEvaluator() { cleanup(); }
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
const Index numValues = internal::array_prod(m_impl.dimensions());
- m_buffer = m_device.get((CoeffReturnType*)m_device.allocate_temp(numValues * sizeof(CoeffReturnType)));
+ m_buffer_holder = std::make_shared<DeviceTempPointerHolder<Device>>(m_device, numValues * sizeof(CoeffReturnType));
+ m_buffer = static_cast<EvaluatorPointerType>(m_buffer_holder->ptr());
internal::non_integral_type_placement_new<Device, CoeffReturnType>()(numValues, m_buffer);
@@ -148,7 +177,9 @@
template <typename EvalSubExprsCallback>
EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) {
const Index numValues = internal::array_prod(m_impl.dimensions());
- m_buffer = m_device.get((CoeffReturnType*)m_device.allocate_temp(numValues * sizeof(CoeffReturnType)));
+ m_buffer_holder = std::make_shared<DeviceTempPointerHolder<Device>>(m_device, numValues * sizeof(CoeffReturnType));
+ m_buffer = static_cast<EvaluatorPointerType>(m_buffer_holder->ptr());
+
typedef TensorEvalToOp<const std::remove_const_t<ArgType>> EvalTo;
EvalTo evalToTmp(m_device.get(m_buffer), m_op);
@@ -162,8 +193,8 @@
#endif
EIGEN_STRONG_INLINE void cleanup() {
- m_device.deallocate_temp(m_buffer);
- m_buffer = NULL;
+ m_buffer_holder = nullptr;
+ m_buffer = nullptr;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_buffer[index]; }
@@ -179,7 +210,7 @@
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
bool /*root_of_expr_ast*/ = false) const {
- eigen_assert(m_buffer != NULL);
+ eigen_assert(m_buffer != nullptr);
return TensorBlock::materialize(m_buffer, m_impl.dimensions(), desc, scratch);
}
@@ -187,13 +218,14 @@
return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_buffer; }
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE EvaluatorPointerType data() const { return m_buffer; }
private:
TensorEvaluator<ArgType, Device> m_impl;
const ArgType m_op;
const Device EIGEN_DEVICE_REF m_device;
- EvaluatorPointerType m_buffer;
+ std::shared_ptr<DeviceTempPointerHolder<Device>> m_buffer_holder;
+ EvaluatorPointerType m_buffer; // Cached copy of the value stored in m_buffer_holder.
};
} // end namespace Eigen
diff --git a/unsupported/Eigen/src/SparseExtra/MarketIO.h b/unsupported/Eigen/src/SparseExtra/MarketIO.h
index 5e65b26..f92622d 100644
--- a/unsupported/Eigen/src/SparseExtra/MarketIO.h
+++ b/unsupported/Eigen/src/SparseExtra/MarketIO.h
@@ -309,6 +309,7 @@
out << header << std::endl;
out << mat.rows() << " " << mat.cols() << " " << mat.nonZeros() << "\n";
int count = 0;
+ EIGEN_UNUSED_VARIABLE(count);
for (int j = 0; j < mat.outerSize(); ++j)
for (typename SparseMatrixType::InnerIterator it(mat, j); it; ++it) {
++count;
diff --git a/unsupported/test/cxx11_tensor_concatenation.cpp b/unsupported/test/cxx11_tensor_concatenation.cpp
index ce78892..68455b3 100644
--- a/unsupported/test/cxx11_tensor_concatenation.cpp
+++ b/unsupported/test/cxx11_tensor_concatenation.cpp
@@ -36,6 +36,8 @@
static void test_static_dimension_failure() {
Tensor<int, 2, DataLayout> left(2, 3);
Tensor<int, 3, DataLayout> right(2, 3, 1);
+ left.setRandom();
+ right.setRandom();
#ifdef CXX11_TENSOR_CONCATENATION_STATIC_DIMENSION_FAILURE
// Technically compatible, but we static assert that the inputs have same
diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp
index ba553f9..228fa9e 100644
--- a/unsupported/test/cxx11_tensor_executor.cpp
+++ b/unsupported/test/cxx11_tensor_executor.cpp
@@ -24,7 +24,7 @@
// Default assignment that does no use block evaluation or vectorization.
// We assume that default coefficient evaluation is well tested and correct.
template <typename Dst, typename Expr>
-static void DefaultAssign(Dst& dst, Expr expr) {
+void DefaultAssign(Dst& dst, Expr expr) {
using Assign = Eigen::TensorAssignOp<Dst, const Expr>;
using Executor = Eigen::internal::TensorExecutor<const Assign, DefaultDevice,
/*Vectorizable=*/false,
@@ -35,7 +35,7 @@
// Assignment with specified device and tiling strategy.
template <bool Vectorizable, TiledEvaluation Tiling, typename Device, typename Dst, typename Expr>
-static void DeviceAssign(Device& d, Dst& dst, Expr expr) {
+void DeviceAssign(Device& d, Dst& dst, Expr expr) {
using Assign = Eigen::TensorAssignOp<Dst, const Expr>;
using Executor = Eigen::internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
@@ -52,7 +52,7 @@
}
template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout>
-static void test_execute_unary_expr(Device d) {
+void test_execute_unary_expr(Device d) {
static constexpr int Options = 0 | Layout;
// Pick a large enough tensor size to bypass small tensor block evaluation
@@ -77,7 +77,7 @@
}
template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout>
-static void test_execute_binary_expr(Device d) {
+void test_execute_binary_expr(Device d) {
static constexpr int Options = 0 | Layout;
// Pick a large enough tensor size to bypass small tensor block evaluation
@@ -105,7 +105,7 @@
}
template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout>
-static void test_execute_broadcasting(Device d) {
+void test_execute_broadcasting(Device d) {
static constexpr int Options = 0 | Layout;
auto dims = RandomDims<NumDims>(1, 10);
@@ -134,92 +134,103 @@
}
template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout>
-static void test_execute_chipping_rvalue(Device d) {
- auto dims = RandomDims<NumDims>(1, 10);
- Tensor<T, NumDims, Layout, Index> src(dims);
- src.setRandom();
+struct test_execute_chipping_rvalue_runner {
+ template <int ChipDim>
+ static std::enable_if_t<0 <= ChipDim, void> run_dim(Device& d, const array<Index, NumDims>& dims,
+ const Tensor<T, NumDims, Layout, Index>& src) {
+ const auto offset = internal::random<Index>(0, dims[(ChipDim)] - 1);
+ const auto expr = src.template chip<ChipDim>(offset);
-#define TEST_CHIPPING(CHIP_DIM) \
- if (NumDims > (CHIP_DIM)) { \
- const auto offset = internal::random<Index>(0, dims[(CHIP_DIM)] - 1); \
- const auto expr = src.template chip<(CHIP_DIM)>(offset); \
- \
- Tensor<T, NumDims - 1, Layout, Index> golden; \
- golden = expr; \
- \
- Tensor<T, NumDims - 1, Layout, Index> dst(golden.dimensions()); \
- \
- using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>; \
- using Executor = internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>; \
- \
- Executor::run(Assign(dst, expr), d); \
- \
- for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { \
- VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); \
- } \
+ Tensor<T, NumDims - 1, Layout, Index> golden;
+ golden = expr;
+
+ Tensor<T, NumDims - 1, Layout, Index> dst(golden.dimensions());
+
+ using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+ using Executor = internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+ Executor::run(Assign(dst, expr), d);
+
+ for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+ VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+ }
+
+ // Recursively reduce chip dimension.
+ run_dim<ChipDim - 1>(d, dims, src);
}
- TEST_CHIPPING(0)
- TEST_CHIPPING(1)
- TEST_CHIPPING(2)
- TEST_CHIPPING(3)
- TEST_CHIPPING(4)
- TEST_CHIPPING(5)
+ template <int ChipDim>
+ static std::enable_if_t <
+ ChipDim<0, void> run_dim(Device&, const array<Index, NumDims>&, const Tensor<T, NumDims, Layout, Index>&) {}
-#undef TEST_CHIPPING
+ static void run(Device d) {
+ auto dims = RandomDims<NumDims>(1, 10);
+ Tensor<T, NumDims, Layout, Index> src(dims);
+ src.setRandom();
+ run_dim<NumDims - 1>(d, dims, src);
+ }
+};
+
+template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout>
+void test_execute_chipping_rvalue(Device d) {
+ test_execute_chipping_rvalue_runner<T, NumDims, Device, Vectorizable, Tiling, Layout>::run(d);
}
template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout>
-static void test_execute_chipping_lvalue(Device d) {
- auto dims = RandomDims<NumDims>(1, 10);
+struct test_execute_chipping_lvalue_runner {
+ template <int ChipDim>
+ static std::enable_if_t<0 <= ChipDim> run_dim(Device& d, const array<Index, NumDims>& dims) {
+ /* Generate random data that we'll assign to the chipped tensor dim. */
+ array<Index, NumDims - 1> src_dims;
+ for (int i = 0; i < NumDims - 1; ++i) {
+ int dim = i < (ChipDim) ? i : i + 1;
+ src_dims[i] = dims[dim];
+ }
-#define TEST_CHIPPING(CHIP_DIM) \
- if (NumDims > (CHIP_DIM)) { \
- /* Generate random data that we'll assign to the chipped tensor dim. */ \
- array<Index, NumDims - 1> src_dims; \
- for (int i = 0; i < NumDims - 1; ++i) { \
- int dim = i < (CHIP_DIM) ? i : i + 1; \
- src_dims[i] = dims[dim]; \
- } \
- \
- Tensor<T, NumDims - 1, Layout, Index> src(src_dims); \
- src.setRandom(); \
- \
- const auto offset = internal::random<Index>(0, dims[(CHIP_DIM)] - 1); \
- \
- Tensor<T, NumDims, Layout, Index> random(dims); \
- random.setZero(); \
- \
- Tensor<T, NumDims, Layout, Index> golden(dims); \
- golden = random; \
- golden.template chip<(CHIP_DIM)>(offset) = src; \
- \
- Tensor<T, NumDims, Layout, Index> dst(dims); \
- dst = random; \
- auto expr = dst.template chip<(CHIP_DIM)>(offset); \
- \
- using Assign = TensorAssignOp<decltype(expr), const decltype(src)>; \
- using Executor = internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>; \
- \
- Executor::run(Assign(expr, src), d); \
- \
- for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { \
- VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); \
- } \
+ Tensor<T, NumDims - 1, Layout, Index> src(src_dims);
+ src.setRandom();
+
+ const auto offset = internal::random<Index>(0, dims[(ChipDim)] - 1);
+
+ Tensor<T, NumDims, Layout, Index> random(dims);
+ random.setZero();
+
+ Tensor<T, NumDims, Layout, Index> golden(dims);
+ golden = random;
+ golden.template chip<(ChipDim)>(offset) = src;
+
+ Tensor<T, NumDims, Layout, Index> dst(dims);
+ dst = random;
+ auto expr = dst.template chip<(ChipDim)>(offset);
+
+ using Assign = TensorAssignOp<decltype(expr), const decltype(src)>;
+ using Executor = internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+ Executor::run(Assign(expr, src), d);
+
+ for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+ VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+ }
+
+ run_dim<ChipDim - 1>(d, dims);
}
- TEST_CHIPPING(0)
- TEST_CHIPPING(1)
- TEST_CHIPPING(2)
- TEST_CHIPPING(3)
- TEST_CHIPPING(4)
- TEST_CHIPPING(5)
+ template <int ChipDim>
+ static std::enable_if_t < ChipDim<0, void> run_dim(Device&, const array<Index, NumDims>&) {}
-#undef TEST_CHIPPING
+ static void run(Device d) {
+ auto dims = RandomDims<NumDims>(1, 10);
+ run_dim<NumDims - 1>(d, dims);
+ }
+};
+
+template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout>
+void test_execute_chipping_lvalue(Device d) {
+ test_execute_chipping_lvalue_runner<T, NumDims, Device, Vectorizable, Tiling, Layout>::run(d);
}
template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout>
-static void test_execute_shuffle_rvalue(Device d) {
+void test_execute_shuffle_rvalue(Device d) {
static constexpr int Options = 0 | Layout;
auto dims = RandomDims<NumDims>(1, 10);
@@ -255,7 +266,7 @@
}
template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout>
-static void test_execute_shuffle_lvalue(Device d) {
+void test_execute_shuffle_lvalue(Device d) {
static constexpr int Options = 0 | Layout;
auto dims = RandomDims<NumDims>(5, 10);
@@ -289,7 +300,7 @@
}
template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout>
-static void test_execute_reshape(Device d) {
+void test_execute_reshape(Device d) {
static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
static constexpr int ReshapedDims = NumDims - 1;
@@ -326,7 +337,7 @@
}
template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout>
-static void test_execute_slice_rvalue(Device d) {
+void test_execute_slice_rvalue(Device d) {
static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
static constexpr int Options = 0 | Layout;
@@ -362,7 +373,7 @@
}
template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout>
-static void test_execute_slice_lvalue(Device d) {
+void test_execute_slice_lvalue(Device d) {
static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
static constexpr int Options = 0 | Layout;
@@ -402,7 +413,7 @@
}
template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout>
-static void test_execute_broadcasting_of_forced_eval(Device d) {
+void test_execute_broadcasting_of_forced_eval(Device d) {
static constexpr int Options = 0 | Layout;
auto dims = RandomDims<NumDims>(1, 10);
@@ -442,7 +453,7 @@
};
template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout>
-static void test_execute_generator_op(Device d) {
+void test_execute_generator_op(Device d) {
static constexpr int Options = 0 | Layout;
auto dims = RandomDims<NumDims>(20, 30);
@@ -470,7 +481,7 @@
}
template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout>
-static void test_execute_reverse_rvalue(Device d) {
+void test_execute_reverse_rvalue(Device d) {
static constexpr int Options = 0 | Layout;
auto dims = RandomDims<NumDims>(1, numext::pow(1000000.0, 1.0 / NumDims));
@@ -502,7 +513,7 @@
}
template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout>
-static void test_async_execute_unary_expr(Device d) {
+void test_async_execute_unary_expr(Device d) {
static constexpr int Options = 0 | Layout;
// Pick a large enough tensor size to bypass small tensor block evaluation
@@ -532,7 +543,7 @@
}
template <typename T, int NumDims, typename Device, bool Vectorizable, TiledEvaluation Tiling, int Layout>
-static void test_async_execute_binary_expr(Device d) {
+void test_async_execute_binary_expr(Device d) {
static constexpr int Options = 0 | Layout;
// Pick a large enough tensor size to bypass small tensor block evaluation
diff --git a/unsupported/test/sparse_extra.cpp b/unsupported/test/sparse_extra.cpp
index 22d6665..4bf5452 100644
--- a/unsupported/test/sparse_extra.cpp
+++ b/unsupported/test/sparse_extra.cpp
@@ -7,6 +7,9 @@
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#include <cstdlib>
+#include <string>
+
#include "sparse.h"
#ifdef min
@@ -19,6 +22,16 @@
#include <Eigen/SparseExtra>
+// Read from an environment variable TEST_TMPDIR, if available,
+// and append the provided filename. Defaults to local directory.
+std::string GetTestTempFilename(const char* filename) {
+ const char* test_tmpdir = std::getenv("TEST_TMPDIR");
+ if (test_tmpdir == nullptr) {
+ return std::string(filename);
+ }
+ return std::string(test_tmpdir) + std::string("/") + std::string(filename);
+}
+
template <typename SetterType, typename DenseType, typename Scalar, int Options>
bool test_random_setter(SparseMatrix<Scalar, Options>& sm, const DenseType& ref,
const std::vector<Vector2i>& nonzeroCoords) {
@@ -116,8 +129,9 @@
Index cols = internal::random<Index>(1, 100);
SparseMatrixType m1, m2;
m1 = DenseMatrix::Random(rows, cols).sparseView();
- saveMarket(m1, "sparse_extra.mtx");
- loadMarket(m2, "sparse_extra.mtx");
+ std::string filename = GetTestTempFilename("sparse_extra.mtx");
+ saveMarket(m1, filename);
+ loadMarket(m2, filename);
VERIFY_IS_EQUAL(DenseMatrix(m1), DenseMatrix(m2));
}
@@ -126,8 +140,9 @@
Index size = internal::random<Index>(1, 100);
VectorType v1, v2;
v1 = VectorType::Random(size);
- saveMarketVector(v1, "vector_extra.mtx");
- loadMarketVector(v2, "vector_extra.mtx");
+ std::string filename = GetTestTempFilename("vector_extra.mtx");
+ saveMarketVector(v1, filename);
+ loadMarketVector(v2, filename);
VERIFY_IS_EQUAL(v1, v2);
}
@@ -149,8 +164,9 @@
DenseMatrixType m1, m2;
m1 = DenseMatrixType::Random(rows, cols);
- saveMarketDense(m1, "dense_extra.mtx");
- loadMarketDense(m2, "dense_extra.mtx");
+ std::string filename = GetTestTempFilename("dense_extra.mtx");
+ saveMarketDense(m1, filename);
+ loadMarketDense(m2, filename);
VERIFY_IS_EQUAL(m1, m2);
}