Update Eigen to commit:e7c799b7c984f9b8bea27967bb04a97c52e62582 CHANGELOG ========= e7c799b7c - Prevent premature overflow to infinity in exp(x). The changes also provide a 3-4% speedup. 00af47102 - Revert https://gitlab.com/libeigen/eigen/-/commit/040180078db70b8673932d7e5615920d64ceeaf5 8ee6f8475 - Speed up exp(x) by 30-35%. 93ec5450c - disable fill_n optimization for msvc 0af6ab4b7 - Remove unnecessary check for HasBlend trait. d5eec781b - Get rid of redundant computation for large arguments to erf(x). 2fc63808e - Fix C++20 constexpr test compilation failures 5133c836c - Vectorize erf(x) for double. d6e3b528b - Update Assign_MKL.h to cast disparate enum type to int, so it can be compared... 040180078 - Ensure that destructor'\''s needed by lldb make it into binary in non-inlined fashion 0fb2ed140 - Make element accessors constexpr 8b4efc8ed - check_size_for_overflow: use numeric limits instead of c99 macro PiperOrigin-RevId: 698131745 Change-Id: Ia044ff8444a6d2266afdd7ad05556afd3be2366d
diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h index f7f0b23..f40b2f4 100644 --- a/Eigen/src/Core/AssignEvaluator.h +++ b/Eigen/src/Core/AssignEvaluator.h
@@ -738,6 +738,7 @@ } // Specialization for filling the destination with a constant value. +#if !EIGEN_COMP_MSVC #ifndef EIGEN_GPU_COMPILE_PHASE template <typename DstXprType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop( @@ -748,6 +749,7 @@ std::fill_n(dst.data(), dst.size(), src.functor()()); } #endif +#endif template <typename DstXprType, typename SrcXprType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src) {
diff --git a/Eigen/src/Core/Assign_MKL.h b/Eigen/src/Core/Assign_MKL.h index 5b566cd..ad11220 100644 --- a/Eigen/src/Core/Assign_MKL.h +++ b/Eigen/src/Core/Assign_MKL.h
@@ -89,7 +89,7 @@ static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE, EIGENTYPE> &func) { \ resize_if_allowed(dst, src, func); \ eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \ - if (vml_assign_traits<DstXprType, SrcXprNested>::Traversal == LinearTraversal) { \ + if (vml_assign_traits<DstXprType, SrcXprNested>::Traversal == (int)LinearTraversal) { \ VMLOP(dst.size(), (const VMLTYPE *)src.nestedExpression().data(), \ (VMLTYPE *)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE)); \ } else { \
diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h index 879b0db..156ca2b 100644 --- a/Eigen/src/Core/CoreEvaluators.h +++ b/Eigen/src/Core/CoreEvaluators.h
@@ -124,8 +124,7 @@ // noncopyable: // Don't make this class inherit noncopyable as this kills EBO (Empty Base Optimization) // and make complex evaluator much larger than then should do. - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE evaluator_base() {} - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~evaluator_base() {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr evaluator_base() = default; private: EIGEN_DEVICE_FUNC evaluator_base(const evaluator_base&); @@ -143,7 +142,7 @@ template <typename Scalar, int OuterStride> class plainobjectbase_evaluator_data { public: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr) { #ifndef EIGEN_INTERNAL_DEBUGGING EIGEN_UNUSED_VARIABLE(outerStride); @@ -157,9 +156,9 @@ template <typename Scalar> class plainobjectbase_evaluator_data<Scalar, Dynamic> { public: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr), m_outerStride(outerStride) {} - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outerStride() const { return m_outerStride; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index outerStride() const { return m_outerStride; } const Scalar* data; protected: @@ -189,32 +188,34 @@ : RowsAtCompileTime }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE evaluator() : m_d(0, OuterStrideAtCompileTime) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr evaluator() : m_d(0, OuterStrideAtCompileTime) { EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const PlainObjectType& m) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr explicit evaluator(const PlainObjectType& m) : m_d(m.data(), IsVectorAtCompileTime ? 0 : m.outerStride()) { EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType coeff(Index row, Index col) const { if (IsRowMajor) return m_d.data[row * m_d.outerStride() + col]; else return m_d.data[row + col * m_d.outerStride()]; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_d.data[index]; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType coeff(Index index) const { return m_d.data[index]; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& coeffRef(Index row, Index col) { if (IsRowMajor) return const_cast<Scalar*>(m_d.data)[row * m_d.outerStride() + col]; else return const_cast<Scalar*>(m_d.data)[row + col * m_d.outerStride()]; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { return const_cast<Scalar*>(m_d.data)[index]; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& coeffRef(Index index) { + return const_cast<Scalar*>(m_d.data)[index]; + } template <int LoadMode, typename PacketType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { @@ -251,9 +252,10 @@ : evaluator<PlainObjectBase<Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols>>> { typedef Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols> XprType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE evaluator() {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr evaluator() = default; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& m) : evaluator<PlainObjectBase<XprType>>(m) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr explicit evaluator(const XprType& m) + : evaluator<PlainObjectBase<XprType>>(m) {} }; template <typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols> @@ -261,9 +263,10 @@ : evaluator<PlainObjectBase<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols>>> { typedef Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> XprType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE evaluator() {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr evaluator() = default; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& m) : evaluator<PlainObjectBase<XprType>>(m) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr explicit evaluator(const XprType& m) + : evaluator<PlainObjectBase<XprType>>(m) {} }; // -------------------- Transpose --------------------
diff --git a/Eigen/src/Core/DenseCoeffsBase.h b/Eigen/src/Core/DenseCoeffsBase.h index 30e0aa3..97f9b50 100644 --- a/Eigen/src/Core/DenseCoeffsBase.h +++ b/Eigen/src/Core/DenseCoeffsBase.h
@@ -298,7 +298,7 @@ * * \sa operator()(Index,Index), coeff(Index, Index) const, coeffRef(Index) */ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& coeffRef(Index row, Index col) { eigen_internal_assert(row >= 0 && row < rows() && col >= 0 && col < cols()); return internal::evaluator<Derived>(derived()).coeffRef(row, col); } @@ -312,7 +312,7 @@ * \sa operator[](Index) */ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index row, Index col) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& operator()(Index row, Index col) { eigen_assert(row >= 0 && row < rows() && col >= 0 && col < cols()); return coeffRef(row, col); } @@ -332,7 +332,7 @@ * \sa operator[](Index), coeff(Index) const, coeffRef(Index,Index) */ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& coeffRef(Index index) { EIGEN_STATIC_ASSERT(internal::evaluator<Derived>::Flags & LinearAccessBit, THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS) eigen_internal_assert(index >= 0 && index < size()); @@ -346,7 +346,7 @@ * \sa operator[](Index) const, operator()(Index,Index), x(), y(), z(), w() */ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator[](Index index) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& operator[](Index index) { EIGEN_STATIC_ASSERT(Derived::IsVectorAtCompileTime, THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD) eigen_assert(index >= 0 && index < size());
diff --git a/Eigen/src/Core/EigenBase.h b/Eigen/src/Core/EigenBase.h index a2d6ee2..6d16700 100644 --- a/Eigen/src/Core/EigenBase.h +++ b/Eigen/src/Core/EigenBase.h
@@ -46,9 +46,9 @@ typedef typename internal::traits<Derived>::StorageKind StorageKind; /** \returns a reference to the derived object */ - EIGEN_DEVICE_FUNC Derived& derived() { return *static_cast<Derived*>(this); } + EIGEN_DEVICE_FUNC constexpr Derived& derived() { return *static_cast<Derived*>(this); } /** \returns a const reference to the derived object */ - EIGEN_DEVICE_FUNC const Derived& derived() const { return *static_cast<const Derived*>(this); } + EIGEN_DEVICE_FUNC constexpr const Derived& derived() const { return *static_cast<const Derived*>(this); } EIGEN_DEVICE_FUNC inline Derived& const_cast_derived() const { return *static_cast<Derived*>(const_cast<EigenBase*>(this));
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 1980e92..7e4f054 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -1934,6 +1934,22 @@ } template <> +EIGEN_STRONG_INLINE Packet4d pldexp_fast<Packet4d>(const Packet4d& a, const Packet4d& exponent) { + // Clamp exponent to [-1024, 1024] + const Packet4d min_exponent = pset1<Packet4d>(-1023.0); + const Packet4d max_exponent = pset1<Packet4d>(1024.0); + const Packet4i e = _mm256_cvtpd_epi32(pmin(pmax(exponent, min_exponent), max_exponent)); + const Packet4i bias = pset1<Packet4i>(1023); + + // 2^e + Packet4i hi = vec4i_swizzle1(padd(e, bias), 0, 2, 1, 3); + const Packet4i lo = _mm_slli_epi64(hi, 52); + hi = _mm_slli_epi64(_mm_srli_epi64(hi, 32), 52); + const Packet4d c = _mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1)); + return pmul(a, c); // a * 2^e +} + +template <> EIGEN_STRONG_INLINE float predux<Packet8f>(const Packet8f& a) { return predux(Packet4f(_mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 1)))); }
diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 78d17d5..5d869e4 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
@@ -155,6 +155,7 @@ HasExp = 1, HasATan = 1, HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH, HasErfc = EIGEN_FAST_MATH, HasATanh = 1, HasCmp = 1,
diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index da26cd4..49220ca 100644 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -3183,6 +3183,7 @@ HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH, HasErfc = EIGEN_FAST_MATH, HasATanh = 1, HasATan = 0,
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index 4e441b4..e21d3ef 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@@ -274,22 +274,20 @@ // // Assumes IEEE floating point format template <typename Packet> -struct pldexp_fast_impl { +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_fast(const Packet& a, const Packet& exponent) { typedef typename unpacket_traits<Packet>::integer_packet PacketI; typedef typename unpacket_traits<Packet>::type Scalar; typedef typename unpacket_traits<PacketI>::type ScalarI; static constexpr int TotalBits = sizeof(Scalar) * CHAR_BIT, MantissaBits = numext::numeric_limits<Scalar>::digits - 1, ExponentBits = TotalBits - MantissaBits - 1; - static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet run(const Packet& a, const Packet& exponent) { - const Packet bias = pset1<Packet>(Scalar((ScalarI(1) << (ExponentBits - 1)) - ScalarI(1))); // 127 - const Packet limit = pset1<Packet>(Scalar((ScalarI(1) << ExponentBits) - ScalarI(1))); // 255 - // restrict biased exponent between 0 and 255 for float. - const PacketI e = pcast<Packet, PacketI>(pmin(pmax(padd(exponent, bias), pzero(limit)), limit)); // exponent + 127 - // return a * (2^e) - return pmul(a, preinterpret<Packet>(plogical_shift_left<MantissaBits>(e))); - } -}; + const Packet bias = pset1<Packet>(Scalar((ScalarI(1) << (ExponentBits - 1)) - ScalarI(1))); // 127 + const Packet limit = pset1<Packet>(Scalar((ScalarI(1) << ExponentBits) - ScalarI(1))); // 255 + // restrict biased exponent between 0 and 255 for float. + const PacketI e = pcast<Packet, PacketI>(pmin(pmax(padd(exponent, bias), pzero(limit)), limit)); // exponent + 127 + // return a * (2^e) + return pmul(a, preinterpret<Packet>(plogical_shift_left<MantissaBits>(e))); +} // Natural or base 2 logarithm. // Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2) @@ -514,6 +512,7 @@ const Packet cst_half = pset1<Packet>(0.5f); const Packet cst_exp_hi = pset1<Packet>(88.723f); const Packet cst_exp_lo = pset1<Packet>(-104.f); + const Packet cst_pldexp_threshold = pset1<Packet>(87.0); const Packet cst_cephes_LOG2EF = pset1<Packet>(1.44269504088896341f); const Packet cst_p2 = pset1<Packet>(0.49999988079071044921875f); @@ -549,7 +548,12 @@ y = pmadd(r2, y, p_low); // Return 2^m * exp(r). - // TODO: replace pldexp with faster implementation since y in [-1, 1). + const Packet fast_pldexp_unsafe = pcmp_lt(cst_pldexp_threshold, pabs(x)); + if (!predux_any(fast_pldexp_unsafe)) { + // For |x| <= 87, we know the result is not zero or inf, and we can safely use + // the fast version of pldexp. + return pmax(pldexp_fast(y, m), _x); + } return pselect(zero_mask, cst_zero, pmax(pldexp(y, m), _x)); } @@ -562,8 +566,8 @@ const Packet cst_half = pset1<Packet>(0.5); const Packet cst_exp_hi = pset1<Packet>(709.784); - const Packet cst_exp_lo = pset1<Packet>(-709.784); - + const Packet cst_exp_lo = pset1<Packet>(-745.519); + const Packet cst_pldexp_threshold = pset1<Packet>(708.0); const Packet cst_cephes_LOG2EF = pset1<Packet>(1.4426950408889634073599); const Packet cst_cephes_exp_p0 = pset1<Packet>(1.26177193074810590878e-4); const Packet cst_cephes_exp_p1 = pset1<Packet>(3.02994407707441961300e-2); @@ -616,7 +620,12 @@ // Construct the result 2^n * exp(g) = e * x. The max is used to catch // non-finite values in the input. - // TODO: replace pldexp with faster implementation since x in [-1, 1). + const Packet fast_pldexp_unsafe = pcmp_lt(cst_pldexp_threshold, pabs(_x)); + if (!predux_any(fast_pldexp_unsafe)) { + // For |x| <= 708, we know the result is not zero or inf, and we can safely use + // the fast version of pldexp. + return pmax(pldexp_fast(x, fx), _x); + } return pselect(zero_mask, cst_zero, pmax(pldexp(x, fx), _x)); }
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h index 3b362f4..ac0e2cf 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
@@ -42,6 +42,18 @@ template <typename Packet> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_generic(const Packet& a, const Packet& exponent); +// Explicitly multiplies +// a * (2^e) +// clamping e to the range +// [NumTraits<Scalar>::min_exponent()-2, NumTraits<Scalar>::max_exponent()] +// +// This is approx 7x faster than pldexp_impl, but will prematurely over/underflow +// if 2^e doesn't fit into a normal floating-point Scalar. +// +// Assumes IEEE floating point format +template <typename Packet> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_fast(const Packet& a, const Packet& exponent); + /** \internal \returns log(x) for single precision float */ template <typename Packet> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_float(const Packet _x);
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 2f401fd..3f2d9d5 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -5141,7 +5141,7 @@ HasSqrt = 1, HasRsqrt = 1, HasTanh = EIGEN_FAST_MATH, - HasErf = 0, + HasErf = EIGEN_FAST_MATH, HasErfc = EIGEN_FAST_MATH }; };
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index c749763..f294009 100644 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -1766,7 +1766,6 @@ // We specialize pldexp here, since the generic implementation uses Packet2l, which is not well // supported by SSE, and has more range than is needed for exponents. -// TODO(rmlarsen): Remove this specialization once Packet2l has support or casting. template <> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) { // Clamp exponent to [-2099, 2099] @@ -1787,6 +1786,24 @@ return out; } +// We specialize pldexp here, since the generic implementation uses Packet2l, which is not well +// supported by SSE, and has more range than is needed for exponents. +template <> +EIGEN_STRONG_INLINE Packet2d pldexp_fast<Packet2d>(const Packet2d& a, const Packet2d& exponent) { + // Clamp exponent to [-1023, 1024] + const Packet2d min_exponent = pset1<Packet2d>(-1023.0); + const Packet2d max_exponent = pset1<Packet2d>(1024.0); + const Packet2d e = pmin(pmax(exponent, min_exponent), max_exponent); + + // Convert e to integer and swizzle to low-order bits. + const Packet4i ei = vec4i_swizzle1(_mm_cvtpd_epi32(e), 0, 3, 1, 3); + + // Compute 2^e multiply: + const Packet4i bias = _mm_set_epi32(0, 1023, 0, 1023); + const Packet2d c = _mm_castsi128_pd(_mm_slli_epi64(padd(ei, bias), 52)); // 2^e + return pmul(a, c); +} + // with AVX, the default implementations based on pload1 are faster #ifndef __AVX__ template <>
diff --git a/Eigen/src/Core/functors/NullaryFunctors.h b/Eigen/src/Core/functors/NullaryFunctors.h index c53bb90..a478b80 100644 --- a/Eigen/src/Core/functors/NullaryFunctors.h +++ b/Eigen/src/Core/functors/NullaryFunctors.h
@@ -131,8 +131,7 @@ struct functor_traits<linspaced_op<Scalar> > { enum { Cost = 1, - PacketAccess = - (!NumTraits<Scalar>::IsInteger) && packet_traits<Scalar>::HasSetLinear && packet_traits<Scalar>::HasBlend, + PacketAccess = (!NumTraits<Scalar>::IsInteger) && packet_traits<Scalar>::HasSetLinear, /*&& ((!NumTraits<Scalar>::IsInteger) || packet_traits<Scalar>::HasDiv),*/ // <- vectorization for integer is // currently disabled IsRepeatable = true
diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h index defd3c2..03542e3 100644 --- a/Eigen/src/Core/functors/UnaryFunctors.h +++ b/Eigen/src/Core/functors/UnaryFunctors.h
@@ -1292,7 +1292,7 @@ p = pmadd(r2, p, p_low); // 4. Undo subtractive range reduction exp(m*ln(2) + r) = 2^m * exp(r). - Packet e = pldexp_fast_impl<Packet>::run(p, m); + Packet e = pldexp_fast(p, m); // 5. Undo multiplicative range reduction by using exp(r) = exp(r/2)^2. e = pmul(e, e);
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 2acdd9d..a278c91 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h
@@ -391,7 +391,7 @@ template <typename T> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void check_size_for_overflow(std::size_t size) { - constexpr std::size_t max_elements = PTRDIFF_MAX / sizeof(T); + constexpr std::size_t max_elements = (std::numeric_limits<std::ptrdiff_t>::max)() / sizeof(T); if (size > max_elements) throw_std_bad_alloc(); }
diff --git a/test/constexpr.cpp b/test/constexpr.cpp index 9fdf447..34c728f 100644 --- a/test/constexpr.cpp +++ b/test/constexpr.cpp
@@ -16,38 +16,48 @@ // until after the constructor returns: // error: member ‘Eigen::internal::plain_array<int, 9, 0, 0>::array’ must be initialized by mem-initializer in // ‘constexpr’ constructor -#if EIGEN_COMP_CXXVER >= 20 +#if __cpp_constexpr >= 201907L constexpr Matrix3i mat({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); VERIFY_IS_EQUAL(mat.size(), 9); - VERIFY_IS_EQUAL(mat(0, 0), 1); + static_assert(mat(0, 0) == 1); + static_assert(mat(0) == 1); static_assert(mat.coeff(0, 1) == 2); constexpr Array33i arr({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); - VERIFY_IS_EQUAL(arr(0, 0), 1); + static_assert(arr(0, 0) == 1); + static_assert(arr(0) == 1); VERIFY_IS_EQUAL(arr.size(), 9); static_assert(arr.coeff(0, 1) == 2); + constexpr RowVector3i vec{{1, 2, 3}}; + static_assert(vec(0, 0) == 1); + static_assert(vec[0] == 1); + VERIFY_IS_EQUAL(vec.size(), 3); + static_assert(vec.coeff(0, 1) == 2); + // Also check dynamic size arrays/matrices with fixed-size storage (currently // only works if all elements are initialized, since otherwise the compiler // complains about uninitialized trailing elements. constexpr Matrix<int, Eigen::Dynamic, Eigen::Dynamic, 0, 3, 3> dyn_mat({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); VERIFY_IS_EQUAL(dyn_mat.size(), 9); - VERIFY_IS_EQUAL(dyn_mat(0, 0), 1); + static_assert(dyn_mat(0, 0) == 1); static_assert(dyn_mat.coeff(0, 1) == 2); constexpr Array<int, Eigen::Dynamic, Eigen::Dynamic, 0, 3, 3> dyn_arr({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); - VERIFY_IS_EQUAL(dyn_arr(0, 0), 1); + static_assert(dyn_arr(0, 0) == 1); + static_assert(dyn_arr(0) == 1); VERIFY_IS_EQUAL(dyn_arr.size(), 9); static_assert(dyn_arr.coeff(0, 1) == 2); -#endif // EIGEN_COMP_CXXVER >= 20 +#endif // __cpp_constexpr >= 201907L } // Check that we can use the std::initializer_list constructor for constexpr variables. -#if EIGEN_COMP_CXXVER >= 20 +#if __cpp_constexpr >= 201907L // EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT() will fail constexpr evaluation unless // we have std::is_constant_evaluated(). constexpr Matrix<int, 2, 2> global_mat({{1, 2}, {3, 4}}); EIGEN_DECLARE_TEST(constexpr_global) { VERIFY_IS_EQUAL(global_mat.size(), 4); - VERIFY_IS_EQUAL(global_mat(0, 0), 1); + static_assert(global_mat(0, 0) == 1); + static_assert(global_mat(0) == 1); static_assert(global_mat.coeff(0, 0) == 1); } -#endif // EIGEN_COMP_CXXVER >= 20 +#endif // __cpp_constexpr >= 201907L
diff --git a/test/dense_storage.cpp b/test/dense_storage.cpp index 0f6ab64..d394a94 100644 --- a/test/dense_storage.cpp +++ b/test/dense_storage.cpp
@@ -38,6 +38,13 @@ // all fixed-size, fixed-dimension plain object types are trivially move constructible static_assert(std::is_trivially_move_constructible<Matrix4f>::value, "Matrix4f not trivially_move_constructible"); static_assert(std::is_trivially_move_constructible<Array4f>::value, "Array4f not trivially_move_constructible"); +// all statically-allocated plain object types are trivially destructible +static_assert(std::is_trivially_destructible<Matrix4f>::value, "Matrix4f not trivially_destructible"); +static_assert(std::is_trivially_destructible<Array4f>::value, "Array4f not trivially_destructible"); +static_assert(std::is_trivially_destructible<Matrix<float, 4, Dynamic, 0, 4, 4>>::value, + "Matrix4X44 not trivially_destructible"); +static_assert(std::is_trivially_destructible<Array<float, 4, Dynamic, 0, 4, 4>>::value, + "Array4X44 not trivially_destructible"); #if !defined(EIGEN_DENSE_STORAGE_CTOR_PLUGIN) // all fixed-size, fixed-dimension plain object types are trivially copy constructible static_assert(std::is_trivially_copy_constructible<Matrix4f>::value, "Matrix4f not trivially_copy_constructible");
diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h index 0b266f9..5f95fd0 100644 --- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h +++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
@@ -269,81 +269,8 @@ } }; -/**************************************************************************** - * Implementation of erf, requires C++11/C99 * - ****************************************************************************/ - -/** \internal \returns the error function of \a a (coeff-wise) - This uses a 11/10-degree rational interpolantand is accurate to 3 ulp for - normalized floats. - - This implementation works on both scalars and SIMD "packets". -*/ -template <typename T> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erf_float(const T& x) { - // The monomial coefficients of the numerator polynomial (odd). - constexpr float alpha[] = {2.123732201653183437883853912353515625e-06f, 2.861979592125862836837768554687500000e-04f, - 3.658048342913389205932617187500000000e-03f, 5.243302136659622192382812500000000000e-02f, - 1.874160766601562500000000000000000000e-01f, 1.128379106521606445312500000000000000e+00f}; - - // The monomial coefficients of the denominator polynomial (even). - constexpr float beta[] = {3.89185734093189239501953125000e-05f, 1.14329601638019084930419921875e-03f, - 1.47520881146192550659179687500e-02f, 1.12945675849914550781250000000e-01f, - 4.99425798654556274414062500000e-01f, 1.0f}; - - // Since the polynomials are odd/even, we need x^2. - // Since erf(4) == 1 in float, we clamp x^2 to 16 to avoid - // computing Inf/Inf below. - const T x2 = pmin(pset1<T>(16.0f), pmul(x, x)); - - // Evaluate the numerator polynomial p. - T p = ppolevl<T, 5>::run(x2, alpha); - p = pmul(x, p); - - // Evaluate the denominator polynomial p. - T q = ppolevl<T, 5>::run(x2, beta); - const T r = pdiv(p, q); - - // Clamp to [-1:1]. - return pmax(pmin(r, pset1<T>(1.0f)), pset1<T>(-1.0f)); -} - -template <typename T> -struct erf_impl { - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) { return generic_fast_erf_float(x); } -}; - -template <typename Scalar> -struct erf_retval { - typedef Scalar type; -}; - -#if EIGEN_HAS_C99_MATH -template <> -struct erf_impl<float> { - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float run(float x) { -#if defined(SYCL_DEVICE_ONLY) - return cl::sycl::erf(x); -#else - return generic_fast_erf_float(x); -#endif - } -}; - -template <> -struct erf_impl<double> { - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE double run(double x) { -#if defined(SYCL_DEVICE_ONLY) - return cl::sycl::erf(x); -#else - return ::erf(x); -#endif - } -}; -#endif // EIGEN_HAS_C99_MATH - /*************************************************************************** - * Implementation of erfc, requires C++11/C99 * + * Implementation of erfc. ****************************************************************************/ template <typename Scalar> struct generic_fast_erfc { @@ -366,7 +293,7 @@ 2.67075151205062866210937500000e-02, -1.12800106406211853027343750000e-01, 3.76122951507568359375000000000e-01, -1.12837910652160644531250000000e+00}; const T x2 = pmul(x, x); - const T one = pset1<T>(1.0); + const T one = pset1<T>(1.0f); const T erfc_small = pmadd(x, ppolevl<T, 5>::run(x2, alpha), one); // Return early if we don't need the more expensive approximation for any @@ -401,46 +328,42 @@ return pselect(x_abs_gt_one_mask, erfc_large, erfc_small); } -template <> +// Computes erf(x)/x for |x| <= 1. Used by both erf and erfc implementations. +// Takes x2 = x^2 as input. +// +// PRECONDITION: x2 <= 1. template <typename T> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erfc<double>::run(const T& x_in) { - // Clamp x to [-27:27] beyond which erfc(x) is either two or zero (below the underflow threshold). - // This avoids having to deal with twoprod(x,x) producing NaN for sufficiently large x. - constexpr double kClamp = 28.0; - const T x = pmin(pmax(x_in, pset1<T>(-kClamp)), pset1<T>(kClamp)); - - // erfc(x) = 1 + x * S(x^2) / T(x^2), |x| <= 1. +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T erf_over_x_double_small(const T& x2) { + // erf(x)/x = S(x^2) / T(x^2), x^2 <= 1. // // Coefficients for S and T generated with Rminimax command: - // ./ratapprox --function="erfc(x)-1" --dom='[-1,1]' --type=[9,10] + // ./ratapprox --function="erf(x)" --dom='[-1,1]' --type=[9,10] // --num="odd" --numF="[D]" --den="even" --denF="[D]" --log --dispCoeff="dec" - constexpr double alpha[] = {-1.9493725660006057018823477644531294572516344487667083740234375e-04, - -1.8272566210022942682217328425053892715368419885635375976562500e-03, - -4.5303363351690106863856044583371840417385101318359375000000000e-02, - -1.4215015503619179981775744181504705920815467834472656250000000e-01, - -1.1283791670955125585606992899556644260883331298828125000000000e+00}; + constexpr double alpha[] = {1.9493725660006057018823477644531294572516344487667083740234375e-04, + 1.8272566210022942682217328425053892715368419885635375976562500e-03, + 4.5303363351690106863856044583371840417385101318359375000000000e-02, + 1.4215015503619179981775744181504705920815467834472656250000000e-01, + 1.1283791670955125585606992899556644260883331298828125000000000e+00}; constexpr double beta[] = {2.0294484101083099089526257108317963684385176748037338256835938e-05, 6.8117805899186819641732970609382391558028757572174072265625000e-04, 1.0582026056098614921752165685120417037978768348693847656250000e-02, 9.3252603143757495374188692949246615171432495117187500000000000e-02, 4.5931062818368939559832142549566924571990966796875000000000000e-01, 1.0}; - const T x2 = pmul(x, x); const T num_small = ppolevl<T, 4>::run(x2, alpha); const T denom_small = ppolevl<T, 5>::run(x2, beta); - const T one = pset1<T>(1.0); - const T erfc_small = pmadd(x, pdiv(num_small, denom_small), one); + return pdiv(num_small, denom_small); +} - // Return early if we don't need the more expensive approximation for any - // entry in a. - const T x_abs_gt_one_mask = pcmp_lt(one, x2); - if (!predux_any(x_abs_gt_one_mask)) return erfc_small; - - // erfc(x) = exp(-x^2) * 1/x * P(x) / Q(x), 1 < x < 27. - // - // Coefficients for P and Q generated with Rminimax command: - // ./ratapprox --function="erfc(1/sqrt(x))*exp(1/x)/sqrt(x)" --dom='[0.0013717,1]' --type=[9,9] --numF="[D]" - // --denF="[D]" --log --dispCoeff="dec" +// erfc(x) = exp(-x^2) * 1/x * P(1/x^2) / Q(1/x^2), 1 < x < 28. +// +// Coefficients for P and Q generated with Rminimax command: +// ./ratapprox --function="erfc(1/sqrt(x))*exp(1/x)/sqrt(x)" --dom='[0.0013717,1]' --type=[9,9] --numF="[D]" +// --denF="[D]" --log --dispCoeff="dec" +// +// PRECONDITION: 1 < x < 28. +template <typename T> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T erfc_double_large(const T& x, const T& x2) { constexpr double gamma[] = {1.5252844933226974316088642158462107545346952974796295166015625e-04, 1.0909912393738931124520519233556115068495273590087890625000000e-02, 1.0628604636755033252537572252549580298364162445068359375000000e-01, @@ -461,7 +384,7 @@ 3.152505418656005586885981983868987299501895904541015625000000e-02, 2.565085751861882583380047861965067568235099315643310546875000e-03, 7.899362131678837697403017248376499992446042597293853759765625e-05}; - + // Compute exp(-x^2). const T x2_lo = twoprod_low(x, x, x2); // Here we use that // exp(-x^2) = exp(-(x2+x2_lo)^2) ~= exp(-x2)*exp(-x2_lo) ~= exp(-x2)*(1-x2_lo) @@ -469,12 +392,34 @@ // from 258 ulps to below 7 ulps. const T exp2_hi = pexp(pnegate(x2)); const T z = pnmadd(exp2_hi, x2_lo, exp2_hi); + // Compute r = P / Q. const T q2 = preciprocal(x2); const T num_large = ppolevl<T, 9>::run(q2, gamma); const T denom_large = pmul(x, ppolevl<T, 9>::run(q2, delta)); const T r = pdiv(num_large, denom_large); const T maybe_two = pand(pcmp_lt(x, pset1<T>(0.0)), pset1<T>(2.0)); - const T erfc_large = pmadd(z, r, maybe_two); + return pmadd(z, r, maybe_two); +} + +template <> +template <typename T> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erfc<double>::run(const T& x_in) { + // Clamp x to [-28:28] beyond which erfc(x) is either two or zero (below the underflow threshold). + // This avoids having to deal with twoprod(x,x) producing NaN for sufficiently large x. + constexpr double kClamp = 28.0; + const T x = pmin(pmax(x_in, pset1<T>(-kClamp)), pset1<T>(kClamp)); + + // For |x| < 1, we use erfc(x) = 1 - erf(x). + const T x2 = pmul(x, x); + const T one = pset1<T>(1.0); + const T erfc_small = pnmadd(x, erf_over_x_double_small(x2), one); + + // Return early if we don't need the more expensive approximation for any + // entry in a. + const T x_abs_gt_one_mask = pcmp_lt(one, x2); + if (!predux_any(x_abs_gt_one_mask)) return erfc_small; + + const T erfc_large = erfc_double_large(x, x2); return pselect(x_abs_gt_one_mask, erfc_large, erfc_small); } @@ -513,6 +458,104 @@ }; #endif // EIGEN_HAS_C99_MATH +/**************************************************************************** + * Implementation of erf. + ****************************************************************************/ + +template <typename Scalar> +struct generic_fast_erf { + template <typename T> + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T run(const T& x_in); +}; + +/** \internal \returns the error function of \a a (coeff-wise) + This uses a 11/10-degree rational interpolantand is accurate to 3 ulp for + normalized floats. + + This implementation works on both scalars and SIMD "packets". +*/ +template <> +template <typename T> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erf<float>::run(const T& x) { + // The monomial coefficients of the numerator polynomial (odd). + constexpr float alpha[] = {2.123732201653183437883853912353515625e-06f, 2.861979592125862836837768554687500000e-04f, + 3.658048342913389205932617187500000000e-03f, 5.243302136659622192382812500000000000e-02f, + 1.874160766601562500000000000000000000e-01f, 1.128379106521606445312500000000000000e+00f}; + + // The monomial coefficients of the denominator polynomial (even). + constexpr float beta[] = {3.89185734093189239501953125000e-05f, 1.14329601638019084930419921875e-03f, + 1.47520881146192550659179687500e-02f, 1.12945675849914550781250000000e-01f, + 4.99425798654556274414062500000e-01f, 1.0f}; + + // Since the polynomials are odd/even, we need x^2. + // Since erf(4) == 1 in float, we clamp x^2 to 16 to avoid + // computing Inf/Inf below. + const T x2 = pmin(pset1<T>(16.0f), pmul(x, x)); + + // Evaluate the numerator polynomial p. + T p = ppolevl<T, 5>::run(x2, alpha); + p = pmul(x, p); + + // Evaluate the denominator polynomial p. + T q = ppolevl<T, 5>::run(x2, beta); + const T r = pdiv(p, q); + + // Clamp to [-1:1]. + return pmax(pmin(r, pset1<T>(1.0f)), pset1<T>(-1.0f)); +} + +template <> +template <typename T> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erf<double>::run(const T& x) { + T x2 = pmul(x, x); + T erf_small = pmul(x, erf_over_x_double_small(x2)); + + // Return early if we don't need the more expensive approximation for any + // entry in a. + const T one = pset1<T>(1.0); + const T x_abs_gt_one_mask = pcmp_lt(one, x2); + if (!predux_any(x_abs_gt_one_mask)) return erf_small; + + // For |x| > 1, use erf(x) = 1 - erfc(x). + const T erf_large = psub(one, erfc_double_large(x, x2)); + return pselect(x_abs_gt_one_mask, erf_large, erf_small); +} + +template <typename T> +struct erf_impl { + typedef typename unpacket_traits<T>::type Scalar; + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) { return generic_fast_erf<Scalar>::run(x); } +}; + +template <typename Scalar> +struct erf_retval { + typedef Scalar type; +}; + +#if EIGEN_HAS_C99_MATH +template <> +struct erf_impl<float> { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float run(const float x) { +#if defined(SYCL_DEVICE_ONLY) + return cl::sycl::erf(x); +#else + return generic_fast_erf<float>::run(x); +#endif + } +}; + +template <> +struct erf_impl<double> { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE double run(const double x) { +#if defined(SYCL_DEVICE_ONLY) + return cl::sycl::erf(x); +#else + return generic_fast_erf<double>::run(x); +#endif + } +}; +#endif // EIGEN_HAS_C99_MATH + /*************************************************************************** * Implementation of ndtri. * ****************************************************************************/