Update Eigen to commit:e7c799b7c984f9b8bea27967bb04a97c52e62582
CHANGELOG
=========
e7c799b7c - Prevent premature overflow to infinity in exp(x). The changes also provide a 3-4% speedup.
00af47102 - Revert https://gitlab.com/libeigen/eigen/-/commit/040180078db70b8673932d7e5615920d64ceeaf5
8ee6f8475 - Speed up exp(x) by 30-35%.
93ec5450c - disable fill_n optimization for msvc
0af6ab4b7 - Remove unnecessary check for HasBlend trait.
d5eec781b - Get rid of redundant computation for large arguments to erf(x).
2fc63808e - Fix C++20 constexpr test compilation failures
5133c836c - Vectorize erf(x) for double.
d6e3b528b - Update Assign_MKL.h to cast disparate enum type to int, so it can be compared...
040180078 - Ensure that destructor'\''s needed by lldb make it into binary in non-inlined fashion
0fb2ed140 - Make element accessors constexpr
8b4efc8ed - check_size_for_overflow: use numeric limits instead of c99 macro
PiperOrigin-RevId: 698131745
Change-Id: Ia044ff8444a6d2266afdd7ad05556afd3be2366d
diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h
index f7f0b23..f40b2f4 100644
--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h
@@ -738,6 +738,7 @@
}
// Specialization for filling the destination with a constant value.
+#if !EIGEN_COMP_MSVC
#ifndef EIGEN_GPU_COMPILE_PHASE
template <typename DstXprType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(
@@ -748,6 +749,7 @@
std::fill_n(dst.data(), dst.size(), src.functor()());
}
#endif
+#endif
template <typename DstXprType, typename SrcXprType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src) {
diff --git a/Eigen/src/Core/Assign_MKL.h b/Eigen/src/Core/Assign_MKL.h
index 5b566cd..ad11220 100644
--- a/Eigen/src/Core/Assign_MKL.h
+++ b/Eigen/src/Core/Assign_MKL.h
@@ -89,7 +89,7 @@
static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE, EIGENTYPE> &func) { \
resize_if_allowed(dst, src, func); \
eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \
- if (vml_assign_traits<DstXprType, SrcXprNested>::Traversal == LinearTraversal) { \
+ if (vml_assign_traits<DstXprType, SrcXprNested>::Traversal == (int)LinearTraversal) { \
VMLOP(dst.size(), (const VMLTYPE *)src.nestedExpression().data(), \
(VMLTYPE *)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE)); \
} else { \
diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h
index 879b0db..156ca2b 100644
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h
@@ -124,8 +124,7 @@
// noncopyable:
// Don't make this class inherit noncopyable as this kills EBO (Empty Base Optimization)
// and make complex evaluator much larger than then should do.
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE evaluator_base() {}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~evaluator_base() {}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr evaluator_base() = default;
private:
EIGEN_DEVICE_FUNC evaluator_base(const evaluator_base&);
@@ -143,7 +142,7 @@
template <typename Scalar, int OuterStride>
class plainobjectbase_evaluator_data {
public:
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride)
: data(ptr) {
#ifndef EIGEN_INTERNAL_DEBUGGING
EIGEN_UNUSED_VARIABLE(outerStride);
@@ -157,9 +156,9 @@
template <typename Scalar>
class plainobjectbase_evaluator_data<Scalar, Dynamic> {
public:
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride)
: data(ptr), m_outerStride(outerStride) {}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outerStride() const { return m_outerStride; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index outerStride() const { return m_outerStride; }
const Scalar* data;
protected:
@@ -189,32 +188,34 @@
: RowsAtCompileTime
};
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE evaluator() : m_d(0, OuterStrideAtCompileTime) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr evaluator() : m_d(0, OuterStrideAtCompileTime) {
EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const PlainObjectType& m)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr explicit evaluator(const PlainObjectType& m)
: m_d(m.data(), IsVectorAtCompileTime ? 0 : m.outerStride()) {
EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType coeff(Index row, Index col) const {
if (IsRowMajor)
return m_d.data[row * m_d.outerStride() + col];
else
return m_d.data[row + col * m_d.outerStride()];
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_d.data[index]; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType coeff(Index index) const { return m_d.data[index]; }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& coeffRef(Index row, Index col) {
if (IsRowMajor)
return const_cast<Scalar*>(m_d.data)[row * m_d.outerStride() + col];
else
return const_cast<Scalar*>(m_d.data)[row + col * m_d.outerStride()];
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { return const_cast<Scalar*>(m_d.data)[index]; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& coeffRef(Index index) {
+ return const_cast<Scalar*>(m_d.data)[index];
+ }
template <int LoadMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
@@ -251,9 +252,10 @@
: evaluator<PlainObjectBase<Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols>>> {
typedef Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols> XprType;
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE evaluator() {}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr evaluator() = default;
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& m) : evaluator<PlainObjectBase<XprType>>(m) {}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr explicit evaluator(const XprType& m)
+ : evaluator<PlainObjectBase<XprType>>(m) {}
};
template <typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
@@ -261,9 +263,10 @@
: evaluator<PlainObjectBase<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols>>> {
typedef Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> XprType;
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE evaluator() {}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr evaluator() = default;
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& m) : evaluator<PlainObjectBase<XprType>>(m) {}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr explicit evaluator(const XprType& m)
+ : evaluator<PlainObjectBase<XprType>>(m) {}
};
// -------------------- Transpose --------------------
diff --git a/Eigen/src/Core/DenseCoeffsBase.h b/Eigen/src/Core/DenseCoeffsBase.h
index 30e0aa3..97f9b50 100644
--- a/Eigen/src/Core/DenseCoeffsBase.h
+++ b/Eigen/src/Core/DenseCoeffsBase.h
@@ -298,7 +298,7 @@
*
* \sa operator()(Index,Index), coeff(Index, Index) const, coeffRef(Index)
*/
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& coeffRef(Index row, Index col) {
eigen_internal_assert(row >= 0 && row < rows() && col >= 0 && col < cols());
return internal::evaluator<Derived>(derived()).coeffRef(row, col);
}
@@ -312,7 +312,7 @@
* \sa operator[](Index)
*/
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index row, Index col) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& operator()(Index row, Index col) {
eigen_assert(row >= 0 && row < rows() && col >= 0 && col < cols());
return coeffRef(row, col);
}
@@ -332,7 +332,7 @@
* \sa operator[](Index), coeff(Index) const, coeffRef(Index,Index)
*/
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& coeffRef(Index index) {
EIGEN_STATIC_ASSERT(internal::evaluator<Derived>::Flags & LinearAccessBit,
THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS)
eigen_internal_assert(index >= 0 && index < size());
@@ -346,7 +346,7 @@
* \sa operator[](Index) const, operator()(Index,Index), x(), y(), z(), w()
*/
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator[](Index index) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& operator[](Index index) {
EIGEN_STATIC_ASSERT(Derived::IsVectorAtCompileTime,
THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD)
eigen_assert(index >= 0 && index < size());
diff --git a/Eigen/src/Core/EigenBase.h b/Eigen/src/Core/EigenBase.h
index a2d6ee2..6d16700 100644
--- a/Eigen/src/Core/EigenBase.h
+++ b/Eigen/src/Core/EigenBase.h
@@ -46,9 +46,9 @@
typedef typename internal::traits<Derived>::StorageKind StorageKind;
/** \returns a reference to the derived object */
- EIGEN_DEVICE_FUNC Derived& derived() { return *static_cast<Derived*>(this); }
+ EIGEN_DEVICE_FUNC constexpr Derived& derived() { return *static_cast<Derived*>(this); }
/** \returns a const reference to the derived object */
- EIGEN_DEVICE_FUNC const Derived& derived() const { return *static_cast<const Derived*>(this); }
+ EIGEN_DEVICE_FUNC constexpr const Derived& derived() const { return *static_cast<const Derived*>(this); }
EIGEN_DEVICE_FUNC inline Derived& const_cast_derived() const {
return *static_cast<Derived*>(const_cast<EigenBase*>(this));
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index 1980e92..7e4f054 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -1934,6 +1934,22 @@
}
template <>
+EIGEN_STRONG_INLINE Packet4d pldexp_fast<Packet4d>(const Packet4d& a, const Packet4d& exponent) {
+ // Clamp exponent to [-1024, 1024]
+ const Packet4d min_exponent = pset1<Packet4d>(-1023.0);
+ const Packet4d max_exponent = pset1<Packet4d>(1024.0);
+ const Packet4i e = _mm256_cvtpd_epi32(pmin(pmax(exponent, min_exponent), max_exponent));
+ const Packet4i bias = pset1<Packet4i>(1023);
+
+ // 2^e
+ Packet4i hi = vec4i_swizzle1(padd(e, bias), 0, 2, 1, 3);
+ const Packet4i lo = _mm_slli_epi64(hi, 52);
+ hi = _mm_slli_epi64(_mm_srli_epi64(hi, 32), 52);
+ const Packet4d c = _mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1));
+ return pmul(a, c); // a * 2^e
+}
+
+template <>
EIGEN_STRONG_INLINE float predux<Packet8f>(const Packet8f& a) {
return predux(Packet4f(_mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 1))));
}
diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h
index 78d17d5..5d869e4 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
@@ -155,6 +155,7 @@
HasExp = 1,
HasATan = 1,
HasTanh = EIGEN_FAST_MATH,
+ HasErf = EIGEN_FAST_MATH,
HasErfc = EIGEN_FAST_MATH,
HasATanh = 1,
HasCmp = 1,
diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index da26cd4..49220ca 100644
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -3183,6 +3183,7 @@
HasSin = EIGEN_FAST_MATH,
HasCos = EIGEN_FAST_MATH,
HasTanh = EIGEN_FAST_MATH,
+ HasErf = EIGEN_FAST_MATH,
HasErfc = EIGEN_FAST_MATH,
HasATanh = 1,
HasATan = 0,
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
index 4e441b4..e21d3ef 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@@ -274,22 +274,20 @@
//
// Assumes IEEE floating point format
template <typename Packet>
-struct pldexp_fast_impl {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_fast(const Packet& a, const Packet& exponent) {
typedef typename unpacket_traits<Packet>::integer_packet PacketI;
typedef typename unpacket_traits<Packet>::type Scalar;
typedef typename unpacket_traits<PacketI>::type ScalarI;
static constexpr int TotalBits = sizeof(Scalar) * CHAR_BIT, MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
ExponentBits = TotalBits - MantissaBits - 1;
- static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet run(const Packet& a, const Packet& exponent) {
- const Packet bias = pset1<Packet>(Scalar((ScalarI(1) << (ExponentBits - 1)) - ScalarI(1))); // 127
- const Packet limit = pset1<Packet>(Scalar((ScalarI(1) << ExponentBits) - ScalarI(1))); // 255
- // restrict biased exponent between 0 and 255 for float.
- const PacketI e = pcast<Packet, PacketI>(pmin(pmax(padd(exponent, bias), pzero(limit)), limit)); // exponent + 127
- // return a * (2^e)
- return pmul(a, preinterpret<Packet>(plogical_shift_left<MantissaBits>(e)));
- }
-};
+ const Packet bias = pset1<Packet>(Scalar((ScalarI(1) << (ExponentBits - 1)) - ScalarI(1))); // 127
+ const Packet limit = pset1<Packet>(Scalar((ScalarI(1) << ExponentBits) - ScalarI(1))); // 255
+ // restrict biased exponent between 0 and 255 for float.
+ const PacketI e = pcast<Packet, PacketI>(pmin(pmax(padd(exponent, bias), pzero(limit)), limit)); // exponent + 127
+ // return a * (2^e)
+ return pmul(a, preinterpret<Packet>(plogical_shift_left<MantissaBits>(e)));
+}
// Natural or base 2 logarithm.
// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
@@ -514,6 +512,7 @@
const Packet cst_half = pset1<Packet>(0.5f);
const Packet cst_exp_hi = pset1<Packet>(88.723f);
const Packet cst_exp_lo = pset1<Packet>(-104.f);
+ const Packet cst_pldexp_threshold = pset1<Packet>(87.0);
const Packet cst_cephes_LOG2EF = pset1<Packet>(1.44269504088896341f);
const Packet cst_p2 = pset1<Packet>(0.49999988079071044921875f);
@@ -549,7 +548,12 @@
y = pmadd(r2, y, p_low);
// Return 2^m * exp(r).
- // TODO: replace pldexp with faster implementation since y in [-1, 1).
+ const Packet fast_pldexp_unsafe = pcmp_lt(cst_pldexp_threshold, pabs(x));
+ if (!predux_any(fast_pldexp_unsafe)) {
+ // For |x| <= 87, we know the result is not zero or inf, and we can safely use
+ // the fast version of pldexp.
+ return pmax(pldexp_fast(y, m), _x);
+ }
return pselect(zero_mask, cst_zero, pmax(pldexp(y, m), _x));
}
@@ -562,8 +566,8 @@
const Packet cst_half = pset1<Packet>(0.5);
const Packet cst_exp_hi = pset1<Packet>(709.784);
- const Packet cst_exp_lo = pset1<Packet>(-709.784);
-
+ const Packet cst_exp_lo = pset1<Packet>(-745.519);
+ const Packet cst_pldexp_threshold = pset1<Packet>(708.0);
const Packet cst_cephes_LOG2EF = pset1<Packet>(1.4426950408889634073599);
const Packet cst_cephes_exp_p0 = pset1<Packet>(1.26177193074810590878e-4);
const Packet cst_cephes_exp_p1 = pset1<Packet>(3.02994407707441961300e-2);
@@ -616,7 +620,12 @@
// Construct the result 2^n * exp(g) = e * x. The max is used to catch
// non-finite values in the input.
- // TODO: replace pldexp with faster implementation since x in [-1, 1).
+ const Packet fast_pldexp_unsafe = pcmp_lt(cst_pldexp_threshold, pabs(_x));
+ if (!predux_any(fast_pldexp_unsafe)) {
+ // For |x| <= 708, we know the result is not zero or inf, and we can safely use
+ // the fast version of pldexp.
+ return pmax(pldexp_fast(x, fx), _x);
+ }
return pselect(zero_mask, cst_zero, pmax(pldexp(x, fx), _x));
}
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
index 3b362f4..ac0e2cf 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
@@ -42,6 +42,18 @@
template <typename Packet>
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_generic(const Packet& a, const Packet& exponent);
+// Explicitly multiplies
+// a * (2^e)
+// clamping e to the range
+// [NumTraits<Scalar>::min_exponent()-2, NumTraits<Scalar>::max_exponent()]
+//
+// This is approx 7x faster than pldexp_impl, but will prematurely over/underflow
+// if 2^e doesn't fit into a normal floating-point Scalar.
+//
+// Assumes IEEE floating point format
+template <typename Packet>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_fast(const Packet& a, const Packet& exponent);
+
/** \internal \returns log(x) for single precision float */
template <typename Packet>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_float(const Packet _x);
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 2f401fd..3f2d9d5 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -5141,7 +5141,7 @@
HasSqrt = 1,
HasRsqrt = 1,
HasTanh = EIGEN_FAST_MATH,
- HasErf = 0,
+ HasErf = EIGEN_FAST_MATH,
HasErfc = EIGEN_FAST_MATH
};
};
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index c749763..f294009 100644
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -1766,7 +1766,6 @@
// We specialize pldexp here, since the generic implementation uses Packet2l, which is not well
// supported by SSE, and has more range than is needed for exponents.
-// TODO(rmlarsen): Remove this specialization once Packet2l has support or casting.
template <>
EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
// Clamp exponent to [-2099, 2099]
@@ -1787,6 +1786,24 @@
return out;
}
+// We specialize pldexp here, since the generic implementation uses Packet2l, which is not well
+// supported by SSE, and has more range than is needed for exponents.
+template <>
+EIGEN_STRONG_INLINE Packet2d pldexp_fast<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
+ // Clamp exponent to [-1023, 1024]
+ const Packet2d min_exponent = pset1<Packet2d>(-1023.0);
+ const Packet2d max_exponent = pset1<Packet2d>(1024.0);
+ const Packet2d e = pmin(pmax(exponent, min_exponent), max_exponent);
+
+ // Convert e to integer and swizzle to low-order bits.
+ const Packet4i ei = vec4i_swizzle1(_mm_cvtpd_epi32(e), 0, 3, 1, 3);
+
+ // Compute 2^e multiply:
+ const Packet4i bias = _mm_set_epi32(0, 1023, 0, 1023);
+ const Packet2d c = _mm_castsi128_pd(_mm_slli_epi64(padd(ei, bias), 52)); // 2^e
+ return pmul(a, c);
+}
+
// with AVX, the default implementations based on pload1 are faster
#ifndef __AVX__
template <>
diff --git a/Eigen/src/Core/functors/NullaryFunctors.h b/Eigen/src/Core/functors/NullaryFunctors.h
index c53bb90..a478b80 100644
--- a/Eigen/src/Core/functors/NullaryFunctors.h
+++ b/Eigen/src/Core/functors/NullaryFunctors.h
@@ -131,8 +131,7 @@
struct functor_traits<linspaced_op<Scalar> > {
enum {
Cost = 1,
- PacketAccess =
- (!NumTraits<Scalar>::IsInteger) && packet_traits<Scalar>::HasSetLinear && packet_traits<Scalar>::HasBlend,
+ PacketAccess = (!NumTraits<Scalar>::IsInteger) && packet_traits<Scalar>::HasSetLinear,
/*&& ((!NumTraits<Scalar>::IsInteger) || packet_traits<Scalar>::HasDiv),*/ // <- vectorization for integer is
// currently disabled
IsRepeatable = true
diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h
index defd3c2..03542e3 100644
--- a/Eigen/src/Core/functors/UnaryFunctors.h
+++ b/Eigen/src/Core/functors/UnaryFunctors.h
@@ -1292,7 +1292,7 @@
p = pmadd(r2, p, p_low);
// 4. Undo subtractive range reduction exp(m*ln(2) + r) = 2^m * exp(r).
- Packet e = pldexp_fast_impl<Packet>::run(p, m);
+ Packet e = pldexp_fast(p, m);
// 5. Undo multiplicative range reduction by using exp(r) = exp(r/2)^2.
e = pmul(e, e);
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index 2acdd9d..a278c91 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -391,7 +391,7 @@
template <typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void check_size_for_overflow(std::size_t size) {
- constexpr std::size_t max_elements = PTRDIFF_MAX / sizeof(T);
+ constexpr std::size_t max_elements = (std::numeric_limits<std::ptrdiff_t>::max)() / sizeof(T);
if (size > max_elements) throw_std_bad_alloc();
}
diff --git a/test/constexpr.cpp b/test/constexpr.cpp
index 9fdf447..34c728f 100644
--- a/test/constexpr.cpp
+++ b/test/constexpr.cpp
@@ -16,38 +16,48 @@
// until after the constructor returns:
// error: member ‘Eigen::internal::plain_array<int, 9, 0, 0>::array’ must be initialized by mem-initializer in
// ‘constexpr’ constructor
-#if EIGEN_COMP_CXXVER >= 20
+#if __cpp_constexpr >= 201907L
constexpr Matrix3i mat({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
VERIFY_IS_EQUAL(mat.size(), 9);
- VERIFY_IS_EQUAL(mat(0, 0), 1);
+ static_assert(mat(0, 0) == 1);
+ static_assert(mat(0) == 1);
static_assert(mat.coeff(0, 1) == 2);
constexpr Array33i arr({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
- VERIFY_IS_EQUAL(arr(0, 0), 1);
+ static_assert(arr(0, 0) == 1);
+ static_assert(arr(0) == 1);
VERIFY_IS_EQUAL(arr.size(), 9);
static_assert(arr.coeff(0, 1) == 2);
+ constexpr RowVector3i vec{{1, 2, 3}};
+ static_assert(vec(0, 0) == 1);
+ static_assert(vec[0] == 1);
+ VERIFY_IS_EQUAL(vec.size(), 3);
+ static_assert(vec.coeff(0, 1) == 2);
+
// Also check dynamic size arrays/matrices with fixed-size storage (currently
// only works if all elements are initialized, since otherwise the compiler
// complains about uninitialized trailing elements.
constexpr Matrix<int, Eigen::Dynamic, Eigen::Dynamic, 0, 3, 3> dyn_mat({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
VERIFY_IS_EQUAL(dyn_mat.size(), 9);
- VERIFY_IS_EQUAL(dyn_mat(0, 0), 1);
+ static_assert(dyn_mat(0, 0) == 1);
static_assert(dyn_mat.coeff(0, 1) == 2);
constexpr Array<int, Eigen::Dynamic, Eigen::Dynamic, 0, 3, 3> dyn_arr({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
- VERIFY_IS_EQUAL(dyn_arr(0, 0), 1);
+ static_assert(dyn_arr(0, 0) == 1);
+ static_assert(dyn_arr(0) == 1);
VERIFY_IS_EQUAL(dyn_arr.size(), 9);
static_assert(dyn_arr.coeff(0, 1) == 2);
-#endif // EIGEN_COMP_CXXVER >= 20
+#endif // __cpp_constexpr >= 201907L
}
// Check that we can use the std::initializer_list constructor for constexpr variables.
-#if EIGEN_COMP_CXXVER >= 20
+#if __cpp_constexpr >= 201907L
// EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT() will fail constexpr evaluation unless
// we have std::is_constant_evaluated().
constexpr Matrix<int, 2, 2> global_mat({{1, 2}, {3, 4}});
EIGEN_DECLARE_TEST(constexpr_global) {
VERIFY_IS_EQUAL(global_mat.size(), 4);
- VERIFY_IS_EQUAL(global_mat(0, 0), 1);
+ static_assert(global_mat(0, 0) == 1);
+ static_assert(global_mat(0) == 1);
static_assert(global_mat.coeff(0, 0) == 1);
}
-#endif // EIGEN_COMP_CXXVER >= 20
+#endif // __cpp_constexpr >= 201907L
diff --git a/test/dense_storage.cpp b/test/dense_storage.cpp
index 0f6ab64..d394a94 100644
--- a/test/dense_storage.cpp
+++ b/test/dense_storage.cpp
@@ -38,6 +38,13 @@
// all fixed-size, fixed-dimension plain object types are trivially move constructible
static_assert(std::is_trivially_move_constructible<Matrix4f>::value, "Matrix4f not trivially_move_constructible");
static_assert(std::is_trivially_move_constructible<Array4f>::value, "Array4f not trivially_move_constructible");
+// all statically-allocated plain object types are trivially destructible
+static_assert(std::is_trivially_destructible<Matrix4f>::value, "Matrix4f not trivially_destructible");
+static_assert(std::is_trivially_destructible<Array4f>::value, "Array4f not trivially_destructible");
+static_assert(std::is_trivially_destructible<Matrix<float, 4, Dynamic, 0, 4, 4>>::value,
+ "Matrix4X44 not trivially_destructible");
+static_assert(std::is_trivially_destructible<Array<float, 4, Dynamic, 0, 4, 4>>::value,
+ "Array4X44 not trivially_destructible");
#if !defined(EIGEN_DENSE_STORAGE_CTOR_PLUGIN)
// all fixed-size, fixed-dimension plain object types are trivially copy constructible
static_assert(std::is_trivially_copy_constructible<Matrix4f>::value, "Matrix4f not trivially_copy_constructible");
diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
index 0b266f9..5f95fd0 100644
--- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
@@ -269,81 +269,8 @@
}
};
-/****************************************************************************
- * Implementation of erf, requires C++11/C99 *
- ****************************************************************************/
-
-/** \internal \returns the error function of \a a (coeff-wise)
- This uses a 11/10-degree rational interpolantand is accurate to 3 ulp for
- normalized floats.
-
- This implementation works on both scalars and SIMD "packets".
-*/
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erf_float(const T& x) {
- // The monomial coefficients of the numerator polynomial (odd).
- constexpr float alpha[] = {2.123732201653183437883853912353515625e-06f, 2.861979592125862836837768554687500000e-04f,
- 3.658048342913389205932617187500000000e-03f, 5.243302136659622192382812500000000000e-02f,
- 1.874160766601562500000000000000000000e-01f, 1.128379106521606445312500000000000000e+00f};
-
- // The monomial coefficients of the denominator polynomial (even).
- constexpr float beta[] = {3.89185734093189239501953125000e-05f, 1.14329601638019084930419921875e-03f,
- 1.47520881146192550659179687500e-02f, 1.12945675849914550781250000000e-01f,
- 4.99425798654556274414062500000e-01f, 1.0f};
-
- // Since the polynomials are odd/even, we need x^2.
- // Since erf(4) == 1 in float, we clamp x^2 to 16 to avoid
- // computing Inf/Inf below.
- const T x2 = pmin(pset1<T>(16.0f), pmul(x, x));
-
- // Evaluate the numerator polynomial p.
- T p = ppolevl<T, 5>::run(x2, alpha);
- p = pmul(x, p);
-
- // Evaluate the denominator polynomial p.
- T q = ppolevl<T, 5>::run(x2, beta);
- const T r = pdiv(p, q);
-
- // Clamp to [-1:1].
- return pmax(pmin(r, pset1<T>(1.0f)), pset1<T>(-1.0f));
-}
-
-template <typename T>
-struct erf_impl {
- EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) { return generic_fast_erf_float(x); }
-};
-
-template <typename Scalar>
-struct erf_retval {
- typedef Scalar type;
-};
-
-#if EIGEN_HAS_C99_MATH
-template <>
-struct erf_impl<float> {
- EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float run(float x) {
-#if defined(SYCL_DEVICE_ONLY)
- return cl::sycl::erf(x);
-#else
- return generic_fast_erf_float(x);
-#endif
- }
-};
-
-template <>
-struct erf_impl<double> {
- EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE double run(double x) {
-#if defined(SYCL_DEVICE_ONLY)
- return cl::sycl::erf(x);
-#else
- return ::erf(x);
-#endif
- }
-};
-#endif // EIGEN_HAS_C99_MATH
-
/***************************************************************************
- * Implementation of erfc, requires C++11/C99 *
+ * Implementation of erfc.
****************************************************************************/
template <typename Scalar>
struct generic_fast_erfc {
@@ -366,7 +293,7 @@
2.67075151205062866210937500000e-02, -1.12800106406211853027343750000e-01,
3.76122951507568359375000000000e-01, -1.12837910652160644531250000000e+00};
const T x2 = pmul(x, x);
- const T one = pset1<T>(1.0);
+ const T one = pset1<T>(1.0f);
const T erfc_small = pmadd(x, ppolevl<T, 5>::run(x2, alpha), one);
// Return early if we don't need the more expensive approximation for any
@@ -401,46 +328,42 @@
return pselect(x_abs_gt_one_mask, erfc_large, erfc_small);
}
-template <>
+// Computes erf(x)/x for |x| <= 1. Used by both erf and erfc implementations.
+// Takes x2 = x^2 as input.
+//
+// PRECONDITION: x2 <= 1.
template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erfc<double>::run(const T& x_in) {
- // Clamp x to [-27:27] beyond which erfc(x) is either two or zero (below the underflow threshold).
- // This avoids having to deal with twoprod(x,x) producing NaN for sufficiently large x.
- constexpr double kClamp = 28.0;
- const T x = pmin(pmax(x_in, pset1<T>(-kClamp)), pset1<T>(kClamp));
-
- // erfc(x) = 1 + x * S(x^2) / T(x^2), |x| <= 1.
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T erf_over_x_double_small(const T& x2) {
+ // erf(x)/x = S(x^2) / T(x^2), x^2 <= 1.
//
// Coefficients for S and T generated with Rminimax command:
- // ./ratapprox --function="erfc(x)-1" --dom='[-1,1]' --type=[9,10]
+ // ./ratapprox --function="erf(x)" --dom='[-1,1]' --type=[9,10]
// --num="odd" --numF="[D]" --den="even" --denF="[D]" --log --dispCoeff="dec"
- constexpr double alpha[] = {-1.9493725660006057018823477644531294572516344487667083740234375e-04,
- -1.8272566210022942682217328425053892715368419885635375976562500e-03,
- -4.5303363351690106863856044583371840417385101318359375000000000e-02,
- -1.4215015503619179981775744181504705920815467834472656250000000e-01,
- -1.1283791670955125585606992899556644260883331298828125000000000e+00};
+ constexpr double alpha[] = {1.9493725660006057018823477644531294572516344487667083740234375e-04,
+ 1.8272566210022942682217328425053892715368419885635375976562500e-03,
+ 4.5303363351690106863856044583371840417385101318359375000000000e-02,
+ 1.4215015503619179981775744181504705920815467834472656250000000e-01,
+ 1.1283791670955125585606992899556644260883331298828125000000000e+00};
constexpr double beta[] = {2.0294484101083099089526257108317963684385176748037338256835938e-05,
6.8117805899186819641732970609382391558028757572174072265625000e-04,
1.0582026056098614921752165685120417037978768348693847656250000e-02,
9.3252603143757495374188692949246615171432495117187500000000000e-02,
4.5931062818368939559832142549566924571990966796875000000000000e-01,
1.0};
- const T x2 = pmul(x, x);
const T num_small = ppolevl<T, 4>::run(x2, alpha);
const T denom_small = ppolevl<T, 5>::run(x2, beta);
- const T one = pset1<T>(1.0);
- const T erfc_small = pmadd(x, pdiv(num_small, denom_small), one);
+ return pdiv(num_small, denom_small);
+}
- // Return early if we don't need the more expensive approximation for any
- // entry in a.
- const T x_abs_gt_one_mask = pcmp_lt(one, x2);
- if (!predux_any(x_abs_gt_one_mask)) return erfc_small;
-
- // erfc(x) = exp(-x^2) * 1/x * P(x) / Q(x), 1 < x < 27.
- //
- // Coefficients for P and Q generated with Rminimax command:
- // ./ratapprox --function="erfc(1/sqrt(x))*exp(1/x)/sqrt(x)" --dom='[0.0013717,1]' --type=[9,9] --numF="[D]"
- // --denF="[D]" --log --dispCoeff="dec"
+// erfc(x) = exp(-x^2) * 1/x * P(1/x^2) / Q(1/x^2), 1 < x < 28.
+//
+// Coefficients for P and Q generated with Rminimax command:
+// ./ratapprox --function="erfc(1/sqrt(x))*exp(1/x)/sqrt(x)" --dom='[0.0013717,1]' --type=[9,9] --numF="[D]"
+// --denF="[D]" --log --dispCoeff="dec"
+//
+// PRECONDITION: 1 < x < 28.
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T erfc_double_large(const T& x, const T& x2) {
constexpr double gamma[] = {1.5252844933226974316088642158462107545346952974796295166015625e-04,
1.0909912393738931124520519233556115068495273590087890625000000e-02,
1.0628604636755033252537572252549580298364162445068359375000000e-01,
@@ -461,7 +384,7 @@
3.152505418656005586885981983868987299501895904541015625000000e-02,
2.565085751861882583380047861965067568235099315643310546875000e-03,
7.899362131678837697403017248376499992446042597293853759765625e-05};
-
+ // Compute exp(-x^2).
const T x2_lo = twoprod_low(x, x, x2);
// Here we use that
// exp(-x^2) = exp(-(x2+x2_lo)^2) ~= exp(-x2)*exp(-x2_lo) ~= exp(-x2)*(1-x2_lo)
@@ -469,12 +392,34 @@
// from 258 ulps to below 7 ulps.
const T exp2_hi = pexp(pnegate(x2));
const T z = pnmadd(exp2_hi, x2_lo, exp2_hi);
+ // Compute r = P / Q.
const T q2 = preciprocal(x2);
const T num_large = ppolevl<T, 9>::run(q2, gamma);
const T denom_large = pmul(x, ppolevl<T, 9>::run(q2, delta));
const T r = pdiv(num_large, denom_large);
const T maybe_two = pand(pcmp_lt(x, pset1<T>(0.0)), pset1<T>(2.0));
- const T erfc_large = pmadd(z, r, maybe_two);
+ return pmadd(z, r, maybe_two);
+}
+
+template <>
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erfc<double>::run(const T& x_in) {
+ // Clamp x to [-28:28] beyond which erfc(x) is either two or zero (below the underflow threshold).
+ // This avoids having to deal with twoprod(x,x) producing NaN for sufficiently large x.
+ constexpr double kClamp = 28.0;
+ const T x = pmin(pmax(x_in, pset1<T>(-kClamp)), pset1<T>(kClamp));
+
+ // For |x| < 1, we use erfc(x) = 1 - erf(x).
+ const T x2 = pmul(x, x);
+ const T one = pset1<T>(1.0);
+ const T erfc_small = pnmadd(x, erf_over_x_double_small(x2), one);
+
+ // Return early if we don't need the more expensive approximation for any
+ // entry in a.
+ const T x_abs_gt_one_mask = pcmp_lt(one, x2);
+ if (!predux_any(x_abs_gt_one_mask)) return erfc_small;
+
+ const T erfc_large = erfc_double_large(x, x2);
return pselect(x_abs_gt_one_mask, erfc_large, erfc_small);
}
@@ -513,6 +458,104 @@
};
#endif // EIGEN_HAS_C99_MATH
+/****************************************************************************
+ * Implementation of erf.
+ ****************************************************************************/
+
+template <typename Scalar>
+struct generic_fast_erf {
+ template <typename T>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T run(const T& x_in);
+};
+
+/** \internal \returns the error function of \a a (coeff-wise)
+ This uses a 11/10-degree rational interpolantand is accurate to 3 ulp for
+ normalized floats.
+
+ This implementation works on both scalars and SIMD "packets".
+*/
+template <>
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erf<float>::run(const T& x) {
+ // The monomial coefficients of the numerator polynomial (odd).
+ constexpr float alpha[] = {2.123732201653183437883853912353515625e-06f, 2.861979592125862836837768554687500000e-04f,
+ 3.658048342913389205932617187500000000e-03f, 5.243302136659622192382812500000000000e-02f,
+ 1.874160766601562500000000000000000000e-01f, 1.128379106521606445312500000000000000e+00f};
+
+ // The monomial coefficients of the denominator polynomial (even).
+ constexpr float beta[] = {3.89185734093189239501953125000e-05f, 1.14329601638019084930419921875e-03f,
+ 1.47520881146192550659179687500e-02f, 1.12945675849914550781250000000e-01f,
+ 4.99425798654556274414062500000e-01f, 1.0f};
+
+ // Since the polynomials are odd/even, we need x^2.
+ // Since erf(4) == 1 in float, we clamp x^2 to 16 to avoid
+ // computing Inf/Inf below.
+ const T x2 = pmin(pset1<T>(16.0f), pmul(x, x));
+
+ // Evaluate the numerator polynomial p.
+ T p = ppolevl<T, 5>::run(x2, alpha);
+ p = pmul(x, p);
+
+ // Evaluate the denominator polynomial p.
+ T q = ppolevl<T, 5>::run(x2, beta);
+ const T r = pdiv(p, q);
+
+ // Clamp to [-1:1].
+ return pmax(pmin(r, pset1<T>(1.0f)), pset1<T>(-1.0f));
+}
+
+template <>
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erf<double>::run(const T& x) {
+ T x2 = pmul(x, x);
+ T erf_small = pmul(x, erf_over_x_double_small(x2));
+
+ // Return early if we don't need the more expensive approximation for any
+ // entry in a.
+ const T one = pset1<T>(1.0);
+ const T x_abs_gt_one_mask = pcmp_lt(one, x2);
+ if (!predux_any(x_abs_gt_one_mask)) return erf_small;
+
+ // For |x| > 1, use erf(x) = 1 - erfc(x).
+ const T erf_large = psub(one, erfc_double_large(x, x2));
+ return pselect(x_abs_gt_one_mask, erf_large, erf_small);
+}
+
+template <typename T>
+struct erf_impl {
+ typedef typename unpacket_traits<T>::type Scalar;
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) { return generic_fast_erf<Scalar>::run(x); }
+};
+
+template <typename Scalar>
+struct erf_retval {
+ typedef Scalar type;
+};
+
+#if EIGEN_HAS_C99_MATH
+template <>
+struct erf_impl<float> {
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float run(const float x) {
+#if defined(SYCL_DEVICE_ONLY)
+ return cl::sycl::erf(x);
+#else
+ return generic_fast_erf<float>::run(x);
+#endif
+ }
+};
+
+template <>
+struct erf_impl<double> {
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE double run(const double x) {
+#if defined(SYCL_DEVICE_ONLY)
+ return cl::sycl::erf(x);
+#else
+ return generic_fast_erf<double>::run(x);
+#endif
+ }
+};
+#endif // EIGEN_HAS_C99_MATH
+
/***************************************************************************
* Implementation of ndtri. *
****************************************************************************/