Update Eigen to: https://gitlab.com/libeigen/eigen/-/commit/5dc2fbabeee17fe023c38756ebde0c1d56472913
BEGIN_PUBLIC
Update Eigen to: https://gitlab.com/libeigen/eigen/-/commit/5dc2fbabeee17fe023c38756ebde0c1d56472913
END_PUBLIC
PiperOrigin-RevId: 347493578
diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h
index 67e97ff..492cd5a 100644
--- a/Eigen/src/Cholesky/LDLT.h
+++ b/Eigen/src/Cholesky/LDLT.h
@@ -53,7 +53,7 @@
* decomposition to determine whether a system of equations has a solution.
*
* This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
- *
+ *
* \sa MatrixBase::ldlt(), SelfAdjointView::ldlt(), class LLT
*/
template<typename _MatrixType, int _UpLo> class LDLT
@@ -246,8 +246,8 @@
*/
const LDLT& adjoint() const { return *this; };
- inline Index rows() const { return m_matrix.rows(); }
- inline Index cols() const { return m_matrix.cols(); }
+ EIGEN_DEVICE_FUNC inline Index rows() const { return m_matrix.rows(); }
+ EIGEN_DEVICE_FUNC inline Index cols() const { return m_matrix.cols(); }
/** \brief Reports whether previous computation was successful.
*
diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h
index 229e258..47ccf62 100644
--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h
@@ -17,7 +17,7 @@
// This implementation is based on Assign.h
namespace internal {
-
+
/***************************************************************************
* Part 1 : the logic deciding a strategy for traversal and unrolling *
***************************************************************************/
@@ -29,12 +29,12 @@
{
typedef typename DstEvaluator::XprType Dst;
typedef typename Dst::Scalar DstScalar;
-
+
enum {
DstFlags = DstEvaluator::Flags,
SrcFlags = SrcEvaluator::Flags
};
-
+
public:
enum {
DstAlignment = DstEvaluator::Alignment,
@@ -99,7 +99,8 @@
public:
enum {
- Traversal = (int(MayLinearVectorize) && (LinearPacketSize>InnerPacketSize)) ? int(LinearVectorizedTraversal)
+ Traversal = int(Dst::SizeAtCompileTime) == 0 ? int(AllAtOnceTraversal) // If compile-size is zero, traversing will fail at compile-time.
+ : (int(MayLinearVectorize) && (LinearPacketSize>InnerPacketSize)) ? int(LinearVectorizedTraversal)
: int(MayInnerVectorize) ? int(InnerVectorizedTraversal)
: int(MayLinearVectorize) ? int(LinearVectorizedTraversal)
: int(MaySliceVectorize) ? int(SliceVectorizedTraversal)
@@ -137,7 +138,7 @@
? int(CompleteUnrolling)
: int(NoUnrolling) )
: int(Traversal) == int(LinearTraversal)
- ? ( bool(MayUnrollCompletely) ? int(CompleteUnrolling)
+ ? ( bool(MayUnrollCompletely) ? int(CompleteUnrolling)
: int(NoUnrolling) )
#if EIGEN_UNALIGNED_VECTORIZE
: int(Traversal) == int(SliceVectorizedTraversal)
@@ -199,7 +200,7 @@
// FIXME: this is not very clean, perhaps this information should be provided by the kernel?
typedef typename Kernel::DstEvaluatorType DstEvaluatorType;
typedef typename DstEvaluatorType::XprType DstXprType;
-
+
enum {
outer = Index / DstXprType::InnerSizeAtCompileTime,
inner = Index % DstXprType::InnerSizeAtCompileTime
@@ -265,7 +266,7 @@
typedef typename Kernel::DstEvaluatorType DstEvaluatorType;
typedef typename DstEvaluatorType::XprType DstXprType;
typedef typename Kernel::PacketType PacketType;
-
+
enum {
outer = Index / DstXprType::InnerSizeAtCompileTime,
inner = Index % DstXprType::InnerSizeAtCompileTime,
@@ -317,6 +318,22 @@
struct dense_assignment_loop;
/************************
+***** Special Cases *****
+************************/
+
+// Zero-sized assignment is a no-op.
+template<typename Kernel, int Unrolling>
+struct dense_assignment_loop<Kernel, AllAtOnceTraversal, Unrolling>
+{
+ EIGEN_DEVICE_FUNC static void EIGEN_STRONG_INLINE run(Kernel& /*kernel*/)
+ {
+ typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
+ EIGEN_STATIC_ASSERT(int(DstXprType::SizeAtCompileTime) == 0,
+ EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT)
+ }
+};
+
+/************************
*** Default traversal ***
************************/
@@ -430,7 +447,7 @@
{
typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
typedef typename Kernel::PacketType PacketType;
-
+
enum { size = DstXprType::SizeAtCompileTime,
packetSize =unpacket_traits<PacketType>::size,
alignedSize = (size/packetSize)*packetSize };
@@ -572,14 +589,15 @@
typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
typedef typename Kernel::PacketType PacketType;
- enum { size = DstXprType::InnerSizeAtCompileTime,
+ enum { innerSize = DstXprType::InnerSizeAtCompileTime,
packetSize =unpacket_traits<PacketType>::size,
- vectorizableSize = (size/packetSize)*packetSize };
+ vectorizableSize = (innerSize/packetSize)*packetSize,
+ size = DstXprType::SizeAtCompileTime };
for(Index outer = 0; outer < kernel.outerSize(); ++outer)
{
copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, vectorizableSize, 0, 0>::run(kernel, outer);
- copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, vectorizableSize, size>::run(kernel, outer);
+ copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, vectorizableSize, innerSize>::run(kernel, outer);
}
}
};
@@ -603,14 +621,14 @@
typedef typename DstEvaluatorTypeT::XprType DstXprType;
typedef typename SrcEvaluatorTypeT::XprType SrcXprType;
public:
-
+
typedef DstEvaluatorTypeT DstEvaluatorType;
typedef SrcEvaluatorTypeT SrcEvaluatorType;
typedef typename DstEvaluatorType::Scalar Scalar;
typedef copy_using_evaluator_traits<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor> AssignmentTraits;
typedef typename AssignmentTraits::PacketType PacketType;
-
-
+
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
generic_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
: m_dst(dst), m_src(src), m_functor(func), m_dstExpr(dstExpr)
@@ -619,58 +637,58 @@
AssignmentTraits::debug();
#endif
}
-
+
EIGEN_DEVICE_FUNC Index size() const { return m_dstExpr.size(); }
EIGEN_DEVICE_FUNC Index innerSize() const { return m_dstExpr.innerSize(); }
EIGEN_DEVICE_FUNC Index outerSize() const { return m_dstExpr.outerSize(); }
EIGEN_DEVICE_FUNC Index rows() const { return m_dstExpr.rows(); }
EIGEN_DEVICE_FUNC Index cols() const { return m_dstExpr.cols(); }
EIGEN_DEVICE_FUNC Index outerStride() const { return m_dstExpr.outerStride(); }
-
+
EIGEN_DEVICE_FUNC DstEvaluatorType& dstEvaluator() { return m_dst; }
EIGEN_DEVICE_FUNC const SrcEvaluatorType& srcEvaluator() const { return m_src; }
-
+
/// Assign src(row,col) to dst(row,col) through the assignment functor.
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Index row, Index col)
{
m_functor.assignCoeff(m_dst.coeffRef(row,col), m_src.coeff(row,col));
}
-
+
/// \sa assignCoeff(Index,Index)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Index index)
{
m_functor.assignCoeff(m_dst.coeffRef(index), m_src.coeff(index));
}
-
+
/// \sa assignCoeff(Index,Index)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeffByOuterInner(Index outer, Index inner)
{
- Index row = rowIndexByOuterInner(outer, inner);
- Index col = colIndexByOuterInner(outer, inner);
+ Index row = rowIndexByOuterInner(outer, inner);
+ Index col = colIndexByOuterInner(outer, inner);
assignCoeff(row, col);
}
-
-
+
+
template<int StoreMode, int LoadMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacket(Index row, Index col)
{
m_functor.template assignPacket<StoreMode>(&m_dst.coeffRef(row,col), m_src.template packet<LoadMode,PacketType>(row,col));
}
-
+
template<int StoreMode, int LoadMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacket(Index index)
{
m_functor.template assignPacket<StoreMode>(&m_dst.coeffRef(index), m_src.template packet<LoadMode,PacketType>(index));
}
-
+
template<int StoreMode, int LoadMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacketByOuterInner(Index outer, Index inner)
{
- Index row = rowIndexByOuterInner(outer, inner);
+ Index row = rowIndexByOuterInner(outer, inner);
Index col = colIndexByOuterInner(outer, inner);
assignPacket<StoreMode,LoadMode,PacketType>(row, col);
}
-
+
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index rowIndexByOuterInner(Index outer, Index inner)
{
typedef typename DstEvaluatorType::ExpressionTraits Traits;
@@ -693,7 +711,7 @@
{
return m_dstExpr.data();
}
-
+
protected:
DstEvaluatorType& m_dst;
const SrcEvaluatorType& m_src;
@@ -716,13 +734,13 @@
typedef typename Base::DstXprType DstXprType;
typedef copy_using_evaluator_traits<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, 4> AssignmentTraits;
typedef typename AssignmentTraits::PacketType PacketType;
-
+
EIGEN_DEVICE_FUNC restricted_packet_dense_assignment_kernel(DstEvaluatorTypeT &dst, const SrcEvaluatorTypeT &src, const Functor &func, DstXprType& dstExpr)
: Base(dst, src, func, dstExpr)
{
}
};
-
+
/***************************************************************************
* Part 5 : Entry point for dense rectangular assignment
***************************************************************************/
@@ -760,7 +778,7 @@
resize_if_allowed(dst, src, func);
DstEvaluatorType dstEvaluator(dst);
-
+
typedef generic_dense_assignment_kernel<DstEvaluatorType,SrcEvaluatorType,Functor> Kernel;
Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());
@@ -788,7 +806,7 @@
template<typename,typename> struct AssignmentKind { typedef EigenBase2EigenBase Kind; };
template<> struct AssignmentKind<DenseShape,DenseShape> { typedef Dense2Dense Kind; };
-
+
// This is the main assignment class
template< typename DstXprType, typename SrcXprType, typename Functor,
typename Kind = typename AssignmentKind< typename evaluator_traits<DstXprType>::Shape , typename evaluator_traits<SrcXprType>::Shape >::Kind,
@@ -813,7 +831,7 @@
{
call_assignment(dst, src, internal::assign_op<typename Dst::Scalar,typename Src::Scalar>());
}
-
+
// Deal with "assume-aliasing"
template<typename Dst, typename Src, typename Func>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -853,12 +871,12 @@
typedef typename internal::conditional<NeedToTranspose, Transpose<Dst>, Dst>::type ActualDstTypeCleaned;
typedef typename internal::conditional<NeedToTranspose, Transpose<Dst>, Dst&>::type ActualDstType;
ActualDstType actualDst(dst);
-
+
// TODO check whether this is the right place to perform these checks:
EIGEN_STATIC_ASSERT_LVALUE(Dst)
EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(ActualDstTypeCleaned,Src)
EIGEN_CHECK_BINARY_COMPATIBILIY(Func,typename ActualDstTypeCleaned::Scalar,typename Src::Scalar);
-
+
Assignment<ActualDstTypeCleaned,Src,Func>::run(actualDst, src, func);
}
@@ -875,7 +893,7 @@
SrcEvaluatorType srcEvaluator(src);
resize_if_allowed(dst, src, func);
-
+
DstEvaluatorType dstEvaluator(dst);
Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());
@@ -922,7 +940,7 @@
#ifndef EIGEN_NO_DEBUG
internal::check_for_aliasing(dst, src);
#endif
-
+
call_dense_assignment_loop(dst, src, func);
}
};
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index e2fc700..671ed3c 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -539,6 +539,20 @@
template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet
plset(const typename unpacket_traits<Packet>::type& a) { return a; }
+/** \internal \returns a packet with constant coefficients \a a, e.g.: (x, 0, x, 0),
+ where x is the value of all 1-bits. */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+peven_mask(const Packet& /*a*/) {
+ typedef typename unpacket_traits<Packet>::type Scalar;
+ const size_t n = unpacket_traits<Packet>::size;
+ Scalar elements[n];
+ for(size_t i = 0; i < n; ++i) {
+ memset(elements+i, ((i & 1) == 0 ? 0xff : 0), sizeof(Scalar));
+ }
+ return ploadu<Packet>(elements);
+}
+
+
/** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */
template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstore(Scalar* to, const Packet& from)
{ (*to) = from; }
@@ -568,7 +582,7 @@
#if defined(EIGEN_HIP_DEVICE_COMPILE)
// do nothing
#elif defined(EIGEN_CUDA_ARCH)
-#if defined(__LP64__)
+#if defined(__LP64__) || EIGEN_OS_WIN64
// 64-bit pointer operand constraint for inlined asm
asm(" prefetch.L1 [ %1 ];" : "=l"(addr) : "l"(addr));
#else
@@ -650,6 +664,13 @@
template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
Packet plog10(const Packet& a) { EIGEN_USING_STD(log10); return log10(a); }
+/** \internal \returns the log10 of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet plog2(const Packet& a) {
+ typedef typename internal::unpacket_traits<Packet>::type Scalar;
+ return pmul(pset1<Packet>(Scalar(EIGEN_LOG2E)), plog(a));
+}
+
/** \internal \returns the square-root of \a a (coeff-wise) */
template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
Packet psqrt(const Packet& a) { EIGEN_USING_STD(sqrt); return sqrt(a); }
diff --git a/Eigen/src/Core/GlobalFunctions.h b/Eigen/src/Core/GlobalFunctions.h
index 0a539ed..629af94 100644
--- a/Eigen/src/Core/GlobalFunctions.h
+++ b/Eigen/src/Core/GlobalFunctions.h
@@ -81,7 +81,8 @@
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(expm1,scalar_expm1_op,exponential of a value minus 1,\sa ArrayBase::expm1)
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log,scalar_log_op,natural logarithm,\sa Eigen::log10 DOXCOMMA ArrayBase::log)
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log1p,scalar_log1p_op,natural logarithm of 1 plus the value,\sa ArrayBase::log1p)
- EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log10,scalar_log10_op,base 10 logarithm,\sa Eigen::log DOXCOMMA ArrayBase::log)
+ EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log10,scalar_log10_op,base 10 logarithm,\sa Eigen::log DOXCOMMA ArrayBase::log10)
+ EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log2,scalar_log2_op,base 2 logarithm,\sa Eigen::log DOXCOMMA ArrayBase::log2)
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs,scalar_abs_op,absolute value,\sa ArrayBase::abs DOXCOMMA MatrixBase::cwiseAbs)
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs2,scalar_abs2_op,squared absolute value,\sa ArrayBase::abs2 DOXCOMMA MatrixBase::cwiseAbs2)
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(arg,scalar_arg_op,complex argument,\sa ArrayBase::arg DOXCOMMA MatrixBase::cwiseArg)
diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index e9da359..3cf91bd 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -10,9 +10,11 @@
#ifndef EIGEN_MATHFUNCTIONS_H
#define EIGEN_MATHFUNCTIONS_H
-// source: http://www.geom.uiuc.edu/~huberty/math5337/groupe/digits.html
// TODO this should better be moved to NumTraits
-#define EIGEN_PI 3.141592653589793238462643383279502884197169399375105820974944592307816406L
+// Source: WolframAlpha
+#define EIGEN_PI 3.141592653589793238462643383279502884197169399375105820974944592307816406L
+#define EIGEN_LOG2E 1.442695040888963407359924681001892137426645954152985934135449406931109219L
+#define EIGEN_LN2 0.693147180559945309417232121458176568075500134360255254120680009493393621L
namespace Eigen {
@@ -1374,6 +1376,11 @@
return sqrt(x);
}
+// Boolean specialization, avoids implicit float to bool conversion (-Wimplicit-conversion-floating-point-to-bool).
+template<>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_DEVICE_FUNC
+bool sqrt<bool>(const bool &x) { return x; }
+
#if defined(SYCL_DEVICE_ONLY)
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sqrt, sqrt)
#endif
diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h
index 6e220d7..02b5843 100644
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@@ -14,7 +14,7 @@
#define EIGEN_PRODUCTEVALUATORS_H
namespace Eigen {
-
+
namespace internal {
/** \internal
@@ -22,19 +22,19 @@
* Since products require special treatments to handle all possible cases,
* we simply defer the evaluation logic to a product_evaluator class
* which offers more partial specialization possibilities.
- *
+ *
* \sa class product_evaluator
*/
template<typename Lhs, typename Rhs, int Options>
-struct evaluator<Product<Lhs, Rhs, Options> >
+struct evaluator<Product<Lhs, Rhs, Options> >
: public product_evaluator<Product<Lhs, Rhs, Options> >
{
typedef Product<Lhs, Rhs, Options> XprType;
typedef product_evaluator<XprType> Base;
-
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr) : Base(xpr) {}
};
-
+
// Catch "scalar * ( A * B )" and transform it to "(A*scalar) * B"
// TODO we should apply that rule only if that's really helpful
template<typename Lhs, typename Rhs, typename Scalar1, typename Scalar2, typename Plain1>
@@ -62,12 +62,12 @@
template<typename Lhs, typename Rhs, int DiagIndex>
-struct evaluator<Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> >
+struct evaluator<Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> >
: public evaluator<Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex> >
{
typedef Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> XprType;
typedef evaluator<Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex> > Base;
-
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr)
: Base(Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex>(
Product<Lhs, Rhs, LazyProduct>(xpr.nestedExpression().lhs(), xpr.nestedExpression().rhs()),
@@ -108,23 +108,23 @@
: m_result(xpr.rows(), xpr.cols())
{
::new (static_cast<Base*>(this)) Base(m_result);
-
+
// FIXME shall we handle nested_eval here?,
// if so, then we must take care at removing the call to nested_eval in the specializations (e.g., in permutation_matrix_product, transposition_matrix_product, etc.)
// typedef typename internal::nested_eval<Lhs,Rhs::ColsAtCompileTime>::type LhsNested;
// typedef typename internal::nested_eval<Rhs,Lhs::RowsAtCompileTime>::type RhsNested;
// typedef typename internal::remove_all<LhsNested>::type LhsNestedCleaned;
// typedef typename internal::remove_all<RhsNested>::type RhsNestedCleaned;
-//
+//
// const LhsNested lhs(xpr.lhs());
// const RhsNested rhs(xpr.rhs());
-//
+//
// generic_product_impl<LhsNestedCleaned, RhsNestedCleaned>::evalTo(m_result, lhs, rhs);
generic_product_impl<Lhs, Rhs, LhsShape, RhsShape, ProductTag>::evalTo(m_result, xpr.lhs(), xpr.rhs());
}
-
-protected:
+
+protected:
PlainObject m_result;
};
@@ -250,7 +250,7 @@
{
dst.coeffRef(0,0) = (lhs.transpose().cwiseProduct(rhs)).sum();
}
-
+
template<typename Dst>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{
@@ -298,7 +298,7 @@
{
template<typename T> struct is_row_major : internal::conditional<(int(T::Flags)&RowMajorBit), internal::true_type, internal::false_type>::type {};
typedef typename Product<Lhs,Rhs>::Scalar Scalar;
-
+
// TODO it would be nice to be able to exploit our *_assign_op functors for that purpose
struct set { template<typename Dst, typename Src> EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() = src; } };
struct add { template<typename Dst, typename Src> EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } };
@@ -310,31 +310,31 @@
dst.const_cast_derived() += m_scale * src;
}
};
-
+
template<typename Dst>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{
internal::outer_product_selector_run(dst, lhs, rhs, set(), is_row_major<Dst>());
}
-
+
template<typename Dst>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{
internal::outer_product_selector_run(dst, lhs, rhs, add(), is_row_major<Dst>());
}
-
+
template<typename Dst>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{
internal::outer_product_selector_run(dst, lhs, rhs, sub(), is_row_major<Dst>());
}
-
+
template<typename Dst>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
{
internal::outer_product_selector_run(dst, lhs, rhs, adds(alpha), is_row_major<Dst>());
}
-
+
};
@@ -343,7 +343,7 @@
struct generic_product_impl_base
{
typedef typename Product<Lhs,Rhs>::Scalar Scalar;
-
+
template<typename Dst>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{ dst.setZero(); scaleAndAddTo(dst, lhs, rhs, Scalar(1)); }
@@ -355,7 +355,7 @@
template<typename Dst>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{ scaleAndAddTo(dst, lhs, rhs, Scalar(-1)); }
-
+
template<typename Dst>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
{ Derived::scaleAndAddTo(dst,lhs,rhs,alpha); }
@@ -379,7 +379,7 @@
if (lhs.rows() == 1 && rhs.cols() == 1) {
dst.coeffRef(0,0) += alpha * lhs.row(0).conjugate().dot(rhs.col(0));
return;
- }
+ }
LhsNested actual_lhs(lhs);
RhsNested actual_rhs(rhs);
internal::gemv_dense_selector<Side,
@@ -390,10 +390,10 @@
};
template<typename Lhs, typename Rhs>
-struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode>
+struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode>
{
typedef typename Product<Lhs,Rhs>::Scalar Scalar;
-
+
template<typename Dst>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{
@@ -408,7 +408,7 @@
// dst.noalias() += lhs.lazyProduct(rhs);
call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::add_assign_op<typename Dst::Scalar,Scalar>());
}
-
+
template<typename Dst>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{
@@ -525,7 +525,7 @@
typedef typename internal::nested_eval<Lhs,Rhs::ColsAtCompileTime>::type LhsNested;
typedef typename internal::nested_eval<Rhs,Lhs::RowsAtCompileTime>::type RhsNested;
-
+
typedef typename internal::remove_all<LhsNested>::type LhsNestedCleaned;
typedef typename internal::remove_all<RhsNested>::type RhsNestedCleaned;
@@ -544,7 +544,7 @@
typedef typename find_best_packet<Scalar,ColsAtCompileTime>::type RhsVecPacketType;
enum {
-
+
LhsCoeffReadCost = LhsEtorType::CoeffReadCost,
RhsCoeffReadCost = RhsEtorType::CoeffReadCost,
CoeffReadCost = InnerSize==0 ? NumTraits<Scalar>::ReadCost
@@ -553,10 +553,10 @@
+ (InnerSize - 1) * NumTraits<Scalar>::AddCost,
Unroll = CoeffReadCost <= EIGEN_UNROLLING_LIMIT,
-
+
LhsFlags = LhsEtorType::Flags,
RhsFlags = RhsEtorType::Flags,
-
+
LhsRowMajor = LhsFlags & RowMajorBit,
RhsRowMajor = RhsFlags & RowMajorBit,
@@ -566,7 +566,7 @@
// Here, we don't care about alignment larger than the usable packet size.
LhsAlignment = EIGEN_PLAIN_ENUM_MIN(LhsEtorType::Alignment,LhsVecPacketSize*int(sizeof(typename LhsNestedCleaned::Scalar))),
RhsAlignment = EIGEN_PLAIN_ENUM_MIN(RhsEtorType::Alignment,RhsVecPacketSize*int(sizeof(typename RhsNestedCleaned::Scalar))),
-
+
SameType = is_same<typename LhsNestedCleaned::Scalar,typename RhsNestedCleaned::Scalar>::value,
CanVectorizeRhs = bool(RhsRowMajor) && (RhsFlags & PacketAccessBit) && (ColsAtCompileTime!=1),
@@ -581,7 +581,7 @@
// TODO enable vectorization for mixed types
| (SameType && (CanVectorizeLhs || CanVectorizeRhs) ? PacketAccessBit : 0)
| (XprType::IsVectorAtCompileTime ? LinearAccessBit : 0),
-
+
LhsOuterStrideBytes = int(LhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename LhsNestedCleaned::Scalar)),
RhsOuterStrideBytes = int(RhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename RhsNestedCleaned::Scalar)),
@@ -600,7 +600,7 @@
&& (LhsFlags & RhsFlags & ActualPacketAccessBit)
&& (InnerSize % packet_traits<Scalar>::size == 0)
};
-
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index row, Index col) const
{
return (m_lhs.row(row).transpose().cwiseProduct( m_rhs.col(col) )).sum();
@@ -642,7 +642,7 @@
protected:
typename internal::add_const_on_value_type<LhsNested>::type m_lhs;
typename internal::add_const_on_value_type<RhsNested>::type m_rhs;
-
+
LhsEtorType m_lhsImpl;
RhsEtorType m_rhsImpl;
@@ -673,7 +673,7 @@
template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
struct etor_product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode>
{
- static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
{
etor_product_packet_impl<RowMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res);
res = pmadd(pset1<Packet>(lhs.coeff(row, Index(UnrollingIndex-1))), rhs.template packet<LoadMode,Packet>(Index(UnrollingIndex-1), col), res);
@@ -683,7 +683,7 @@
template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
struct etor_product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode>
{
- static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
{
etor_product_packet_impl<ColMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res);
res = pmadd(lhs.template packet<LoadMode,Packet>(row, Index(UnrollingIndex-1)), pset1<Packet>(rhs.coeff(Index(UnrollingIndex-1), col)), res);
@@ -693,7 +693,7 @@
template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
struct etor_product_packet_impl<RowMajor, 1, Lhs, Rhs, Packet, LoadMode>
{
- static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
{
res = pmul(pset1<Packet>(lhs.coeff(row, Index(0))),rhs.template packet<LoadMode,Packet>(Index(0), col));
}
@@ -702,7 +702,7 @@
template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
struct etor_product_packet_impl<ColMajor, 1, Lhs, Rhs, Packet, LoadMode>
{
- static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
{
res = pmul(lhs.template packet<LoadMode,Packet>(row, Index(0)), pset1<Packet>(rhs.coeff(Index(0), col)));
}
@@ -711,7 +711,7 @@
template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
struct etor_product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode>
{
- static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res)
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res)
{
res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
}
@@ -720,7 +720,7 @@
template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
struct etor_product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode>
{
- static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res)
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res)
{
res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
}
@@ -729,7 +729,7 @@
template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
struct etor_product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
{
- static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
{
res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
for(Index i = 0; i < innerDim; ++i)
@@ -740,7 +740,7 @@
template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
struct etor_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
{
- static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
{
res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
for(Index i = 0; i < innerDim; ++i)
@@ -762,7 +762,7 @@
: generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,TriangularShape,DenseShape,ProductTag> >
{
typedef typename Product<Lhs,Rhs>::Scalar Scalar;
-
+
template<typename Dest>
static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
{
@@ -776,7 +776,7 @@
: generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,TriangularShape,ProductTag> >
{
typedef typename Product<Lhs,Rhs>::Scalar Scalar;
-
+
template<typename Dest>
static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
{
@@ -797,7 +797,7 @@
: generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,SelfAdjointShape,DenseShape,ProductTag> >
{
typedef typename Product<Lhs,Rhs>::Scalar Scalar;
-
+
template<typename Dest>
static EIGEN_DEVICE_FUNC
void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
@@ -811,7 +811,7 @@
: generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,SelfAdjointShape,ProductTag> >
{
typedef typename Product<Lhs,Rhs>::Scalar Scalar;
-
+
template<typename Dest>
static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
{
@@ -823,7 +823,7 @@
/***************************************************************************
* Diagonal products
***************************************************************************/
-
+
template<typename MatrixType, typename DiagonalType, typename Derived, int ProductOrder>
struct diagonal_product_evaluator_base
: evaluator_base<Derived>
@@ -832,10 +832,10 @@
public:
enum {
CoeffReadCost = NumTraits<Scalar>::MulCost + evaluator<MatrixType>::CoeffReadCost + evaluator<DiagonalType>::CoeffReadCost,
-
+
MatrixFlags = evaluator<MatrixType>::Flags,
DiagFlags = evaluator<DiagonalType>::Flags,
-
+
_StorageOrder = (Derived::MaxRowsAtCompileTime==1 && Derived::MaxColsAtCompileTime!=1) ? RowMajor
: (Derived::MaxColsAtCompileTime==1 && Derived::MaxRowsAtCompileTime!=1) ? ColMajor
: MatrixFlags & RowMajorBit ? RowMajor : ColMajor,
@@ -858,14 +858,14 @@
|| (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::RowsAtCompileTime==1 && ProductOrder==OnTheLeft)
|| (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::ColsAtCompileTime==1 && ProductOrder==OnTheRight)
};
-
+
EIGEN_DEVICE_FUNC diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag)
: m_diagImpl(diag), m_matImpl(mat)
{
EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::MulCost);
EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
}
-
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const
{
if(AsScalarProduct)
@@ -873,7 +873,7 @@
else
return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx);
}
-
+
protected:
template<int LoadMode,typename PacketType>
EIGEN_STRONG_INLINE PacketType packet_impl(Index row, Index col, Index id, internal::true_type) const
@@ -881,7 +881,7 @@
return internal::pmul(m_matImpl.template packet<LoadMode,PacketType>(row, col),
internal::pset1<PacketType>(m_diagImpl.coeff(id)));
}
-
+
template<int LoadMode,typename PacketType>
EIGEN_STRONG_INLINE PacketType packet_impl(Index row, Index col, Index id, internal::false_type) const
{
@@ -892,7 +892,7 @@
return internal::pmul(m_matImpl.template packet<LoadMode,PacketType>(row, col),
m_diagImpl.template packet<DiagonalPacketLoadMode,PacketType>(id));
}
-
+
evaluator<DiagonalType> m_diagImpl;
evaluator<MatrixType> m_matImpl;
};
@@ -907,24 +907,24 @@
using Base::m_matImpl;
using Base::coeff;
typedef typename Base::Scalar Scalar;
-
+
typedef Product<Lhs, Rhs, ProductKind> XprType;
typedef typename XprType::PlainObject PlainObject;
typedef typename Lhs::DiagonalVectorType DiagonalType;
-
+
enum { StorageOrder = Base::_StorageOrder };
EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
: Base(xpr.rhs(), xpr.lhs().diagonal())
{
}
-
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const
{
return m_diagImpl.coeff(row) * m_matImpl.coeff(row, col);
}
-
+
#ifndef EIGEN_GPUCC
template<int LoadMode,typename PacketType>
EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
@@ -934,7 +934,7 @@
return this->template packet_impl<LoadMode,PacketType>(row,col, row,
typename internal::conditional<int(StorageOrder)==RowMajor, internal::true_type, internal::false_type>::type());
}
-
+
template<int LoadMode,typename PacketType>
EIGEN_STRONG_INLINE PacketType packet(Index idx) const
{
@@ -953,22 +953,22 @@
using Base::m_matImpl;
using Base::coeff;
typedef typename Base::Scalar Scalar;
-
+
typedef Product<Lhs, Rhs, ProductKind> XprType;
typedef typename XprType::PlainObject PlainObject;
-
+
enum { StorageOrder = Base::_StorageOrder };
EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
: Base(xpr.lhs(), xpr.rhs().diagonal())
{
}
-
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const
{
return m_matImpl.coeff(row, col) * m_diagImpl.coeff(col);
}
-
+
#ifndef EIGEN_GPUCC
template<int LoadMode,typename PacketType>
EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
@@ -976,7 +976,7 @@
return this->template packet_impl<LoadMode,PacketType>(row,col, col,
typename internal::conditional<int(StorageOrder)==ColMajor, internal::true_type, internal::false_type>::type());
}
-
+
template<int LoadMode,typename PacketType>
EIGEN_STRONG_INLINE PacketType packet(Index idx) const
{
@@ -1004,7 +1004,7 @@
typedef typename remove_all<MatrixType>::type MatrixTypeCleaned;
template<typename Dest, typename PermutationType>
- static inline void run(Dest& dst, const PermutationType& perm, const ExpressionType& xpr)
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Dest& dst, const PermutationType& perm, const ExpressionType& xpr)
{
MatrixType mat(xpr);
const Index n = Side==OnTheLeft ? mat.rows() : mat.cols();
@@ -1058,7 +1058,7 @@
struct generic_product_impl<Lhs, Rhs, PermutationShape, MatrixShape, ProductTag>
{
template<typename Dest>
- static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
{
permutation_matrix_product<Rhs, OnTheLeft, false, MatrixShape>::run(dst, lhs, rhs);
}
@@ -1068,7 +1068,7 @@
struct generic_product_impl<Lhs, Rhs, MatrixShape, PermutationShape, ProductTag>
{
template<typename Dest>
- static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
{
permutation_matrix_product<Lhs, OnTheRight, false, MatrixShape>::run(dst, rhs, lhs);
}
@@ -1078,7 +1078,7 @@
struct generic_product_impl<Inverse<Lhs>, Rhs, PermutationShape, MatrixShape, ProductTag>
{
template<typename Dest>
- static void evalTo(Dest& dst, const Inverse<Lhs>& lhs, const Rhs& rhs)
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Inverse<Lhs>& lhs, const Rhs& rhs)
{
permutation_matrix_product<Rhs, OnTheLeft, true, MatrixShape>::run(dst, lhs.nestedExpression(), rhs);
}
@@ -1088,7 +1088,7 @@
struct generic_product_impl<Lhs, Inverse<Rhs>, MatrixShape, PermutationShape, ProductTag>
{
template<typename Dest>
- static void evalTo(Dest& dst, const Lhs& lhs, const Inverse<Rhs>& rhs)
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Inverse<Rhs>& rhs)
{
permutation_matrix_product<Lhs, OnTheRight, true, MatrixShape>::run(dst, rhs.nestedExpression(), lhs);
}
@@ -1110,9 +1110,9 @@
{
typedef typename nested_eval<ExpressionType, 1>::type MatrixType;
typedef typename remove_all<MatrixType>::type MatrixTypeCleaned;
-
+
template<typename Dest, typename TranspositionType>
- static inline void run(Dest& dst, const TranspositionType& tr, const ExpressionType& xpr)
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Dest& dst, const TranspositionType& tr, const ExpressionType& xpr)
{
MatrixType mat(xpr);
typedef typename TranspositionType::StorageIndex StorageIndex;
@@ -1135,7 +1135,7 @@
struct generic_product_impl<Lhs, Rhs, TranspositionsShape, MatrixShape, ProductTag>
{
template<typename Dest>
- static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
{
transposition_matrix_product<Rhs, OnTheLeft, false, MatrixShape>::run(dst, lhs, rhs);
}
@@ -1145,7 +1145,7 @@
struct generic_product_impl<Lhs, Rhs, MatrixShape, TranspositionsShape, ProductTag>
{
template<typename Dest>
- static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
{
transposition_matrix_product<Lhs, OnTheRight, false, MatrixShape>::run(dst, rhs, lhs);
}
@@ -1156,7 +1156,7 @@
struct generic_product_impl<Transpose<Lhs>, Rhs, TranspositionsShape, MatrixShape, ProductTag>
{
template<typename Dest>
- static void evalTo(Dest& dst, const Transpose<Lhs>& lhs, const Rhs& rhs)
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Transpose<Lhs>& lhs, const Rhs& rhs)
{
transposition_matrix_product<Rhs, OnTheLeft, true, MatrixShape>::run(dst, lhs.nestedExpression(), rhs);
}
@@ -1166,7 +1166,7 @@
struct generic_product_impl<Lhs, Transpose<Rhs>, MatrixShape, TranspositionsShape, ProductTag>
{
template<typename Dest>
- static void evalTo(Dest& dst, const Lhs& lhs, const Transpose<Rhs>& rhs)
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Transpose<Rhs>& rhs)
{
transposition_matrix_product<Lhs, OnTheRight, true, MatrixShape>::run(dst, rhs.nestedExpression(), lhs);
}
diff --git a/Eigen/src/Core/Reshaped.h b/Eigen/src/Core/Reshaped.h
index a78fd88..36b4f41 100644
--- a/Eigen/src/Core/Reshaped.h
+++ b/Eigen/src/Core/Reshaped.h
@@ -12,7 +12,6 @@
#define EIGEN_RESHAPED_H
namespace Eigen {
-namespace internal {
/** \class Reshaped
* \ingroup Core_Module
@@ -44,6 +43,8 @@
* \sa DenseBase::reshaped(NRowsType,NColsType)
*/
+namespace internal {
+
template<typename XprType, int Rows, int Cols, int Order>
struct traits<Reshaped<XprType, Rows, Cols, Order> > : traits<XprType>
{
diff --git a/Eigen/src/Core/SolveTriangular.h b/Eigen/src/Core/SolveTriangular.h
index 813fef0d..dc53b5e 100644
--- a/Eigen/src/Core/SolveTriangular.h
+++ b/Eigen/src/Core/SolveTriangular.h
@@ -10,7 +10,7 @@
#ifndef EIGEN_SOLVETRIANGULAR_H
#define EIGEN_SOLVETRIANGULAR_H
-namespace Eigen {
+namespace Eigen {
namespace internal {
@@ -54,7 +54,7 @@
typedef blas_traits<Lhs> LhsProductTraits;
typedef typename LhsProductTraits::ExtractType ActualLhsType;
typedef Map<Matrix<RhsScalar,Dynamic,1>, Aligned> MappedRhs;
- static void run(const Lhs& lhs, Rhs& rhs)
+ static EIGEN_DEVICE_FUNC void run(const Lhs& lhs, Rhs& rhs)
{
ActualLhsType actualLhs = LhsProductTraits::extract(lhs);
@@ -64,7 +64,7 @@
ei_declare_aligned_stack_constructed_variable(RhsScalar,actualRhs,rhs.size(),
(useRhsDirectly ? rhs.data() : 0));
-
+
if(!useRhsDirectly)
MappedRhs(actualRhs,rhs.size()) = rhs;
@@ -85,7 +85,7 @@
typedef blas_traits<Lhs> LhsProductTraits;
typedef typename LhsProductTraits::DirectLinearAccessType ActualLhsType;
- static void run(const Lhs& lhs, Rhs& rhs)
+ static EIGEN_DEVICE_FUNC void run(const Lhs& lhs, Rhs& rhs)
{
typename internal::add_const_on_value_type<ActualLhsType>::type actualLhs = LhsProductTraits::extract(lhs);
@@ -118,7 +118,7 @@
DiagIndex = IsLower ? LoopIndex : Size - LoopIndex - 1,
StartIndex = IsLower ? 0 : DiagIndex+1
};
- static void run(const Lhs& lhs, Rhs& rhs)
+ static EIGEN_DEVICE_FUNC void run(const Lhs& lhs, Rhs& rhs)
{
if (LoopIndex>0)
rhs.coeffRef(DiagIndex) -= lhs.row(DiagIndex).template segment<LoopIndex>(StartIndex).transpose()
@@ -133,22 +133,22 @@
template<typename Lhs, typename Rhs, int Mode, int LoopIndex, int Size>
struct triangular_solver_unroller<Lhs,Rhs,Mode,LoopIndex,Size,true> {
- static void run(const Lhs&, Rhs&) {}
+ static EIGEN_DEVICE_FUNC void run(const Lhs&, Rhs&) {}
};
template<typename Lhs, typename Rhs, int Mode>
struct triangular_solver_selector<Lhs,Rhs,OnTheLeft,Mode,CompleteUnrolling,1> {
- static void run(const Lhs& lhs, Rhs& rhs)
+ static EIGEN_DEVICE_FUNC void run(const Lhs& lhs, Rhs& rhs)
{ triangular_solver_unroller<Lhs,Rhs,Mode,0,Rhs::SizeAtCompileTime>::run(lhs,rhs); }
};
template<typename Lhs, typename Rhs, int Mode>
struct triangular_solver_selector<Lhs,Rhs,OnTheRight,Mode,CompleteUnrolling,1> {
- static void run(const Lhs& lhs, Rhs& rhs)
+ static EIGEN_DEVICE_FUNC void run(const Lhs& lhs, Rhs& rhs)
{
Transpose<const Lhs> trLhs(lhs);
Transpose<Rhs> trRhs(rhs);
-
+
triangular_solver_unroller<Transpose<const Lhs>,Transpose<Rhs>,
((Mode&Upper)==Upper ? Lower : Upper) | (Mode&UnitDiag),
0,Rhs::SizeAtCompileTime>::run(trLhs,trRhs);
diff --git a/Eigen/src/Core/Transpositions.h b/Eigen/src/Core/Transpositions.h
index fcfe6f4..f323a2b 100644
--- a/Eigen/src/Core/Transpositions.h
+++ b/Eigen/src/Core/Transpositions.h
@@ -47,6 +47,7 @@
Index cols() const { return indices().size(); }
/** Direct access to the underlying index vector */
+ EIGEN_DEVICE_FUNC
inline const StorageIndex& coeff(Index i) const { return indices().coeff(i); }
/** Direct access to the underlying index vector */
inline StorageIndex& coeffRef(Index i) { return indices().coeffRef(i); }
@@ -373,6 +374,7 @@
return Product<Transpose, OtherDerived, AliasFreeProduct>(*this, matrix.derived());
}
+ EIGEN_DEVICE_FUNC
const TranspositionType& nestedExpression() const { return m_transpositions; }
protected:
diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h
index 23568ca..506ca0b 100644
--- a/Eigen/src/Core/arch/AVX/Complex.h
+++ b/Eigen/src/Core/arch/AVX/Complex.h
@@ -38,6 +38,7 @@
HasMul = 1,
HasDiv = 1,
HasNegate = 1,
+ HasSqrt = 1,
HasAbs = 0,
HasAbs2 = 0,
HasMin = 0,
@@ -47,7 +48,18 @@
};
#endif
-template<> struct unpacket_traits<Packet4cf> { typedef std::complex<float> type; enum {size=4, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2cf half; };
+template<> struct unpacket_traits<Packet4cf> {
+ typedef std::complex<float> type;
+ typedef Packet2cf half;
+ typedef Packet8f as_real;
+ enum {
+ size=4,
+ alignment=Aligned32,
+ vectorizable=true,
+ masked_load_available=false,
+ masked_store_available=false
+ };
+};
template<> EIGEN_STRONG_INLINE Packet4cf padd<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_add_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet4cf psub<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_sub_ps(a.v,b.v)); }
@@ -228,6 +240,7 @@
HasMul = 1,
HasDiv = 1,
HasNegate = 1,
+ HasSqrt = 1,
HasAbs = 0,
HasAbs2 = 0,
HasMin = 0,
@@ -237,7 +250,18 @@
};
#endif
-template<> struct unpacket_traits<Packet2cd> { typedef std::complex<double> type; enum {size=2, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet1cd half; };
+template<> struct unpacket_traits<Packet2cd> {
+ typedef std::complex<double> type;
+ typedef Packet1cd half;
+ typedef Packet4d as_real;
+ enum {
+ size=2,
+ alignment=Aligned32,
+ vectorizable=true,
+ masked_load_available=false,
+ masked_store_available=false
+ };
+};
template<> EIGEN_STRONG_INLINE Packet2cd padd<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_add_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cd psub<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_sub_pd(a.v,b.v)); }
@@ -399,6 +423,14 @@
kernel.packet[0].v = tmp;
}
+template<> EIGEN_STRONG_INLINE Packet2cd psqrt<Packet2cd>(const Packet2cd& a) {
+ return psqrt_complex<Packet2cd>(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cf psqrt<Packet4cf>(const Packet4cf& a) {
+ return psqrt_complex<Packet4cf>(a);
+}
+
} // end namespace internal
} // end namespace Eigen
diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h
index 9b123db..5fe2cff 100644
--- a/Eigen/src/Core/arch/AVX/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX/MathFunctions.h
@@ -42,6 +42,18 @@
return plog_double(_x);
}
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
+plog2<Packet8f>(const Packet8f& _x) {
+ return plog2_float(_x);
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d
+plog2<Packet4d>(const Packet4d& _x) {
+ return plog2_double(_x);
+}
+
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet8f plog1p<Packet8f>(const Packet8f& _x) {
return generic_plog1p(_x);
@@ -158,9 +170,21 @@
return _mm256_div_pd(p4d_one, _mm256_sqrt_pd(_x));
}
+F16_PACKET_FUNCTION(Packet8f, Packet8h, psin)
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pcos)
+F16_PACKET_FUNCTION(Packet8f, Packet8h, plog)
+F16_PACKET_FUNCTION(Packet8f, Packet8h, plog2)
+F16_PACKET_FUNCTION(Packet8f, Packet8h, plog1p)
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pexpm1)
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pexp)
+F16_PACKET_FUNCTION(Packet8f, Packet8h, ptanh)
+F16_PACKET_FUNCTION(Packet8f, Packet8h, psqrt)
+F16_PACKET_FUNCTION(Packet8f, Packet8h, prsqrt)
+
BF16_PACKET_FUNCTION(Packet8f, Packet8bf, psin)
BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pcos)
BF16_PACKET_FUNCTION(Packet8f, Packet8bf, plog)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, plog2)
BF16_PACKET_FUNCTION(Packet8f, Packet8bf, plog1p)
BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pexpm1)
BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pexp)
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index ae111c6..3974935 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -105,7 +105,8 @@
HasBlend = 1,
HasRound = 1,
HasFloor = 1,
- HasCeil = 1
+ HasCeil = 1,
+ HasRint = 1
};
};
@@ -119,22 +120,36 @@
AlignedOnScalar = 1,
size = 8,
HasHalfPacket = 0,
+
+ HasCmp = 1,
HasAdd = 1,
HasSub = 1,
HasMul = 1,
HasDiv = 1,
+ HasSin = EIGEN_FAST_MATH,
+ HasCos = EIGEN_FAST_MATH,
HasNegate = 1,
- HasAbs = 0,
+ HasAbs = 1,
HasAbs2 = 0,
- HasMin = 0,
- HasMax = 0,
- HasConj = 0,
+ HasMin = 1,
+ HasMax = 1,
+ HasConj = 1,
HasSetLinear = 0,
- HasSqrt = 0,
- HasRsqrt = 0,
- HasExp = 0,
- HasLog = 0,
- HasBlend = 0
+ HasLog = 1,
+ HasLog1p = 1,
+ HasExpm1 = 1,
+ HasExp = 1,
+ HasSqrt = 1,
+ HasRsqrt = 1,
+ HasTanh = EIGEN_FAST_MATH,
+ HasErf = EIGEN_FAST_MATH,
+ HasBlend = 0,
+ HasRound = 1,
+ HasFloor = 1,
+ HasCeil = 1,
+ HasRint = 1,
+ HasBessel = 1,
+ HasNdtri = 1,
};
};
@@ -150,16 +165,24 @@
size = 8,
HasHalfPacket = 0,
- HasCmp = 1,
+ HasCmp = 1,
+ HasAdd = 1,
+ HasSub = 1,
+ HasMul = 1,
HasDiv = 1,
HasSin = EIGEN_FAST_MATH,
HasCos = EIGEN_FAST_MATH,
+ HasNegate = 1,
+ HasAbs = 1,
+ HasAbs2 = 0,
+ HasMin = 1,
+ HasMax = 1,
+ HasConj = 1,
+ HasSetLinear = 0,
HasLog = 1,
HasLog1p = 1,
HasExpm1 = 1,
HasExp = 1,
- HasNdtri = 1,
- HasBessel = 1,
HasSqrt = 1,
HasRsqrt = 1,
HasTanh = EIGEN_FAST_MATH,
@@ -168,7 +191,9 @@
HasRound = 1,
HasFloor = 1,
HasCeil = 1,
- HasRint = 1
+ HasRint = 1,
+ HasBessel = 1,
+ HasNdtri = 1,
};
};
#endif
@@ -211,6 +236,7 @@
_mm256_extractf128_si256(_mm256_castps_si256(rf), 1));
}
+
template<> EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float& from) { return _mm256_set1_ps(from); }
template<> EIGEN_STRONG_INLINE Packet4d pset1<Packet4d>(const double& from) { return _mm256_set1_pd(from); }
template<> EIGEN_STRONG_INLINE Packet8i pset1<Packet8i>(const int& from) { return _mm256_set1_epi32(from); }
@@ -222,6 +248,11 @@
template<> EIGEN_STRONG_INLINE Packet4d pzero(const Packet4d& /*a*/) { return _mm256_setzero_pd(); }
template<> EIGEN_STRONG_INLINE Packet8i pzero(const Packet8i& /*a*/) { return _mm256_setzero_si256(); }
+
+template<> EIGEN_STRONG_INLINE Packet8f peven_mask(const Packet8f& /*a*/) { return _mm256_castsi256_ps(_mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1)); }
+template<> EIGEN_STRONG_INLINE Packet8i peven_mask(const Packet8i& /*a*/) { return _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1); }
+template<> EIGEN_STRONG_INLINE Packet4d peven_mask(const Packet4d& /*a*/) { return _mm256_castsi256_pd(_mm256_set_epi32(0, 0, -1, -1, 0, 0, -1, -1)); }
+
template<> EIGEN_STRONG_INLINE Packet8f pload1<Packet8f>(const float* from) { return _mm256_broadcast_ss(from); }
template<> EIGEN_STRONG_INLINE Packet4d pload1<Packet4d>(const double* from) { return _mm256_broadcast_sd(from); }
@@ -258,7 +289,15 @@
template<> EIGEN_STRONG_INLINE Packet8f pmul<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_mul_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4d pmul<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_mul_pd(a,b); }
-
+template<> EIGEN_STRONG_INLINE Packet8i pmul<Packet8i>(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+ return _mm256_mullo_epi32(a,b);
+#else
+ const __m128i lo = _mm_mullo_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
+ const __m128i hi = _mm_mullo_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
template<> EIGEN_STRONG_INLINE Packet8f pdiv<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_div_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4d pdiv<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_div_pd(a,b); }
@@ -479,14 +518,14 @@
template<> EIGEN_STRONG_INLINE Packet8f pround<Packet8f>(const Packet8f& a)
{
- const Packet8f mask = pset1frombits<Packet8f>(0x80000000u);
- const Packet8f prev0dot5 = pset1frombits<Packet8f>(0x3EFFFFFFu);
+ const Packet8f mask = pset1frombits<Packet8f>(static_cast<numext::uint32_t>(0x80000000u));
+ const Packet8f prev0dot5 = pset1frombits<Packet8f>(static_cast<numext::uint32_t>(0x3EFFFFFFu));
return _mm256_round_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
}
template<> EIGEN_STRONG_INLINE Packet4d pround<Packet4d>(const Packet4d& a)
{
- const Packet4d mask = _mm256_castsi256_pd(_mm256_set_epi64x(0x8000000000000000ull, 0x8000000000000000ull, 0x8000000000000000ull, 0x8000000000000000ull));
- const Packet4d prev0dot5 = _mm256_castsi256_pd(_mm256_set_epi64x(0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull));
+ const Packet4d mask = pset1frombits<Packet4d>(static_cast<numext::uint64_t>(0x8000000000000000ull));
+ const Packet4d prev0dot5 = pset1frombits<Packet4d>(static_cast<numext::uint64_t>(0x3FDFFFFFFFFFFFFFull));
return _mm256_round_pd(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
}
@@ -695,13 +734,11 @@
__m256i a_expo = _mm256_castpd_si256(pand(a, cst_exp_mask));
#ifdef EIGEN_VECTORIZE_AVX2
a_expo = _mm256_srli_epi64(a_expo, 52);
-#endif
-#if defined(EIGEN_VECTORIZE_AVX2) && defined(EIGEN_VECTORIZE_AVX512DQ)
- exponent = _mm256_cvtepi64_pd(a_expo);
+ __m128i lo = _mm256_extractf128_si256(a_expo, 0);
+ __m128i hi = _mm256_extractf128_si256(a_expo, 1);
#else
__m128i lo = _mm256_extractf128_si256(a_expo, 0);
__m128i hi = _mm256_extractf128_si256(a_expo, 1);
-#ifndef EIGEN_VECTORIZE_AVX2
lo = _mm_srli_epi64(lo, 52);
hi = _mm_srli_epi64(hi, 52);
#endif
@@ -709,7 +746,6 @@
Packet2d exponent_hi = _mm_cvtepi32_pd(vec4i_swizzle1(hi, 0, 2, 1, 3));
exponent = _mm256_insertf128_pd(exponent, exponent_lo, 0);
exponent = _mm256_insertf128_pd(exponent, exponent_hi, 1);
-#endif // EIGEN_VECTORIZE_AVX512DQ
exponent = psub(exponent, cst_1022d);
const Packet4d cst_mant_mask = pset1frombits<Packet4d>(static_cast<uint64_t>(~0x7ff0000000000000ull));
return por(pand(a, cst_mant_mask), cst_half);
@@ -870,14 +906,15 @@
}
// Packet math for Eigen::half
+
template<> struct unpacket_traits<Packet8h> { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet8h half; };
template<> EIGEN_STRONG_INLINE Packet8h pset1<Packet8h>(const Eigen::half& from) {
- return _mm_set1_epi16(from.x);
+ return _mm_set1_epi16(numext::bit_cast<numext::uint16_t>(from));
}
template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet8h>(const Packet8h& from) {
- return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_extract_epi16(from, 0)));
+ return numext::bit_cast<Eigen::half>(static_cast<numext::uint16_t>(_mm_extract_epi16(from, 0)));
}
template<> EIGEN_STRONG_INLINE Packet8h pload<Packet8h>(const Eigen::half* from) {
@@ -898,20 +935,30 @@
template<> EIGEN_STRONG_INLINE Packet8h
ploaddup<Packet8h>(const Eigen::half* from) {
- unsigned short a = from[0].x;
- unsigned short b = from[1].x;
- unsigned short c = from[2].x;
- unsigned short d = from[3].x;
+ const numext::uint16_t a = numext::bit_cast<numext::uint16_t>(from[0]);
+ const numext::uint16_t b = numext::bit_cast<numext::uint16_t>(from[1]);
+ const numext::uint16_t c = numext::bit_cast<numext::uint16_t>(from[2]);
+ const numext::uint16_t d = numext::bit_cast<numext::uint16_t>(from[3]);
return _mm_set_epi16(d, d, c, c, b, b, a, a);
}
template<> EIGEN_STRONG_INLINE Packet8h
ploadquad<Packet8h>(const Eigen::half* from) {
- unsigned short a = from[0].x;
- unsigned short b = from[1].x;
+ const numext::uint16_t a = numext::bit_cast<numext::uint16_t>(from[0]);
+ const numext::uint16_t b = numext::bit_cast<numext::uint16_t>(from[1]);
return _mm_set_epi16(b, b, b, b, a, a, a, a);
}
+template<> EIGEN_STRONG_INLINE Packet8h ptrue(const Packet8h& a) {
+ return _mm_cmpeq_epi32(a, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pabs(const Packet8h& a) {
+ const __m128i sign_mask = _mm_set1_epi16(static_cast<numext::uint16_t>(0x8000));
+ return _mm_andnot_si128(sign_mask, a);
+}
+
EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) {
#ifdef EIGEN_HAS_FP16_C
return _mm256_cvtph_ps(a);
@@ -937,21 +984,33 @@
#else
EIGEN_ALIGN32 float aux[8];
pstore(aux, a);
- Eigen::half h0(aux[0]);
- Eigen::half h1(aux[1]);
- Eigen::half h2(aux[2]);
- Eigen::half h3(aux[3]);
- Eigen::half h4(aux[4]);
- Eigen::half h5(aux[5]);
- Eigen::half h6(aux[6]);
- Eigen::half h7(aux[7]);
-
- return _mm_set_epi16(h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x);
+ const numext::uint16_t s0 = numext::bit_cast<numext::uint16_t>(Eigen::half(aux[0]));
+ const numext::uint16_t s1 = numext::bit_cast<numext::uint16_t>(Eigen::half(aux[1]));
+ const numext::uint16_t s2 = numext::bit_cast<numext::uint16_t>(Eigen::half(aux[2]));
+ const numext::uint16_t s3 = numext::bit_cast<numext::uint16_t>(Eigen::half(aux[3]));
+ const numext::uint16_t s4 = numext::bit_cast<numext::uint16_t>(Eigen::half(aux[4]));
+ const numext::uint16_t s5 = numext::bit_cast<numext::uint16_t>(Eigen::half(aux[5]));
+ const numext::uint16_t s6 = numext::bit_cast<numext::uint16_t>(Eigen::half(aux[6]));
+ const numext::uint16_t s7 = numext::bit_cast<numext::uint16_t>(Eigen::half(aux[7]));
+ return _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0);
#endif
}
-template<> EIGEN_STRONG_INLINE Packet8h ptrue(const Packet8h& a) {
- return _mm_cmpeq_epi32(a, a);
+template <>
+EIGEN_STRONG_INLINE Packet8h pmin<Packet8h>(const Packet8h& a,
+ const Packet8h& b) {
+ return float2half(pmin<Packet8f>(half2float(a), half2float(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pmax<Packet8h>(const Packet8h& a,
+ const Packet8h& b) {
+ return float2half(pmax<Packet8f>(half2float(a), half2float(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h plset<Packet8h>(const half& a) {
+ return float2half(plset<Packet8f>(static_cast<float>(a)));
}
template<> EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a,const Packet8h& b) {
@@ -973,19 +1032,42 @@
return _mm_blendv_epi8(b, a, mask);
}
+template<> EIGEN_STRONG_INLINE Packet8h pround<Packet8h>(const Packet8h& a) {
+ return float2half(pround<Packet8f>(half2float(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h print<Packet8h>(const Packet8h& a) {
+ return float2half(print<Packet8f>(half2float(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pceil<Packet8h>(const Packet8h& a) {
+ return float2half(pceil<Packet8f>(half2float(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pfloor<Packet8h>(const Packet8h& a) {
+ return float2half(pfloor<Packet8f>(half2float(a)));
+}
+
template<> EIGEN_STRONG_INLINE Packet8h pcmp_eq(const Packet8h& a,const Packet8h& b) {
- Packet8f af = half2float(a);
- Packet8f bf = half2float(b);
- Packet8f rf = pcmp_eq(af, bf);
- // Pack the 32-bit flags into 16-bits flags.
- return _mm_packs_epi32(_mm256_extractf128_si256(_mm256_castps_si256(rf), 0),
- _mm256_extractf128_si256(_mm256_castps_si256(rf), 1));
+ return Pack16To8(pcmp_eq(half2float(a), half2float(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pcmp_le(const Packet8h& a,const Packet8h& b) {
+ return Pack16To8(pcmp_le(half2float(a), half2float(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pcmp_lt(const Packet8h& a,const Packet8h& b) {
+ return Pack16To8(pcmp_lt(half2float(a), half2float(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pcmp_lt_or_nan(const Packet8h& a,const Packet8h& b) {
+ return Pack16To8(pcmp_lt_or_nan(half2float(a), half2float(b)));
}
template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; }
template<> EIGEN_STRONG_INLINE Packet8h pnegate(const Packet8h& a) {
- Packet8h sign_mask = _mm_set1_epi16(static_cast<unsigned short>(0x8000));
+ Packet8h sign_mask = _mm_set1_epi16(static_cast<numext::uint16_t>(0x8000));
return _mm_xor_si128(a, sign_mask);
}
@@ -1019,7 +1101,15 @@
template<> EIGEN_STRONG_INLINE Packet8h pgather<Eigen::half, Packet8h>(const Eigen::half* from, Index stride)
{
- return _mm_set_epi16(from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x, from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
+ const numext::uint16_t s0 = numext::bit_cast<numext::uint16_t>(from[0*stride]);
+ const numext::uint16_t s1 = numext::bit_cast<numext::uint16_t>(from[1*stride]);
+ const numext::uint16_t s2 = numext::bit_cast<numext::uint16_t>(from[2*stride]);
+ const numext::uint16_t s3 = numext::bit_cast<numext::uint16_t>(from[3*stride]);
+ const numext::uint16_t s4 = numext::bit_cast<numext::uint16_t>(from[4*stride]);
+ const numext::uint16_t s5 = numext::bit_cast<numext::uint16_t>(from[5*stride]);
+ const numext::uint16_t s6 = numext::bit_cast<numext::uint16_t>(from[6*stride]);
+ const numext::uint16_t s7 = numext::bit_cast<numext::uint16_t>(from[7*stride]);
+ return _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0);
}
template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8h>(Eigen::half* to, const Packet8h& from, Index stride)
@@ -1139,6 +1229,8 @@
kernel.packet[3] = pload<Packet8h>(out[3]);
}
+// BFloat16 implementation.
+
EIGEN_STRONG_INLINE Packet8f Bf16ToF32(const Packet8bf& a) {
#ifdef EIGEN_VECTORIZE_AVX2
__m256i extend = _mm256_cvtepu16_epi32(a);
@@ -1178,7 +1270,7 @@
__m256 mask = _mm256_cmp_ps(flush, flush, _CMP_ORD_Q);
__m256i nan = _mm256_set1_epi32(0x7fc0);
t = _mm256_blendv_epi8(nan, t, _mm256_castps_si256(mask));
- // output.value = static_cast<uint16_t>(input);
+ // output = numext::bit_cast<uint16_t>(input);
return _mm_packus_epi32(_mm256_extractf128_si256(t, 0),
_mm256_extractf128_si256(t, 1));
#else
@@ -1202,17 +1294,17 @@
__m128i nan = _mm_set1_epi32(0x7fc0);
lo = _mm_blendv_epi8(nan, lo, _mm_castps_si128(_mm256_castps256_ps128(mask)));
hi = _mm_blendv_epi8(nan, hi, _mm_castps_si128(_mm256_extractf128_ps(mask, 1)));
- // output.value = static_cast<uint16_t>(input);
+ // output = numext::bit_cast<uint16_t>(input);
return _mm_packus_epi32(lo, hi);
#endif
}
template<> EIGEN_STRONG_INLINE Packet8bf pset1<Packet8bf>(const bfloat16& from) {
- return _mm_set1_epi16(from.value);
+ return _mm_set1_epi16(numext::bit_cast<numext::uint16_t>(from));
}
template<> EIGEN_STRONG_INLINE bfloat16 pfirst<Packet8bf>(const Packet8bf& from) {
- return bfloat16_impl::raw_uint16_to_bfloat16(static_cast<unsigned short>(_mm_extract_epi16(from, 0)));
+ return numext::bit_cast<bfloat16>(static_cast<numext::uint16_t>(_mm_extract_epi16(from, 0)));
}
template<> EIGEN_STRONG_INLINE Packet8bf pload<Packet8bf>(const bfloat16* from) {
@@ -1233,17 +1325,17 @@
template<> EIGEN_STRONG_INLINE Packet8bf
ploaddup<Packet8bf>(const bfloat16* from) {
- unsigned short a = from[0].value;
- unsigned short b = from[1].value;
- unsigned short c = from[2].value;
- unsigned short d = from[3].value;
+ const numext::uint16_t a = numext::bit_cast<numext::uint16_t>(from[0]);
+ const numext::uint16_t b = numext::bit_cast<numext::uint16_t>(from[1]);
+ const numext::uint16_t c = numext::bit_cast<numext::uint16_t>(from[2]);
+ const numext::uint16_t d = numext::bit_cast<numext::uint16_t>(from[3]);
return _mm_set_epi16(d, d, c, c, b, b, a, a);
}
template<> EIGEN_STRONG_INLINE Packet8bf
ploadquad<Packet8bf>(const bfloat16* from) {
- unsigned short a = from[0].value;
- unsigned short b = from[1].value;
+ const numext::uint16_t a = numext::bit_cast<numext::uint16_t>(from[0]);
+ const numext::uint16_t b = numext::bit_cast<numext::uint16_t>(from[1]);
return _mm_set_epi16(b, b, b, b, a, a, a, a);
}
@@ -1253,7 +1345,8 @@
template <>
EIGEN_STRONG_INLINE Packet8bf pabs(const Packet8bf& a) {
- return F32ToBf16(pabs<Packet8f>(Bf16ToF32(a)));
+ const __m128i sign_mask = _mm_set1_epi16(static_cast<numext::uint16_t>(0x8000));
+ return _mm_andnot_si128(sign_mask, a);
}
template <>
@@ -1326,7 +1419,7 @@
template<> EIGEN_STRONG_INLINE Packet8bf pconj(const Packet8bf& a) { return a; }
template<> EIGEN_STRONG_INLINE Packet8bf pnegate(const Packet8bf& a) {
- Packet8bf sign_mask = _mm_set1_epi16(static_cast<unsigned short>(0x8000));
+ Packet8bf sign_mask = _mm_set1_epi16(static_cast<numext::uint16_t>(0x8000));
return _mm_xor_si128(a, sign_mask);
}
@@ -1349,7 +1442,15 @@
template<> EIGEN_STRONG_INLINE Packet8bf pgather<bfloat16, Packet8bf>(const bfloat16* from, Index stride)
{
- return _mm_set_epi16(from[7*stride].value, from[6*stride].value, from[5*stride].value, from[4*stride].value, from[3*stride].value, from[2*stride].value, from[1*stride].value, from[0*stride].value);
+ const numext::uint16_t s0 = numext::bit_cast<numext::uint16_t>(from[0*stride]);
+ const numext::uint16_t s1 = numext::bit_cast<numext::uint16_t>(from[1*stride]);
+ const numext::uint16_t s2 = numext::bit_cast<numext::uint16_t>(from[2*stride]);
+ const numext::uint16_t s3 = numext::bit_cast<numext::uint16_t>(from[3*stride]);
+ const numext::uint16_t s4 = numext::bit_cast<numext::uint16_t>(from[4*stride]);
+ const numext::uint16_t s5 = numext::bit_cast<numext::uint16_t>(from[5*stride]);
+ const numext::uint16_t s6 = numext::bit_cast<numext::uint16_t>(from[6*stride]);
+ const numext::uint16_t s7 = numext::bit_cast<numext::uint16_t>(from[7*stride]);
+ return _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0);
}
template<> EIGEN_STRONG_INLINE void pscatter<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from, Index stride)
diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h
index 53ee53d..45f22f4 100644
--- a/Eigen/src/Core/arch/AVX512/Complex.h
+++ b/Eigen/src/Core/arch/AVX512/Complex.h
@@ -37,6 +37,7 @@
HasMul = 1,
HasDiv = 1,
HasNegate = 1,
+ HasSqrt = 1,
HasAbs = 0,
HasAbs2 = 0,
HasMin = 0,
@@ -47,6 +48,8 @@
template<> struct unpacket_traits<Packet8cf> {
typedef std::complex<float> type;
+ typedef Packet4cf half;
+ typedef Packet16f as_real;
enum {
size = 8,
alignment=unpacket_traits<Packet16f>::alignment,
@@ -54,7 +57,6 @@
masked_load_available=false,
masked_store_available=false
};
- typedef Packet4cf half;
};
template<> EIGEN_STRONG_INLINE Packet8cf ptrue<Packet8cf>(const Packet8cf& a) { return Packet8cf(ptrue(Packet16f(a.v))); }
@@ -223,6 +225,7 @@
HasMul = 1,
HasDiv = 1,
HasNegate = 1,
+ HasSqrt = 1,
HasAbs = 0,
HasAbs2 = 0,
HasMin = 0,
@@ -233,6 +236,8 @@
template<> struct unpacket_traits<Packet4cd> {
typedef std::complex<double> type;
+ typedef Packet2cd half;
+ typedef Packet8d as_real;
enum {
size = 4,
alignment = unpacket_traits<Packet8d>::alignment,
@@ -240,7 +245,6 @@
masked_load_available=false,
masked_store_available=false
};
- typedef Packet2cd half;
};
template<> EIGEN_STRONG_INLINE Packet4cd padd<Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_add_pd(a.v,b.v)); }
@@ -437,8 +441,15 @@
kernel.packet[0] = Packet4cd(_mm512_shuffle_f64x2(T0, T2, (shuffle_mask<0,2,0,2>::mask))); // [a0 b0 c0 d0]
}
-} // end namespace internal
+template<> EIGEN_STRONG_INLINE Packet4cd psqrt<Packet4cd>(const Packet4cd& a) {
+ return psqrt_complex<Packet4cd>(a);
+}
+template<> EIGEN_STRONG_INLINE Packet8cf psqrt<Packet8cf>(const Packet8cf& a) {
+ return psqrt_complex<Packet8cf>(a);
+}
+
+} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_COMPLEX_AVX512_H
diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h
index bfd30c0..66f3252 100644
--- a/Eigen/src/Core/arch/AVX512/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h
@@ -35,7 +35,6 @@
#define _EIGEN_DECLARE_CONST_Packet16bf_FROM_INT(NAME, X) \
const Packet16bf p16bf_##NAME = preinterpret<Packet16bf,Packet16i>(pset1<Packet16i>(X))
-#if defined(EIGEN_VECTORIZE_AVX512DQ)
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
plog<Packet16f>(const Packet16f& _x) {
@@ -48,8 +47,23 @@
return plog_double(_x);
}
+F16_PACKET_FUNCTION(Packet16f, Packet16h, plog)
BF16_PACKET_FUNCTION(Packet16f, Packet16bf, plog)
-#endif
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
+plog2<Packet16f>(const Packet16f& _x) {
+ return plog2_float(_x);
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
+plog2<Packet8d>(const Packet8d& _x) {
+ return plog2_double(_x);
+}
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, plog2)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, plog2)
// Exponential function. Works by writing "x = m*log(2) + r" where
// "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then
@@ -174,6 +188,7 @@
return pmax(pmul(x, e), _x);
}*/
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pexp)
BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pexp)
// Functions for sqrt.
@@ -232,6 +247,7 @@
}
#endif
+F16_PACKET_FUNCTION(Packet16f, Packet16h, psqrt)
BF16_PACKET_FUNCTION(Packet16f, Packet16bf, psqrt)
// prsqrt for float.
@@ -256,7 +272,7 @@
__mmask16 inf_mask = _mm512_cmp_ps_mask(_x, p16f_inf, _CMP_EQ_OQ);
__mmask16 not_pos_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LE_OQ);
__mmask16 not_finite_pos_mask = not_pos_mask | inf_mask;
-
+
// Compute an approximate result using the rsqrt intrinsic, forcing +inf
// for denormals for consistency with AVX and SSE implementations.
Packet16f y_approx = _mm512_rsqrt14_ps(_x);
@@ -281,6 +297,7 @@
}
#endif
+F16_PACKET_FUNCTION(Packet16f, Packet16h, prsqrt)
BF16_PACKET_FUNCTION(Packet16f, Packet16bf, prsqrt)
// prsqrt for double.
@@ -330,12 +347,12 @@
}
#endif
-#if defined(EIGEN_VECTORIZE_AVX512DQ)
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet16f plog1p<Packet16f>(const Packet16f& _x) {
return generic_plog1p(_x);
}
+F16_PACKET_FUNCTION(Packet16f, Packet16h, plog1p)
BF16_PACKET_FUNCTION(Packet16f, Packet16bf, plog1p)
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
@@ -343,8 +360,8 @@
return generic_expm1(_x);
}
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pexpm1)
BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pexpm1)
-#endif
#endif
@@ -367,6 +384,10 @@
return internal::generic_fast_tanh_float(_x);
}
+F16_PACKET_FUNCTION(Packet16f, Packet16h, psin)
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pcos)
+F16_PACKET_FUNCTION(Packet16f, Packet16h, ptanh)
+
BF16_PACKET_FUNCTION(Packet16f, Packet16bf, psin)
BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pcos)
BF16_PACKET_FUNCTION(Packet16f, Packet16bf, ptanh)
diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h
index bf7f0db..f838857 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
@@ -58,23 +58,37 @@
Vectorizable = 1,
AlignedOnScalar = 1,
size = 16,
- HasHalfPacket = 0,
+ HasHalfPacket = 1,
+
+ HasCmp = 1,
HasAdd = 1,
HasSub = 1,
HasMul = 1,
HasDiv = 1,
HasNegate = 1,
- HasAbs = 0,
+ HasAbs = 1,
HasAbs2 = 0,
- HasMin = 0,
- HasMax = 0,
- HasConj = 0,
+ HasMin = 1,
+ HasMax = 1,
+ HasConj = 1,
HasSetLinear = 0,
- HasSqrt = 0,
- HasRsqrt = 0,
- HasExp = 0,
- HasLog = 0,
- HasBlend = 0
+ HasLog = 1,
+ HasLog1p = 1,
+ HasExpm1 = 1,
+ HasExp = 1,
+ HasSqrt = 1,
+ HasRsqrt = 1,
+ HasSin = EIGEN_FAST_MATH,
+ HasCos = EIGEN_FAST_MATH,
+ HasTanh = EIGEN_FAST_MATH,
+ HasErf = EIGEN_FAST_MATH,
+ HasBlend = 0,
+ HasRound = 1,
+ HasFloor = 1,
+ HasCeil = 1,
+ HasRint = 1,
+ HasBessel = 1,
+ HasNdtri = 1,
};
};
@@ -87,17 +101,20 @@
AlignedOnScalar = 1,
size = 16,
HasHalfPacket = 1,
+
+ HasAbs = 1,
+ HasMin = 1,
+ HasMax = 1,
+ HasConj = 1,
HasBlend = 0,
HasSin = EIGEN_FAST_MATH,
HasCos = EIGEN_FAST_MATH,
#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
-#ifdef EIGEN_VECTORIZE_AVX512DQ
HasLog = 1,
HasLog1p = 1,
HasExpm1 = 1,
HasNdtri = 1,
HasBessel = 1,
-#endif
HasExp = 1,
HasSqrt = EIGEN_FAST_MATH,
HasRsqrt = EIGEN_FAST_MATH,
@@ -105,7 +122,11 @@
HasErf = EIGEN_FAST_MATH,
#endif
HasCmp = 1,
- HasDiv = 1
+ HasDiv = 1,
+ HasRound = 1,
+ HasFloor = 1,
+ HasCeil = 1,
+ HasRint = 1
};
};
template<> struct packet_traits<double> : default_packet_traits
@@ -118,14 +139,16 @@
size = 8,
HasHalfPacket = 1,
#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
-#ifdef EIGEN_VECTORIZE_AVX512DQ
HasLog = 1,
-#endif
HasSqrt = EIGEN_FAST_MATH,
HasRsqrt = EIGEN_FAST_MATH,
#endif
HasCmp = 1,
- HasDiv = 1
+ HasDiv = 1,
+ HasRound = 1,
+ HasFloor = 1,
+ HasCeil = 1,
+ HasRint = 1
};
};
@@ -165,7 +188,7 @@
template<>
struct unpacket_traits<Packet16h> {
typedef Eigen::half type;
- typedef Packet16h half;
+ typedef Packet8h half;
enum {size=16, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false};
};
@@ -188,10 +211,27 @@
}
template <>
-EIGEN_STRONG_INLINE Packet8d pset1frombits<Packet8d>(uint64_t from) {
+EIGEN_STRONG_INLINE Packet8d pset1frombits<Packet8d>(const numext::uint64_t from) {
return _mm512_castsi512_pd(_mm512_set1_epi64(from));
}
+template<> EIGEN_STRONG_INLINE Packet16f pzero(const Packet16f& /*a*/) { return _mm512_setzero_ps(); }
+template<> EIGEN_STRONG_INLINE Packet8d pzero(const Packet8d& /*a*/) { return _mm512_setzero_pd(); }
+template<> EIGEN_STRONG_INLINE Packet16i pzero(const Packet16i& /*a*/) { return _mm512_setzero_si512(); }
+
+template<> EIGEN_STRONG_INLINE Packet16f peven_mask(const Packet16f& /*a*/) {
+ return _mm512_castsi512_ps(_mm512_set_epi32(0, -1, 0, -1, 0, -1, 0, -1,
+ 0, -1, 0, -1, 0, -1, 0, -1));
+}
+template<> EIGEN_STRONG_INLINE Packet16i peven_mask(const Packet16i& /*a*/) {
+ return _mm512_set_epi32(0, -1, 0, -1, 0, -1, 0, -1,
+ 0, -1, 0, -1, 0, -1, 0, -1);
+}
+template<> EIGEN_STRONG_INLINE Packet8d peven_mask(const Packet8d& /*a*/) {
+ return _mm512_castsi512_pd(_mm512_set_epi32(0, 0, -1, -1, 0, 0, -1, -1,
+ 0, 0, -1, -1, 0, 0, -1, -1));
+}
+
template <>
EIGEN_STRONG_INLINE Packet16f pload1<Packet16f>(const float* from) {
return _mm512_broadcastss_ps(_mm_load_ps1(from));
@@ -281,7 +321,7 @@
template <>
EIGEN_STRONG_INLINE Packet16i pmul<Packet16i>(const Packet16i& a,
const Packet16i& b) {
- return _mm512_mul_epi32(a, b);
+ return _mm512_mullo_epi32(a, b);
}
template <>
@@ -482,6 +522,15 @@
_mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu));
}
+template<> EIGEN_STRONG_INLINE Packet16f print<Packet16f>(const Packet16f& a) { return _mm512_roundscale_ps(a, _MM_FROUND_CUR_DIRECTION); }
+template<> EIGEN_STRONG_INLINE Packet8d print<Packet8d>(const Packet8d& a) { return _mm512_roundscale_pd(a, _MM_FROUND_CUR_DIRECTION); }
+
+template<> EIGEN_STRONG_INLINE Packet16f pceil<Packet16f>(const Packet16f& a) { return _mm512_roundscale_ps(a, _MM_FROUND_TO_POS_INF); }
+template<> EIGEN_STRONG_INLINE Packet8d pceil<Packet8d>(const Packet8d& a) { return _mm512_roundscale_pd(a, _MM_FROUND_TO_POS_INF); }
+
+template<> EIGEN_STRONG_INLINE Packet16f pfloor<Packet16f>(const Packet16f& a) { return _mm512_roundscale_ps(a, _MM_FROUND_TO_NEG_INF); }
+template<> EIGEN_STRONG_INLINE Packet8d pfloor<Packet8d>(const Packet8d& a) { return _mm512_roundscale_pd(a, _MM_FROUND_TO_NEG_INF); }
+
template <>
EIGEN_STRONG_INLINE Packet16i ptrue<Packet16i>(const Packet16i& /*a*/) {
return _mm512_set1_epi32(0xffffffffu);
@@ -598,6 +647,21 @@
#endif
}
+template<> EIGEN_STRONG_INLINE Packet16f pround<Packet16f>(const Packet16f& a)
+{
+ // Work-around for default std::round rounding mode.
+ const Packet16f mask = pset1frombits<Packet16f>(static_cast<numext::uint32_t>(0x80000000u));
+ const Packet16f prev0dot5 = pset1frombits<Packet16f>(static_cast<numext::uint32_t>(0x3EFFFFFFu));
+ return _mm512_roundscale_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
+}
+template<> EIGEN_STRONG_INLINE Packet8d pround<Packet8d>(const Packet8d& a)
+{
+ // Work-around for default std::round rounding mode.
+ const Packet8d mask = pset1frombits<Packet8d>(static_cast<numext::uint64_t>(0x8000000000000000ull));
+ const Packet8d prev0dot5 = pset1frombits<Packet8d>(static_cast<numext::uint64_t>(0x3FDFFFFFFFFFFFFFull));
+ return _mm512_roundscale_pd(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
+}
+
template<int N> EIGEN_STRONG_INLINE Packet16i parithmetic_shift_right(Packet16i a) {
return _mm512_srai_epi32(a, N);
}
@@ -835,12 +899,34 @@
}
template<>
-EIGEN_STRONG_INLINE Packet8d pfrexp<Packet8d>(const Packet8d& a, Packet8d& exponent){
+EIGEN_STRONG_INLINE Packet8d pfrexp<Packet8d>(const Packet8d& a, Packet8d& exponent) {
const Packet8d cst_1022d = pset1<Packet8d>(1022.0);
+#ifdef EIGEN_TEST_AVX512DQ
+ exponent = psub(_mm512_cvtepi64_pd(_mm512_srli_epi64(_mm512_castpd_si512(a), 52)), cst_1022d);
+#else
+ exponent = psub(_mm512_cvtepi32_pd(_mm512_cvtepi64_epi32(_mm512_srli_epi64(_mm512_castpd_si512(a), 52))),
+ cst_1022d);
+#endif
const Packet8d cst_half = pset1<Packet8d>(0.5);
const Packet8d cst_inv_mant_mask = pset1frombits<Packet8d>(static_cast<uint64_t>(~0x7ff0000000000000ull));
- exponent = psub(_mm512_cvtepi64_pd(_mm512_srli_epi64(_mm512_castpd_si512(a), 52)), cst_1022d);
- return por(pand(a, cst_inv_mant_mask), cst_half);
+ return por(pand(a, cst_inv_mant_mask), cst_half);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16f pldexp<Packet16f>(const Packet16f& a, const Packet16f& exponent) {
+ return pldexp_float(a,exponent);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8d pldexp<Packet8d>(const Packet8d& a, const Packet8d& exponent) {
+ // Build e=2^n by constructing the exponents in a 256-bit vector and
+ // shifting them to where they belong in double-precision values.
+ Packet8i cst_1023 = pset1<Packet8i>(1023);
+ __m256i emm0 = _mm512_cvtpd_epi32(exponent);
+ emm0 = _mm256_add_epi32(emm0, cst_1023);
+ emm0 = _mm256_shuffle_epi32(emm0, _MM_SHUFFLE(3, 1, 2, 0));
+ __m256i lo = _mm256_slli_epi64(emm0, 52);
+ __m256i hi = _mm256_slli_epi64(_mm256_srli_epi64(emm0, 32), 52);
+ __m512d b = _mm512_castsi512_pd(_mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1));
+ return pmul(a, b);
}
#ifdef EIGEN_VECTORIZE_AVX512DQ
@@ -1270,22 +1356,6 @@
return _mm512_mask_blend_pd(m, elsePacket, thenPacket);
}
-template<> EIGEN_STRONG_INLINE Packet16i pcast<Packet16f, Packet16i>(const Packet16f& a) {
- return _mm512_cvttps_epi32(a);
-}
-
-template<> EIGEN_STRONG_INLINE Packet16f pcast<Packet16i, Packet16f>(const Packet16i& a) {
- return _mm512_cvtepi32_ps(a);
-}
-
-template<> EIGEN_STRONG_INLINE Packet16i preinterpret<Packet16i,Packet16f>(const Packet16f& a) {
- return _mm512_castps_si512(a);
-}
-
-template<> EIGEN_STRONG_INLINE Packet16f preinterpret<Packet16f,Packet16i>(const Packet16i& a) {
- return _mm512_castsi512_ps(a);
-}
-
// Packet math for Eigen::half
template<> EIGEN_STRONG_INLINE Packet16h pset1<Packet16h>(const Eigen::half& from) {
return _mm256_set1_epi16(from.x);
@@ -1398,6 +1468,29 @@
return ptrue(Packet8i(a));
}
+template <>
+EIGEN_STRONG_INLINE Packet16h pabs(const Packet16h& a) {
+ const __m256i sign_mask = _mm256_set1_epi16(static_cast<numext::uint16_t>(0x8000));
+ return _mm256_andnot_si256(sign_mask, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pmin<Packet16h>(const Packet16h& a,
+ const Packet16h& b) {
+ return float2half(pmin<Packet16f>(half2float(a), half2float(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pmax<Packet16h>(const Packet16h& a,
+ const Packet16h& b) {
+ return float2half(pmax<Packet16f>(half2float(a), half2float(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h plset<Packet16h>(const half& a) {
+ return float2half(plset<Packet16f>(static_cast<float>(a)));
+}
+
template<> EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a,const Packet16h& b) {
// in some cases Packet8i is a wrapper around __m256i, so we need to
// cast to Packet8i to call the correct overload.
@@ -1417,12 +1510,42 @@
return _mm256_blendv_epi8(b, a, mask);
}
+template<> EIGEN_STRONG_INLINE Packet16h pround<Packet16h>(const Packet16h& a) {
+ return float2half(pround<Packet16f>(half2float(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h print<Packet16h>(const Packet16h& a) {
+ return float2half(print<Packet16f>(half2float(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pceil<Packet16h>(const Packet16h& a) {
+ return float2half(pceil<Packet16f>(half2float(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pfloor<Packet16h>(const Packet16h& a) {
+ return float2half(pfloor<Packet16f>(half2float(a)));
+}
+
template<> EIGEN_STRONG_INLINE Packet16h pcmp_eq(const Packet16h& a,const Packet16h& b) {
Packet16f af = half2float(a);
Packet16f bf = half2float(b);
return Pack32To16(pcmp_eq(af, bf));
}
+template<> EIGEN_STRONG_INLINE Packet16h pcmp_le(const Packet16h& a,const Packet16h& b) {
+ return Pack32To16(pcmp_le(half2float(a), half2float(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pcmp_lt(const Packet16h& a,const Packet16h& b) {
+ return Pack32To16(pcmp_lt(half2float(a), half2float(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pcmp_lt_or_nan(const Packet16h& a,const Packet16h& b) {
+ return Pack32To16(pcmp_lt_or_nan(half2float(a), half2float(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pconj(const Packet16h& a) { return a; }
+
template<> EIGEN_STRONG_INLINE Packet16h pnegate(const Packet16h& a) {
Packet16h sign_mask = _mm256_set1_epi16(static_cast<unsigned short>(0x8000));
return _mm256_xor_si256(a, sign_mask);
@@ -1461,6 +1584,25 @@
return half(predux(from_float));
}
+template <>
+EIGEN_STRONG_INLINE Packet8h predux_half_dowto4<Packet16h>(const Packet16h& a) {
+ Packet8h lane0 = _mm256_extractf128_si256(a, 0);
+ Packet8h lane1 = _mm256_extractf128_si256(a, 1);
+ return padd<Packet8h>(lane0, lane1);
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half predux_max<Packet16h>(const Packet16h& a) {
+ Packet16f af = half2float(a);
+ float reduced = predux_max<Packet16f>(af);
+ return Eigen::half(reduced);
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half predux_min<Packet16h>(const Packet16h& a) {
+ Packet16f af = half2float(a);
+ float reduced = predux_min<Packet16f>(af);
+ return Eigen::half(reduced);
+}
+
template<> EIGEN_STRONG_INLINE half predux_mul<Packet16h>(const Packet16h& from) {
Packet16f from_float = half2float(from);
return half(predux_mul(from_float));
@@ -1487,22 +1629,22 @@
{
EIGEN_ALIGN64 half aux[16];
pstore(aux, from);
- to[stride*0].x = aux[0].x;
- to[stride*1].x = aux[1].x;
- to[stride*2].x = aux[2].x;
- to[stride*3].x = aux[3].x;
- to[stride*4].x = aux[4].x;
- to[stride*5].x = aux[5].x;
- to[stride*6].x = aux[6].x;
- to[stride*7].x = aux[7].x;
- to[stride*8].x = aux[8].x;
- to[stride*9].x = aux[9].x;
- to[stride*10].x = aux[10].x;
- to[stride*11].x = aux[11].x;
- to[stride*12].x = aux[12].x;
- to[stride*13].x = aux[13].x;
- to[stride*14].x = aux[14].x;
- to[stride*15].x = aux[15].x;
+ to[stride*0] = aux[0];
+ to[stride*1] = aux[1];
+ to[stride*2] = aux[2];
+ to[stride*3] = aux[3];
+ to[stride*4] = aux[4];
+ to[stride*5] = aux[5];
+ to[stride*6] = aux[6];
+ to[stride*7] = aux[7];
+ to[stride*8] = aux[8];
+ to[stride*9] = aux[9];
+ to[stride*10] = aux[10];
+ to[stride*11] = aux[11];
+ to[stride*12] = aux[12];
+ to[stride*13] = aux[13];
+ to[stride*14] = aux[14];
+ to[stride*15] = aux[15];
}
EIGEN_STRONG_INLINE void
@@ -1694,7 +1836,7 @@
HasCos = EIGEN_FAST_MATH,
#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
#ifdef EIGEN_VECTORIZE_AVX512DQ
- HasLog = 1,
+ HasLog = 1, // Currently fails test with bad accuracy.
HasLog1p = 1,
HasExpm1 = 1,
HasNdtri = 1,
@@ -1859,6 +2001,23 @@
return _mm256_blendv_epi8(b, a, mask);
}
+template<> EIGEN_STRONG_INLINE Packet16bf pround<Packet16bf>(const Packet16bf& a)
+{
+ return F32ToBf16(pround<Packet16f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16bf print<Packet16bf>(const Packet16bf& a) {
+ return F32ToBf16(print<Packet16f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16bf pceil<Packet16bf>(const Packet16bf& a) {
+ return F32ToBf16(pceil<Packet16f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16bf pfloor<Packet16bf>(const Packet16bf& a) {
+ return F32ToBf16(pfloor<Packet16f>(Bf16ToF32(a)));
+}
+
template <>
EIGEN_STRONG_INLINE Packet16bf pcmp_eq(const Packet16bf& a,
const Packet16bf& b) {
@@ -1885,9 +2044,7 @@
template <>
EIGEN_STRONG_INLINE Packet16bf pnegate(const Packet16bf& a) {
- Packet16bf sign_mask;
- sign_mask = _mm256_set1_epi16(static_cast<unsigned short>(0x8000));
- Packet16bf result;
+ Packet16bf sign_mask = _mm256_set1_epi16(static_cast<unsigned short>(0x8000));
return _mm256_xor_si256(a, sign_mask);
}
@@ -1898,7 +2055,8 @@
template <>
EIGEN_STRONG_INLINE Packet16bf pabs(const Packet16bf& a) {
- return F32ToBf16(pabs<Packet16f>(Bf16ToF32(a)));
+ const __m256i sign_mask = _mm256_set1_epi16(static_cast<numext::uint16_t>(0x8000));
+ return _mm256_andnot_si256(sign_mask, a);
}
template <>
@@ -1997,22 +2155,22 @@
Index stride) {
EIGEN_ALIGN64 bfloat16 aux[16];
pstore(aux, from);
- to[stride*0].value = aux[0].value;
- to[stride*1].value = aux[1].value;
- to[stride*2].value = aux[2].value;
- to[stride*3].value = aux[3].value;
- to[stride*4].value = aux[4].value;
- to[stride*5].value = aux[5].value;
- to[stride*6].value = aux[6].value;
- to[stride*7].value = aux[7].value;
- to[stride*8].value = aux[8].value;
- to[stride*9].value = aux[9].value;
- to[stride*10].value = aux[10].value;
- to[stride*11].value = aux[11].value;
- to[stride*12].value = aux[12].value;
- to[stride*13].value = aux[13].value;
- to[stride*14].value = aux[14].value;
- to[stride*15].value = aux[15].value;
+ to[stride*0] = aux[0];
+ to[stride*1] = aux[1];
+ to[stride*2] = aux[2];
+ to[stride*3] = aux[3];
+ to[stride*4] = aux[4];
+ to[stride*5] = aux[5];
+ to[stride*6] = aux[6];
+ to[stride*7] = aux[7];
+ to[stride*8] = aux[8];
+ to[stride*9] = aux[9];
+ to[stride*10] = aux[10];
+ to[stride*11] = aux[11];
+ to[stride*12] = aux[12];
+ to[stride*13] = aux[13];
+ to[stride*14] = aux[14];
+ to[stride*15] = aux[15];
}
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16bf,16>& kernel) {
diff --git a/Eigen/src/Core/arch/AVX512/TypeCasting.h b/Eigen/src/Core/arch/AVX512/TypeCasting.h
index e643b18..3304127 100644
--- a/Eigen/src/Core/arch/AVX512/TypeCasting.h
+++ b/Eigen/src/Core/arch/AVX512/TypeCasting.h
@@ -14,6 +14,22 @@
namespace internal {
+template<> EIGEN_STRONG_INLINE Packet16i pcast<Packet16f, Packet16i>(const Packet16f& a) {
+ return _mm512_cvttps_epi32(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16f pcast<Packet16i, Packet16f>(const Packet16i& a) {
+ return _mm512_cvtepi32_ps(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16i preinterpret<Packet16i, Packet16f>(const Packet16f& a) {
+ return _mm512_castps_si512(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16f preinterpret<Packet16f, Packet16i>(const Packet16i& a) {
+ return _mm512_castsi512_ps(a);
+}
+
template <>
struct type_casting_traits<half, float> {
enum {
diff --git a/Eigen/src/Core/arch/Default/BFloat16.h b/Eigen/src/Core/arch/Default/BFloat16.h
index 351f451..72a489b 100644
--- a/Eigen/src/Core/arch/Default/BFloat16.h
+++ b/Eigen/src/Core/arch/Default/BFloat16.h
@@ -512,6 +512,9 @@
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log10(const bfloat16& a) {
return bfloat16(::log10f(float(a)));
}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log2(const bfloat16& a) {
+ return bfloat16(static_cast<float>(EIGEN_LOG2E) * ::logf(float(a)));
+}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 sqrt(const bfloat16& a) {
return bfloat16(::sqrtf(float(a)));
}
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
index 6d92d1c..a6d2de6 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@@ -59,21 +59,21 @@
return pmul(a, preinterpret<Packet>(plogical_shift_left<52>(ei)));
}
-// Natural logarithm
+// Natural or base 2 logarithm.
// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can
// be easily approximated by a polynomial centered on m=1 for stability.
// TODO(gonnet): Further reduce the interval allowing for lower-degree
// polynomial interpolants -> ... -> profit!
-template <typename Packet>
+template <typename Packet, bool base2>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
EIGEN_UNUSED
-Packet plog_float(const Packet _x)
+Packet plog_impl_float(const Packet _x)
{
Packet x = _x;
const Packet cst_1 = pset1<Packet>(1.0f);
- const Packet cst_half = pset1<Packet>(0.5f);
+ const Packet cst_neg_half = pset1<Packet>(-0.5f);
// The smallest non denormalized float number.
const Packet cst_min_norm_pos = pset1frombits<Packet>( 0x00800000u);
const Packet cst_minus_inf = pset1frombits<Packet>( 0xff800000u);
@@ -90,8 +90,6 @@
const Packet cst_cephes_log_p6 = pset1<Packet>(+2.0000714765E-1f);
const Packet cst_cephes_log_p7 = pset1<Packet>(-2.4999993993E-1f);
const Packet cst_cephes_log_p8 = pset1<Packet>(+3.3333331174E-1f);
- const Packet cst_cephes_log_q1 = pset1<Packet>(-2.12194440e-4f);
- const Packet cst_cephes_log_q2 = pset1<Packet>(0.693359375f);
// Truncate input values to the minimum positive normal.
x = pmax(x, cst_min_norm_pos);
@@ -129,14 +127,17 @@
y = pmadd(y, x3, y2);
y = pmul(y, x3);
+ y = pmadd(cst_neg_half, x2, y);
+ x = padd(x, y);
+
// Add the logarithm of the exponent back to the result of the interpolation.
- y1 = pmul(e, cst_cephes_log_q1);
- tmp = pmul(x2, cst_half);
- y = padd(y, y1);
- x = psub(x, tmp);
- y2 = pmul(e, cst_cephes_log_q2);
- x = padd(x, y);
- x = padd(x, y2);
+ if (base2) {
+ const Packet cst_log2e = pset1<Packet>(static_cast<float>(EIGEN_LOG2E));
+ x = pmadd(x, cst_log2e, e);
+ } else {
+ const Packet cst_ln2 = pset1<Packet>(static_cast<float>(EIGEN_LN2));
+ x = pmadd(e, cst_ln2, x);
+ }
Packet invalid_mask = pcmp_lt_or_nan(_x, pzero(_x));
Packet iszero_mask = pcmp_eq(_x,pzero(_x));
@@ -149,33 +150,46 @@
por(pselect(pos_inf_mask,cst_pos_inf,x), invalid_mask));
}
-
-/* Returns the base e (2.718...) logarithm of x.
- * The argument is separated into its exponent and fractional
- * parts. If the exponent is between -1 and +1, the logarithm
- * of the fraction is approximated by
- *
- * log(1+x) = x - 0.5 x**2 + x**3 P(x)/Q(x).
- *
- * Otherwise, setting z = 2(x-1)/x+1),
- * log(x) = z + z**3 P(z)/Q(z).
- *
- * for more detail see: http://www.netlib.org/cephes/
- */
template <typename Packet>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
EIGEN_UNUSED
-Packet plog_double(const Packet _x)
+Packet plog_float(const Packet _x)
+{
+ return plog_impl_float<Packet, /* base2 */ false>(_x);
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet plog2_float(const Packet _x)
+{
+ return plog_impl_float<Packet, /* base2 */ true>(_x);
+}
+
+/* Returns the base e (2.718...) or base 2 logarithm of x.
+ * The argument is separated into its exponent and fractional parts.
+ * The logarithm of the fraction in the interval [sqrt(1/2), sqrt(2)],
+ * is approximated by
+ *
+ * log(1+x) = x - 0.5 x**2 + x**3 P(x)/Q(x).
+ *
+ * for more detail see: http://www.netlib.org/cephes/
+ */
+template <typename Packet, bool base2>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet plog_impl_double(const Packet _x)
{
Packet x = _x;
const Packet cst_1 = pset1<Packet>(1.0);
- const Packet cst_half = pset1<Packet>(0.5);
- // The smallest non denormalized float number.
+ const Packet cst_neg_half = pset1<Packet>(-0.5);
+ // The smallest non denormalized double.
const Packet cst_min_norm_pos = pset1frombits<Packet>( static_cast<uint64_t>(0x0010000000000000ull));
const Packet cst_minus_inf = pset1frombits<Packet>( static_cast<uint64_t>(0xfff0000000000000ull));
const Packet cst_pos_inf = pset1frombits<Packet>( static_cast<uint64_t>(0x7ff0000000000000ull));
+
// Polynomial Coefficients for log(1+x) = x - x**2/2 + x**3 P(x)/Q(x)
// 1/sqrt(2) <= x < sqrt(2)
const Packet cst_cephes_SQRTHF = pset1<Packet>(0.70710678118654752440E0);
@@ -186,15 +200,12 @@
const Packet cst_cephes_log_p4 = pset1<Packet>(1.79368678507819816313E1);
const Packet cst_cephes_log_p5 = pset1<Packet>(7.70838733755885391666E0);
- const Packet cst_cephes_log_r0 = pset1<Packet>(1.0);
- const Packet cst_cephes_log_r1 = pset1<Packet>(1.12873587189167450590E1);
- const Packet cst_cephes_log_r2 = pset1<Packet>(4.52279145837532221105E1);
- const Packet cst_cephes_log_r3 = pset1<Packet>(8.29875266912776603211E1);
- const Packet cst_cephes_log_r4 = pset1<Packet>(7.11544750618563894466E1);
- const Packet cst_cephes_log_r5 = pset1<Packet>(2.31251620126765340583E1);
-
- const Packet cst_cephes_log_q1 = pset1<Packet>(-2.121944400546905827679e-4);
- const Packet cst_cephes_log_q2 = pset1<Packet>(0.693359375);
+ const Packet cst_cephes_log_q0 = pset1<Packet>(1.0);
+ const Packet cst_cephes_log_q1 = pset1<Packet>(1.12873587189167450590E1);
+ const Packet cst_cephes_log_q2 = pset1<Packet>(4.52279145837532221105E1);
+ const Packet cst_cephes_log_q3 = pset1<Packet>(8.29875266912776603211E1);
+ const Packet cst_cephes_log_q4 = pset1<Packet>(7.11544750618563894466E1);
+ const Packet cst_cephes_log_q5 = pset1<Packet>(2.31251620126765340583E1);
// Truncate input values to the minimum positive normal.
x = pmax(x, cst_min_norm_pos);
@@ -220,31 +231,34 @@
Packet x3 = pmul(x2, x);
// Evaluate the polynomial approximant , probably to improve instruction-level parallelism.
- // y = x * ( z * polevl( x, P, 5 ) / p1evl( x, Q, 5 ) );
- Packet y, y1, y2,y_;
+ // y = x - 0.5*x^2 + x^3 * polevl( x, P, 5 ) / p1evl( x, Q, 5 ) );
+ Packet y, y1, y_;
y = pmadd(cst_cephes_log_p0, x, cst_cephes_log_p1);
y1 = pmadd(cst_cephes_log_p3, x, cst_cephes_log_p4);
y = pmadd(y, x, cst_cephes_log_p2);
y1 = pmadd(y1, x, cst_cephes_log_p5);
y_ = pmadd(y, x3, y1);
- y = pmadd(cst_cephes_log_r0, x, cst_cephes_log_r1);
- y1 = pmadd(cst_cephes_log_r3, x, cst_cephes_log_r4);
- y = pmadd(y, x, cst_cephes_log_r2);
- y1 = pmadd(y1, x, cst_cephes_log_r5);
+ y = pmadd(cst_cephes_log_q0, x, cst_cephes_log_q1);
+ y1 = pmadd(cst_cephes_log_q3, x, cst_cephes_log_q4);
+ y = pmadd(y, x, cst_cephes_log_q2);
+ y1 = pmadd(y1, x, cst_cephes_log_q5);
y = pmadd(y, x3, y1);
y_ = pmul(y_, x3);
y = pdiv(y_, y);
+ y = pmadd(cst_neg_half, x2, y);
+ x = padd(x, y);
+
// Add the logarithm of the exponent back to the result of the interpolation.
- y1 = pmul(e, cst_cephes_log_q1);
- tmp = pmul(x2, cst_half);
- y = padd(y, y1);
- x = psub(x, tmp);
- y2 = pmul(e, cst_cephes_log_q2);
- x = padd(x, y);
- x = padd(x, y2);
+ if (base2) {
+ const Packet cst_log2e = pset1<Packet>(static_cast<double>(EIGEN_LOG2E));
+ x = pmadd(x, cst_log2e, e);
+ } else {
+ const Packet cst_ln2 = pset1<Packet>(static_cast<double>(EIGEN_LN2));
+ x = pmadd(e, cst_ln2, x);
+ }
Packet invalid_mask = pcmp_lt_or_nan(_x, pzero(_x));
Packet iszero_mask = pcmp_eq(_x,pzero(_x));
@@ -257,6 +271,22 @@
por(pselect(pos_inf_mask,cst_pos_inf,x), invalid_mask));
}
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet plog_double(const Packet _x)
+{
+ return plog_impl_double<Packet, /* base2 */ false>(_x);
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet plog2_double(const Packet _x)
+{
+ return plog_impl_double<Packet, /* base2 */ true>(_x);
+}
+
/** \internal \returns log(1 + x) computed using W. Kahan's formula.
See: http://www.plunk.org/~hatch/rightway.php
*/
@@ -643,6 +673,120 @@
return psincos_float<false>(x);
}
+
+template<typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet psqrt_complex(const Packet& a) {
+ typedef typename unpacket_traits<Packet>::type Scalar;
+ typedef typename Scalar::value_type RealScalar;
+ typedef typename unpacket_traits<Packet>::as_real RealPacket;
+
+ // Computes the principal sqrt of the complex numbers in the input.
+ //
+ // For example, for packets containing 2 complex numbers stored in interleaved format
+ // a = [a0, a1] = [x0, y0, x1, y1],
+ // where x0 = real(a0), y0 = imag(a0) etc., this function returns
+ // b = [b0, b1] = [u0, v0, u1, v1],
+ // such that b0^2 = a0, b1^2 = a1.
+ //
+ // To derive the formula for the complex square roots, let's consider the equation for
+ // a single complex square root of the number x + i*y. We want to find real numbers
+ // u and v such that
+ // (u + i*v)^2 = x + i*y <=>
+ // u^2 - v^2 + i*2*u*v = x + i*v.
+ // By equating the real and imaginary parts we get:
+ // u^2 - v^2 = x
+ // 2*u*v = y.
+ //
+ // For x >= 0, this has the numerically stable solution
+ // u = sqrt(0.5 * (x + sqrt(x^2 + y^2)))
+ // v = 0.5 * (y / u)
+ // and for x < 0,
+ // v = sign(y) * sqrt(0.5 * (x + sqrt(x^2 + y^2)))
+ // u = |0.5 * (y / v)|
+ //
+ // To avoid unnecessary over- and underflow, we compute sqrt(x^2 + y^2) as
+ // l = max(|x|, |y|) * sqrt(1 + (min(|x|, |y|) / max(|x|, |y|))^2) ,
+
+ // In the following, without lack of generality, we have annotated the code, assuming
+ // that the input is a packet of 2 complex numbers.
+ //
+ // Step 1. Compute l = [l0, l0, l1, l1], where
+ // l0 = sqrt(x0^2 + y0^2), l1 = sqrt(x1^2 + y1^2)
+ // To avoid over- and underflow, we use the stable formula for each hypotenuse
+ // l0 = (min0 == 0 ? max0 : max0 * sqrt(1 + (min0/max0)**2)),
+ // where max0 = max(|x0|, |y0|), min0 = min(|x0|, |y0|), and similarly for l1.
+
+ Packet a_flip = pcplxflip(a);
+ RealPacket a_abs = pabs(a.v); // [|x0|, |y0|, |x1|, |y1|]
+ RealPacket a_abs_flip = pabs(a_flip.v); // [|y0|, |x0|, |y1|, |x1|]
+ RealPacket a_max = pmax(a_abs, a_abs_flip);
+ RealPacket a_min = pmin(a_abs, a_abs_flip);
+ RealPacket a_min_zero_mask = pcmp_eq(a_min, pzero(a_min));
+ RealPacket a_max_zero_mask = pcmp_eq(a_max, pzero(a_max));
+ RealPacket r = pdiv(a_min, a_max);
+ const RealPacket cst_one = pset1<RealPacket>(RealScalar(1));
+ RealPacket l = pmul(a_max, psqrt(padd(cst_one, pmul(r, r)))); // [l0, l0, l1, l1]
+ // Set l to a_max if a_min is zero.
+ l = pselect(a_min_zero_mask, a_max, l);
+
+ // Step 2. Compute [rho0, *, rho1, *], where
+ // rho0 = sqrt(0.5 * (l0 + |x0|)), rho1 = sqrt(0.5 * (l1 + |x1|))
+ // We don't care about the imaginary parts computed here. They will be overwritten later.
+ const RealPacket cst_half = pset1<RealPacket>(RealScalar(0.5));
+ Packet rho;
+ rho.v = psqrt(pmul(cst_half, padd(a_abs, l)));
+
+ // Step 3. Compute [rho0, eta0, rho1, eta1], where
+ // eta0 = (y0 / l0) / 2, and eta1 = (y1 / l1) / 2.
+ // set eta = 0 of input is 0 + i0.
+ RealPacket eta = pandnot(pmul(cst_half, pdiv(a.v, pcplxflip(rho).v)), a_max_zero_mask);
+ RealPacket real_mask = peven_mask(a.v);
+ Packet positive_real_result;
+ // Compute result for inputs with positive real part.
+ positive_real_result.v = pselect(real_mask, rho.v, eta);
+
+ // Step 4. Compute solution for inputs with negative real part:
+ // [|eta0|, sign(y0)*rho0, |eta1|, sign(y1)*rho1]
+ const RealPacket cst_imag_sign_mask = pset1<Packet>(Scalar(RealScalar(0.0), RealScalar(-0.0))).v;
+ RealPacket imag_signs = pand(a.v, cst_imag_sign_mask);
+ Packet negative_real_result;
+ // Notice that rho is positive, so taking it's absolute value is a noop.
+ negative_real_result.v = por(pabs(pcplxflip(positive_real_result).v), imag_signs);
+
+ // Step 5. Select solution branch based on the sign of the real parts.
+ Packet negative_real_mask;
+ negative_real_mask.v = pcmp_lt(pand(real_mask, a.v), pzero(a.v));
+ negative_real_mask.v = por(negative_real_mask.v, pcplxflip(negative_real_mask).v);
+ Packet result = pselect(negative_real_mask, negative_real_result, positive_real_result);
+
+ // Step 6. Handle special cases for infinities:
+ // * If z is (x,+∞), the result is (+∞,+∞) even if x is NaN
+ // * If z is (x,-∞), the result is (+∞,-∞) even if x is NaN
+ // * If z is (-∞,y), the result is (0*|y|,+∞) for finite or NaN y
+ // * If z is (+∞,y), the result is (+∞,0*|y|) for finite or NaN y
+ const RealPacket cst_pos_inf = pset1<RealPacket>(NumTraits<RealScalar>::infinity());
+ Packet is_inf;
+ is_inf.v = pcmp_eq(a_abs, cst_pos_inf);
+ Packet is_real_inf;
+ is_real_inf.v = pand(is_inf.v, real_mask);
+ is_real_inf = por(is_real_inf, pcplxflip(is_real_inf));
+ // prepare packet of (+∞,0*|y|) or (0*|y|,+∞), depending on the sign of the infinite real part.
+ Packet real_inf_result;
+ real_inf_result.v = pmul(a_abs, pset1<Packet>(Scalar(RealScalar(1.0), RealScalar(0.0))).v);
+ real_inf_result.v = pselect(negative_real_mask.v, pcplxflip(real_inf_result).v, real_inf_result.v);
+ // prepare packet of (+∞,+∞) or (+∞,-∞), depending on the sign of the infinite imaginary part.
+ Packet is_imag_inf;
+ is_imag_inf.v = pandnot(is_inf.v, real_mask);
+ is_imag_inf = por(is_imag_inf, pcplxflip(is_imag_inf));
+ Packet imag_inf_result;
+ imag_inf_result.v = por(pand(cst_pos_inf, real_mask), pandnot(a.v, real_mask));
+
+ return pselect(is_imag_inf, imag_inf_result,
+ pselect(is_real_inf, real_inf_result,result));
+}
+
/* polevl (modified for Eigen)
*
* Evaluate polynomial
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
index 0e02a1b..491f1c9 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
@@ -32,12 +32,24 @@
EIGEN_UNUSED
Packet plog_float(const Packet _x);
+/** \internal \returns log2(x) for single precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet plog2_float(const Packet _x);
+
/** \internal \returns log(x) for single precision float */
template <typename Packet>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
EIGEN_UNUSED
Packet plog_double(const Packet _x);
+/** \internal \returns log2(x) for single precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet plog2_double(const Packet _x);
+
/** \internal \returns log(1 + x) */
template<typename Packet>
Packet generic_plog1p(const Packet& x);
@@ -70,8 +82,15 @@
EIGEN_UNUSED
Packet pcos_float(const Packet& x);
+/** \internal \returns sqrt(x) for complex types */
+template<typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet psqrt_complex(const Packet& a);
+
template <typename Packet, int N> struct ppolevl;
+
} // end namespace internal
} // end namespace Eigen
diff --git a/Eigen/src/Core/arch/Default/Half.h b/Eigen/src/Core/arch/Default/Half.h
index 4dde913..b273abe 100644
--- a/Eigen/src/Core/arch/Default/Half.h
+++ b/Eigen/src/Core/arch/Default/Half.h
@@ -36,12 +36,6 @@
#ifndef EIGEN_HALF_H
#define EIGEN_HALF_H
-#if EIGEN_HAS_CXX11
-#define EIGEN_EXPLICIT_CAST(tgt_type) explicit operator tgt_type()
-#else
-#define EIGEN_EXPLICIT_CAST(tgt_type) operator tgt_type()
-#endif
-
#include <sstream>
#if defined(EIGEN_HAS_GPU_FP16) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
@@ -56,6 +50,13 @@
#define EIGEN_CONSTEXPR
#endif
+#define F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, METHOD) \
+ template <> \
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_UNUSED \
+ PACKET_F16 METHOD<PACKET_F16>(const PACKET_F16& _x) { \
+ return float2half(METHOD<PACKET_F>(half2float(_x))); \
+ }
+
namespace Eigen {
struct half;
@@ -80,14 +81,12 @@
// Nothing to do here
// HIP fp16 header file has a definition for __half_raw
#elif defined(EIGEN_HAS_CUDA_FP16)
- #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
-// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
- typedef __half __half_raw;
- #endif // defined(EIGEN_HAS_CUDA_FP16)
-
+ #if EIGEN_CUDA_SDK_VER < 90000
+ // In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
+ typedef __half __half_raw;
+ #endif // defined(EIGEN_HAS_CUDA_FP16)
#elif defined(SYCL_DEVICE_ONLY)
-typedef cl::sycl::half __half_raw;
-
+ typedef cl::sycl::half __half_raw;
#endif
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw raw_uint16_to_half(numext::uint16_t x);
@@ -102,7 +101,7 @@
#if defined(EIGEN_HAS_HIP_FP16)
EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half_base(const __half& h) { x = __half_as_ushort(h); }
#elif defined(EIGEN_HAS_CUDA_FP16)
- #if (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000)
+ #if EIGEN_CUDA_SDK_VER >= 90000
EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
#endif
#endif
@@ -200,7 +199,7 @@
static Eigen::half round_error() { return Eigen::half(0.5); }
static Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); }
static Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); }
- static Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); }
+ static Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7d00); }
static Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x1); }
};
@@ -448,7 +447,7 @@
#endif
}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR numext::uint16_t raw_half_as_uint16(const __half_raw& h) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC numext::uint16_t raw_half_as_uint16(const __half_raw& h) {
// HIP/CUDA/Default have a member 'x' of type uint16_t.
// For ARM64 native half, the member 'x' is of type __fp16, so we need to bit-cast.
// For SYCL, cl::sycl::half is _Float16, so cast directly.
@@ -617,6 +616,10 @@
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) {
return half(::log10f(float(a)));
}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log2(const half& a) {
+ return half(static_cast<float>(EIGEN_LOG2E) * ::logf(float(a)));
+}
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
defined(EIGEN_HIP_DEVICE_COMPILE)
@@ -744,7 +747,7 @@
return half_impl::raw_uint16_to_half(0x7c00);
}
EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half quiet_NaN() {
- return half_impl::raw_uint16_to_half(0x7c01);
+ return half_impl::raw_uint16_to_half(0x7e00);
}
};
@@ -767,27 +770,6 @@
} // end namespace std
-// Add the missing shfl_xor intrinsic
-#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
- defined(EIGEN_HIPCC)
-
-__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
- #if (EIGEN_CUDA_SDK_VER < 90000) || \
- defined(EIGEN_HAS_HIP_FP16)
- return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width));
- #else
- return static_cast<Eigen::half>(__shfl_xor_sync(0xFFFFFFFF, static_cast<float>(var), laneMask, width));
- #endif
-}
-#endif
-
-// ldg() has an overload for __half_raw, but we also need one for Eigen::half.
-#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350) || defined(EIGEN_HIPCC)
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) {
- return Eigen::half_impl::raw_uint16_to_half(__ldg(reinterpret_cast<const Eigen::numext::uint16_t*>(ptr)));
-}
-#endif
-
namespace Eigen {
namespace numext {
@@ -823,4 +805,69 @@
} // namespace numext
} // namespace Eigen
+// Add the missing shfl* intrinsics.
+// The __shfl* functions are only valid on HIP or _CUDA_ARCH_ >= 300.
+// CUDA defines them for (__CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__))
+//
+// HIP and CUDA prior to SDK 9.0 define
+// __shfl, __shfl_up, __shfl_down, __shfl_xor for int and float
+// CUDA since 9.0 deprecates those and instead defines
+// __shfl_sync, __shfl_up_sync, __shfl_down_sync, __shfl_xor_sync,
+// with native support for __half and __nv_bfloat16
+//
+// Note that the following are __device__ - only functions.
+#if (defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 300)) \
+ || defined(EIGEN_HIPCC)
+
+#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 90000
+
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_sync(unsigned mask, Eigen::half var, int srcLane, int width=warpSize) {
+ return static_cast<Eigen::half>(__shfl_sync(mask, static_cast<__half>(var), srcLane, width));
+}
+
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_up_sync(unsigned mask, Eigen::half var, unsigned int delta, int width=warpSize) {
+ return static_cast<Eigen::half>(__shfl_up_sync(mask, static_cast<__half>(var), delta, width));
+}
+
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_down_sync(unsigned mask, Eigen::half var, unsigned int delta, int width=warpSize) {
+ return static_cast<Eigen::half>(__shfl_down_sync(mask, static_cast<__half>(var), delta, width));
+}
+
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor_sync(unsigned mask, Eigen::half var, int laneMask, int width=warpSize) {
+ return static_cast<Eigen::half>(__shfl_xor_sync(mask, static_cast<__half>(var), laneMask, width));
+}
+
+#else // HIP or CUDA SDK < 9.0
+
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl(Eigen::half var, int srcLane, int width=warpSize) {
+ const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
+ return Eigen::numext::bit_cast<Eigen::half>(static_cast<Eigen::numext::uint16_t>(__shfl(ivar, srcLane, width)));
+}
+
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_up(Eigen::half var, unsigned int delta, int width=warpSize) {
+ const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
+ return Eigen::numext::bit_cast<Eigen::half>(static_cast<Eigen::numext::uint16_t>(__shfl_up(ivar, delta, width)));
+}
+
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_down(Eigen::half var, unsigned int delta, int width=warpSize) {
+ const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
+ return Eigen::numext::bit_cast<Eigen::half>(static_cast<Eigen::numext::uint16_t>(__shfl_down(ivar, delta, width)));
+}
+
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
+ const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
+ return Eigen::numext::bit_cast<Eigen::half>(static_cast<Eigen::numext::uint16_t>(__shfl_xor(ivar, laneMask, width)));
+}
+
+#endif // HIP vs CUDA
+#endif // __shfl*
+
+// ldg() has an overload for __half_raw, but we also need one for Eigen::half.
+#if (defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 350)) \
+ || defined(EIGEN_HIPCC)
+EIGEN_STRONG_INLINE __device__ Eigen::half __ldg(const Eigen::half* ptr) {
+ return Eigen::half_impl::raw_uint16_to_half(__ldg(reinterpret_cast<const Eigen::numext::uint16_t*>(ptr)));
+}
+#endif // __ldg
+
#endif // EIGEN_HALF_H
diff --git a/Eigen/src/Core/arch/GPU/PacketMath.h b/Eigen/src/Core/arch/GPU/PacketMath.h
index dd4e77d..c16f95e 100644
--- a/Eigen/src/Core/arch/GPU/PacketMath.h
+++ b/Eigen/src/Core/arch/GPU/PacketMath.h
@@ -14,10 +14,21 @@
namespace internal {
+// Read-only data cached load available.
+#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 350
+#define EIGEN_GPU_HAS_LDG 1
+#endif
+
+// FP16 math available.
+#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 530
+#define EIGEN_GPU_HAS_FP16_ARITHMETIC 1
+#endif
+
// Make sure this is only available when targeting a GPU: we don't want to
// introduce conflicts between these packet_traits definitions and the ones
// we'll use on the host side (SSE, AVX, ...)
#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
+
template<> struct is_arithmetic<float4> { enum { value = true }; };
template<> struct is_arithmetic<double2> { enum { value = true }; };
@@ -237,7 +248,7 @@
pcmp_lt<double2>(const double2& a, const double2& b) {
return make_double2(lt_mask(a.x, b.x), lt_mask(a.y, b.y));
}
-#endif // EIGEN_CUDA_ARCH || defined(EIGEN_HIP_DEVICE_COMPILE)
+#endif // defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(const float& a) {
return make_float4(a, a+1, a+2, a+3);
@@ -342,7 +353,7 @@
template<>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
-#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
+#if defined(EIGEN_GPU_HAS_LDG)
return __ldg((const float4*)from);
#else
return make_float4(from[0], from[1], from[2], from[3]);
@@ -350,7 +361,7 @@
}
template<>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
-#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
+#if defined(EIGEN_GPU_HAS_LDG)
return __ldg((const double2*)from);
#else
return make_double2(from[0], from[1]);
@@ -359,7 +370,7 @@
template<>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
-#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
+#if defined(EIGEN_GPU_HAS_LDG)
return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3));
#else
return make_float4(from[0], from[1], from[2], from[3]);
@@ -367,7 +378,7 @@
}
template<>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
-#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
+#if defined(EIGEN_GPU_HAS_LDG)
return make_double2(__ldg(from+0), __ldg(from+1));
#else
return make_double2(from[0], from[1]);
@@ -480,9 +491,7 @@
// Packet4h2 must be defined in the macro without EIGEN_CUDA_ARCH, meaning
// its corresponding packet_traits<Eigen::half> must be visible on host.
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC)) || \
- (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIPCC)) || \
- (defined(EIGEN_HAS_CUDA_FP16) && defined(__clang__) && defined(__CUDA__))
+#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
typedef ulonglong2 Packet4h2;
template<> struct unpacket_traits<Packet4h2> { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4h2 half; };
@@ -513,17 +522,41 @@
};
};
+namespace {
+// This is equivalent to make_half2, which is undocumented and doesn't seem to always exist.
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 combine_half(const __half& a, const __half& b) {
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+ return __halves2half2(a, b);
+#else
+ // Round-about way since __halves2half2 is a __device__ function.
+ return __floats2half2_rn(__half2float(a), __half2float(b));
+#endif
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE __half get_half2_low(const half2& a) {
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+ return __low2half(a);
+#else
+ return __float2half(__low2float(a));
+#endif
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE __half get_half2_high(const half2& a) {
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+ return __high2half(a);
+#else
+ return __float2half(__high2float(a));
+#endif
+}
+} // namespace
+
template<>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
-#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIPCC)
- half2 r;
- r.x = from;
- r.y = from;
- return r;
-#elif defined(EIGEN_HIPCC)
- return __half2{from,from};
-#else
+#if defined(EIGEN_GPU_COMPILE_PHASE)
return __half2half2(from);
+#else
+ const float f = __half2float(from);
+ return __floats2half2_rn(f, f);
#endif
}
@@ -539,7 +572,8 @@
return r;
}
-#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
+// We now need this visible on both host and device.
+// #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
namespace {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) {
@@ -547,11 +581,11 @@
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu(const Eigen::half* from) {
- return __halves2half2(from[0], from[1]);
+ return combine_half(from[0], from[1]);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half* from) {
- return __halves2half2(from[0], from[0]);
+ return combine_half(from[0], from[0]);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to,
@@ -561,201 +595,164 @@
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to,
const half2& from) {
-#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIPCC)
- to[0] = from.x;
- to[1] = from.y;
-#else
- to[0] = __low2half(from);
- to[1] = __high2half(from);
-#endif
+ to[0] = get_half2_low(from);
+ to[1] = get_half2_high(from);
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned(
const Eigen::half* from) {
-
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-
+#if defined(EIGEN_GPU_HAS_LDG)
return __ldg((const half2*)from);
-
-#else // EIGEN_CUDA_ARCH
-
-#if EIGEN_CUDA_ARCH >= 350
- return __ldg((const half2*)from);
#else
- return __halves2half2(*(from+0), *(from+1));
-#endif
-
+ return combine_half(*(from+0), *(from+1));
#endif
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_unaligned(
const Eigen::half* from) {
-
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-
+#if defined(EIGEN_GPU_HAS_LDG)
return __halves2half2(__ldg(from+0), __ldg(from+1));
-
-#else // EIGEN_CUDA_ARCH
-
-#if EIGEN_CUDA_ARCH >= 350
- return __halves2half2(__ldg(from+0), __ldg(from+1));
#else
- return __halves2half2(*(from+0), *(from+1));
-#endif
-
+ return combine_half(*(from+0), *(from+1));
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from,
Index stride) {
- return __halves2half2(from[0*stride], from[1*stride]);
+ return combine_half(from[0*stride], from[1*stride]);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(
Eigen::half* to, const half2& from, Index stride) {
- to[stride*0] = __low2half(from);
- to[stride*1] = __high2half(from);
+ to[stride*0] = get_half2_low(from);
+ to[stride*1] = get_half2_high(from);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& a) {
- return __low2half(a);
+ return get_half2_low(a);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& a) {
- half a1 = __low2half(a);
- half a2 = __high2half(a);
+ half a1 = get_half2_low(a);
+ half a2 = get_half2_high(a);
half result1 = half_impl::raw_uint16_to_half(a1.x & 0x7FFF);
half result2 = half_impl::raw_uint16_to_half(a2.x & 0x7FFF);
- return __halves2half2(result1, result2);
+ return combine_half(result1, result2);
}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue(const half2& a) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue(const half2& /*a*/) {
half true_half = half_impl::raw_uint16_to_half(0xffffu);
return pset1<half2>(true_half);
}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pzero(const half2& a) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pzero(const half2& /*a*/) {
half false_half = half_impl::raw_uint16_to_half(0x0000u);
return pset1<half2>(false_half);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
ptranspose(PacketBlock<half2,2>& kernel) {
- __half a1 = __low2half(kernel.packet[0]);
- __half a2 = __high2half(kernel.packet[0]);
- __half b1 = __low2half(kernel.packet[1]);
- __half b2 = __high2half(kernel.packet[1]);
- kernel.packet[0] = __halves2half2(a1, b1);
- kernel.packet[1] = __halves2half2(a2, b2);
+ __half a1 = get_half2_low(kernel.packet[0]);
+ __half a2 = get_half2_high(kernel.packet[0]);
+ __half b1 = get_half2_low(kernel.packet[1]);
+ __half b2 = get_half2_high(kernel.packet[1]);
+ kernel.packet[0] = combine_half(a1, b1);
+ kernel.packet[1] = combine_half(a2, b2);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-
- return __halves2half2(a, __hadd(a, __float2half(1.0f)));
-
-#else // EIGEN_CUDA_ARCH
-
-#if EIGEN_CUDA_ARCH >= 530
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
return __halves2half2(a, __hadd(a, __float2half(1.0f)));
#else
float f = __half2float(a) + 1.0f;
- return __halves2half2(a, __float2half(f));
-#endif
-
+ return combine_half(a, __float2half(f));
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pselect(const half2& mask,
const half2& a,
const half2& b) {
- half mask_low = __low2half(mask);
- half mask_high = __high2half(mask);
- half result_low = mask_low == half(0) ? __low2half(b) : __low2half(a);
- half result_high = mask_high == half(0) ? __high2half(b) : __high2half(a);
- return __halves2half2(result_low, result_high);
+ half mask_low = get_half2_low(mask);
+ half mask_high = get_half2_high(mask);
+ half result_low = mask_low == half(0) ? get_half2_low(b) : get_half2_low(a);
+ half result_high = mask_high == half(0) ? get_half2_high(b) : get_half2_high(a);
+ return combine_half(result_low, result_high);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_eq(const half2& a,
const half2& b) {
half true_half = half_impl::raw_uint16_to_half(0xffffu);
half false_half = half_impl::raw_uint16_to_half(0x0000u);
- half a1 = __low2half(a);
- half a2 = __high2half(a);
- half b1 = __low2half(b);
- half b2 = __high2half(b);
+ half a1 = get_half2_low(a);
+ half a2 = get_half2_high(a);
+ half b1 = get_half2_low(b);
+ half b2 = get_half2_high(b);
half eq1 = __half2float(a1) == __half2float(b1) ? true_half : false_half;
half eq2 = __half2float(a2) == __half2float(b2) ? true_half : false_half;
- return __halves2half2(eq1, eq2);
+ return combine_half(eq1, eq2);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_lt(const half2& a,
const half2& b) {
half true_half = half_impl::raw_uint16_to_half(0xffffu);
half false_half = half_impl::raw_uint16_to_half(0x0000u);
- half a1 = __low2half(a);
- half a2 = __high2half(a);
- half b1 = __low2half(b);
- half b2 = __high2half(b);
+ half a1 = get_half2_low(a);
+ half a2 = get_half2_high(a);
+ half b1 = get_half2_low(b);
+ half b2 = get_half2_high(b);
half eq1 = __half2float(a1) < __half2float(b1) ? true_half : false_half;
half eq2 = __half2float(a2) < __half2float(b2) ? true_half : false_half;
- return __halves2half2(eq1, eq2);
+ return combine_half(eq1, eq2);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pand(const half2& a,
const half2& b) {
- half a1 = __low2half(a);
- half a2 = __high2half(a);
- half b1 = __low2half(b);
- half b2 = __high2half(b);
+ half a1 = get_half2_low(a);
+ half a2 = get_half2_high(a);
+ half b1 = get_half2_low(b);
+ half b2 = get_half2_high(b);
half result1 = half_impl::raw_uint16_to_half(a1.x & b1.x);
half result2 = half_impl::raw_uint16_to_half(a2.x & b2.x);
- return __halves2half2(result1, result2);
+ return combine_half(result1, result2);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 por(const half2& a,
const half2& b) {
- half a1 = __low2half(a);
- half a2 = __high2half(a);
- half b1 = __low2half(b);
- half b2 = __high2half(b);
+ half a1 = get_half2_low(a);
+ half a2 = get_half2_high(a);
+ half b1 = get_half2_low(b);
+ half b2 = get_half2_high(b);
half result1 = half_impl::raw_uint16_to_half(a1.x | b1.x);
half result2 = half_impl::raw_uint16_to_half(a2.x | b2.x);
- return __halves2half2(result1, result2);
+ return combine_half(result1, result2);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pxor(const half2& a,
const half2& b) {
- half a1 = __low2half(a);
- half a2 = __high2half(a);
- half b1 = __low2half(b);
- half b2 = __high2half(b);
+ half a1 = get_half2_low(a);
+ half a2 = get_half2_high(a);
+ half b1 = get_half2_low(b);
+ half b2 = get_half2_high(b);
half result1 = half_impl::raw_uint16_to_half(a1.x ^ b1.x);
half result2 = half_impl::raw_uint16_to_half(a2.x ^ b2.x);
- return __halves2half2(result1, result2);
+ return combine_half(result1, result2);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot(const half2& a,
const half2& b) {
- half a1 = __low2half(a);
- half a2 = __high2half(a);
- half b1 = __low2half(b);
- half b2 = __high2half(b);
+ half a1 = get_half2_low(a);
+ half a2 = get_half2_high(a);
+ half b1 = get_half2_low(b);
+ half b2 = get_half2_high(b);
half result1 = half_impl::raw_uint16_to_half(a1.x & ~b1.x);
half result2 = half_impl::raw_uint16_to_half(a2.x & ~b2.x);
- return __halves2half2(result1, result2);
+ return combine_half(result1, result2);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a,
const half2& b) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-
- return __hadd2(a, b);
-
-#else // EIGEN_CUDA_ARCH
-
-#if EIGEN_CUDA_ARCH >= 530
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
return __hadd2(a, b);
#else
float a1 = __low2float(a);
@@ -766,19 +763,11 @@
float r2 = a2 + b2;
return __floats2half2_rn(r1, r2);
#endif
-
-#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a,
const half2& b) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-
- return __hsub2(a, b);
-
-#else // EIGEN_CUDA_ARCH
-
-#if EIGEN_CUDA_ARCH >= 530
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
return __hsub2(a, b);
#else
float a1 = __low2float(a);
@@ -789,39 +778,23 @@
float r2 = a2 - b2;
return __floats2half2_rn(r1, r2);
#endif
-
-#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-
- return __hneg2(a);
-
-#else // EIGEN_CUDA_ARCH
-
-#if EIGEN_CUDA_ARCH >= 530
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
return __hneg2(a);
#else
float a1 = __low2float(a);
float a2 = __high2float(a);
return __floats2half2_rn(-a1, -a2);
#endif
-
-#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a,
const half2& b) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-
- return __hmul2(a, b);
-
-#else // EIGEN_CUDA_ARCH
-
-#if EIGEN_CUDA_ARCH >= 530
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
return __hmul2(a, b);
#else
float a1 = __low2float(a);
@@ -832,20 +805,12 @@
float r2 = a2 * b2;
return __floats2half2_rn(r1, r2);
#endif
-
-#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a,
const half2& b,
const half2& c) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-
- return __hfma2(a, b, c);
-
-#else // EIGEN_CUDA_ARCH
-
-#if EIGEN_CUDA_ARCH >= 530
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
return __hfma2(a, b, c);
#else
float a1 = __low2float(a);
@@ -858,18 +823,13 @@
float r2 = a2 * b2 + c2;
return __floats2half2_rn(r1, r2);
#endif
-
-#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a,
const half2& b) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
return __h2div(a, b);
-
-#else // EIGEN_CUDA_ARCH
-
+#else
float a1 = __low2float(a);
float a2 = __high2float(a);
float b1 = __low2float(b);
@@ -877,7 +837,6 @@
float r1 = a1 / b1;
float r2 = a2 / b2;
return __floats2half2_rn(r1, r2);
-
#endif
}
@@ -887,9 +846,9 @@
float a2 = __high2float(a);
float b1 = __low2float(b);
float b2 = __high2float(b);
- __half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
- __half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
- return __halves2half2(r1, r2);
+ __half r1 = a1 < b1 ? get_half2_low(a) : get_half2_low(b);
+ __half r2 = a2 < b2 ? get_half2_high(a) : get_half2_high(b);
+ return combine_half(r1, r2);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a,
@@ -898,89 +857,53 @@
float a2 = __high2float(a);
float b1 = __low2float(b);
float b2 = __high2float(b);
- __half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
- __half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
- return __halves2half2(r1, r2);
+ __half r1 = a1 > b1 ? get_half2_low(a) : get_half2_low(b);
+ __half r2 = a2 > b2 ? get_half2_high(a) : get_half2_high(b);
+ return combine_half(r1, r2);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-
- return __hadd(__low2half(a), __high2half(a));
-
-#else // EIGEN_CUDA_ARCH
-
-#if EIGEN_CUDA_ARCH >= 530
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
return __hadd(__low2half(a), __high2half(a));
#else
float a1 = __low2float(a);
float a2 = __high2float(a);
return Eigen::half(__float2half(a1 + a2));
#endif
-
-#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-
- __half first = __low2half(a);
- __half second = __high2half(a);
- return __hgt(first, second) ? first : second;
-
-#else // EIGEN_CUDA_ARCH
-
-#if EIGEN_CUDA_ARCH >= 530
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
__half first = __low2half(a);
__half second = __high2half(a);
return __hgt(first, second) ? first : second;
#else
float a1 = __low2float(a);
float a2 = __high2float(a);
- return a1 > a2 ? __low2half(a) : __high2half(a);
-#endif
-
+ return a1 > a2 ? get_half2_low(a) : get_half2_high(a);
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-
- __half first = __low2half(a);
- __half second = __high2half(a);
- return __hlt(first, second) ? first : second;
-
-#else // EIGEN_CUDA_ARCH
-
-#if EIGEN_CUDA_ARCH >= 530
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
__half first = __low2half(a);
__half second = __high2half(a);
return __hlt(first, second) ? first : second;
#else
float a1 = __low2float(a);
float a2 = __high2float(a);
- return a1 < a2 ? __low2half(a) : __high2half(a);
-#endif
-
+ return a1 < a2 ? get_half2_low(a) : get_half2_high(a);
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul(const half2& a) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-
- return __hmul(__low2half(a), __high2half(a));
-
-#else // EIGEN_CUDA_ARCH
-
-#if EIGEN_CUDA_ARCH >= 530
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
return __hmul(__low2half(a), __high2half(a));
#else
float a1 = __low2float(a);
float a2 = __high2float(a);
return Eigen::half(__float2half(a1 * a2));
#endif
-
-#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p(const half2& a) {
@@ -1108,14 +1031,7 @@
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2
ploadt_ro<Packet4h2, Aligned>(const Eigen::half* from) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-
- Packet4h2 r;
- r = __ldg((const Packet4h2*)from);
- return r;
-#else // EIGEN_CUDA_ARCH
-
-#if EIGEN_CUDA_ARCH >= 350
+#if defined(EIGEN_GPU_HAS_LDG)
Packet4h2 r;
r = __ldg((const Packet4h2*)from);
return r;
@@ -1128,8 +1044,6 @@
r_alias[3] = ploadt_ro_aligned(from + 6);
return r;
#endif
-
-#endif
}
template <>
@@ -1149,10 +1063,10 @@
pgather<Eigen::half, Packet4h2>(const Eigen::half* from, Index stride) {
Packet4h2 r;
half2* p_alias = reinterpret_cast<half2*>(&r);
- p_alias[0] = __halves2half2(from[0 * stride], from[1 * stride]);
- p_alias[1] = __halves2half2(from[2 * stride], from[3 * stride]);
- p_alias[2] = __halves2half2(from[4 * stride], from[5 * stride]);
- p_alias[3] = __halves2half2(from[6 * stride], from[7 * stride]);
+ p_alias[0] = combine_half(from[0 * stride], from[1 * stride]);
+ p_alias[1] = combine_half(from[2 * stride], from[3 * stride]);
+ p_alias[2] = combine_half(from[4 * stride], from[5 * stride]);
+ p_alias[3] = combine_half(from[6 * stride], from[7 * stride]);
return r;
}
@@ -1187,13 +1101,13 @@
template <>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 ptrue<Packet4h2>(
- const Packet4h2& a) {
+ const Packet4h2& /*a*/) {
half true_half = half_impl::raw_uint16_to_half(0xffffu);
return pset1<Packet4h2>(true_half);
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pzero<Packet4h2>(const Packet4h2& a) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pzero<Packet4h2>(const Packet4h2& /*a*/) {
half false_half = half_impl::raw_uint16_to_half(0x0000u);
return pset1<Packet4h2>(false_half);
}
@@ -1233,12 +1147,12 @@
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
ptranspose_half(half2& f0, half2& f1) {
- __half a1 = __low2half(f0);
- __half a2 = __high2half(f0);
- __half b1 = __low2half(f1);
- __half b2 = __high2half(f1);
- f0 = __halves2half2(a1, b1);
- f1 = __halves2half2(a2, b2);
+ __half a1 = get_half2_low(f0);
+ __half a2 = get_half2_high(f0);
+ __half b1 = get_half2_low(f1);
+ __half b2 = get_half2_high(f1);
+ f0 = combine_half(a1, b1);
+ f1 = combine_half(a2, b2);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
@@ -1294,7 +1208,7 @@
ptranspose_half(f_row0[1], f_row1[1]);
ptranspose_half(f_row2[0], f_row3[0]);
ptranspose_half(f_row2[1], f_row3[1]);
-
+
}
template <>
@@ -1312,9 +1226,7 @@
p_alias[3] = __halves2half2(__hadd(a, __float2half(6.0f)),
__hadd(a, __float2half(7.0f)));
return r;
-#else // EIGEN_CUDA_ARCH
-
-#if EIGEN_CUDA_ARCH >= 530
+#elif EIGEN_CUDA_ARCH >= 530
Packet4h2 r;
half2* r_alias = reinterpret_cast<half2*>(&r);
@@ -1337,14 +1249,12 @@
float f = __half2float(a);
Packet4h2 r;
half2* p_alias = reinterpret_cast<half2*>(&r);
- p_alias[0] = __halves2half2(a, __float2half(f + 1.0f));
- p_alias[1] = __halves2half2(__float2half(f + 2.0f), __float2half(f + 3.0f));
- p_alias[2] = __halves2half2(__float2half(f + 4.0f), __float2half(f + 5.0f));
- p_alias[3] = __halves2half2(__float2half(f + 6.0f), __float2half(f + 7.0f));
+ p_alias[0] = combine_half(a, __float2half(f + 1.0f));
+ p_alias[1] = combine_half(__float2half(f + 2.0f), __float2half(f + 3.0f));
+ p_alias[2] = combine_half(__float2half(f + 4.0f), __float2half(f + 5.0f));
+ p_alias[3] = combine_half(__float2half(f + 6.0f), __float2half(f + 7.0f));
return r;
#endif
-
-#endif
}
template <>
@@ -1562,9 +1472,9 @@
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<Packet4h2>(
const Packet4h2& a) {
const half2* a_alias = reinterpret_cast<const half2*>(&a);
- half2 m0 = __halves2half2(predux_max(a_alias[0]),
+ half2 m0 = combine_half(predux_max(a_alias[0]),
predux_max(a_alias[1]));
- half2 m1 = __halves2half2(predux_max(a_alias[2]),
+ half2 m1 = combine_half(predux_max(a_alias[2]),
predux_max(a_alias[3]));
__half first = predux_max(m0);
__half second = predux_max(m1);
@@ -1581,9 +1491,9 @@
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<Packet4h2>(
const Packet4h2& a) {
const half2* a_alias = reinterpret_cast<const half2*>(&a);
- half2 m0 = __halves2half2(predux_min(a_alias[0]),
+ half2 m0 = combine_half(predux_min(a_alias[0]),
predux_min(a_alias[1]));
- half2 m1 = __halves2half2(predux_min(a_alias[2]),
+ half2 m1 = combine_half(predux_min(a_alias[2]),
predux_min(a_alias[3]));
__half first = predux_min(m0);
__half second = predux_min(m1);
@@ -1685,13 +1595,7 @@
template<>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a,
const half2& b) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-
- return __hadd2(a, b);
-
-#else // EIGEN_CUDA_ARCH
-
-#if EIGEN_CUDA_ARCH >= 530
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
return __hadd2(a, b);
#else
float a1 = __low2float(a);
@@ -1702,20 +1606,12 @@
float r2 = a2 + b2;
return __floats2half2_rn(r1, r2);
#endif
-
-#endif
}
template<>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a,
const half2& b) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-
- return __hmul2(a, b);
-
-#else // EIGEN_CUDA_ARCH
-
-#if EIGEN_CUDA_ARCH >= 530
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
return __hmul2(a, b);
#else
float a1 = __low2float(a);
@@ -1726,19 +1622,14 @@
float r2 = a2 * b2;
return __floats2half2_rn(r1, r2);
#endif
-
-#endif
}
template<>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a,
const half2& b) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
return __h2div(a, b);
-
-#else // EIGEN_CUDA_ARCH
-
+#else
float a1 = __low2float(a);
float a2 = __high2float(a);
float b1 = __low2float(b);
@@ -1746,7 +1637,6 @@
float r1 = a1 / b1;
float r2 = a2 / b2;
return __floats2half2_rn(r1, r2);
-
#endif
}
@@ -1757,9 +1647,9 @@
float a2 = __high2float(a);
float b1 = __low2float(b);
float b2 = __high2float(b);
- __half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
- __half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
- return __halves2half2(r1, r2);
+ __half r1 = a1 < b1 ? get_half2_low(a) : get_half2_low(b);
+ __half r2 = a2 < b2 ? get_half2_high(a) : get_half2_high(b);
+ return combine_half(r1, r2);
}
template<>
@@ -1769,14 +1659,17 @@
float a2 = __high2float(a);
float b1 = __low2float(b);
float b2 = __high2float(b);
- __half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
- __half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
- return __halves2half2(r1, r2);
+ __half r1 = a1 > b1 ? get_half2_low(a) : get_half2_low(b);
+ __half r2 = a2 > b2 ? get_half2_high(a) : get_half2_high(b);
+ return combine_half(r1, r2);
}
-#endif // defined(EIGEN_CUDA_ARCH)
+// #endif // defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
-#endif // defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC)
+#endif // defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
+
+#undef EIGEN_GPU_HAS_LDG
+#undef EIGEN_GPU_HAS_FP16_ARITHMETIC
} // end namespace internal
diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h
index 448600b..a889ab1 100644
--- a/Eigen/src/Core/arch/NEON/Complex.h
+++ b/Eigen/src/Core/arch/NEON/Complex.h
@@ -76,6 +76,7 @@
{
typedef std::complex<float> type;
typedef Packet1cf half;
+ typedef Packet2f as_real;
enum
{
size = 1,
@@ -89,6 +90,7 @@
{
typedef std::complex<float> type;
typedef Packet1cf half;
+ typedef Packet4f as_real;
enum
{
size = 2,
@@ -430,6 +432,14 @@
kernel.packet[1].v = tmp;
}
+template<> EIGEN_STRONG_INLINE Packet1cf psqrt<Packet1cf>(const Packet1cf& a) {
+ return psqrt_complex<Packet1cf>(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf psqrt<Packet2cf>(const Packet2cf& a) {
+ return psqrt_complex<Packet2cf>(a);
+}
+
//---------- double ----------
#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
@@ -475,6 +485,8 @@
template<> struct unpacket_traits<Packet1cd>
{
typedef std::complex<double> type;
+ typedef Packet1cd half;
+ typedef Packet2d as_real;
enum
{
size=1,
@@ -483,7 +495,6 @@
masked_load_available=false,
masked_store_available=false
};
- typedef Packet1cd half;
};
template<> EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from)
@@ -640,6 +651,11 @@
kernel.packet[0].v = vcombine_f64(vget_low_f64(kernel.packet[0].v), vget_low_f64(kernel.packet[1].v));
kernel.packet[1].v = tmp;
}
+
+template<> EIGEN_STRONG_INLINE Packet1cd psqrt<Packet1cd>(const Packet1cd& a) {
+ return psqrt_complex<Packet1cd>(a);
+}
+
#endif // EIGEN_ARCH_ARM64
} // end namespace internal
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 709cebe..90ffee7 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -192,7 +192,9 @@
HasExp = 1,
HasSqrt = 1,
HasTanh = EIGEN_FAST_MATH,
- HasErf = EIGEN_FAST_MATH
+ HasErf = EIGEN_FAST_MATH,
+ HasBessel = 0, // Issues with accuracy.
+ HasNdtri = 0
};
};
@@ -1157,6 +1159,17 @@
template<> EIGEN_STRONG_INLINE Packet2f pmin<Packet2f>(const Packet2f& a, const Packet2f& b) { return vmin_f32(a,b); }
template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vminq_f32(a,b); }
+
+#ifdef __ARM_FEATURE_NUMERIC_MAXMIN
+// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems).
+template<> EIGEN_STRONG_INLINE Packet4f pmin<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) { return vminnmq_f32(a, b); }
+template<> EIGEN_STRONG_INLINE Packet2f pmin<PropagateNumbers, Packet2f>(const Packet2f& a, const Packet2f& b) { return vminnm_f32(a, b); }
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet4f pmin<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) { return pmin<Packet4f>(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet2f pmin<PropagateNaN, Packet2f>(const Packet2f& a, const Packet2f& b) { return pmin<Packet2f>(a, b); }
+
template<> EIGEN_STRONG_INLINE Packet4c pmin<Packet4c>(const Packet4c& a, const Packet4c& b)
{
return vget_lane_s32(vreinterpret_s32_s8(vmin_s8(
@@ -1194,6 +1207,17 @@
template<> EIGEN_STRONG_INLINE Packet2f pmax<Packet2f>(const Packet2f& a, const Packet2f& b) { return vmax_f32(a,b); }
template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmaxq_f32(a,b); }
+
+#ifdef __ARM_FEATURE_NUMERIC_MAXMIN
+// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems).
+template<> EIGEN_STRONG_INLINE Packet4f pmax<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) { return vmaxnmq_f32(a, b); }
+template<> EIGEN_STRONG_INLINE Packet2f pmax<PropagateNumbers, Packet2f>(const Packet2f& a, const Packet2f& b) { return vmaxnm_f32(a, b); }
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet4f pmax<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) { return pmax<Packet4f>(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet2f pmax<PropagateNaN, Packet2f>(const Packet2f& a, const Packet2f& b) { return pmax<Packet2f>(a, b); }
+
template<> EIGEN_STRONG_INLINE Packet4c pmax<Packet4c>(const Packet4c& a, const Packet4c& b)
{
return vget_lane_s32(vreinterpret_s32_s8(vmax_s8(
@@ -3321,7 +3345,9 @@
HasExp = 1,
HasSqrt = 0,
HasTanh = EIGEN_FAST_MATH,
- HasErf = EIGEN_FAST_MATH
+ HasErf = EIGEN_FAST_MATH,
+ HasBessel = 0, // Issues with accuracy.
+ HasNdtri = 0
};
};
@@ -3371,6 +3397,10 @@
return reinterpret_cast<Packet4f>(vshlq_n_u32(vmovl_u16(p), 16));
}
+EIGEN_STRONG_INLINE Packet4bf F32MaskToBf16Mask(const Packet4f& p) {
+ return vmovn_u32(vreinterpretq_u32_f32(p));
+}
+
template<> EIGEN_STRONG_INLINE Packet4bf pset1<Packet4bf>(const bfloat16& from) {
return pset1<Packet4us>(from.value);
}
@@ -3408,12 +3438,34 @@
return F32ToBf16(pabs<Packet4f>(Bf16ToF32(a)));
}
+template <> EIGEN_STRONG_INLINE Packet4bf pmin<PropagateNumbers, Packet4bf>(const Packet4bf &a,
+ const Packet4bf &b)
+{
+ return F32ToBf16(pmin<PropagateNumbers, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+template <> EIGEN_STRONG_INLINE Packet4bf pmin<PropagateNaN, Packet4bf>(const Packet4bf &a,
+ const Packet4bf &b)
+{
+ return F32ToBf16(pmin<PropagateNaN, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
template <> EIGEN_STRONG_INLINE Packet4bf pmin<Packet4bf>(const Packet4bf &a,
const Packet4bf &b)
{
return F32ToBf16(pmin<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
}
+template <> EIGEN_STRONG_INLINE Packet4bf pmax<PropagateNumbers, Packet4bf>(const Packet4bf &a,
+ const Packet4bf &b)
+{
+ return F32ToBf16(pmax<PropagateNumbers, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+template <> EIGEN_STRONG_INLINE Packet4bf pmax<PropagateNaN, Packet4bf>(const Packet4bf &a,
+ const Packet4bf &b)
+{
+ return F32ToBf16(pmax<PropagateNaN, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
template <> EIGEN_STRONG_INLINE Packet4bf pmax<Packet4bf>(const Packet4bf &a,
const Packet4bf &b)
{
@@ -3528,17 +3580,17 @@
template<> EIGEN_STRONG_INLINE Packet4bf pcmp_eq<Packet4bf>(const Packet4bf& a, const Packet4bf& b)
{
- return F32ToBf16(pcmp_eq<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+ return F32MaskToBf16Mask(pcmp_eq<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
}
template<> EIGEN_STRONG_INLINE Packet4bf pcmp_lt<Packet4bf>(const Packet4bf& a, const Packet4bf& b)
{
- return F32ToBf16(pcmp_lt<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+ return F32MaskToBf16Mask(pcmp_lt<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
}
template<> EIGEN_STRONG_INLINE Packet4bf pcmp_le<Packet4bf>(const Packet4bf& a, const Packet4bf& b)
{
- return F32ToBf16(pcmp_le<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+ return F32MaskToBf16Mask(pcmp_le<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
}
template<> EIGEN_STRONG_INLINE Packet4bf pnegate<Packet4bf>(const Packet4bf& a)
@@ -3684,8 +3736,20 @@
template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vminq_f64(a,b); }
+#ifdef __ARM_FEATURE_NUMERIC_MAXMIN
+// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems).
+template<> EIGEN_STRONG_INLINE Packet2d pmin<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) { return vminnmq_f64(a, b); }
+template<> EIGEN_STRONG_INLINE Packet2d pmax<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) { return vmaxnmq_f64(a, b); }
+
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet2d pmin<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) { return pmin<Packet2d>(a, b); }
+
template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vmaxq_f64(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pmax<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) { return pmax<Packet2d>(a, b); }
+
// WARNING: this pfloor implementation makes sense for inputs that fit in
// signed int64 integers (up to ~9.22e18), hence this is currently only used
// by pexp and not exposed through HasFloor.
@@ -3883,7 +3947,10 @@
HasCos = 0,
HasLog = 0,
HasExp = 0,
- HasSqrt = 1
+ HasSqrt = 1,
+ HasErf = EIGEN_FAST_MATH,
+ HasBessel = 0, // Issues with accuracy.
+ HasNdtri = 0,
};
};
@@ -4022,6 +4089,16 @@
return vmin_f16(a, b);
}
+#ifdef __ARM_FEATURE_NUMERIC_MAXMIN
+// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems).
+template<> EIGEN_STRONG_INLINE Packet4hf pmin<PropagateNumbers, Packet4hf>(const Packet4hf& a, const Packet4hf& b) { return vminnm_f16(a, b); }
+template<> EIGEN_STRONG_INLINE Packet8hf pmin<PropagateNumbers, Packet8hf>(const Packet8hf& a, const Packet8hf& b) { return vminnmq_f16(a, b); }
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet4hf pmin<PropagateNaN, Packet4hf>(const Packet4hf& a, const Packet4hf& b) { return pmin<Packet4hf>(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet8hf pmin<PropagateNaN, Packet8hf>(const Packet8hf& a, const Packet8hf& b) { return pmin<Packet8hf>(a, b); }
+
template <>
EIGEN_STRONG_INLINE Packet8hf pmax<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
return vmaxq_f16(a, b);
@@ -4032,6 +4109,16 @@
return vmax_f16(a, b);
}
+#ifdef __ARM_FEATURE_NUMERIC_MAXMIN
+// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems).
+template<> EIGEN_STRONG_INLINE Packet4hf pmax<PropagateNumbers, Packet4hf>(const Packet4hf& a, const Packet4hf& b) { return vmaxnm_f16(a, b); }
+template<> EIGEN_STRONG_INLINE Packet8hf pmax<PropagateNumbers, Packet8hf>(const Packet8hf& a, const Packet8hf& b) { return vmaxnmq_f16(a, b); }
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet4hf pmax<PropagateNaN, Packet4hf>(const Packet4hf& a, const Packet4hf& b) { return pmax<Packet4hf>(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet8hf pmax<PropagateNaN, Packet8hf>(const Packet8hf& a, const Packet8hf& b) { return pmax<Packet8hf>(a, b); }
+
#define EIGEN_MAKE_ARM_FP16_CMP_8(name) \
template <> \
EIGEN_STRONG_INLINE Packet8hf pcmp_##name(const Packet8hf& a, const Packet8hf& b) { \
diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h
index 0d322a2..58cdb5d 100644
--- a/Eigen/src/Core/arch/SSE/Complex.h
+++ b/Eigen/src/Core/arch/SSE/Complex.h
@@ -40,6 +40,7 @@
HasMul = 1,
HasDiv = 1,
HasNegate = 1,
+ HasSqrt = 1,
HasAbs = 0,
HasAbs2 = 0,
HasMin = 0,
@@ -50,7 +51,18 @@
};
#endif
-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2cf half; };
+template<> struct unpacket_traits<Packet2cf> {
+ typedef std::complex<float> type;
+ typedef Packet2cf half;
+ typedef Packet4f as_real;
+ enum {
+ size=2,
+ alignment=Aligned16,
+ vectorizable=true,
+ masked_load_available=false,
+ masked_store_available=false
+ };
+};
template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); }
@@ -83,7 +95,6 @@
}
template<> EIGEN_STRONG_INLINE Packet2cf ptrue <Packet2cf>(const Packet2cf& a) { return Packet2cf(ptrue(Packet4f(a.v))); }
-
template<> EIGEN_STRONG_INLINE Packet2cf pand <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_and_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cf por <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_or_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cf pxor <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_xor_ps(a.v,b.v)); }
@@ -255,6 +266,7 @@
HasMul = 1,
HasDiv = 1,
HasNegate = 1,
+ HasSqrt = 1,
HasAbs = 0,
HasAbs2 = 0,
HasMin = 0,
@@ -264,7 +276,18 @@
};
#endif
-template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet1cd half; };
+template<> struct unpacket_traits<Packet1cd> {
+ typedef std::complex<double> type;
+ typedef Packet1cd half;
+ typedef Packet2d as_real;
+ enum {
+ size=1,
+ alignment=Aligned16,
+ vectorizable=true,
+ masked_load_available=false,
+ masked_store_available=false
+ };
+};
template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); }
@@ -426,8 +449,15 @@
return Packet2cf(_mm_castpd_ps(result));
}
-} // end namespace internal
+template<> EIGEN_STRONG_INLINE Packet1cd psqrt<Packet1cd>(const Packet1cd& a) {
+ return psqrt_complex<Packet1cd>(a);
+}
+template<> EIGEN_STRONG_INLINE Packet2cf psqrt<Packet2cf>(const Packet2cf& a) {
+ return psqrt_complex<Packet2cf>(a);
+}
+
+} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_COMPLEX_SSE_H
diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h
index 71ec6f8..2d7df2f 100644
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@@ -30,6 +30,16 @@
}
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f plog2<Packet4f>(const Packet4f& _x) {
+ return plog2_float(_x);
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet2d plog2<Packet2d>(const Packet2d& _x) {
+ return plog2_double(_x);
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet4f plog1p<Packet4f>(const Packet4f& _x) {
return generic_plog1p(_x);
}
@@ -99,6 +109,9 @@
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet2d psqrt<Packet2d>(const Packet2d& x) { return _mm_sqrt_pd(x); }
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet16b psqrt<Packet16b>(const Packet16b& x) { return x; }
+
#if EIGEN_FAST_MATH
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 4db7014..4e733c7 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -218,7 +218,8 @@
HasAbs2 = 0,
HasMin = 0,
HasMax = 0,
- HasConj = 0
+ HasConj = 0,
+ HasSqrt = 1
};
};
@@ -266,6 +267,10 @@
template<> EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from) { return _mm_castsi128_ps(pset1<Packet4i>(from)); }
template<> EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from) { return _mm_castsi128_pd(_mm_set1_epi64x(from)); }
+template<> EIGEN_STRONG_INLINE Packet4f peven_mask(const Packet4f& /*a*/) { return _mm_castsi128_ps(_mm_set_epi32(0, -1, 0, -1)); }
+template<> EIGEN_STRONG_INLINE Packet4i peven_mask(const Packet4i& /*a*/) { return _mm_set_epi32(0, -1, 0, -1); }
+template<> EIGEN_STRONG_INLINE Packet2d peven_mask(const Packet2d& /*a*/) { return _mm_castsi128_pd(_mm_set_epi32(0, 0, -1, -1)); }
+
template<> EIGEN_STRONG_INLINE Packet4f pzero(const Packet4f& /*a*/) { return _mm_setzero_ps(); }
template<> EIGEN_STRONG_INLINE Packet2d pzero(const Packet2d& /*a*/) { return _mm_setzero_pd(); }
template<> EIGEN_STRONG_INLINE Packet4i pzero(const Packet4i& /*a*/) { return _mm_setzero_si128(); }
diff --git a/Eigen/src/Core/arch/ZVector/Complex.h b/Eigen/src/Core/arch/ZVector/Complex.h
index d3e41b4..ddf5a97 100644
--- a/Eigen/src/Core/arch/ZVector/Complex.h
+++ b/Eigen/src/Core/arch/ZVector/Complex.h
@@ -140,6 +140,11 @@
template<> EIGEN_STRONG_INLINE Packet1cd pxor <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_xor(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet1cd pandnot <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v, vec_nor(b.v,b.v))); }
template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) { return pset1<Packet1cd>(*from); }
+template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) {
+ Packet2d eq = vec_cmpeq (a.v, b.v);
+ Packet2d tmp = { eq[1], eq[0] };
+ return (Packet1cd)pand<Packet2d>(eq, tmp);
+}
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
@@ -281,6 +286,17 @@
#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12)
+
+template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) {
+ Packet4f eq = pcmp_eq<Packet4f> (a.v, b.v);
+ Packet2cf res;
+ Packet2d tmp1 = { eq.v4f[0][1], eq.v4f[0][0] };
+ Packet2d tmp2 = { eq.v4f[1][1], eq.v4f[1][0] };
+ res.v.v4f[0] = pand<Packet2d>(eq.v4f[0], tmp1);
+ res.v.v4f[1] = pand<Packet2d>(eq.v4f[1], tmp2);
+ return res;
+}
+
template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a)
{
Packet2cf res;
@@ -387,6 +403,11 @@
return result;
}
#else
+template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) {
+ Packet4f eq = vec_cmpeq (a.v, b.v);
+ Packet4f tmp = { eq[1], eq[0], eq[3], eq[2] };
+ return (Packet2cf)pand<Packet4f>(eq, tmp);
+}
template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor<Packet4f>(a.v, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR))); }
template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
{
diff --git a/Eigen/src/Core/arch/ZVector/MathFunctions.h b/Eigen/src/Core/arch/ZVector/MathFunctions.h
index 689ecc7..1635e12 100644
--- a/Eigen/src/Core/arch/ZVector/MathFunctions.h
+++ b/Eigen/src/Core/arch/ZVector/MathFunctions.h
@@ -140,7 +140,6 @@
Packet4f pexp<Packet4f>(const Packet4f& _x)
{
#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
-/*
Packet4f x = _x;
Packet4f tmp, fx;
@@ -171,16 +170,11 @@
y = padd(y, p4f_1);
// build 2^n
- emm0 = vec_cts(fx, 0);
+ emm0 = (Packet4i){ (int)fx[0], (int)fx[1], (int)fx[2], (int)fx[3] };
emm0 = emm0 + p4i_0x7f;
emm0 = emm0 << reinterpret_cast<Packet4i>(p4i_23);
- // Altivec's max & min operators just drop silent NaNs. Check NaNs in
- // inputs and return them unmodified.
- Packet4ui isnumber_mask = reinterpret_cast<Packet4ui>(vec_cmpeq(_x, _x));
- return vec_sel(_x, pmax(pmul(y, reinterpret_cast<Packet4f>(emm0)), _x),
- isnumber_mask);*/
- return _x;
+ return pmax(pmul(y, reinterpret_cast<Packet4f>(emm0)), _x);
#else
Packet4f res;
res.v4f[0] = pexp<Packet2d>(_x.v4f[0]);
diff --git a/Eigen/src/Core/arch/ZVector/PacketMath.h b/Eigen/src/Core/arch/ZVector/PacketMath.h
index 3fb642a..eb378a1 100755
--- a/Eigen/src/Core/arch/ZVector/PacketMath.h
+++ b/Eigen/src/Core/arch/ZVector/PacketMath.h
@@ -193,11 +193,7 @@
HasSin = 0,
HasCos = 0,
HasLog = 0,
-#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
- HasExp = 0,
-#else
HasExp = 1,
-#endif
HasSqrt = 1,
HasRsqrt = 1,
HasTanh = 1,
@@ -741,16 +737,16 @@
template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b)
{
Packet4f res;
- res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
- res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
+ res.v4f[0] = por(a.v4f[0], b.v4f[0]);
+ res.v4f[1] = por(a.v4f[1], b.v4f[1]);
return res;
}
template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b)
{
Packet4f res;
- res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
- res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
+ res.v4f[0] = pxor(a.v4f[0], b.v4f[0]);
+ res.v4f[1] = pxor(a.v4f[1], b.v4f[1]);
return res;
}
@@ -890,6 +886,31 @@
result.v4f[1] = vec_sel(elsePacket.v4f[1], thenPacket.v4f[1], mask_lo);
return result;
}
+
+template<> Packet4f EIGEN_STRONG_INLINE pcmp_le<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+ Packet4f res;
+ res.v4f[0] = pcmp_le(a.v4f[0], b.v4f[0]);
+ res.v4f[1] = pcmp_le(a.v4f[1], b.v4f[1]);
+ return res;
+}
+
+template<> Packet4f EIGEN_STRONG_INLINE pcmp_lt<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+ Packet4f res;
+ res.v4f[0] = pcmp_lt(a.v4f[0], b.v4f[0]);
+ res.v4f[1] = pcmp_lt(a.v4f[1], b.v4f[1]);
+ return res;
+}
+
+template<> Packet4f EIGEN_STRONG_INLINE pcmp_eq<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+ Packet4f res;
+ res.v4f[0] = pcmp_eq(a.v4f[0], b.v4f[0]);
+ res.v4f[1] = pcmp_eq(a.v4f[1], b.v4f[1]);
+ return res;
+}
+
#else
template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
{
diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h
index c7ed18c..eee6ae1 100644
--- a/Eigen/src/Core/functors/UnaryFunctors.h
+++ b/Eigen/src/Core/functors/UnaryFunctors.h
@@ -396,6 +396,22 @@
{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasLog10 }; };
/** \internal
+ *
+ * \brief Template functor to compute the base-2 logarithm of a scalar
+ *
+ * \sa class CwiseUnaryOp, Cwise::log2()
+ */
+template<typename Scalar> struct scalar_log2_op {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_log2_op)
+ EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return Scalar(EIGEN_LOG2E) * std::log(a); }
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plog2(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_log2_op<Scalar> >
+{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasLog }; };
+
+/** \internal
* \brief Template functor to compute the square root of a scalar
* \sa class CwiseUnaryOp, Cwise::sqrt()
*/
@@ -422,6 +438,18 @@
};
};
+// Boolean specialization to eliminate -Wimplicit-conversion-floating-point-to-bool warnings.
+template<> struct scalar_sqrt_op<bool> {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_sqrt_op)
+ EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline bool operator() (const bool& a) const { return a; }
+ template <typename Packet>
+ EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return a; }
+};
+template <>
+struct functor_traits<scalar_sqrt_op<bool> > {
+ enum { Cost = 1, PacketAccess = packet_traits<bool>::Vectorizable };
+};
+
/** \internal
* \brief Template functor to compute the reciprocal square root of a scalar
* \sa class CwiseUnaryOp, Cwise::rsqrt()
@@ -719,6 +747,19 @@
struct functor_traits<scalar_square_op<Scalar> >
{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasMul }; };
+// Boolean specialization to avoid -Wint-in-bool-context warnings on GCC.
+template<>
+struct scalar_square_op<bool> {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_square_op)
+ EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline bool operator() (const bool& a) const { return a; }
+ template<typename Packet>
+ EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const
+ { return a; }
+};
+template<>
+struct functor_traits<scalar_square_op<bool> >
+{ enum { Cost = 0, PacketAccess = packet_traits<bool>::Vectorizable }; };
+
/** \internal
* \brief Template functor to compute the cube of a scalar
* \sa class CwiseUnaryOp, Cwise::cube()
@@ -735,6 +776,19 @@
struct functor_traits<scalar_cube_op<Scalar> >
{ enum { Cost = 2*NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasMul }; };
+// Boolean specialization to avoid -Wint-in-bool-context warnings on GCC.
+template<>
+struct scalar_cube_op<bool> {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_cube_op)
+ EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline bool operator() (const bool& a) const { return a; }
+ template<typename Packet>
+ EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const
+ { return a; }
+};
+template<>
+struct functor_traits<scalar_cube_op<bool> >
+{ enum { Cost = 0, PacketAccess = packet_traits<bool>::Vectorizable }; };
+
/** \internal
* \brief Template functor to compute the rounded value of a scalar
* \sa class CwiseUnaryOp, ArrayBase::round()
diff --git a/Eigen/src/Geometry/Scaling.h b/Eigen/src/Geometry/Scaling.h
old mode 100755
new mode 100644
index df650fd..d352f1f
--- a/Eigen/src/Geometry/Scaling.h
+++ b/Eigen/src/Geometry/Scaling.h
@@ -14,7 +14,7 @@
/** \geometry_module \ingroup Geometry_Module
*
- * \class Scaling
+ * \class UniformScaling
*
* \brief Represents a generic uniform scaling transformation
*
diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h
index 4e639d1..6e6cab3 100644
--- a/Eigen/src/SparseCore/SparseMatrix.h
+++ b/Eigen/src/SparseCore/SparseMatrix.h
@@ -1086,7 +1086,7 @@
* \code
typedef Triplet<double> T;
std::vector<T> tripletList;
- triplets.reserve(estimation_of_entries);
+ tripletList.reserve(estimation_of_entries);
for(...)
{
// ...
diff --git a/Eigen/src/StlSupport/StdDeque.h b/Eigen/src/StlSupport/StdDeque.h
index 006ac11..6d47e75 100644
--- a/Eigen/src/StlSupport/StdDeque.h
+++ b/Eigen/src/StlSupport/StdDeque.h
@@ -98,19 +98,7 @@
{ return deque_base::insert(position,x); }
void insert(const_iterator position, size_type new_size, const value_type& x)
{ deque_base::insert(position, new_size, x); }
-#elif defined(_GLIBCXX_DEQUE) && EIGEN_GNUC_AT_LEAST(4,2) && !EIGEN_GNUC_AT_LEAST(10, 1)
- // workaround GCC std::deque implementation
- // GCC 10.1 doesn't let us access _Deque_impl _M_impl anymore and we have to
- // fall-back to the default case
- void resize(size_type new_size, const value_type& x)
- {
- if (new_size < deque_base::size())
- deque_base::_M_erase_at_end(this->_M_impl._M_start + new_size);
- else
- deque_base::insert(deque_base::end(), new_size - deque_base::size(), x);
- }
#else
- // either non-GCC or GCC between 4.1 and 10.1
// default implementation which should always work.
void resize(size_type new_size, const value_type& x)
{
diff --git a/Eigen/src/plugins/ArrayCwiseUnaryOps.h b/Eigen/src/plugins/ArrayCwiseUnaryOps.h
index 59a4ee6..b7ea22a 100644
--- a/Eigen/src/plugins/ArrayCwiseUnaryOps.h
+++ b/Eigen/src/plugins/ArrayCwiseUnaryOps.h
@@ -14,6 +14,7 @@
typedef CwiseUnaryOp<internal::scalar_log_op<Scalar>, const Derived> LogReturnType;
typedef CwiseUnaryOp<internal::scalar_log1p_op<Scalar>, const Derived> Log1pReturnType;
typedef CwiseUnaryOp<internal::scalar_log10_op<Scalar>, const Derived> Log10ReturnType;
+typedef CwiseUnaryOp<internal::scalar_log2_op<Scalar>, const Derived> Log2ReturnType;
typedef CwiseUnaryOp<internal::scalar_cos_op<Scalar>, const Derived> CosReturnType;
typedef CwiseUnaryOp<internal::scalar_sin_op<Scalar>, const Derived> SinReturnType;
typedef CwiseUnaryOp<internal::scalar_tan_op<Scalar>, const Derived> TanReturnType;
@@ -159,6 +160,18 @@
return Log10ReturnType(derived());
}
+/** \returns an expression of the coefficient-wise base-2 logarithm of *this.
+ *
+ * This function computes the coefficient-wise base-2 logarithm.
+ *
+ */
+EIGEN_DEVICE_FUNC
+inline const Log2ReturnType
+log2() const
+{
+ return Log2ReturnType(derived());
+}
+
/** \returns an expression of the coefficient-wise square root of *this.
*
* This function computes the coefficient-wise square root. The function MatrixBase::sqrt() in the
diff --git a/cmake/FindMetis.cmake b/cmake/FindMetis.cmake
index 5c9cbcb..747f882 100644
--- a/cmake/FindMetis.cmake
+++ b/cmake/FindMetis.cmake
@@ -258,7 +258,8 @@
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(METIS DEFAULT_MSG
METIS_LIBRARIES
- METIS_WORKS)
+ METIS_WORKS
+ METIS_INCLUDE_DIRS)
#
# TODO: Add possibility to check for specific functions in the library
#
diff --git a/debug/msvc/eigen.natvis b/debug/msvc/eigen.natvis
index 22cf346..da89857 100644
--- a/debug/msvc/eigen.natvis
+++ b/debug/msvc/eigen.natvis
@@ -1,235 +1,235 @@
-<?xml version="1.0" encoding="utf-8"?>
-
-<AutoVisualizer xmlns="http://schemas.microsoft.com/vstudio/debugger/natvis/2010">
-
- <!-- Fixed x Fixed Matrix -->
- <Type Name="Eigen::Matrix<*,*,*,*,*,*>">
- <AlternativeType Name="Eigen::Array<*,-1,-1,*,*,*>"/>
- <DisplayString>[{$T2}, {$T3}] (fixed matrix)</DisplayString>
- <Expand>
- <ArrayItems Condition="Flags%2"> <!-- row major layout -->
- <Rank>2</Rank>
- <Size>$i==0 ? $T2 : $T3</Size>
- <ValuePointer>m_storage.m_data.array</ValuePointer>
- </ArrayItems>
- <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->
- <Direction>Backward</Direction>
- <Rank>2</Rank>
- <Size>$i==0 ? $T2 : $T3</Size>
- <ValuePointer>m_storage.m_data.array</ValuePointer>
- </ArrayItems>
- </Expand>
- </Type>
-
- <!-- 2 x 2 Matrix -->
- <Type Name="Eigen::Matrix<*,2,2,*,*,*>">
- <AlternativeType Name="Eigen::Array<*,2,2,*,*,*>"/>
- <DisplayString>[2, 2] (fixed matrix)</DisplayString>
- <Expand>
- <Synthetic Name="[row 0]" Condition="Flags%2">
- <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 0]" Condition="!(Flags%2)">
- <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[2]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 1]" Condition="Flags%2">
- <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 1]" Condition="!(Flags%2)">
- <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[3]})</DisplayString>
- </Synthetic>
- </Expand>
- </Type>
-
- <!-- 3 x 3 Matrix -->
- <Type Name="Eigen::Matrix<*,3,3,*,*,*>">
- <AlternativeType Name="Eigen::Array<*,3,3,*,*,*>"/>
- <DisplayString>[3, 3] (fixed matrix)</DisplayString>
- <Expand>
- <Synthetic Name="[row 0]" Condition="Flags%2">
- <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 0]" Condition="!(Flags%2)">
- <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[3]}, {m_storage.m_data.array[6]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 1]" Condition="Flags%2">
- <DisplayString>({m_storage.m_data.array[3]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[5]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 1]" Condition="!(Flags%2)">
- <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[7]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 2]" Condition="Flags%2">
- <DisplayString>({m_storage.m_data.array[6]}, {m_storage.m_data.array[7]}, {m_storage.m_data.array[8]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 2]" Condition="!(Flags%2)">
- <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[8]})</DisplayString>
- </Synthetic>
- </Expand>
- </Type>
-
- <!-- 4 x 4 Matrix -->
- <Type Name="Eigen::Matrix<*,4,4,*,*,*>">
- <AlternativeType Name="Eigen::Array<*,4,4,*,*,*>"/>
- <DisplayString>[4, 4] (fixed matrix)</DisplayString>
- <Expand>
- <Synthetic Name="[row 0]" Condition="Flags%2">
- <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 0]" Condition="!(Flags%2)">
- <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[8]}, {m_storage.m_data.array[12]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 1]" Condition="Flags%2">
- <DisplayString>({m_storage.m_data.array[4]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[6]}, {m_storage.m_data.array[7]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 1]" Condition="!(Flags%2)">
- <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[9]}, {m_storage.m_data.array[13]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 2]" Condition="Flags%2">
- <DisplayString>({m_storage.m_data.array[8]}, {m_storage.m_data.array[9]}, {m_storage.m_data.array[10]}, {m_storage.m_data.array[11]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 2]" Condition="!(Flags%2)">
- <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[6]}, {m_storage.m_data.array[10]}, {m_storage.m_data.array[14]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 3]" Condition="Flags%2">
- <DisplayString>({m_storage.m_data.array[12]}, {m_storage.m_data.array[13]}, {m_storage.m_data.array[14]}, {m_storage.m_data.array[15]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 3]" Condition="!(Flags%2)">
- <DisplayString>({m_storage.m_data.array[3]}, {m_storage.m_data.array[7]}, {m_storage.m_data.array[11]}, {m_storage.m_data.array[15]})</DisplayString>
- </Synthetic>
- </Expand>
- </Type>
-
- <!-- Dynamic x Dynamic Matrix -->
- <Type Name="Eigen::Matrix<*,-1,-1,*,*,*>">
- <AlternativeType Name="Eigen::Array<*,-1,-1,*,*,*>"/>
- <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
- <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}, {m_storage.m_cols}] (dynamic matrix)</DisplayString>
- <Expand>
- <ArrayItems Condition="Flags%2"> <!-- row major layout -->
- <Rank>2</Rank>
- <Size>$i==0 ? m_storage.m_rows : m_storage.m_cols</Size>
- <ValuePointer>m_storage.m_data</ValuePointer>
- </ArrayItems>
- <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->
- <Direction>Backward</Direction>
- <Rank>2</Rank>
- <Size>$i==0 ? m_storage.m_rows : m_storage.m_cols</Size>
- <ValuePointer>m_storage.m_data</ValuePointer>
- </ArrayItems>
- </Expand>
- </Type>
-
- <!-- Fixed x Dynamic Matrix -->
- <Type Name="Eigen::Matrix<*,*,-1,*,*,*>">
- <AlternativeType Name="Eigen::Array<*,*,-1,*,*,*>"/>
- <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
- <DisplayString Condition="m_storage.m_data != 0">[{$T2}, {m_storage.m_cols}] (dynamic column matrix)</DisplayString>
- <Expand>
- <ArrayItems Condition="Flags%2"> <!-- row major layout -->
- <Rank>2</Rank>
- <Size>$i==0 ? $T2 : m_storage.m_cols</Size>
- <ValuePointer>m_storage.m_data</ValuePointer>
- </ArrayItems>
- <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->
- <Direction>Backward</Direction>
- <Rank>2</Rank>
- <Size>$i==0 ? $T2 : m_storage.m_cols</Size>
- <ValuePointer>m_storage.m_data</ValuePointer>
- </ArrayItems>
- </Expand>
- </Type>
-
- <!-- Dynamic x Fixed Matrix -->
- <Type Name="Eigen::Matrix<*,-1,*,*,*,*>">
- <AlternativeType Name="Eigen::Array<*,-1,*,*,*,*>"/>
- <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
- <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}, {$T2}] (dynamic row matrix)</DisplayString>
- <Expand>
- <ArrayItems Condition="Flags%2"> <!-- row major layout -->
- <Rank>2</Rank>
- <Size>$i==0 ? m_storage.m_rows : $T2</Size>
- <ValuePointer>m_storage.m_data</ValuePointer>
- </ArrayItems>
- <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->
- <Direction>Backward</Direction>
- <Rank>2</Rank>
- <Size>$i==0 ? m_storage.m_rows : $T2</Size>
- <ValuePointer>m_storage.m_data</ValuePointer>
- </ArrayItems>
- </Expand>
- </Type>
-
- <!-- Dynamic Column Vector -->
- <Type Name="Eigen::Matrix<*,1,-1,*,*,*>">
- <AlternativeType Name="Eigen::Array<*,1,-1,*,*,*>"/>
- <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
- <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_cols}] (dynamic column vector)</DisplayString>
- <Expand>
- <Item Name="[size]">m_storage.m_cols</Item>
- <ArrayItems>
- <Size>m_storage.m_cols</Size>
- <ValuePointer>m_storage.m_data</ValuePointer>
- </ArrayItems>
- </Expand>
- </Type>
-
- <!-- Dynamic Row Vector -->
- <Type Name="Eigen::Matrix<*,-1,1,*,*,*>">
- <AlternativeType Name="Eigen::Array<*,-1,1,*,*,*>"/>
- <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
- <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}] (dynamic row vector)</DisplayString>
- <Expand>
- <Item Name="[size]">m_storage.m_rows</Item>
- <ArrayItems>
- <Size>m_storage.m_rows</Size>
- <ValuePointer>m_storage.m_data</ValuePointer>
- </ArrayItems>
- </Expand>
- </Type>
-
- <!-- Fixed Vector -->
- <Type Name="Eigen::Matrix<*,1,1,*,*,*>">
- <AlternativeType Name="Eigen::Array<*,1,1,*,*,*>"/>
- <DisplayString>[1] ({m_storage.m_data.array[0]})</DisplayString>
- <Expand>
- <Item Name="[x]">m_storage.m_data.array[0]</Item>
- </Expand>
- </Type>
-
- <Type Name="Eigen::Matrix<*,2,1,*,*,*>">
- <AlternativeType Name="Eigen::Matrix<*,1,2,*,*,*>"/>
- <AlternativeType Name="Eigen::Array<*,2,1,*,*,*>"/>
- <AlternativeType Name="Eigen::Array<*,1,2,*,*,*>"/>
- <DisplayString>[2] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]})</DisplayString>
- <Expand>
- <Item Name="[x]">m_storage.m_data.array[0]</Item>
- <Item Name="[y]">m_storage.m_data.array[1]</Item>
- </Expand>
- </Type>
-
- <Type Name="Eigen::Matrix<*,3,1,*,*,*>">
- <AlternativeType Name="Eigen::Matrix<*,1,3,*,*,*>"/>
- <AlternativeType Name="Eigen::Array<*,3,1,*,*,*>"/>
- <AlternativeType Name="Eigen::Array<*,1,3,*,*,*>"/>
- <DisplayString>[3] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]})</DisplayString>
- <Expand>
- <Item Name="[x]">m_storage.m_data.array[0]</Item>
- <Item Name="[y]">m_storage.m_data.array[1]</Item>
- <Item Name="[z]">m_storage.m_data.array[2]</Item>
- </Expand>
- </Type>
-
- <Type Name="Eigen::Matrix<*,4,1,*,*,*>">
- <AlternativeType Name="Eigen::Matrix<*,1,4,*,*,*>"/>
- <AlternativeType Name="Eigen::Array<*,4,1,*,*,*>"/>
- <AlternativeType Name="Eigen::Array<*,1,4,*,*,*>"/>
- <DisplayString>[4] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>
- <Expand>
- <Item Name="[x]">m_storage.m_data.array[0]</Item>
- <Item Name="[y]">m_storage.m_data.array[1]</Item>
- <Item Name="[z]">m_storage.m_data.array[2]</Item>
- <Item Name="[w]">m_storage.m_data.array[3]</Item>
- </Expand>
- </Type>
-
-</AutoVisualizer>
+<?xml version="1.0" encoding="utf-8"?>
+
+<AutoVisualizer xmlns="http://schemas.microsoft.com/vstudio/debugger/natvis/2010">
+
+ <!-- Fixed x Fixed Matrix -->
+ <Type Name="Eigen::Matrix<*,*,*,*,*,*>">
+ <AlternativeType Name="Eigen::Array<*,-1,-1,*,*,*>"/>
+ <DisplayString>[{$T2}, {$T3}] (fixed matrix)</DisplayString>
+ <Expand>
+ <ArrayItems Condition="Flags%2"> <!-- row major layout -->
+ <Rank>2</Rank>
+ <Size>$i==0 ? $T2 : $T3</Size>
+ <ValuePointer>m_storage.m_data.array</ValuePointer>
+ </ArrayItems>
+ <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->
+ <Direction>Backward</Direction>
+ <Rank>2</Rank>
+ <Size>$i==0 ? $T2 : $T3</Size>
+ <ValuePointer>m_storage.m_data.array</ValuePointer>
+ </ArrayItems>
+ </Expand>
+ </Type>
+
+ <!-- 2 x 2 Matrix -->
+ <Type Name="Eigen::Matrix<*,2,2,*,*,*>">
+ <AlternativeType Name="Eigen::Array<*,2,2,*,*,*>"/>
+ <DisplayString>[2, 2] (fixed matrix)</DisplayString>
+ <Expand>
+ <Synthetic Name="[row 0]" Condition="Flags%2">
+ <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 0]" Condition="!(Flags%2)">
+ <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[2]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 1]" Condition="Flags%2">
+ <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 1]" Condition="!(Flags%2)">
+ <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[3]})</DisplayString>
+ </Synthetic>
+ </Expand>
+ </Type>
+
+ <!-- 3 x 3 Matrix -->
+ <Type Name="Eigen::Matrix<*,3,3,*,*,*>">
+ <AlternativeType Name="Eigen::Array<*,3,3,*,*,*>"/>
+ <DisplayString>[3, 3] (fixed matrix)</DisplayString>
+ <Expand>
+ <Synthetic Name="[row 0]" Condition="Flags%2">
+ <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 0]" Condition="!(Flags%2)">
+ <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[3]}, {m_storage.m_data.array[6]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 1]" Condition="Flags%2">
+ <DisplayString>({m_storage.m_data.array[3]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[5]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 1]" Condition="!(Flags%2)">
+ <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[7]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 2]" Condition="Flags%2">
+ <DisplayString>({m_storage.m_data.array[6]}, {m_storage.m_data.array[7]}, {m_storage.m_data.array[8]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 2]" Condition="!(Flags%2)">
+ <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[8]})</DisplayString>
+ </Synthetic>
+ </Expand>
+ </Type>
+
+ <!-- 4 x 4 Matrix -->
+ <Type Name="Eigen::Matrix<*,4,4,*,*,*>">
+ <AlternativeType Name="Eigen::Array<*,4,4,*,*,*>"/>
+ <DisplayString>[4, 4] (fixed matrix)</DisplayString>
+ <Expand>
+ <Synthetic Name="[row 0]" Condition="Flags%2">
+ <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 0]" Condition="!(Flags%2)">
+ <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[8]}, {m_storage.m_data.array[12]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 1]" Condition="Flags%2">
+ <DisplayString>({m_storage.m_data.array[4]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[6]}, {m_storage.m_data.array[7]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 1]" Condition="!(Flags%2)">
+ <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[9]}, {m_storage.m_data.array[13]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 2]" Condition="Flags%2">
+ <DisplayString>({m_storage.m_data.array[8]}, {m_storage.m_data.array[9]}, {m_storage.m_data.array[10]}, {m_storage.m_data.array[11]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 2]" Condition="!(Flags%2)">
+ <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[6]}, {m_storage.m_data.array[10]}, {m_storage.m_data.array[14]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 3]" Condition="Flags%2">
+ <DisplayString>({m_storage.m_data.array[12]}, {m_storage.m_data.array[13]}, {m_storage.m_data.array[14]}, {m_storage.m_data.array[15]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 3]" Condition="!(Flags%2)">
+ <DisplayString>({m_storage.m_data.array[3]}, {m_storage.m_data.array[7]}, {m_storage.m_data.array[11]}, {m_storage.m_data.array[15]})</DisplayString>
+ </Synthetic>
+ </Expand>
+ </Type>
+
+ <!-- Dynamic x Dynamic Matrix -->
+ <Type Name="Eigen::Matrix<*,-1,-1,*,*,*>">
+ <AlternativeType Name="Eigen::Array<*,-1,-1,*,*,*>"/>
+ <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
+ <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}, {m_storage.m_cols}] (dynamic matrix)</DisplayString>
+ <Expand>
+ <ArrayItems Condition="Flags%2"> <!-- row major layout -->
+ <Rank>2</Rank>
+ <Size>$i==0 ? m_storage.m_rows : m_storage.m_cols</Size>
+ <ValuePointer>m_storage.m_data</ValuePointer>
+ </ArrayItems>
+ <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->
+ <Direction>Backward</Direction>
+ <Rank>2</Rank>
+ <Size>$i==0 ? m_storage.m_rows : m_storage.m_cols</Size>
+ <ValuePointer>m_storage.m_data</ValuePointer>
+ </ArrayItems>
+ </Expand>
+ </Type>
+
+ <!-- Fixed x Dynamic Matrix -->
+ <Type Name="Eigen::Matrix<*,*,-1,*,*,*>">
+ <AlternativeType Name="Eigen::Array<*,*,-1,*,*,*>"/>
+ <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
+ <DisplayString Condition="m_storage.m_data != 0">[{$T2}, {m_storage.m_cols}] (dynamic column matrix)</DisplayString>
+ <Expand>
+ <ArrayItems Condition="Flags%2"> <!-- row major layout -->
+ <Rank>2</Rank>
+ <Size>$i==0 ? $T2 : m_storage.m_cols</Size>
+ <ValuePointer>m_storage.m_data</ValuePointer>
+ </ArrayItems>
+ <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->
+ <Direction>Backward</Direction>
+ <Rank>2</Rank>
+ <Size>$i==0 ? $T2 : m_storage.m_cols</Size>
+ <ValuePointer>m_storage.m_data</ValuePointer>
+ </ArrayItems>
+ </Expand>
+ </Type>
+
+ <!-- Dynamic x Fixed Matrix -->
+ <Type Name="Eigen::Matrix<*,-1,*,*,*,*>">
+ <AlternativeType Name="Eigen::Array<*,-1,*,*,*,*>"/>
+ <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
+ <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}, {$T2}] (dynamic row matrix)</DisplayString>
+ <Expand>
+ <ArrayItems Condition="Flags%2"> <!-- row major layout -->
+ <Rank>2</Rank>
+ <Size>$i==0 ? m_storage.m_rows : $T2</Size>
+ <ValuePointer>m_storage.m_data</ValuePointer>
+ </ArrayItems>
+ <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->
+ <Direction>Backward</Direction>
+ <Rank>2</Rank>
+ <Size>$i==0 ? m_storage.m_rows : $T2</Size>
+ <ValuePointer>m_storage.m_data</ValuePointer>
+ </ArrayItems>
+ </Expand>
+ </Type>
+
+ <!-- Dynamic Column Vector -->
+ <Type Name="Eigen::Matrix<*,1,-1,*,*,*>">
+ <AlternativeType Name="Eigen::Array<*,1,-1,*,*,*>"/>
+ <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
+ <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_cols}] (dynamic column vector)</DisplayString>
+ <Expand>
+ <Item Name="[size]">m_storage.m_cols</Item>
+ <ArrayItems>
+ <Size>m_storage.m_cols</Size>
+ <ValuePointer>m_storage.m_data</ValuePointer>
+ </ArrayItems>
+ </Expand>
+ </Type>
+
+ <!-- Dynamic Row Vector -->
+ <Type Name="Eigen::Matrix<*,-1,1,*,*,*>">
+ <AlternativeType Name="Eigen::Array<*,-1,1,*,*,*>"/>
+ <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
+ <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}] (dynamic row vector)</DisplayString>
+ <Expand>
+ <Item Name="[size]">m_storage.m_rows</Item>
+ <ArrayItems>
+ <Size>m_storage.m_rows</Size>
+ <ValuePointer>m_storage.m_data</ValuePointer>
+ </ArrayItems>
+ </Expand>
+ </Type>
+
+ <!-- Fixed Vector -->
+ <Type Name="Eigen::Matrix<*,1,1,*,*,*>">
+ <AlternativeType Name="Eigen::Array<*,1,1,*,*,*>"/>
+ <DisplayString>[1] ({m_storage.m_data.array[0]})</DisplayString>
+ <Expand>
+ <Item Name="[x]">m_storage.m_data.array[0]</Item>
+ </Expand>
+ </Type>
+
+ <Type Name="Eigen::Matrix<*,2,1,*,*,*>">
+ <AlternativeType Name="Eigen::Matrix<*,1,2,*,*,*>"/>
+ <AlternativeType Name="Eigen::Array<*,2,1,*,*,*>"/>
+ <AlternativeType Name="Eigen::Array<*,1,2,*,*,*>"/>
+ <DisplayString>[2] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]})</DisplayString>
+ <Expand>
+ <Item Name="[x]">m_storage.m_data.array[0]</Item>
+ <Item Name="[y]">m_storage.m_data.array[1]</Item>
+ </Expand>
+ </Type>
+
+ <Type Name="Eigen::Matrix<*,3,1,*,*,*>">
+ <AlternativeType Name="Eigen::Matrix<*,1,3,*,*,*>"/>
+ <AlternativeType Name="Eigen::Array<*,3,1,*,*,*>"/>
+ <AlternativeType Name="Eigen::Array<*,1,3,*,*,*>"/>
+ <DisplayString>[3] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]})</DisplayString>
+ <Expand>
+ <Item Name="[x]">m_storage.m_data.array[0]</Item>
+ <Item Name="[y]">m_storage.m_data.array[1]</Item>
+ <Item Name="[z]">m_storage.m_data.array[2]</Item>
+ </Expand>
+ </Type>
+
+ <Type Name="Eigen::Matrix<*,4,1,*,*,*>">
+ <AlternativeType Name="Eigen::Matrix<*,1,4,*,*,*>"/>
+ <AlternativeType Name="Eigen::Array<*,4,1,*,*,*>"/>
+ <AlternativeType Name="Eigen::Array<*,1,4,*,*,*>"/>
+ <DisplayString>[4] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>
+ <Expand>
+ <Item Name="[x]">m_storage.m_data.array[0]</Item>
+ <Item Name="[y]">m_storage.m_data.array[1]</Item>
+ <Item Name="[z]">m_storage.m_data.array[2]</Item>
+ <Item Name="[w]">m_storage.m_data.array[3]</Item>
+ </Expand>
+ </Type>
+
+</AutoVisualizer>
diff --git a/debug/msvc/eigen_autoexp_part.dat b/debug/msvc/eigen_autoexp_part.dat
index 273c10d..35ef580 100644
--- a/debug/msvc/eigen_autoexp_part.dat
+++ b/debug/msvc/eigen_autoexp_part.dat
@@ -1,295 +1,295 @@
-; ***************************************************************
-; * Eigen Visualizer
-; *
-; * Author: Hauke Heibel <hauke.heibel@gmail.com>
-; *
-; * Support the enhanced debugging of the following Eigen
-; * types (*: any, +:fixed dimension) :
-; *
-; * - Eigen::Matrix<*,4,1,*,*,*> and Eigen::Matrix<*,1,4,*,*,*>
-; * - Eigen::Matrix<*,3,1,*,*,*> and Eigen::Matrix<*,1,3,*,*,*>
-; * - Eigen::Matrix<*,2,1,*,*,*> and Eigen::Matrix<*,1,2,*,*,*>
-; * - Eigen::Matrix<*,-1,-1,*,*,*>
-; * - Eigen::Matrix<*,+,-1,*,*,*>
-; * - Eigen::Matrix<*,-1,+,*,*,*>
-; * - Eigen::Matrix<*,+,+,*,*,*>
-; *
-; * Matrices are displayed properly independently of the memory
-; * alignment (RowMajor vs. ColMajor).
-; *
-; * This file is distributed WITHOUT ANY WARRANTY. Please ensure
-; * that your original autoexp.dat file is copied to a safe
-; * place before proceeding with its modification.
-; ***************************************************************
-
-[Visualizer]
-
-; Fixed size 4-vectors
-Eigen::Matrix<*,4,1,*,*,*>|Eigen::Matrix<*,1,4,*,*,*>{
- children
- (
- #(
- [internals]: [$c,!],
- x : ($c.m_storage.m_data.array)[0],
- y : ($c.m_storage.m_data.array)[1],
- z : ($c.m_storage.m_data.array)[2],
- w : ($c.m_storage.m_data.array)[3]
- )
- )
-
- preview
- (
- #(
- "[",
- 4,
- "](",
- #array(expr: $e.m_storage.m_data.array[$i], size: 4),
- ")"
- )
- )
-}
-
-; Fixed size 3-vectors
-Eigen::Matrix<*,3,1,*,*,*>|Eigen::Matrix<*,1,3,*,*,*>{
- children
- (
- #(
- [internals]: [$c,!],
- x : ($c.m_storage.m_data.array)[0],
- y : ($c.m_storage.m_data.array)[1],
- z : ($c.m_storage.m_data.array)[2]
- )
- )
-
- preview
- (
- #(
- "[",
- 3,
- "](",
- #array(expr: $e.m_storage.m_data.array[$i], size: 3),
- ")"
- )
- )
-}
-
-; Fixed size 2-vectors
-Eigen::Matrix<*,2,1,*,*,*>|Eigen::Matrix<*,1,2,*,*,*>{
- children
- (
- #(
- [internals]: [$c,!],
- x : ($c.m_storage.m_data.array)[0],
- y : ($c.m_storage.m_data.array)[1]
- )
- )
-
- preview
- (
- #(
- "[",
- 2,
- "](",
- #array(expr: $e.m_storage.m_data.array[$i], size: 2),
- ")"
- )
- )
-}
-
-; Fixed size 1-vectors
-Eigen::Matrix<*,1,1,*,*,*>|Eigen::Matrix<*,1,1,*,*,*>{
- children
- (
- #(
- [internals]: [$c,!],
- x : ($c.m_storage.m_data.array)[0]
- )
- )
-
- preview
- (
- #(
- "[",
- 1,
- "](",
- #array(expr: $e.m_storage.m_data.array[$i], size: 1),
- ")"
- )
- )
-}
-
-; Dynamic matrices (ColMajor and RowMajor support)
-Eigen::Matrix<*,-1,-1,*,*,*>{
- children
- (
- #(
- [internals]: [$c,!],
- rows: $c.m_storage.m_rows,
- cols: $c.m_storage.m_cols,
- ; Check for RowMajorBit
- #if ($c.Flags & 0x1) (
- #array(
- rank: 2,
- base: 0,
- expr: ($c.m_storage.m_data)[($i % $c.m_storage.m_rows)*$c.m_storage.m_cols + (($i- $i % $c.m_storage.m_rows)/$c.m_storage.m_rows)],
- size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.m_storage.m_cols
- )
- ) #else (
- #array(
- rank: 2,
- base: 0,
- expr: ($c.m_storage.m_data)[$i],
- size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.m_storage.m_cols
- )
- )
- )
- )
-
- preview
- (
- #(
- "[",
- $c.m_storage.m_rows,
- ",",
- $c.m_storage.m_cols,
- "](",
- #array(
- expr : [($c.m_storage.m_data)[$i],g],
- size : $c.m_storage.m_rows*$c.m_storage.m_cols
- ),
- ")"
- )
- )
-}
-
-; Fixed rows, dynamic columns matrix (ColMajor and RowMajor support)
-Eigen::Matrix<*,*,-1,*,*,*>{
- children
- (
- #(
- [internals]: [$c,!],
- rows: $c.RowsAtCompileTime,
- cols: $c.m_storage.m_cols,
- ; Check for RowMajorBit
- #if ($c.Flags & 0x1) (
- #array(
- rank: 2,
- base: 0,
- expr: ($c.m_storage.m_data)[($i % $c.RowsAtCompileTime)*$c.m_storage.m_cols + (($i- $i % $c.RowsAtCompileTime)/$c.RowsAtCompileTime)],
- size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.m_storage.m_cols
- )
- ) #else (
- #array(
- rank: 2,
- base: 0,
- expr: ($c.m_storage.m_data)[$i],
- size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.m_storage.m_cols
- )
- )
- )
- )
-
- preview
- (
- #(
- "[",
- $c.RowsAtCompileTime,
- ",",
- $c.m_storage.m_cols,
- "](",
- #array(
- expr : [($c.m_storage.m_data)[$i],g],
- size : $c.RowsAtCompileTime*$c.m_storage.m_cols
- ),
- ")"
- )
- )
-}
-
-; Dynamic rows, fixed columns matrix (ColMajor and RowMajor support)
-Eigen::Matrix<*,-1,*,*,*,*>{
- children
- (
- #(
- [internals]: [$c,!],
- rows: $c.m_storage.m_rows,
- cols: $c.ColsAtCompileTime,
- ; Check for RowMajorBit
- #if ($c.Flags & 0x1) (
- #array(
- rank: 2,
- base: 0,
- expr: ($c.m_storage.m_data)[($i % $c.m_storage.m_rows)*$c.ColsAtCompileTime + (($i- $i % $c.m_storage.m_rows)/$c.m_storage.m_rows)],
- size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.ColsAtCompileTime
- )
- ) #else (
- #array(
- rank: 2,
- base: 0,
- expr: ($c.m_storage.m_data)[$i],
- size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.ColsAtCompileTime
- )
- )
- )
- )
-
- preview
- (
- #(
- "[",
- $c.m_storage.m_rows,
- ",",
- $c.ColsAtCompileTime,
- "](",
- #array(
- expr : [($c.m_storage.m_data)[$i],g],
- size : $c.m_storage.m_rows*$c.ColsAtCompileTime
- ),
- ")"
- )
- )
-}
-
-; Fixed size matrix (ColMajor and RowMajor support)
-Eigen::Matrix<*,*,*,*,*,*>{
- children
- (
- #(
- [internals]: [$c,!],
- rows: $c.RowsAtCompileTime,
- cols: $c.ColsAtCompileTime,
- ; Check for RowMajorBit
- #if ($c.Flags & 0x1) (
- #array(
- rank: 2,
- base: 0,
- expr: ($c.m_storage.m_data.array)[($i % $c.RowsAtCompileTime)*$c.ColsAtCompileTime + (($i- $i % $c.RowsAtCompileTime)/$c.RowsAtCompileTime)],
- size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.ColsAtCompileTime
- )
- ) #else (
- #array(
- rank: 2,
- base: 0,
- expr: ($c.m_storage.m_data.array)[$i],
- size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.ColsAtCompileTime
- )
- )
- )
- )
-
- preview
- (
- #(
- "[",
- $c.RowsAtCompileTime,
- ",",
- $c.ColsAtCompileTime,
- "](",
- #array(
- expr : [($c.m_storage.m_data.array)[$i],g],
- size : $c.RowsAtCompileTime*$c.ColsAtCompileTime
- ),
- ")"
- )
- )
-}
+; ***************************************************************
+; * Eigen Visualizer
+; *
+; * Author: Hauke Heibel <hauke.heibel@gmail.com>
+; *
+; * Support the enhanced debugging of the following Eigen
+; * types (*: any, +:fixed dimension) :
+; *
+; * - Eigen::Matrix<*,4,1,*,*,*> and Eigen::Matrix<*,1,4,*,*,*>
+; * - Eigen::Matrix<*,3,1,*,*,*> and Eigen::Matrix<*,1,3,*,*,*>
+; * - Eigen::Matrix<*,2,1,*,*,*> and Eigen::Matrix<*,1,2,*,*,*>
+; * - Eigen::Matrix<*,-1,-1,*,*,*>
+; * - Eigen::Matrix<*,+,-1,*,*,*>
+; * - Eigen::Matrix<*,-1,+,*,*,*>
+; * - Eigen::Matrix<*,+,+,*,*,*>
+; *
+; * Matrices are displayed properly independently of the memory
+; * alignment (RowMajor vs. ColMajor).
+; *
+; * This file is distributed WITHOUT ANY WARRANTY. Please ensure
+; * that your original autoexp.dat file is copied to a safe
+; * place before proceeding with its modification.
+; ***************************************************************
+
+[Visualizer]
+
+; Fixed size 4-vectors
+Eigen::Matrix<*,4,1,*,*,*>|Eigen::Matrix<*,1,4,*,*,*>{
+ children
+ (
+ #(
+ [internals]: [$c,!],
+ x : ($c.m_storage.m_data.array)[0],
+ y : ($c.m_storage.m_data.array)[1],
+ z : ($c.m_storage.m_data.array)[2],
+ w : ($c.m_storage.m_data.array)[3]
+ )
+ )
+
+ preview
+ (
+ #(
+ "[",
+ 4,
+ "](",
+ #array(expr: $e.m_storage.m_data.array[$i], size: 4),
+ ")"
+ )
+ )
+}
+
+; Fixed size 3-vectors
+Eigen::Matrix<*,3,1,*,*,*>|Eigen::Matrix<*,1,3,*,*,*>{
+ children
+ (
+ #(
+ [internals]: [$c,!],
+ x : ($c.m_storage.m_data.array)[0],
+ y : ($c.m_storage.m_data.array)[1],
+ z : ($c.m_storage.m_data.array)[2]
+ )
+ )
+
+ preview
+ (
+ #(
+ "[",
+ 3,
+ "](",
+ #array(expr: $e.m_storage.m_data.array[$i], size: 3),
+ ")"
+ )
+ )
+}
+
+; Fixed size 2-vectors
+Eigen::Matrix<*,2,1,*,*,*>|Eigen::Matrix<*,1,2,*,*,*>{
+ children
+ (
+ #(
+ [internals]: [$c,!],
+ x : ($c.m_storage.m_data.array)[0],
+ y : ($c.m_storage.m_data.array)[1]
+ )
+ )
+
+ preview
+ (
+ #(
+ "[",
+ 2,
+ "](",
+ #array(expr: $e.m_storage.m_data.array[$i], size: 2),
+ ")"
+ )
+ )
+}
+
+; Fixed size 1-vectors
+Eigen::Matrix<*,1,1,*,*,*>|Eigen::Matrix<*,1,1,*,*,*>{
+ children
+ (
+ #(
+ [internals]: [$c,!],
+ x : ($c.m_storage.m_data.array)[0]
+ )
+ )
+
+ preview
+ (
+ #(
+ "[",
+ 1,
+ "](",
+ #array(expr: $e.m_storage.m_data.array[$i], size: 1),
+ ")"
+ )
+ )
+}
+
+; Dynamic matrices (ColMajor and RowMajor support)
+Eigen::Matrix<*,-1,-1,*,*,*>{
+ children
+ (
+ #(
+ [internals]: [$c,!],
+ rows: $c.m_storage.m_rows,
+ cols: $c.m_storage.m_cols,
+ ; Check for RowMajorBit
+ #if ($c.Flags & 0x1) (
+ #array(
+ rank: 2,
+ base: 0,
+ expr: ($c.m_storage.m_data)[($i % $c.m_storage.m_rows)*$c.m_storage.m_cols + (($i- $i % $c.m_storage.m_rows)/$c.m_storage.m_rows)],
+ size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.m_storage.m_cols
+ )
+ ) #else (
+ #array(
+ rank: 2,
+ base: 0,
+ expr: ($c.m_storage.m_data)[$i],
+ size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.m_storage.m_cols
+ )
+ )
+ )
+ )
+
+ preview
+ (
+ #(
+ "[",
+ $c.m_storage.m_rows,
+ ",",
+ $c.m_storage.m_cols,
+ "](",
+ #array(
+ expr : [($c.m_storage.m_data)[$i],g],
+ size : $c.m_storage.m_rows*$c.m_storage.m_cols
+ ),
+ ")"
+ )
+ )
+}
+
+; Fixed rows, dynamic columns matrix (ColMajor and RowMajor support)
+Eigen::Matrix<*,*,-1,*,*,*>{
+ children
+ (
+ #(
+ [internals]: [$c,!],
+ rows: $c.RowsAtCompileTime,
+ cols: $c.m_storage.m_cols,
+ ; Check for RowMajorBit
+ #if ($c.Flags & 0x1) (
+ #array(
+ rank: 2,
+ base: 0,
+ expr: ($c.m_storage.m_data)[($i % $c.RowsAtCompileTime)*$c.m_storage.m_cols + (($i- $i % $c.RowsAtCompileTime)/$c.RowsAtCompileTime)],
+ size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.m_storage.m_cols
+ )
+ ) #else (
+ #array(
+ rank: 2,
+ base: 0,
+ expr: ($c.m_storage.m_data)[$i],
+ size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.m_storage.m_cols
+ )
+ )
+ )
+ )
+
+ preview
+ (
+ #(
+ "[",
+ $c.RowsAtCompileTime,
+ ",",
+ $c.m_storage.m_cols,
+ "](",
+ #array(
+ expr : [($c.m_storage.m_data)[$i],g],
+ size : $c.RowsAtCompileTime*$c.m_storage.m_cols
+ ),
+ ")"
+ )
+ )
+}
+
+; Dynamic rows, fixed columns matrix (ColMajor and RowMajor support)
+Eigen::Matrix<*,-1,*,*,*,*>{
+ children
+ (
+ #(
+ [internals]: [$c,!],
+ rows: $c.m_storage.m_rows,
+ cols: $c.ColsAtCompileTime,
+ ; Check for RowMajorBit
+ #if ($c.Flags & 0x1) (
+ #array(
+ rank: 2,
+ base: 0,
+ expr: ($c.m_storage.m_data)[($i % $c.m_storage.m_rows)*$c.ColsAtCompileTime + (($i- $i % $c.m_storage.m_rows)/$c.m_storage.m_rows)],
+ size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.ColsAtCompileTime
+ )
+ ) #else (
+ #array(
+ rank: 2,
+ base: 0,
+ expr: ($c.m_storage.m_data)[$i],
+ size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.ColsAtCompileTime
+ )
+ )
+ )
+ )
+
+ preview
+ (
+ #(
+ "[",
+ $c.m_storage.m_rows,
+ ",",
+ $c.ColsAtCompileTime,
+ "](",
+ #array(
+ expr : [($c.m_storage.m_data)[$i],g],
+ size : $c.m_storage.m_rows*$c.ColsAtCompileTime
+ ),
+ ")"
+ )
+ )
+}
+
+; Fixed size matrix (ColMajor and RowMajor support)
+Eigen::Matrix<*,*,*,*,*,*>{
+ children
+ (
+ #(
+ [internals]: [$c,!],
+ rows: $c.RowsAtCompileTime,
+ cols: $c.ColsAtCompileTime,
+ ; Check for RowMajorBit
+ #if ($c.Flags & 0x1) (
+ #array(
+ rank: 2,
+ base: 0,
+ expr: ($c.m_storage.m_data.array)[($i % $c.RowsAtCompileTime)*$c.ColsAtCompileTime + (($i- $i % $c.RowsAtCompileTime)/$c.RowsAtCompileTime)],
+ size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.ColsAtCompileTime
+ )
+ ) #else (
+ #array(
+ rank: 2,
+ base: 0,
+ expr: ($c.m_storage.m_data.array)[$i],
+ size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.ColsAtCompileTime
+ )
+ )
+ )
+ )
+
+ preview
+ (
+ #(
+ "[",
+ $c.RowsAtCompileTime,
+ ",",
+ $c.ColsAtCompileTime,
+ "](",
+ #array(
+ expr : [($c.m_storage.m_data.array)[$i],g],
+ size : $c.RowsAtCompileTime*$c.ColsAtCompileTime
+ ),
+ ")"
+ )
+ )
+}
diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in
index ddfd7e8..bc1e03c 100644
--- a/doc/Doxyfile.in
+++ b/doc/Doxyfile.in
@@ -737,6 +737,14 @@
"${Eigen_SOURCE_DIR}/unsupported/doc/examples" \
"${Eigen_SOURCE_DIR}/unsupported/doc/snippets"
+# Forward declarations of class templates cause the title of the main page for
+# the class template to not contain the template signature. This only happens
+# when the \class command is used to document the class. Possibly caused
+# by https://github.com/doxygen/doxygen/issues/7698. Confirmed fixed by
+# doxygen release 1.8.19.
+
+EXCLUDE += "${Eigen_SOURCE_DIR}/Eigen/src/Core/util/ForwardDeclarations.h"
+
# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
# directories that are symbolic links (a Unix file system feature) are excluded
# from the input.
diff --git a/doc/SparseQuickReference.dox b/doc/SparseQuickReference.dox
index 81a73ee..9779f3f 100644
--- a/doc/SparseQuickReference.dox
+++ b/doc/SparseQuickReference.dox
@@ -244,7 +244,7 @@
<td>
\code
sm1.valuePtr(); // Pointer to the values
-sm1.innerIndextr(); // Pointer to the indices.
+sm1.innerIndexPtr(); // Pointer to the indices.
sm1.outerIndexPtr(); // Pointer to the beginning of each inner vector
\endcode
</td>
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 59440cf..8eda8d2 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -450,7 +450,7 @@
endif()
endif()
-option(EIGEN_TEST_BUILD_DOCUMENTATION "Test building the doxygen documentation" OFF)
+cmake_dependent_option(EIGEN_TEST_BUILD_DOCUMENTATION "Test building the doxygen documentation" OFF "EIGEN_BUILD_DOC" OFF)
if(EIGEN_TEST_BUILD_DOCUMENTATION)
add_dependencies(buildtests doc)
endif()
diff --git a/test/array_cwise.cpp b/test/array_cwise.cpp
index 48c0935..6910f0e 100644
--- a/test/array_cwise.cpp
+++ b/test/array_cwise.cpp
@@ -319,6 +319,7 @@
VERIFY_IS_APPROX(m3.log(), log(m3));
VERIFY_IS_APPROX(m3.log1p(), log1p(m3));
VERIFY_IS_APPROX(m3.log10(), log10(m3));
+ VERIFY_IS_APPROX(m3.log2(), log2(m3));
VERIFY((!(m1>m2) == (m1<=m2)).all());
@@ -372,6 +373,7 @@
VERIFY_IS_APPROX(pow(m3,RealScalar(-0.5)), m3.rsqrt());
VERIFY_IS_APPROX(log10(m3), log(m3)/log(10));
+ VERIFY_IS_APPROX(log2(m3), log(m3)/log(2));
// scalar by array division
const RealScalar tiny = sqrt(std::numeric_limits<RealScalar>::epsilon());
@@ -423,6 +425,7 @@
VERIFY_IS_APPROX(m1.inverse(), inverse(m1));
VERIFY_IS_APPROX(m1.log(), log(m1));
VERIFY_IS_APPROX(m1.log10(), log10(m1));
+ VERIFY_IS_APPROX(m1.log2(), log2(m1));
VERIFY_IS_APPROX(m1.abs(), abs(m1));
VERIFY_IS_APPROX(m1.abs2(), abs2(m1));
VERIFY_IS_APPROX(m1.sqrt(), sqrt(m1));
@@ -478,6 +481,7 @@
VERIFY_IS_APPROX(abs(m1), sqrt(square(m1.real())+square(m1.imag())));
VERIFY_IS_APPROX(abs(m1), sqrt(abs2(m1)));
VERIFY_IS_APPROX(log10(m1), log(m1)/log(10));
+ VERIFY_IS_APPROX(log2(m1), log(m1)/log(2));
VERIFY_IS_APPROX( m1.sign(), -(-m1).sign() );
VERIFY_IS_APPROX( m1.sign() * m1.abs(), m1);
diff --git a/test/bfloat16_float.cpp b/test/bfloat16_float.cpp
index fc648df..1df22f7 100644
--- a/test/bfloat16_float.cpp
+++ b/test/bfloat16_float.cpp
@@ -274,9 +274,23 @@
VERIFY_IS_EQUAL(
numext::bit_cast<numext::uint16_t>(std::numeric_limits<bfloat16>::infinity()),
numext::bit_cast<numext::uint16_t>(bfloat16(std::numeric_limits<float>::infinity())) );
- VERIFY_IS_EQUAL(
- numext::bit_cast<numext::uint16_t>(std::numeric_limits<bfloat16>::quiet_NaN()),
- numext::bit_cast<numext::uint16_t>(bfloat16(std::numeric_limits<float>::quiet_NaN())) );
+ // There is no guarantee that casting a 32-bit NaN to bfloat16 has a precise
+ // bit pattern. We test that it is in fact a NaN, then test the signaling
+ // bit (msb of significand is 1 for quiet, 0 for signaling).
+ const numext::uint16_t BFLOAT16_QUIET_BIT = 0x0040;
+ VERIFY(
+ (numext::isnan)(std::numeric_limits<bfloat16>::quiet_NaN())
+ && (numext::isnan)(bfloat16(std::numeric_limits<float>::quiet_NaN()))
+ && ((numext::bit_cast<numext::uint16_t>(std::numeric_limits<bfloat16>::quiet_NaN()) & BFLOAT16_QUIET_BIT) > 0)
+ && ((numext::bit_cast<numext::uint16_t>(bfloat16(std::numeric_limits<float>::quiet_NaN())) & BFLOAT16_QUIET_BIT) > 0) );
+ // After a cast to bfloat16, a signaling NaN may become non-signaling. Thus,
+ // we check that both are NaN, and that only the `numeric_limits` version is
+ // signaling.
+ VERIFY(
+ (numext::isnan)(std::numeric_limits<bfloat16>::signaling_NaN())
+ && (numext::isnan)(bfloat16(std::numeric_limits<float>::signaling_NaN()))
+ && ((numext::bit_cast<numext::uint16_t>(std::numeric_limits<bfloat16>::signaling_NaN()) & BFLOAT16_QUIET_BIT) == 0) );
+
VERIFY( (std::numeric_limits<bfloat16>::min)() > bfloat16(0.f) );
VERIFY( (std::numeric_limits<bfloat16>::denorm_min)() > bfloat16(0.f) );
VERIFY_IS_EQUAL( (std::numeric_limits<bfloat16>::denorm_min)()/bfloat16(2), bfloat16(0.f) );
diff --git a/test/half_float.cpp b/test/half_float.cpp
index cf6df54..b2c2219 100644
--- a/test/half_float.cpp
+++ b/test/half_float.cpp
@@ -136,12 +136,22 @@
VERIFY_IS_EQUAL(
numext::bit_cast<numext::uint16_t>(std::numeric_limits<half>::infinity()),
numext::bit_cast<numext::uint16_t>(half(std::numeric_limits<float>::infinity())) );
- VERIFY_IS_EQUAL(
- numext::bit_cast<numext::uint16_t>(std::numeric_limits<half>::quiet_NaN()),
- numext::bit_cast<numext::uint16_t>(half(std::numeric_limits<float>::quiet_NaN())) );
- VERIFY_IS_EQUAL(
- numext::bit_cast<numext::uint16_t>(std::numeric_limits<half>::signaling_NaN()),
- numext::bit_cast<numext::uint16_t>(half(std::numeric_limits<float>::signaling_NaN())) );
+ // There is no guarantee that casting a 32-bit NaN to 16-bit has a precise
+ // bit pattern. We test that it is in fact a NaN, then test the signaling
+ // bit (msb of significand is 1 for quiet, 0 for signaling).
+ const numext::uint16_t HALF_QUIET_BIT = 0x0200;
+ VERIFY(
+ (numext::isnan)(std::numeric_limits<half>::quiet_NaN())
+ && (numext::isnan)(half(std::numeric_limits<float>::quiet_NaN()))
+ && ((numext::bit_cast<numext::uint16_t>(std::numeric_limits<half>::quiet_NaN()) & HALF_QUIET_BIT) > 0)
+ && ((numext::bit_cast<numext::uint16_t>(half(std::numeric_limits<float>::quiet_NaN())) & HALF_QUIET_BIT) > 0) );
+ // After a cast to half, a signaling NaN may become non-signaling
+ // (e.g. in the case of casting float to native __fp16). Thus, we check that
+ // both are NaN, and that only the `numeric_limits` version is signaling.
+ VERIFY(
+ (numext::isnan)(std::numeric_limits<half>::signaling_NaN())
+ && (numext::isnan)(half(std::numeric_limits<float>::signaling_NaN()))
+ && ((numext::bit_cast<numext::uint16_t>(std::numeric_limits<half>::signaling_NaN()) & HALF_QUIET_BIT) == 0) );
VERIFY( (std::numeric_limits<half>::min)() > half(0.f) );
VERIFY( (std::numeric_limits<half>::denorm_min)() > half(0.f) );
diff --git a/test/main.h b/test/main.h
index e830d68..4f9887c 100644
--- a/test/main.h
+++ b/test/main.h
@@ -47,6 +47,7 @@
#include <list>
#if __cplusplus >= 201103L
#include <random>
+#include <chrono>
#ifdef EIGEN_USE_THREADS
#include <future>
#endif
diff --git a/test/packetmath.cpp b/test/packetmath.cpp
index afe36ea..f19d725 100644
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@@ -473,8 +473,6 @@
CHECK_CWISE3_IF(true, internal::pselect, internal::pselect);
}
- CHECK_CWISE1_IF(PacketTraits::HasSqrt, numext::sqrt, internal::psqrt);
-
for (int i = 0; i < size; ++i) {
data1[i] = internal::random<Scalar>();
}
@@ -486,6 +484,18 @@
packetmath_boolean_mask_ops<Scalar, Packet>();
packetmath_pcast_ops_runner<Scalar, Packet>::run();
packetmath_minus_zero_add<Scalar, Packet>();
+
+ for (int i = 0; i < size; ++i) {
+ data1[i] = numext::abs(internal::random<Scalar>());
+ }
+ CHECK_CWISE1_IF(PacketTraits::HasSqrt, numext::sqrt, internal::psqrt);
+}
+
+// Notice that this definition works for complex types as well.
+// c++11 has std::log2 for real, but not for complex types.
+template <typename Scalar>
+Scalar log2(Scalar x) {
+ return Scalar(EIGEN_LOG2E) * std::log(x);
}
template <typename Scalar, typename Packet>
@@ -506,6 +516,7 @@
if (internal::random<float>(0, 1) < 0.1f) data1[internal::random<int>(0, PacketSize)] = Scalar(0);
CHECK_CWISE1_IF(PacketTraits::HasLog, std::log, internal::plog);
+ CHECK_CWISE1_IF(PacketTraits::HasLog, log2, internal::plog2);
CHECK_CWISE1_IF(PacketTraits::HasRsqrt, 1 / std::sqrt, internal::prsqrt);
for (int i = 0; i < size; ++i) {
@@ -556,7 +567,7 @@
VERIFY((numext::isnan)(data2[0]));
// TODO(rmlarsen): Re-enable for bfloat16.
if (!internal::is_same<Scalar, bfloat16>::value) {
- VERIFY_IS_EQUAL(std::exp(small), data2[1]);
+ VERIFY_IS_APPROX(std::exp(small), data2[1]);
}
data1[0] = -small;
@@ -564,21 +575,21 @@
h.store(data2, internal::pexp(h.load(data1)));
// TODO(rmlarsen): Re-enable for bfloat16.
if (!internal::is_same<Scalar, bfloat16>::value) {
- VERIFY_IS_EQUAL(std::exp(-small), data2[0]);
+ VERIFY_IS_APPROX(std::exp(-small), data2[0]);
}
VERIFY_IS_EQUAL(std::exp(Scalar(0)), data2[1]);
data1[0] = (std::numeric_limits<Scalar>::min)();
data1[1] = -(std::numeric_limits<Scalar>::min)();
h.store(data2, internal::pexp(h.load(data1)));
- VERIFY_IS_EQUAL(std::exp((std::numeric_limits<Scalar>::min)()), data2[0]);
- VERIFY_IS_EQUAL(std::exp(-(std::numeric_limits<Scalar>::min)()), data2[1]);
+ VERIFY_IS_APPROX(std::exp((std::numeric_limits<Scalar>::min)()), data2[0]);
+ VERIFY_IS_APPROX(std::exp(-(std::numeric_limits<Scalar>::min)()), data2[1]);
data1[0] = std::numeric_limits<Scalar>::denorm_min();
data1[1] = -std::numeric_limits<Scalar>::denorm_min();
h.store(data2, internal::pexp(h.load(data1)));
- VERIFY_IS_EQUAL(std::exp(std::numeric_limits<Scalar>::denorm_min()), data2[0]);
- VERIFY_IS_EQUAL(std::exp(-std::numeric_limits<Scalar>::denorm_min()), data2[1]);
+ VERIFY_IS_APPROX(std::exp(std::numeric_limits<Scalar>::denorm_min()), data2[0]);
+ VERIFY_IS_APPROX(std::exp(-std::numeric_limits<Scalar>::denorm_min()), data2[1]);
}
if (PacketTraits::HasTanh) {
@@ -618,7 +629,10 @@
test::packet_helper<PacketTraits::HasLog, Packet> h;
h.store(data2, internal::plog(h.load(data1)));
VERIFY((numext::isnan)(data2[0]));
- VERIFY_IS_EQUAL(std::log(std::numeric_limits<Scalar>::epsilon()), data2[1]);
+ // TODO(cantonios): Re-enable for bfloat16.
+ if (!internal::is_same<Scalar, bfloat16>::value) {
+ VERIFY_IS_APPROX(std::log(data1[1]), data2[1]);
+ }
data1[0] = -std::numeric_limits<Scalar>::epsilon();
data1[1] = Scalar(0);
@@ -629,7 +643,10 @@
data1[0] = (std::numeric_limits<Scalar>::min)();
data1[1] = -(std::numeric_limits<Scalar>::min)();
h.store(data2, internal::plog(h.load(data1)));
- VERIFY_IS_EQUAL(std::log((std::numeric_limits<Scalar>::min)()), data2[0]);
+ // TODO(cantonios): Re-enable for bfloat16.
+ if (!internal::is_same<Scalar, bfloat16>::value) {
+ VERIFY_IS_APPROX(std::log((std::numeric_limits<Scalar>::min)()), data2[0]);
+ }
VERIFY((numext::isnan)(data2[1]));
// Note: 32-bit arm always flushes denorms to zero.
@@ -672,8 +689,10 @@
VERIFY((numext::isnan)(data2[0]));
VERIFY((numext::isnan)(data2[1]));
}
- // TODO(rmlarsen): Re-enable for bfloat16.
- if (PacketTraits::HasCos && !internal::is_same<Scalar, bfloat16>::value) {
+ // TODO(rmlarsen): Re-enable for half and bfloat16.
+ if (PacketTraits::HasCos
+ && !internal::is_same<Scalar, half>::value
+ && !internal::is_same<Scalar, bfloat16>::value) {
test::packet_helper<PacketTraits::HasCos, Packet> h;
for (Scalar k = Scalar(1); k < Scalar(10000) / std::numeric_limits<Scalar>::epsilon(); k *= Scalar(2)) {
for (int k1 = 0; k1 <= 1; ++k1) {
@@ -729,54 +748,6 @@
VERIFY(test::areApprox(ref, data2, PacketSize) && #POP); \
}
-template <>
-void packetmath_real<bfloat16, typename internal::packet_traits<bfloat16>::type>(){
- typedef internal::packet_traits<bfloat16> PacketTraits;
- typedef internal::packet_traits<bfloat16>::type Packet;
-
- const int PacketSize = internal::unpacket_traits<Packet>::size;
- const int size = PacketSize * 4;
- EIGEN_ALIGN_MAX bfloat16 data1[PacketSize * 4];
- EIGEN_ALIGN_MAX bfloat16 data2[PacketSize * 4];
- EIGEN_ALIGN_MAX bfloat16 ref[PacketSize * 4];
-
- for (int i = 0; i < size; ++i) {
- data1[i] = bfloat16(internal::random<float>(0, 1) * std::pow(float(10), internal::random<float>(-6, 6)));
- data2[i] = bfloat16(internal::random<float>(0, 1) * std::pow(float(10), internal::random<float>(-6, 6)));
- data1[i] = bfloat16(0);
- }
-
- if (internal::random<float>(0, 1) < 0.1f) data1[internal::random<int>(0, PacketSize)] = bfloat16(0);
-
- CAST_CHECK_CWISE1_IF(PacketTraits::HasLog, std::log, internal::plog, bfloat16, float);
- CAST_CHECK_CWISE1_IF(PacketTraits::HasRsqrt, float(1) / std::sqrt, internal::prsqrt, bfloat16, float);
-
- for (int i = 0; i < size; ++i) {
- data1[i] = bfloat16(internal::random<float>(-1, 1) * std::pow(float(10), internal::random<float>(-3, 3)));
- data2[i] = bfloat16(internal::random<float>(-1, 1) * std::pow(float(10), internal::random<float>(-3, 3)));
- }
- CAST_CHECK_CWISE1_IF(PacketTraits::HasSin, std::sin, internal::psin, bfloat16, float);
- CAST_CHECK_CWISE1_IF(PacketTraits::HasCos, std::cos, internal::pcos, bfloat16, float);
- CAST_CHECK_CWISE1_IF(PacketTraits::HasTan, std::tan, internal::ptan, bfloat16, float);
-
- CAST_CHECK_CWISE1_IF(PacketTraits::HasRound, numext::round, internal::pround, bfloat16, float);
- CAST_CHECK_CWISE1_IF(PacketTraits::HasCeil, numext::ceil, internal::pceil, bfloat16, float);
- CAST_CHECK_CWISE1_IF(PacketTraits::HasFloor, numext::floor, internal::pfloor, bfloat16, float);
-
- for (int i = 0; i < size; ++i) {
- data1[i] = bfloat16(-1.5 + i);
- data2[i] = bfloat16(-1.5 + i);
- }
- CAST_CHECK_CWISE1_IF(PacketTraits::HasRound, numext::round, internal::pround, bfloat16, float);
-
- for (int i = 0; i < size; ++i) {
- data1[i] = bfloat16(internal::random<float>(-87, 88));
- data2[i] = bfloat16(internal::random<float>(-87, 88));
- }
- CAST_CHECK_CWISE1_IF(PacketTraits::HasExp, std::exp, internal::pexp, bfloat16, float);
-
-}
-
template <typename Scalar>
Scalar propagate_nan_max(const Scalar& a, const Scalar& b) {
if ((numext::isnan)(a)) return a;
@@ -791,6 +762,20 @@
return (numext::mini)(a,b);
}
+template <typename Scalar>
+Scalar propagate_number_max(const Scalar& a, const Scalar& b) {
+ if ((numext::isnan)(a)) return b;
+ if ((numext::isnan)(b)) return a;
+ return (numext::maxi)(a,b);
+}
+
+template <typename Scalar>
+Scalar propagate_number_min(const Scalar& a, const Scalar& b) {
+ if ((numext::isnan)(a)) return b;
+ if ((numext::isnan)(b)) return a;
+ return (numext::mini)(a,b);
+}
+
template <typename Scalar, typename Packet>
void packetmath_notcomplex() {
typedef internal::packet_traits<Scalar> PacketTraits;
@@ -807,15 +792,9 @@
CHECK_CWISE2_IF(PacketTraits::HasMin, (std::min), internal::pmin);
CHECK_CWISE2_IF(PacketTraits::HasMax, (std::max), internal::pmax);
-#if EIGEN_HAS_CXX11_MATH
- using std::fmin;
- using std::fmax;
-#else
- using ::fmin;
- using ::fmax;
-#endif
- CHECK_CWISE2_IF(PacketTraits::HasMin, fmin, (internal::pmin<PropagateNumbers>));
- CHECK_CWISE2_IF(PacketTraits::HasMax, fmax, internal::pmax<PropagateNumbers>);
+
+ CHECK_CWISE2_IF(PacketTraits::HasMin, propagate_number_min, internal::pmin<PropagateNumbers>);
+ CHECK_CWISE2_IF(PacketTraits::HasMax, propagate_number_max, internal::pmax<PropagateNumbers>);
CHECK_CWISE1(numext::abs, internal::pabs);
CHECK_CWISE2_IF(PacketTraits::HasAbsDiff, REF_ABS_DIFF, internal::pabsdiff);
@@ -888,54 +867,13 @@
data1[i + PacketSize] = internal::random<bool>() ? std::numeric_limits<Scalar>::quiet_NaN() : Scalar(0);
}
// Note: NaN propagation is implementation defined for pmin/pmax, so we do not test it here.
- CHECK_CWISE2_IF(PacketTraits::HasMin, fmin, (internal::pmin<PropagateNumbers>));
- CHECK_CWISE2_IF(PacketTraits::HasMax, fmax, internal::pmax<PropagateNumbers>);
+ CHECK_CWISE2_IF(PacketTraits::HasMin, propagate_number_min, (internal::pmin<PropagateNumbers>));
+ CHECK_CWISE2_IF(PacketTraits::HasMax, propagate_number_max, internal::pmax<PropagateNumbers>);
CHECK_CWISE2_IF(PacketTraits::HasMin, propagate_nan_min, (internal::pmin<PropagateNaN>));
CHECK_CWISE2_IF(PacketTraits::HasMax, propagate_nan_max, internal::pmax<PropagateNaN>);
}
}
-template <>
-void packetmath_notcomplex<bfloat16, typename internal::packet_traits<bfloat16>::type>(){
- typedef bfloat16 Scalar;
- typedef internal::packet_traits<bfloat16>::type Packet;
- typedef internal::packet_traits<Scalar> PacketTraits;
- const int PacketSize = internal::unpacket_traits<Packet>::size;
-
- EIGEN_ALIGN_MAX Scalar data1[PacketSize * 4];
- EIGEN_ALIGN_MAX Scalar data2[PacketSize * 4];
- EIGEN_ALIGN_MAX Scalar ref[PacketSize * 4];
- Array<Scalar, Dynamic, 1>::Map(data1, PacketSize * 4).setRandom();
-
- ref[0] = data1[0];
- for (int i = 0; i < PacketSize; ++i) ref[0] = (std::min)(ref[0], data1[i]);
- VERIFY(internal::isApprox(ref[0], internal::predux_min(internal::pload<Packet>(data1))) && "internal::predux_min");
-
- VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasMin);
- VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasMax);
-
- CHECK_CWISE2_IF(PacketTraits::HasMin, (std::min), internal::pmin);
- CHECK_CWISE2_IF(PacketTraits::HasMax, (std::max), internal::pmax);
- CHECK_CWISE1(numext::abs, internal::pabs);
- CHECK_CWISE2_IF(PacketTraits::HasAbsDiff, REF_ABS_DIFF, internal::pabsdiff);
-
- ref[0] = data1[0];
- for (int i = 0; i < PacketSize; ++i) ref[0] = (std::max)(ref[0], data1[i]);
- VERIFY(internal::isApprox(ref[0], internal::predux_max(internal::pload<Packet>(data1))) && "internal::predux_max");
-
- {
- unsigned char* data1_bits = reinterpret_cast<unsigned char*>(data1);
- // predux_any
- for (unsigned int i = 0; i < PacketSize * sizeof(Scalar); ++i) data1_bits[i] = 0x0;
- VERIFY((!internal::predux_any(internal::pload<Packet>(data1))) && "internal::predux_any(0000)");
- for (int k = 0; k < PacketSize; ++k) {
- for (unsigned int i = 0; i < sizeof(Scalar); ++i) data1_bits[k * sizeof(Scalar) + i] = 0xff;
- VERIFY(internal::predux_any(internal::pload<Packet>(data1)) && "internal::predux_any(0101)");
- for (unsigned int i = 0; i < sizeof(Scalar); ++i) data1_bits[k * sizeof(Scalar) + i] = 0x00;
- }
- }
-}
-
template <typename Scalar, typename Packet, bool ConjLhs, bool ConjRhs>
void test_conj_helper(Scalar* data1, Scalar* data2, Scalar* ref, Scalar* pval) {
const int PacketSize = internal::unpacket_traits<Packet>::size;
@@ -964,6 +902,8 @@
template <typename Scalar, typename Packet>
void packetmath_complex() {
+ typedef internal::packet_traits<Scalar> PacketTraits;
+ typedef typename Scalar::value_type RealScalar;
const int PacketSize = internal::unpacket_traits<Packet>::size;
const int size = PacketSize * 4;
@@ -982,11 +922,55 @@
test_conj_helper<Scalar, Packet, true, false>(data1, data2, ref, pval);
test_conj_helper<Scalar, Packet, true, true>(data1, data2, ref, pval);
+ // Test pcplxflip.
{
for (int i = 0; i < PacketSize; ++i) ref[i] = Scalar(std::imag(data1[i]), std::real(data1[i]));
internal::pstore(pval, internal::pcplxflip(internal::pload<Packet>(data1)));
VERIFY(test::areApprox(ref, pval, PacketSize) && "pcplxflip");
}
+
+ if (PacketTraits::HasSqrt) {
+ for (int i = 0; i < size; ++i) {
+ data1[i] = Scalar(internal::random<RealScalar>(), internal::random<RealScalar>());
+ }
+ CHECK_CWISE1(numext::sqrt, internal::psqrt);
+
+ // Test misc. corner cases.
+ const RealScalar zero = RealScalar(0);
+ const RealScalar one = RealScalar(1);
+ const RealScalar inf = std::numeric_limits<RealScalar>::infinity();
+ const RealScalar nan = std::numeric_limits<RealScalar>::quiet_NaN();
+ data1[0] = Scalar(zero, zero);
+ data1[1] = Scalar(-zero, zero);
+ data1[2] = Scalar(one, zero);
+ data1[3] = Scalar(zero, one);
+ CHECK_CWISE1(numext::sqrt, internal::psqrt);
+ data1[0] = Scalar(-one, zero);
+ data1[1] = Scalar(zero, -one);
+ data1[2] = Scalar(one, one);
+ data1[3] = Scalar(-one, -one);
+ CHECK_CWISE1(numext::sqrt, internal::psqrt);
+ data1[0] = Scalar(inf, zero);
+ data1[1] = Scalar(zero, inf);
+ data1[2] = Scalar(-inf, zero);
+ data1[3] = Scalar(zero, -inf);
+ CHECK_CWISE1(numext::sqrt, internal::psqrt);
+ data1[0] = Scalar(inf, inf);
+ data1[1] = Scalar(-inf, inf);
+ data1[2] = Scalar(inf, -inf);
+ data1[3] = Scalar(-inf, -inf);
+ CHECK_CWISE1(numext::sqrt, internal::psqrt);
+ data1[0] = Scalar(nan, zero);
+ data1[1] = Scalar(zero, nan);
+ data1[2] = Scalar(nan, one);
+ data1[3] = Scalar(one, nan);
+ CHECK_CWISE1(numext::sqrt, internal::psqrt);
+ data1[0] = Scalar(nan, nan);
+ data1[1] = Scalar(inf, nan);
+ data1[2] = Scalar(nan, inf);
+ data1[3] = Scalar(-inf, nan);
+ CHECK_CWISE1(numext::sqrt, internal::psqrt);
+ }
}
template <typename Scalar, typename Packet>
@@ -1001,8 +985,9 @@
int stride = internal::random<int>(1, 20);
- EIGEN_ALIGN_MAX Scalar buffer[PacketSize * 20];
- memset(buffer, 0, 20 * PacketSize * sizeof(Scalar));
+ // Buffer of zeros.
+ EIGEN_ALIGN_MAX Scalar buffer[PacketSize * 20] = {};
+
Packet packet = internal::pload<Packet>(data1);
internal::pscatter<Scalar, Packet>(buffer, packet, stride);
@@ -1073,12 +1058,7 @@
CALL_SUBTEST_10(test::runner<uint64_t>::run());
CALL_SUBTEST_11(test::runner<std::complex<float> >::run());
CALL_SUBTEST_12(test::runner<std::complex<double> >::run());
-#if defined(EIGEN_VECTORIZE_AVX)
- // AVX half packets not fully implemented.
- CALL_SUBTEST_13((packetmath<half, internal::packet_traits<half>::type>()));
-#else
CALL_SUBTEST_13(test::runner<half>::run());
-#endif
CALL_SUBTEST_14((packetmath<bool, internal::packet_traits<bool>::type>()));
CALL_SUBTEST_15(test::runner<bfloat16>::run());
g_first_pass = false;
diff --git a/unsupported/CMakeLists.txt b/unsupported/CMakeLists.txt
index 9a56661..34408c0 100644
--- a/unsupported/CMakeLists.txt
+++ b/unsupported/CMakeLists.txt
@@ -1,5 +1,7 @@
add_subdirectory(Eigen)
-add_subdirectory(doc EXCLUDE_FROM_ALL)
+if(EIGEN_BUILD_DOC)
+ add_subdirectory(doc EXCLUDE_FROM_ALL)
+endif()
if(BUILD_TESTING)
if(EIGEN_LEAVE_TEST_IN_ALL_TARGET)
add_subdirectory(test) # can't do EXCLUDE_FROM_ALL here, breaks CTest
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index 2640f95..beed230 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -33,10 +33,13 @@
* Much of the documentation can be found \ref eigen_tensors "here".
*/
+#include <atomic>
+#include <chrono>
#include <cmath>
#include <cstddef>
#include <cstring>
#include <random>
+#include <thread>
#ifdef _WIN32
typedef __int16 int16_t;
@@ -48,7 +51,6 @@
#include <windows.h>
#else
#include <stdint.h>
-#include <unistd.h>
#endif
#ifdef _WIN32
@@ -70,8 +72,6 @@
#else
#include <cuda_runtime.h>
#endif
- #include <atomic>
- #include <unistd.h>
#endif
#include "src/Tensor/TensorMacros.h"
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
index 7f33944..9422dcd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
@@ -83,7 +83,7 @@
// Wait for the other thread to inititialize the properties.
while (!m_devicePropInitialized) {
std::atomic_thread_fence(std::memory_order_acquire);
- EIGEN_SLEEP(1000);
+ std::this_thread::sleep_for(std::chrono::milliseconds(1000));
}
}
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
index 76d15f1..73ff3d2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
@@ -43,15 +43,6 @@
#define EIGEN_SFINAE_ENABLE_IF( __condition__ ) \
typename internal::enable_if< ( __condition__ ) , int >::type = 0
-
-#if EIGEN_OS_WIN || EIGEN_OS_WIN64
-#define EIGEN_SLEEP(n) Sleep(n)
-#elif EIGEN_OS_GNULINUX
-#define EIGEN_SLEEP(n) usleep(n * 1000);
-#else
-#define EIGEN_SLEEP(n) sleep(std::max<unsigned>(1, n/1000))
-#endif
-
// Define a macro to use a reference on the host but a value on the device
#if defined(SYCL_DEVICE_ONLY)
#define EIGEN_DEVICE_REF
diff --git a/unsupported/Eigen/SpecialFunctions b/unsupported/Eigen/SpecialFunctions
index dda6618..f6a2460 100644
--- a/unsupported/Eigen/SpecialFunctions
+++ b/unsupported/Eigen/SpecialFunctions
@@ -61,23 +61,36 @@
}
#include "src/SpecialFunctions/BesselFunctionsImpl.h"
-#include "src/SpecialFunctions/BesselFunctionsPacketMath.h"
#include "src/SpecialFunctions/BesselFunctionsBFloat16.h"
#include "src/SpecialFunctions/BesselFunctionsHalf.h"
+#include "src/SpecialFunctions/BesselFunctionsPacketMath.h"
#include "src/SpecialFunctions/BesselFunctionsFunctors.h"
#include "src/SpecialFunctions/BesselFunctionsArrayAPI.h"
#include "src/SpecialFunctions/SpecialFunctionsImpl.h"
#if defined(EIGEN_HIPCC)
#include "src/SpecialFunctions/HipVectorCompatibility.h"
#endif
-#include "src/SpecialFunctions/SpecialFunctionsPacketMath.h"
#include "src/SpecialFunctions/SpecialFunctionsBFloat16.h"
#include "src/SpecialFunctions/SpecialFunctionsHalf.h"
+#include "src/SpecialFunctions/SpecialFunctionsPacketMath.h"
#include "src/SpecialFunctions/SpecialFunctionsFunctors.h"
#include "src/SpecialFunctions/SpecialFunctionsArrayAPI.h"
+#if defined EIGEN_VECTORIZE_AVX512
+ #include "src/SpecialFunctions/arch/AVX/BesselFunctions.h"
+ #include "src/SpecialFunctions/arch/AVX/SpecialFunctions.h"
+ #include "src/SpecialFunctions/arch/AVX512/BesselFunctions.h"
+ #include "src/SpecialFunctions/arch/AVX512/SpecialFunctions.h"
+#elif defined EIGEN_VECTORIZE_AVX
+ #include "src/SpecialFunctions/arch/AVX/BesselFunctions.h"
+ #include "src/SpecialFunctions/arch/AVX/SpecialFunctions.h"
+#elif defined EIGEN_VECTORIZE_NEON
+ #include "src/SpecialFunctions/arch/NEON/BesselFunctions.h"
+ #include "src/SpecialFunctions/arch/NEON/SpecialFunctions.h"
+#endif
+
#if defined EIGEN_VECTORIZE_GPU
- #include "src/SpecialFunctions/arch/GPU/GpuSpecialFunctions.h"
+ #include "src/SpecialFunctions/arch/GPU/SpecialFunctions.h"
#endif
namespace Eigen {
diff --git a/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsImpl.h b/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsImpl.h
index a9b6ad9..24812be 100644
--- a/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsImpl.h
+++ b/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsImpl.h
@@ -46,7 +46,7 @@
typedef Scalar type;
};
-template <typename T, typename ScalarType>
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
struct generic_i0e {
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE T run(const T&) {
@@ -201,11 +201,11 @@
}
};
-template <typename Scalar>
+template <typename T>
struct bessel_i0e_impl {
EIGEN_DEVICE_FUNC
- static EIGEN_STRONG_INLINE Scalar run(const Scalar x) {
- return generic_i0e<Scalar, Scalar>::run(x);
+ static EIGEN_STRONG_INLINE T run(const T x) {
+ return generic_i0e<T>::run(x);
}
};
@@ -214,7 +214,7 @@
typedef Scalar type;
};
-template <typename T, typename ScalarType>
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
struct generic_i0 {
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE T run(const T& x) {
@@ -224,11 +224,11 @@
}
};
-template <typename Scalar>
+template <typename T>
struct bessel_i0_impl {
EIGEN_DEVICE_FUNC
- static EIGEN_STRONG_INLINE Scalar run(const Scalar x) {
- return generic_i0<Scalar, Scalar>::run(x);
+ static EIGEN_STRONG_INLINE T run(const T x) {
+ return generic_i0<T>::run(x);
}
};
@@ -237,7 +237,7 @@
typedef Scalar type;
};
-template <typename T, typename ScalarType>
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type >
struct generic_i1e {
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE T run(const T&) {
@@ -396,20 +396,20 @@
}
};
-template <typename Scalar>
+template <typename T>
struct bessel_i1e_impl {
EIGEN_DEVICE_FUNC
- static EIGEN_STRONG_INLINE Scalar run(const Scalar x) {
- return generic_i1e<Scalar, Scalar>::run(x);
+ static EIGEN_STRONG_INLINE T run(const T x) {
+ return generic_i1e<T>::run(x);
}
};
-template <typename Scalar>
+template <typename T>
struct bessel_i1_retval {
- typedef Scalar type;
+ typedef T type;
};
-template <typename T, typename ScalarType>
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
struct generic_i1 {
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE T run(const T& x) {
@@ -419,20 +419,20 @@
}
};
-template <typename Scalar>
+template <typename T>
struct bessel_i1_impl {
EIGEN_DEVICE_FUNC
- static EIGEN_STRONG_INLINE Scalar run(const Scalar x) {
- return generic_i1<Scalar, Scalar>::run(x);
+ static EIGEN_STRONG_INLINE T run(const T x) {
+ return generic_i1<T>::run(x);
}
};
-template <typename Scalar>
+template <typename T>
struct bessel_k0e_retval {
- typedef Scalar type;
+ typedef T type;
};
-template <typename T, typename ScalarType>
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
struct generic_k0e {
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE T run(const T&) {
@@ -582,20 +582,20 @@
}
};
-template <typename Scalar>
+template <typename T>
struct bessel_k0e_impl {
EIGEN_DEVICE_FUNC
- static EIGEN_STRONG_INLINE Scalar run(const Scalar x) {
- return generic_k0e<Scalar, Scalar>::run(x);
+ static EIGEN_STRONG_INLINE T run(const T x) {
+ return generic_k0e<T>::run(x);
}
};
-template <typename Scalar>
+template <typename T>
struct bessel_k0_retval {
- typedef Scalar type;
+ typedef T type;
};
-template <typename T, typename ScalarType>
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
struct generic_k0 {
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE T run(const T&) {
@@ -754,20 +754,20 @@
}
};
-template <typename Scalar>
+template <typename T>
struct bessel_k0_impl {
EIGEN_DEVICE_FUNC
- static EIGEN_STRONG_INLINE Scalar run(const Scalar x) {
- return generic_k0<Scalar, Scalar>::run(x);
+ static EIGEN_STRONG_INLINE T run(const T x) {
+ return generic_k0<T>::run(x);
}
};
-template <typename Scalar>
+template <typename T>
struct bessel_k1e_retval {
- typedef Scalar type;
+ typedef T type;
};
-template <typename T, typename ScalarType>
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
struct generic_k1e {
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE T run(const T&) {
@@ -910,20 +910,20 @@
}
};
-template <typename Scalar>
+template <typename T>
struct bessel_k1e_impl {
EIGEN_DEVICE_FUNC
- static EIGEN_STRONG_INLINE Scalar run(const Scalar x) {
- return generic_k1e<Scalar, Scalar>::run(x);
+ static EIGEN_STRONG_INLINE T run(const T x) {
+ return generic_k1e<T>::run(x);
}
};
-template <typename Scalar>
+template <typename T>
struct bessel_k1_retval {
- typedef Scalar type;
+ typedef T type;
};
-template <typename T, typename ScalarType>
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
struct generic_k1 {
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE T run(const T&) {
@@ -1076,20 +1076,20 @@
}
};
-template <typename Scalar>
+template <typename T>
struct bessel_k1_impl {
EIGEN_DEVICE_FUNC
- static EIGEN_STRONG_INLINE Scalar run(const Scalar x) {
- return generic_k1<Scalar, Scalar>::run(x);
+ static EIGEN_STRONG_INLINE T run(const T x) {
+ return generic_k1<T>::run(x);
}
};
-template <typename Scalar>
+template <typename T>
struct bessel_j0_retval {
- typedef Scalar type;
+ typedef T type;
};
-template <typename T, typename ScalarType>
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
struct generic_j0 {
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE T run(const T&) {
@@ -1276,20 +1276,20 @@
}
};
-template <typename Scalar>
+template <typename T>
struct bessel_j0_impl {
EIGEN_DEVICE_FUNC
- static EIGEN_STRONG_INLINE Scalar run(const Scalar x) {
- return generic_j0<Scalar, Scalar>::run(x);
+ static EIGEN_STRONG_INLINE T run(const T x) {
+ return generic_j0<T>::run(x);
}
};
-template <typename Scalar>
+template <typename T>
struct bessel_y0_retval {
- typedef Scalar type;
+ typedef T type;
};
-template <typename T, typename ScalarType>
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
struct generic_y0 {
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE T run(const T&) {
@@ -1474,20 +1474,20 @@
}
};
-template <typename Scalar>
+template <typename T>
struct bessel_y0_impl {
EIGEN_DEVICE_FUNC
- static EIGEN_STRONG_INLINE Scalar run(const Scalar x) {
- return generic_y0<Scalar, Scalar>::run(x);
+ static EIGEN_STRONG_INLINE T run(const T x) {
+ return generic_y0<T>::run(x);
}
};
-template <typename Scalar>
+template <typename T>
struct bessel_j1_retval {
- typedef Scalar type;
+ typedef T type;
};
-template <typename T, typename ScalarType>
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
struct generic_j1 {
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE T run(const T&) {
@@ -1665,20 +1665,20 @@
}
};
-template <typename Scalar>
+template <typename T>
struct bessel_j1_impl {
EIGEN_DEVICE_FUNC
- static EIGEN_STRONG_INLINE Scalar run(const Scalar x) {
- return generic_j1<Scalar, Scalar>::run(x);
+ static EIGEN_STRONG_INLINE T run(const T x) {
+ return generic_j1<T>::run(x);
}
};
-template <typename Scalar>
+template <typename T>
struct bessel_y1_retval {
- typedef Scalar type;
+ typedef T type;
};
-template <typename T, typename ScalarType>
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
struct generic_y1 {
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE T run(const T&) {
@@ -1868,11 +1868,11 @@
}
};
-template <typename Scalar>
+template <typename T>
struct bessel_y1_impl {
EIGEN_DEVICE_FUNC
- static EIGEN_STRONG_INLINE Scalar run(const Scalar x) {
- return generic_y1<Scalar, Scalar>::run(x);
+ static EIGEN_STRONG_INLINE T run(const T x) {
+ return generic_y1<T>::run(x);
}
};
diff --git a/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsPacketMath.h b/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsPacketMath.h
index efc6d9c..943d10f 100644
--- a/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsPacketMath.h
+++ b/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsPacketMath.h
@@ -19,8 +19,7 @@
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
Packet pbessel_i0(const Packet& x) {
- typedef typename unpacket_traits<Packet>::type ScalarType;
- using internal::generic_i0; return generic_i0<Packet, ScalarType>::run(x);
+ return numext::bessel_i0(x);
}
/** \internal \returns the exponentially scaled modified Bessel function of
@@ -28,8 +27,7 @@
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
Packet pbessel_i0e(const Packet& x) {
- typedef typename unpacket_traits<Packet>::type ScalarType;
- using internal::generic_i0e; return generic_i0e<Packet, ScalarType>::run(x);
+ return numext::bessel_i0e(x);
}
/** \internal \returns the exponentially scaled modified Bessel function of
@@ -37,8 +35,7 @@
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
Packet pbessel_i1(const Packet& x) {
- typedef typename unpacket_traits<Packet>::type ScalarType;
- using internal::generic_i1; return generic_i1<Packet, ScalarType>::run(x);
+ return numext::bessel_i1(x);
}
/** \internal \returns the exponentially scaled modified Bessel function of
@@ -46,8 +43,7 @@
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
Packet pbessel_i1e(const Packet& x) {
- typedef typename unpacket_traits<Packet>::type ScalarType;
- using internal::generic_i1e; return generic_i1e<Packet, ScalarType>::run(x);
+ return numext::bessel_i1e(x);
}
/** \internal \returns the exponentially scaled modified Bessel function of
@@ -55,8 +51,7 @@
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
Packet pbessel_j0(const Packet& x) {
- typedef typename unpacket_traits<Packet>::type ScalarType;
- using internal::generic_j0; return generic_j0<Packet, ScalarType>::run(x);
+ return numext::bessel_j0(x);
}
/** \internal \returns the exponentially scaled modified Bessel function of
@@ -64,8 +59,7 @@
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
Packet pbessel_j1(const Packet& x) {
- typedef typename unpacket_traits<Packet>::type ScalarType;
- using internal::generic_j1; return generic_j1<Packet, ScalarType>::run(x);
+ return numext::bessel_j1(x);
}
/** \internal \returns the exponentially scaled modified Bessel function of
@@ -73,8 +67,7 @@
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
Packet pbessel_y0(const Packet& x) {
- typedef typename unpacket_traits<Packet>::type ScalarType;
- using internal::generic_y0; return generic_y0<Packet, ScalarType>::run(x);
+ return numext::bessel_y0(x);
}
/** \internal \returns the exponentially scaled modified Bessel function of
@@ -82,8 +75,7 @@
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
Packet pbessel_y1(const Packet& x) {
- typedef typename unpacket_traits<Packet>::type ScalarType;
- using internal::generic_y1; return generic_y1<Packet, ScalarType>::run(x);
+ return numext::bessel_y1(x);
}
/** \internal \returns the exponentially scaled modified Bessel function of
@@ -91,8 +83,7 @@
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
Packet pbessel_k0(const Packet& x) {
- typedef typename unpacket_traits<Packet>::type ScalarType;
- using internal::generic_k0; return generic_k0<Packet, ScalarType>::run(x);
+ return numext::bessel_k0(x);
}
/** \internal \returns the exponentially scaled modified Bessel function of
@@ -100,8 +91,7 @@
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
Packet pbessel_k0e(const Packet& x) {
- typedef typename unpacket_traits<Packet>::type ScalarType;
- using internal::generic_k0e; return generic_k0e<Packet, ScalarType>::run(x);
+ return numext::bessel_k0e(x);
}
/** \internal \returns the exponentially scaled modified Bessel function of
@@ -109,8 +99,7 @@
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
Packet pbessel_k1(const Packet& x) {
- typedef typename unpacket_traits<Packet>::type ScalarType;
- using internal::generic_k1; return generic_k1<Packet, ScalarType>::run(x);
+ return numext::bessel_k1(x);
}
/** \internal \returns the exponentially scaled modified Bessel function of
@@ -118,8 +107,7 @@
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
Packet pbessel_k1e(const Packet& x) {
- typedef typename unpacket_traits<Packet>::type ScalarType;
- using internal::generic_k1e; return generic_k1e<Packet, ScalarType>::run(x);
+ return numext::bessel_k1e(x);
}
} // end namespace internal
diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
index 648eb05..cfc13af 100644
--- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
@@ -348,7 +348,7 @@
template <typename T>
struct erf_impl {
EIGEN_DEVICE_FUNC
- static EIGEN_STRONG_INLINE T run(const T x) {
+ static EIGEN_STRONG_INLINE T run(const T& x) {
return generic_fast_erf_float(x);
}
};
@@ -490,7 +490,8 @@
template<typename T>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T flipsign(
const T& should_flipsign, const T& x) {
- const T sign_mask = pset1<T>(-0.0);
+ typedef typename unpacket_traits<T>::type Scalar;
+ const T sign_mask = pset1<T>(Scalar(-0.0));
T sign_bit = pand<T>(should_flipsign, sign_mask);
return pxor<T>(sign_bit, x);
}
diff --git a/unsupported/Eigen/src/SpecialFunctions/arch/AVX/BesselFunctions.h b/unsupported/Eigen/src/SpecialFunctions/arch/AVX/BesselFunctions.h
new file mode 100644
index 0000000..2d76692
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/arch/AVX/BesselFunctions.h
@@ -0,0 +1,46 @@
+#ifndef EIGEN_AVX_BESSELFUNCTIONS_H
+#define EIGEN_AVX_BESSELFUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_i0)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_i0)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_i0e)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_i0e)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_i1)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_i1)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_i1e)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_i1e)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_j0)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_j0)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_j1)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_j1)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_k0)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_k0)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_k0e)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_k0e)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_k1)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_k1)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_k1e)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_k1e)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_y0)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_y0)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_y1)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_y1)
+
+} // namespace internal
+} // namespace Eigen
+
+#endif // EIGEN_AVX_BESSELFUNCTIONS_H
diff --git a/unsupported/Eigen/src/SpecialFunctions/arch/AVX/SpecialFunctions.h b/unsupported/Eigen/src/SpecialFunctions/arch/AVX/SpecialFunctions.h
new file mode 100644
index 0000000..35e62a8
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/arch/AVX/SpecialFunctions.h
@@ -0,0 +1,16 @@
+#ifndef EIGEN_AVX_SPECIALFUNCTIONS_H
+#define EIGEN_AVX_SPECIALFUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, perf)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, perf)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pndtri)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pndtri)
+
+} // namespace internal
+} // namespace Eigen
+
+#endif // EIGEN_AVX_SPECIAL_FUNCTIONS_H
diff --git a/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h b/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h
new file mode 100644
index 0000000..7dd3c3e
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h
@@ -0,0 +1,46 @@
+#ifndef EIGEN_AVX512_BESSELFUNCTIONS_H
+#define EIGEN_AVX512_BESSELFUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_i0)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_i0)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_i0e)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_i0e)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_i1)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_i1)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_i1e)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_i1e)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_j0)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_j0)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_j1)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_j1)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_k0)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_k0)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_k0e)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_k0e)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_k1)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_k1)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_k1e)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_k1e)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_y0)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_y0)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_y1)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_y1)
+
+} // namespace internal
+} // namespace Eigen
+
+#endif // EIGEN_AVX512_BESSELFUNCTIONS_H
diff --git a/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/SpecialFunctions.h b/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/SpecialFunctions.h
new file mode 100644
index 0000000..79878f2
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/SpecialFunctions.h
@@ -0,0 +1,16 @@
+#ifndef EIGEN_AVX512_SPECIALFUNCTIONS_H
+#define EIGEN_AVX512_SPECIALFUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, perf)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, perf)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pndtri)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pndtri)
+
+} // namespace internal
+} // namespace Eigen
+
+#endif // EIGEN_AVX512_SPECIAL_FUNCTIONS_H
diff --git a/unsupported/Eigen/src/SpecialFunctions/arch/GPU/GpuSpecialFunctions.h b/unsupported/Eigen/src/SpecialFunctions/arch/GPU/SpecialFunctions.h
similarity index 100%
rename from unsupported/Eigen/src/SpecialFunctions/arch/GPU/GpuSpecialFunctions.h
rename to unsupported/Eigen/src/SpecialFunctions/arch/GPU/SpecialFunctions.h
diff --git a/unsupported/Eigen/src/SpecialFunctions/arch/NEON/BesselFunctions.h b/unsupported/Eigen/src/SpecialFunctions/arch/NEON/BesselFunctions.h
new file mode 100644
index 0000000..67433b0
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/arch/NEON/BesselFunctions.h
@@ -0,0 +1,54 @@
+#ifndef EIGEN_NEON_BESSELFUNCTIONS_H
+#define EIGEN_NEON_BESSELFUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+#if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
+
+#define NEON_HALF_TO_FLOAT_FUNCTIONS(METHOD) \
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
+Packet8hf METHOD<Packet8hf>(const Packet8hf& x) { \
+ const Packet4f lo = METHOD<Packet4f>(vcvt_f32_f16(vget_low_f16(x))); \
+ const Packet4f hi = METHOD<Packet4f>(vcvt_f32_f16(vget_high_f16(x))); \
+ return vcombine_f16(vcvt_f16_f32(lo), vcvt_f16_f32(hi)); \
+} \
+ \
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
+Packet4hf METHOD<Packet4hf>(const Packet4hf& x) { \
+ return vcvt_f16_f32(METHOD<Packet4f>(vcvt_f32_f16(x))); \
+}
+
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_i0)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_i0e)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_i1)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_i1e)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_j0)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_j1)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_k0)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_k0e)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_k1)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_k1e)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_y0)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_y1)
+
+#undef NEON_HALF_TO_FLOAT_FUNCTIONS
+#endif
+
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_i0)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_i0e)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_i1)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_i1e)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_j0)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_j1)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_k0)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_k0e)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_k1)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_k1e)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_y0)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_y1)
+
+} // namespace internal
+} // namespace Eigen
+
+#endif // EIGEN_NEON_BESSELFUNCTIONS_H
diff --git a/unsupported/Eigen/src/SpecialFunctions/arch/NEON/SpecialFunctions.h b/unsupported/Eigen/src/SpecialFunctions/arch/NEON/SpecialFunctions.h
new file mode 100644
index 0000000..ec92951
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/arch/NEON/SpecialFunctions.h
@@ -0,0 +1,34 @@
+#ifndef EIGEN_NEON_SPECIALFUNCTIONS_H
+#define EIGEN_NEON_SPECIALFUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+#if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
+
+#define NEON_HALF_TO_FLOAT_FUNCTIONS(METHOD) \
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
+Packet8hf METHOD<Packet8hf>(const Packet8hf& x) { \
+ const Packet4f lo = METHOD<Packet4f>(vcvt_f32_f16(vget_low_f16(x))); \
+ const Packet4f hi = METHOD<Packet4f>(vcvt_f32_f16(vget_high_f16(x))); \
+ return vcombine_f16(vcvt_f16_f32(lo), vcvt_f16_f32(hi)); \
+} \
+ \
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
+Packet4hf METHOD<Packet4hf>(const Packet4hf& x) { \
+ return vcvt_f16_f32(METHOD<Packet4f>(vcvt_f32_f16(x))); \
+}
+
+NEON_HALF_TO_FLOAT_FUNCTIONS(perf)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pndtri)
+
+#undef NEON_HALF_TO_FLOAT_FUNCTIONS
+#endif
+
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, perf)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pndtri)
+
+} // namespace internal
+} // namespace Eigen
+
+#endif // EIGEN_NEON_SPECIALFUNCTIONS_H
diff --git a/unsupported/test/cxx11_non_blocking_thread_pool.cpp b/unsupported/test/cxx11_non_blocking_thread_pool.cpp
index 90b330f..993ee17 100644
--- a/unsupported/test/cxx11_non_blocking_thread_pool.cpp
+++ b/unsupported/test/cxx11_non_blocking_thread_pool.cpp
@@ -109,7 +109,9 @@
// Schedule a large number of closure that each sleeps for one second. This
// will keep the thread pool busy for much longer than the default test timeout.
for (int i = 0; i < 1000; ++i) {
- tp.Schedule([]() { EIGEN_SLEEP(2000); });
+ tp.Schedule([]() {
+ std::this_thread::sleep_for(std::chrono::milliseconds(2000));
+ });
}
// Cancel the processing of all the closures that are still pending.
diff --git a/unsupported/test/cxx11_tensor_block_eval.cpp b/unsupported/test/cxx11_tensor_block_eval.cpp
index a7a49fa..b2e26eb 100644
--- a/unsupported/test/cxx11_tensor_block_eval.cpp
+++ b/unsupported/test/cxx11_tensor_block_eval.cpp
@@ -222,7 +222,7 @@
input.setRandom();
VerifyBlockEvaluator<T, NumDims, Layout>(
- input.square(), [&dims]() { return RandomBlock<Layout>(dims, 1, 10); });
+ input.abs(), [&dims]() { return RandomBlock<Layout>(dims, 1, 10); });
}
template <typename T, int NumDims, int Layout>
@@ -274,7 +274,7 @@
// Check that desc.destination() memory is not shared between two broadcast
// materializations.
VerifyBlockEvaluator<T, NumDims, Layout>(
- input.broadcast(bcast) * input.square().broadcast(bcast),
+ input.broadcast(bcast) * input.abs().broadcast(bcast),
[&bcasted_dims]() { return SkewedInnerBlock<Layout>(bcasted_dims); });
}
@@ -391,27 +391,46 @@
// Block expression assignment.
VerifyBlockEvaluator<T, NumDims - 1, Layout>(
- input.square().chip(chip_offset, chip_dim),
+ input.abs().chip(chip_offset, chip_dim),
[&chipped_dims]() { return FixedSizeBlock(chipped_dims); });
VerifyBlockEvaluator<T, NumDims - 1, Layout>(
- input.square().chip(chip_offset, chip_dim),
+ input.abs().chip(chip_offset, chip_dim),
[&chipped_dims]() { return RandomBlock<Layout>(chipped_dims, 1, 10); });
}
+
+template<typename T, int NumDims>
+struct SimpleTensorGenerator {
+ T operator()(const array<Index, NumDims>& coords) const {
+ T result = static_cast<T>(0);
+ for (int i = 0; i < NumDims; ++i) {
+ result += static_cast<T>((i + 1) * coords[i]);
+ }
+ return result;
+ }
+};
+
+// Boolean specialization to avoid -Wint-in-bool-context warnings on GCC.
+template<int NumDims>
+struct SimpleTensorGenerator<bool, NumDims> {
+ bool operator()(const array<Index, NumDims>& coords) const {
+ bool result = false;
+ for (int i = 0; i < NumDims; ++i) {
+ result ^= coords[i];
+ }
+ return result;
+ }
+};
+
+
template <typename T, int NumDims, int Layout>
static void test_eval_tensor_generator() {
DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
Tensor<T, NumDims, Layout> input(dims);
input.setRandom();
- auto generator = [](const array<Index, NumDims>& coords) -> T {
- T result = static_cast<T>(0);
- for (int i = 0; i < NumDims; ++i) {
- result += static_cast<T>((i + 1) * coords[i]);
- }
- return result;
- };
+ auto generator = SimpleTensorGenerator<T, NumDims>();
VerifyBlockEvaluator<T, NumDims, Layout>(
input.generate(generator), [&dims]() { return FixedSizeBlock(dims); });
diff --git a/unsupported/test/cxx11_tensor_notification.cpp b/unsupported/test/cxx11_tensor_notification.cpp
index 2bade8d..8e81653 100644
--- a/unsupported/test/cxx11_tensor_notification.cpp
+++ b/unsupported/test/cxx11_tensor_notification.cpp
@@ -23,7 +23,7 @@
Eigen::Notification n;
auto func = [&n, &counter](){ n.Wait(); ++counter;};
thread_pool.Schedule(func);
- EIGEN_SLEEP(1000);
+ std::this_thread::sleep_for(std::chrono::milliseconds(1000));
// The thread should be waiting for the notification.
VERIFY_IS_EQUAL(counter, 0);
@@ -31,7 +31,7 @@
// Unblock the thread
n.Notify();
- EIGEN_SLEEP(1000);
+ std::this_thread::sleep_for(std::chrono::milliseconds(1000));
// Verify the counter has been incremented
VERIFY_IS_EQUAL(counter, 1);
@@ -50,10 +50,10 @@
thread_pool.Schedule(func);
thread_pool.Schedule(func);
thread_pool.Schedule(func);
- EIGEN_SLEEP(1000);
+ std::this_thread::sleep_for(std::chrono::milliseconds(1000));
VERIFY_IS_EQUAL(counter, 0);
n.Notify();
- EIGEN_SLEEP(1000);
+ std::this_thread::sleep_for(std::chrono::milliseconds(1000));
VERIFY_IS_EQUAL(counter, 4);
}
diff --git a/unsupported/test/special_functions.cpp b/unsupported/test/special_functions.cpp
index 1027272..56848da 100644
--- a/unsupported/test/special_functions.cpp
+++ b/unsupported/test/special_functions.cpp
@@ -11,6 +11,17 @@
#include "main.h"
#include "../Eigen/SpecialFunctions"
+// Hack to allow "implicit" conversions from double to Scalar via comma-initialization.
+template<typename Derived>
+Eigen::CommaInitializer<Derived> operator<<(Eigen::DenseBase<Derived>& dense, double v) {
+ return (dense << static_cast<typename Derived::Scalar>(v));
+}
+
+template<typename XprType>
+Eigen::CommaInitializer<XprType>& operator,(Eigen::CommaInitializer<XprType>& ci, double v) {
+ return (ci, static_cast<typename XprType::Scalar>(v));
+}
+
template<typename X, typename Y>
void verify_component_wise(const X& x, const Y& y)
{
@@ -65,8 +76,8 @@
// igamma(a, x) = gamma(a, x) / Gamma(a)
// where Gamma and gamma are considered the standard unnormalized
// upper and lower incomplete gamma functions, respectively.
- ArrayType a = m1.abs() + 2;
- ArrayType x = m2.abs() + 2;
+ ArrayType a = m1.abs() + Scalar(2);
+ ArrayType x = m2.abs() + Scalar(2);
ArrayType zero = ArrayType::Zero(rows, cols);
ArrayType one = ArrayType::Constant(rows, cols, Scalar(1.0));
ArrayType a_m1 = a - one;
@@ -83,18 +94,18 @@
VERIFY_IS_APPROX(Gamma_a_x + gamma_a_x, a.lgamma().exp());
// Gamma(a, x) == (a - 1) * Gamma(a-1, x) + x^(a-1) * exp(-x)
- VERIFY_IS_APPROX(Gamma_a_x, (a - 1) * Gamma_a_m1_x + x.pow(a-1) * (-x).exp());
+ VERIFY_IS_APPROX(Gamma_a_x, (a - Scalar(1)) * Gamma_a_m1_x + x.pow(a-Scalar(1)) * (-x).exp());
// gamma(a, x) == (a - 1) * gamma(a-1, x) - x^(a-1) * exp(-x)
- VERIFY_IS_APPROX(gamma_a_x, (a - 1) * gamma_a_m1_x - x.pow(a-1) * (-x).exp());
+ VERIFY_IS_APPROX(gamma_a_x, (a - Scalar(1)) * gamma_a_m1_x - x.pow(a-Scalar(1)) * (-x).exp());
}
{
// Verify for large a and x that values are between 0 and 1.
ArrayType m1 = ArrayType::Random(rows,cols);
ArrayType m2 = ArrayType::Random(rows,cols);
- Scalar max_exponent = std::numeric_limits<Scalar>::max_exponent10;
- ArrayType a = m1.abs() * pow(10., max_exponent - 1);
- ArrayType x = m2.abs() * pow(10., max_exponent - 1);
+ int max_exponent = std::numeric_limits<Scalar>::max_exponent10;
+ ArrayType a = m1.abs() * Scalar(pow(10., max_exponent - 1));
+ ArrayType x = m2.abs() * Scalar(pow(10., max_exponent - 1));
for (int i = 0; i < a.size(); ++i) {
Scalar igam = numext::igamma(a(i), x(i));
VERIFY(0 <= igam);
@@ -108,27 +119,37 @@
Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
// location i*6+j corresponds to a_s[i], x_s[j].
- Scalar igamma_s[][6] = {{0.0, nan, nan, nan, nan, nan},
- {0.0, 0.6321205588285578, 0.7768698398515702,
- 0.9816843611112658, 9.999500016666262e-05, 1.0},
- {0.0, 0.4275932955291202, 0.608374823728911,
- 0.9539882943107686, 7.522076445089201e-07, 1.0},
- {0.0, 0.01898815687615381, 0.06564245437845008,
- 0.5665298796332909, 4.166333347221828e-18, 1.0},
- {0.0, 0.9999780593618628, 0.9999899967080838,
- 0.9999996219837988, 0.9991370418689945, 1.0},
- {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}};
- Scalar igammac_s[][6] = {{nan, nan, nan, nan, nan, nan},
- {1.0, 0.36787944117144233, 0.22313016014842982,
- 0.018315638888734182, 0.9999000049998333, 0.0},
- {1.0, 0.5724067044708798, 0.3916251762710878,
- 0.04601170568923136, 0.9999992477923555, 0.0},
- {1.0, 0.9810118431238462, 0.9343575456215499,
- 0.4334701203667089, 1.0, 0.0},
- {1.0, 2.1940638138146658e-05, 1.0003291916285e-05,
- 3.7801620118431334e-07, 0.0008629581310054535,
- 0.0},
- {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}};
+ Scalar igamma_s[][6] = {
+ {Scalar(0.0), nan, nan, nan, nan, nan},
+ {Scalar(0.0), Scalar(0.6321205588285578), Scalar(0.7768698398515702),
+ Scalar(0.9816843611112658), Scalar(9.999500016666262e-05),
+ Scalar(1.0)},
+ {Scalar(0.0), Scalar(0.4275932955291202), Scalar(0.608374823728911),
+ Scalar(0.9539882943107686), Scalar(7.522076445089201e-07),
+ Scalar(1.0)},
+ {Scalar(0.0), Scalar(0.01898815687615381),
+ Scalar(0.06564245437845008), Scalar(0.5665298796332909),
+ Scalar(4.166333347221828e-18), Scalar(1.0)},
+ {Scalar(0.0), Scalar(0.9999780593618628), Scalar(0.9999899967080838),
+ Scalar(0.9999996219837988), Scalar(0.9991370418689945), Scalar(1.0)},
+ {Scalar(0.0), Scalar(0.0), Scalar(0.0), Scalar(0.0), Scalar(0.0),
+ Scalar(0.5042041932513908)}};
+ Scalar igammac_s[][6] = {
+ {nan, nan, nan, nan, nan, nan},
+ {Scalar(1.0), Scalar(0.36787944117144233),
+ Scalar(0.22313016014842982), Scalar(0.018315638888734182),
+ Scalar(0.9999000049998333), Scalar(0.0)},
+ {Scalar(1.0), Scalar(0.5724067044708798), Scalar(0.3916251762710878),
+ Scalar(0.04601170568923136), Scalar(0.9999992477923555),
+ Scalar(0.0)},
+ {Scalar(1.0), Scalar(0.9810118431238462), Scalar(0.9343575456215499),
+ Scalar(0.4334701203667089), Scalar(1.0), Scalar(0.0)},
+ {Scalar(1.0), Scalar(2.1940638138146658e-05),
+ Scalar(1.0003291916285e-05), Scalar(3.7801620118431334e-07),
+ Scalar(0.0008629581310054535), Scalar(0.0)},
+ {Scalar(1.0), Scalar(1.0), Scalar(1.0), Scalar(1.0), Scalar(1.0),
+ Scalar(0.49579580674813944)}};
+
for (int i = 0; i < 6; ++i) {
for (int j = 0; j < 6; ++j) {
if ((std::isnan)(igamma_s[i][j])) {
@@ -162,8 +183,8 @@
ArrayType m1 = ArrayType::Random(32);
using std::sqrt;
- ArrayType cdf_val = (m1 / sqrt(2.)).erf();
- cdf_val = (cdf_val + 1.) / 2.;
+ ArrayType cdf_val = (m1 / Scalar(sqrt(2.))).erf();
+ cdf_val = (cdf_val + Scalar(1)) / Scalar(2);
verify_component_wise(cdf_val.ndtri(), m1););
}
@@ -190,7 +211,6 @@
CALL_SUBTEST( res = digamma(x); verify_component_wise(res, ref); );
}
-
#if EIGEN_HAS_C99_MATH
{
ArrayType n(11), x(11), res(11), ref(11);
@@ -323,8 +343,8 @@
ArrayType m3 = ArrayType::Random(32);
ArrayType one = ArrayType::Constant(32, Scalar(1.0));
const Scalar eps = std::numeric_limits<Scalar>::epsilon();
- ArrayType a = (m1 * 4.0).exp();
- ArrayType b = (m2 * 4.0).exp();
+ ArrayType a = (m1 * Scalar(4)).exp();
+ ArrayType b = (m2 * Scalar(4)).exp();
ArrayType x = m3.abs();
// betainc(a, 1, x) == x**a
@@ -471,4 +491,7 @@
{
CALL_SUBTEST_1(array_special_functions<ArrayXf>());
CALL_SUBTEST_2(array_special_functions<ArrayXd>());
+ // TODO(cantonios): half/bfloat16 don't have enough precision to reproduce results above.
+ // CALL_SUBTEST_3(array_special_functions<ArrayX<Eigen::half>>());
+ // CALL_SUBTEST_4(array_special_functions<ArrayX<Eigen::bfloat16>>());
}
diff --git a/unsupported/test/special_packetmath.cpp b/unsupported/test/special_packetmath.cpp
index 87b8735..4f426eb 100644
--- a/unsupported/test/special_packetmath.cpp
+++ b/unsupported/test/special_packetmath.cpp
@@ -8,6 +8,7 @@
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#include <limits>
#include "packetmath_test_shared.h"
#include "../Eigen/SpecialFunctions"
@@ -43,42 +44,48 @@
}
{
for (int i=0; i<size; ++i) {
- data1[i] = internal::random<Scalar>(0,1);
+ data1[i] = internal::random<Scalar>(Scalar(0),Scalar(1));
}
CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasNdtri, numext::ndtri, internal::pndtri);
}
#endif // EIGEN_HAS_C99_MATH
// For bessel_i*e and bessel_j*, the valid range is negative reals.
- for (int i=0; i<size; ++i)
{
- data1[i] = internal::random<Scalar>(-1,1) * std::pow(Scalar(10), internal::random<Scalar>(-6,6));
- data2[i] = internal::random<Scalar>(-1,1) * std::pow(Scalar(10), internal::random<Scalar>(-6,6));
- }
+ const int max_exponent = numext::mini(std::numeric_limits<Scalar>::max_exponent10-1, 6);
+ for (int i=0; i<size; ++i)
+ {
+ data1[i] = internal::random<Scalar>(Scalar(-1),Scalar(1)) * Scalar(std::pow(Scalar(10), internal::random<Scalar>(Scalar(-max_exponent),Scalar(max_exponent))));
+ data2[i] = internal::random<Scalar>(Scalar(-1),Scalar(1)) * Scalar(std::pow(Scalar(10), internal::random<Scalar>(Scalar(-max_exponent),Scalar(max_exponent))));
+ }
- CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_i0e, internal::pbessel_i0e);
- CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_i1e, internal::pbessel_i1e);
- CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_j0, internal::pbessel_j0);
- CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_j1, internal::pbessel_j1);
+ CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_i0e, internal::pbessel_i0e);
+ CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_i1e, internal::pbessel_i1e);
+ CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_j0, internal::pbessel_j0);
+ CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_j1, internal::pbessel_j1);
+ }
// Use a smaller data range for the bessel_i* as these can become very large.
// Following #1693, we also restrict this range further to avoid inf's due to
// differences in pexp and exp.
for (int i=0; i<size; ++i) {
- data1[i] = internal::random<Scalar>(0.01,1) * std::pow(
- Scalar(9), internal::random<Scalar>(-1,2));
- data2[i] = internal::random<Scalar>(0.01,1) * std::pow(
- Scalar(9), internal::random<Scalar>(-1,2));
+ data1[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) *
+ Scalar(std::pow(Scalar(9), internal::random<Scalar>(Scalar(-1),Scalar(2))));
+ data2[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) *
+ Scalar(std::pow(Scalar(9), internal::random<Scalar>(Scalar(-1),Scalar(2))));
}
CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_i0, internal::pbessel_i0);
CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_i1, internal::pbessel_i1);
// y_i, and k_i are valid for x > 0.
- for (int i=0; i<size; ++i)
{
- data1[i] = internal::random<Scalar>(0.01,1) * std::pow(Scalar(10), internal::random<Scalar>(-2,5));
- data2[i] = internal::random<Scalar>(0.01,1) * std::pow(Scalar(10), internal::random<Scalar>(-2,5));
+ const int max_exponent = numext::mini(std::numeric_limits<Scalar>::max_exponent10-1, 5);
+ for (int i=0; i<size; ++i)
+ {
+ data1[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) * Scalar(std::pow(Scalar(10), internal::random<Scalar>(Scalar(-2),Scalar(max_exponent))));
+ data2[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) * Scalar(std::pow(Scalar(10), internal::random<Scalar>(Scalar(-2),Scalar(max_exponent))));
+ }
}
// TODO(srvasude): Re-enable this test once properly investigated why the
@@ -91,20 +98,20 @@
// Following #1693, we restrict the range for exp to avoid zeroing out too
// fast.
for (int i=0; i<size; ++i) {
- data1[i] = internal::random<Scalar>(0.01,1) * std::pow(
- Scalar(9), internal::random<Scalar>(-1,2));
- data2[i] = internal::random<Scalar>(0.01,1) * std::pow(
- Scalar(9), internal::random<Scalar>(-1,2));
+ data1[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) *
+ Scalar(std::pow(Scalar(9), internal::random<Scalar>(Scalar(-1),Scalar(2))));
+ data2[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) *
+ Scalar(std::pow(Scalar(9), internal::random<Scalar>(Scalar(-1),Scalar(2))));
}
CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_k0, internal::pbessel_k0);
CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_k1, internal::pbessel_k1);
for (int i=0; i<size; ++i) {
- data1[i] = internal::random<Scalar>(0.01,1) * std::pow(
- Scalar(10), internal::random<Scalar>(-1,2));
- data2[i] = internal::random<Scalar>(0.01,1) * std::pow(
- Scalar(10), internal::random<Scalar>(-1,2));
+ data1[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) *
+ Scalar(std::pow(Scalar(10), internal::random<Scalar>(Scalar(-1),Scalar(2))));
+ data2[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) *
+ Scalar(std::pow(Scalar(10), internal::random<Scalar>(Scalar(-1),Scalar(2))));
}
#if EIGEN_HAS_C99_MATH && (__cplusplus > 199711L)
@@ -135,6 +142,8 @@
CALL_SUBTEST_1( test::runner<float>::run() );
CALL_SUBTEST_2( test::runner<double>::run() );
+ CALL_SUBTEST_3( test::runner<Eigen::half>::run() );
+ CALL_SUBTEST_4( test::runner<Eigen::bfloat16>::run() );
g_first_pass = false;
}
}