Update Eigen to commit:d2dce377670f1742f3f701b3585682d02f10bb92
CHANGELOG
=========
d2dce3776 - Optimize slerp() as proposed by Gopinath Vasalamarri.
66d8111ac - Use a more conservative method to detect non-finite inputs to cbrt.
d6689a15d - Replace instances of EIGEN_CONSTEXPR macro
33f5f5961 - Vectorize cbrt for float and double.
533096090 - Enable packet segment in partial redux
6266d430c - packet segment: also check DiagonalWrapper
e39ad8bad - fix constexpr in CoreEvaluators.h
7aefb9f4d - fix memset optimization for std::complex types
73ca849a6 - fix packetSegment for ArrayWrapper / MatrixWrapper
28c3b26d5 - masked load/store framework
cebe09110 - Fix a potential deadlock because of Eigen thread pool
11fd34cc1 - Fix the typing of the Tasks in ForkJoin.h
2cd47d743 - Fixe Conversion Warning in Parallelizer
b86004226 - Add postream for ostream-ing packets more reliably.
02d9e1138 - Add missing pmadd for Packet16bf.
9cc9209b9 - Fix cmake warning and default to j0.
e0c99a8dd - By default, run ctests on all available cores in parallel.
63a40ffb9 - Use fma<float> for fma<half> and fma<bfloat16> if native fma is not available on the platform.
44fb6422b - All triggering full CI if MR label containts all-tests
3866cbfbe - Fix test for TensorRef of trace.
6579e36eb - Allow Tensor trace to be passed to a TensorRef.
8e32cbf7d - Reduce flakiness of test for Eigen::half.
d935916ac - Add numext::fma and missing pmadd implementations.
754bd24f5 - fix 2828
ac2165c11 - fix allFinite
314396819 - Generalize the Eigen ForkJoin scheduler to use any ThreadPool interface.
70f2aead9 - Use native _Float16 for AVX512FP16 and update vectorization.
0259a52b0 - Use more .noalias()
14f845a1a - Fix givens rotation.
33b04fe51 - CMake: add install-doc target
10e62ccd2 - Fix x86 complex vectorized fma
PiperOrigin-RevId: 753703769
Change-Id: I43bc1cf7c598ca3f306fffea9844e9c5b1a21b79
diff --git a/Eigen/Core b/Eigen/Core
index 99cd473..6ae069a 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -193,21 +193,27 @@
#include "src/Core/arch/Default/GenericPacketMathFunctionsFwd.h"
#if defined EIGEN_VECTORIZE_AVX512
+#include "src/Core/arch/SSE/PacketMath.h"
+#include "src/Core/arch/AVX/PacketMath.h"
+#include "src/Core/arch/AVX512/PacketMath.h"
#if defined EIGEN_VECTORIZE_AVX512FP16
#include "src/Core/arch/AVX512/PacketMathFP16.h"
#endif
-#include "src/Core/arch/SSE/PacketMath.h"
#include "src/Core/arch/SSE/TypeCasting.h"
-#include "src/Core/arch/SSE/Complex.h"
-#include "src/Core/arch/AVX/PacketMath.h"
#include "src/Core/arch/AVX/TypeCasting.h"
-#include "src/Core/arch/AVX/Complex.h"
-#include "src/Core/arch/AVX512/PacketMath.h"
#include "src/Core/arch/AVX512/TypeCasting.h"
+#if defined EIGEN_VECTORIZE_AVX512FP16
+#include "src/Core/arch/AVX512/TypeCastingFP16.h"
+#endif
+#include "src/Core/arch/SSE/Complex.h"
+#include "src/Core/arch/AVX/Complex.h"
#include "src/Core/arch/AVX512/Complex.h"
#include "src/Core/arch/SSE/MathFunctions.h"
#include "src/Core/arch/AVX/MathFunctions.h"
#include "src/Core/arch/AVX512/MathFunctions.h"
+#if defined EIGEN_VECTORIZE_AVX512FP16
+#include "src/Core/arch/AVX512/MathFunctionsFP16.h"
+#endif
#include "src/Core/arch/AVX512/TrsmKernel.h"
#elif defined EIGEN_VECTORIZE_AVX
// Use AVX for floats and doubles, SSE for integers
diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h
index 5d52ab2..b1d801d 100644
--- a/Eigen/src/Cholesky/LDLT.h
+++ b/Eigen/src/Cholesky/LDLT.h
@@ -230,8 +230,8 @@
*/
const LDLT& adjoint() const { return *this; }
- EIGEN_DEVICE_FUNC inline EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
- EIGEN_DEVICE_FUNC inline EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
+ EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_matrix.rows(); }
+ EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_matrix.cols(); }
/** \brief Reports whether previous computation was successful.
*
diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h
index 01b4476..7fa4fa2 100644
--- a/Eigen/src/Cholesky/LLT.h
+++ b/Eigen/src/Cholesky/LLT.h
@@ -182,10 +182,10 @@
* This method is provided for compatibility with other matrix decompositions, thus enabling generic code such as:
* \code x = decomposition.adjoint().solve(b) \endcode
*/
- const LLT& adjoint() const EIGEN_NOEXCEPT { return *this; }
+ const LLT& adjoint() const noexcept { return *this; }
- inline EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
- inline EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
+ constexpr Index rows() const noexcept { return m_matrix.rows(); }
+ constexpr Index cols() const noexcept { return m_matrix.cols(); }
template <typename VectorType>
LLT& rankUpdate(const VectorType& vec, const RealScalar& sigma = 1);
diff --git a/Eigen/src/Core/Array.h b/Eigen/src/Core/Array.h
index 3376dfc..57f3186 100644
--- a/Eigen/src/Core/Array.h
+++ b/Eigen/src/Core/Array.h
@@ -129,7 +129,7 @@
#endif
/** \brief Move constructor */
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Array(Array&&) = default;
- EIGEN_DEVICE_FUNC Array& operator=(Array&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable<Scalar>::value) {
+ EIGEN_DEVICE_FUNC Array& operator=(Array&& other) noexcept(std::is_nothrow_move_assignable<Scalar>::value) {
Base::operator=(std::move(other));
return *this;
}
@@ -253,8 +253,8 @@
PrivateType())
: Base(other.derived()) {}
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const EIGEN_NOEXCEPT { return 1; }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const EIGEN_NOEXCEPT { return this->innerSize(); }
+ EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { return 1; }
+ EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return this->innerSize(); }
#ifdef EIGEN_ARRAY_PLUGIN
#include EIGEN_ARRAY_PLUGIN
diff --git a/Eigen/src/Core/ArrayWrapper.h b/Eigen/src/Core/ArrayWrapper.h
index b636d88..c9a194e 100644
--- a/Eigen/src/Core/ArrayWrapper.h
+++ b/Eigen/src/Core/ArrayWrapper.h
@@ -56,14 +56,10 @@
EIGEN_DEVICE_FUNC explicit EIGEN_STRONG_INLINE ArrayWrapper(ExpressionType& matrix) : m_expression(matrix) {}
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_expression.rows(); }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_expression.cols(); }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const EIGEN_NOEXCEPT {
- return m_expression.outerStride();
- }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const EIGEN_NOEXCEPT {
- return m_expression.innerStride();
- }
+ EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_expression.rows(); }
+ EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_expression.cols(); }
+ EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return m_expression.outerStride(); }
+ EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { return m_expression.innerStride(); }
EIGEN_DEVICE_FUNC constexpr ScalarWithConstIfNotLvalue* data() { return m_expression.data(); }
EIGEN_DEVICE_FUNC constexpr const Scalar* data() const { return m_expression.data(); }
@@ -135,14 +131,10 @@
EIGEN_DEVICE_FUNC explicit inline MatrixWrapper(ExpressionType& matrix) : m_expression(matrix) {}
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_expression.rows(); }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_expression.cols(); }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const EIGEN_NOEXCEPT {
- return m_expression.outerStride();
- }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const EIGEN_NOEXCEPT {
- return m_expression.innerStride();
- }
+ EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_expression.rows(); }
+ EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_expression.cols(); }
+ EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return m_expression.outerStride(); }
+ EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { return m_expression.innerStride(); }
EIGEN_DEVICE_FUNC constexpr ScalarWithConstIfNotLvalue* data() { return m_expression.data(); }
EIGEN_DEVICE_FUNC constexpr const Scalar* data() const { return m_expression.data(); }
diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h
index 0ea1bc3..36f0a9d 100644
--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h
@@ -136,6 +136,7 @@
: Traversal == SliceVectorizedTraversal ? (MayUnrollInner ? InnerUnrolling : NoUnrolling)
#endif
: NoUnrolling;
+ static constexpr bool UsePacketSegment = has_packet_segment<PacketType>::value;
#ifdef EIGEN_DEBUG_ASSIGN
static void debug() {
@@ -199,7 +200,7 @@
template <typename Kernel, int Stop>
struct copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, Stop, Stop> {
- EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel&) {}
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&) {}
};
template <typename Kernel, int Index_, int Stop>
@@ -253,7 +254,7 @@
template <typename Kernel, int Stop>
struct copy_using_evaluator_innervec_CompleteUnrolling<Kernel, Stop, Stop> {
- EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel&) {}
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&) {}
};
template <typename Kernel, int Index_, int Stop, int SrcAlignment, int DstAlignment>
@@ -273,6 +274,33 @@
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&, Index) {}
};
+template <typename Kernel, int Start, int Stop, int SrcAlignment, int DstAlignment, bool UsePacketSegment>
+struct copy_using_evaluator_innervec_segment {
+ using PacketType = typename Kernel::PacketType;
+
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel, Index outer) {
+ kernel.template assignPacketSegmentByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, Start, 0,
+ Stop - Start);
+ }
+};
+
+template <typename Kernel, int Start, int Stop, int SrcAlignment, int DstAlignment>
+struct copy_using_evaluator_innervec_segment<Kernel, Start, Stop, SrcAlignment, DstAlignment,
+ /*UsePacketSegment*/ false>
+ : copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, Start, Stop> {};
+
+template <typename Kernel, int Stop, int SrcAlignment, int DstAlignment>
+struct copy_using_evaluator_innervec_segment<Kernel, Stop, Stop, SrcAlignment, DstAlignment,
+ /*UsePacketSegment*/ true> {
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&, Index) {}
+};
+
+template <typename Kernel, int Stop, int SrcAlignment, int DstAlignment>
+struct copy_using_evaluator_innervec_segment<Kernel, Stop, Stop, SrcAlignment, DstAlignment,
+ /*UsePacketSegment*/ false> {
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&, Index) {}
+};
+
/***************************************************************************
* Part 3 : implementation of all cases
***************************************************************************/
@@ -306,7 +334,7 @@
struct dense_assignment_loop_impl<Kernel, AllAtOnceTraversal, Unrolling> {
static constexpr int SizeAtCompileTime = Kernel::AssignmentTraits::SizeAtCompileTime;
- EIGEN_DEVICE_FUNC static void EIGEN_STRONG_INLINE EIGEN_CONSTEXPR run(Kernel& /*kernel*/) {
+ EIGEN_DEVICE_FUNC static void EIGEN_STRONG_INLINE constexpr run(Kernel& /*kernel*/) {
EIGEN_STATIC_ASSERT(SizeAtCompileTime == 0, EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT)
}
};
@@ -353,28 +381,46 @@
// The goal of unaligned_dense_assignment_loop is simply to factorize the handling
// of the non vectorizable beginning and ending parts
-template <bool IsAligned = false>
+template <typename PacketType, int DstAlignment, int SrcAlignment, bool UsePacketSegment, bool Skip>
struct unaligned_dense_assignment_loop {
- // if IsAligned = true, then do nothing
+ // if Skip == true, then do nothing
template <typename Kernel>
- EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel&, Index, Index) {}
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& /*kernel*/, Index /*start*/, Index /*end*/) {}
+ template <typename Kernel>
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& /*kernel*/, Index /*outer*/,
+ Index /*innerStart*/, Index /*innerEnd*/) {}
};
-template <>
-struct unaligned_dense_assignment_loop<false> {
- // MSVC must not inline this functions. If it does, it fails to optimize the
- // packet access path.
- // FIXME check which version exhibits this issue
-#if EIGEN_COMP_MSVC
+template <typename PacketType, int DstAlignment, int SrcAlignment>
+struct unaligned_dense_assignment_loop<PacketType, DstAlignment, SrcAlignment, /*UsePacketSegment*/ true,
+ /*Skip*/ false> {
template <typename Kernel>
- static EIGEN_DONT_INLINE void run(Kernel& kernel, Index start, Index end)
-#else
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel, Index start, Index end) {
+ Index count = end - start;
+ eigen_assert(count <= unpacket_traits<PacketType>::size);
+ if (count > 0) kernel.template assignPacketSegment<DstAlignment, SrcAlignment, PacketType>(start, 0, count);
+ }
template <typename Kernel>
- EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel, Index start, Index end)
-#endif
- {
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel, Index outer, Index start, Index end) {
+ Index count = end - start;
+ eigen_assert(count <= unpacket_traits<PacketType>::size);
+ if (count > 0)
+ kernel.template assignPacketSegmentByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, start, 0, count);
+ }
+};
+
+template <typename PacketType, int DstAlignment, int SrcAlignment>
+struct unaligned_dense_assignment_loop<PacketType, DstAlignment, SrcAlignment, /*UsePacketSegment*/ false,
+ /*Skip*/ false> {
+ template <typename Kernel>
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel, Index start, Index end) {
for (Index index = start; index < end; ++index) kernel.assignCoeff(index);
}
+ template <typename Kernel>
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel, Index outer, Index innerStart,
+ Index innerEnd) {
+ for (Index inner = innerStart; inner < innerEnd; ++inner) kernel.assignCoeffByOuterInner(outer, inner);
+ }
};
template <typename Kernel, int Index_, int Stop>
@@ -392,7 +438,32 @@
template <typename Kernel, int Stop>
struct copy_using_evaluator_linearvec_CompleteUnrolling<Kernel, Stop, Stop> {
- EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel&) {}
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&) {}
+};
+
+template <typename Kernel, int Index_, int Stop, bool UsePacketSegment>
+struct copy_using_evaluator_linearvec_segment {
+ using PacketType = typename Kernel::PacketType;
+ static constexpr int SrcAlignment = Kernel::AssignmentTraits::SrcAlignment;
+ static constexpr int DstAlignment = Kernel::AssignmentTraits::DstAlignment;
+
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel) {
+ kernel.template assignPacketSegment<DstAlignment, SrcAlignment, PacketType>(Index_, 0, Stop - Index_);
+ }
+};
+
+template <typename Kernel, int Index_, int Stop>
+struct copy_using_evaluator_linearvec_segment<Kernel, Index_, Stop, /*UsePacketSegment*/ false>
+ : copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, Index_, Stop> {};
+
+template <typename Kernel, int Stop>
+struct copy_using_evaluator_linearvec_segment<Kernel, Stop, Stop, /*UsePacketSegment*/ true> {
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&) {}
+};
+
+template <typename Kernel, int Stop>
+struct copy_using_evaluator_linearvec_segment<Kernel, Stop, Stop, /*UsePacketSegment*/ false> {
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&) {}
};
template <typename Kernel>
@@ -400,23 +471,30 @@
using Scalar = typename Kernel::Scalar;
using PacketType = typename Kernel::PacketType;
static constexpr int PacketSize = unpacket_traits<PacketType>::size;
- static constexpr int RequestedAlignment = Kernel::AssignmentTraits::LinearRequiredAlignment;
- static constexpr bool DstIsAligned = Kernel::AssignmentTraits::DstAlignment >= RequestedAlignment;
static constexpr int SrcAlignment = Kernel::AssignmentTraits::JointAlignment;
- static constexpr int DstAlignment =
- packet_traits<Scalar>::AlignedOnScalar ? RequestedAlignment : Kernel::AssignmentTraits::DstAlignment;
+ static constexpr int DstAlignment = plain_enum_max(Kernel::AssignmentTraits::DstAlignment, alignof(Scalar));
+ static constexpr int RequestedAlignment = unpacket_traits<PacketType>::alignment;
+ static constexpr bool Alignable =
+ (DstAlignment >= RequestedAlignment) || ((RequestedAlignment - DstAlignment) % sizeof(Scalar) == 0);
+ static constexpr int Alignment = Alignable ? RequestedAlignment : DstAlignment;
+ static constexpr bool DstIsAligned = DstAlignment >= Alignment;
+ static constexpr bool UsePacketSegment = Kernel::AssignmentTraits::UsePacketSegment;
- EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) {
+ using head_loop =
+ unaligned_dense_assignment_loop<PacketType, DstAlignment, SrcAlignment, UsePacketSegment, DstIsAligned>;
+ using tail_loop = unaligned_dense_assignment_loop<PacketType, Alignment, SrcAlignment, UsePacketSegment, false>;
+
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel) {
const Index size = kernel.size();
- const Index alignedStart = DstIsAligned ? 0 : first_aligned<RequestedAlignment>(kernel.dstDataPtr(), size);
+ const Index alignedStart = DstIsAligned ? 0 : first_aligned<Alignment>(kernel.dstDataPtr(), size);
const Index alignedEnd = alignedStart + numext::round_down(size - alignedStart, PacketSize);
- unaligned_dense_assignment_loop<DstIsAligned>::run(kernel, 0, alignedStart);
+ head_loop::run(kernel, 0, alignedStart);
for (Index index = alignedStart; index < alignedEnd; index += PacketSize)
- kernel.template assignPacket<DstAlignment, SrcAlignment, PacketType>(index);
+ kernel.template assignPacket<Alignment, SrcAlignment, PacketType>(index);
- unaligned_dense_assignment_loop<>::run(kernel, alignedEnd, size);
+ tail_loop::run(kernel, alignedEnd, size);
}
};
@@ -426,10 +504,11 @@
static constexpr int PacketSize = unpacket_traits<PacketType>::size;
static constexpr int Size = Kernel::AssignmentTraits::SizeAtCompileTime;
static constexpr int AlignedSize = numext::round_down(Size, PacketSize);
+ static constexpr bool UsePacketSegment = Kernel::AssignmentTraits::UsePacketSegment;
- EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) {
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel) {
copy_using_evaluator_linearvec_CompleteUnrolling<Kernel, 0, AlignedSize>::run(kernel);
- copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, AlignedSize, Size>::run(kernel);
+ copy_using_evaluator_linearvec_segment<Kernel, AlignedSize, Size, UsePacketSegment>::run(kernel);
}
};
@@ -444,7 +523,7 @@
static constexpr int SrcAlignment = Kernel::AssignmentTraits::JointAlignment;
static constexpr int DstAlignment = Kernel::AssignmentTraits::DstAlignment;
- EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) {
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel) {
const Index innerSize = kernel.innerSize();
const Index outerSize = kernel.outerSize();
for (Index outer = 0; outer < outerSize; ++outer)
@@ -482,7 +561,7 @@
template <typename Kernel>
struct dense_assignment_loop_impl<Kernel, LinearTraversal, NoUnrolling> {
- EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) {
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel) {
const Index size = kernel.size();
for (Index i = 0; i < size; ++i) kernel.assignCoeff(i);
}
@@ -490,7 +569,7 @@
template <typename Kernel>
struct dense_assignment_loop_impl<Kernel, LinearTraversal, CompleteUnrolling> {
- EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) {
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel) {
copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, 0, Kernel::AssignmentTraits::SizeAtCompileTime>::run(
kernel);
}
@@ -505,35 +584,35 @@
using Scalar = typename Kernel::Scalar;
using PacketType = typename Kernel::PacketType;
static constexpr int PacketSize = unpacket_traits<PacketType>::size;
- static constexpr int RequestedAlignment = Kernel::AssignmentTraits::InnerRequiredAlignment;
+ static constexpr int SrcAlignment = Kernel::AssignmentTraits::JointAlignment;
+ static constexpr int DstAlignment = plain_enum_max(Kernel::AssignmentTraits::DstAlignment, alignof(Scalar));
+ static constexpr int RequestedAlignment = unpacket_traits<PacketType>::alignment;
static constexpr bool Alignable =
- packet_traits<Scalar>::AlignedOnScalar || Kernel::AssignmentTraits::DstAlignment >= sizeof(Scalar);
- static constexpr bool DstIsAligned = Kernel::AssignmentTraits::DstAlignment >= RequestedAlignment;
- static constexpr int DstAlignment = Alignable ? RequestedAlignment : Kernel::AssignmentTraits::DstAlignment;
+ (DstAlignment >= RequestedAlignment) || ((RequestedAlignment - DstAlignment) % sizeof(Scalar) == 0);
+ static constexpr int Alignment = Alignable ? RequestedAlignment : DstAlignment;
+ static constexpr bool DstIsAligned = DstAlignment >= Alignment;
+ static constexpr bool UsePacketSegment = Kernel::AssignmentTraits::UsePacketSegment;
- EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) {
+ using head_loop = unaligned_dense_assignment_loop<PacketType, DstAlignment, Unaligned, UsePacketSegment, !Alignable>;
+ using tail_loop = unaligned_dense_assignment_loop<PacketType, Alignment, Unaligned, UsePacketSegment, false>;
+
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel) {
const Scalar* dst_ptr = kernel.dstDataPtr();
- if ((!DstIsAligned) && (std::uintptr_t(dst_ptr) % sizeof(Scalar)) > 0) {
- // the pointer is not aligned-on scalar, so alignment is not possible
- return dense_assignment_loop<Kernel, DefaultTraversal, NoUnrolling>::run(kernel);
- }
const Index innerSize = kernel.innerSize();
const Index outerSize = kernel.outerSize();
const Index alignedStep = Alignable ? (PacketSize - kernel.outerStride() % PacketSize) % PacketSize : 0;
- Index alignedStart =
- ((!Alignable) || DstIsAligned) ? 0 : internal::first_aligned<RequestedAlignment>(dst_ptr, innerSize);
+ Index alignedStart = ((!Alignable) || DstIsAligned) ? 0 : internal::first_aligned<Alignment>(dst_ptr, innerSize);
for (Index outer = 0; outer < outerSize; ++outer) {
const Index alignedEnd = alignedStart + numext::round_down(innerSize - alignedStart, PacketSize);
- // do the non-vectorizable part of the assignment
- for (Index inner = 0; inner < alignedStart; ++inner) kernel.assignCoeffByOuterInner(outer, inner);
+
+ head_loop::run(kernel, outer, 0, alignedStart);
// do the vectorizable part of the assignment
for (Index inner = alignedStart; inner < alignedEnd; inner += PacketSize)
- kernel.template assignPacketByOuterInner<DstAlignment, Unaligned, PacketType>(outer, inner);
+ kernel.template assignPacketByOuterInner<Alignment, Unaligned, PacketType>(outer, inner);
- // do the non-vectorizable part of the assignment
- for (Index inner = alignedEnd; inner < innerSize; ++inner) kernel.assignCoeffByOuterInner(outer, inner);
+ tail_loop::run(kernel, outer, alignedEnd, innerSize);
alignedStart = numext::mini((alignedStart + alignedStep) % PacketSize, innerSize);
}
@@ -547,11 +626,16 @@
static constexpr int PacketSize = unpacket_traits<PacketType>::size;
static constexpr int InnerSize = Kernel::AssignmentTraits::InnerSizeAtCompileTime;
static constexpr int VectorizableSize = numext::round_down(InnerSize, PacketSize);
+ static constexpr bool UsePacketSegment = Kernel::AssignmentTraits::UsePacketSegment;
- EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) {
+ using packet_loop = copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, VectorizableSize, Unaligned, Unaligned>;
+ using packet_segment_loop = copy_using_evaluator_innervec_segment<Kernel, VectorizableSize, InnerSize, Unaligned,
+ Unaligned, UsePacketSegment>;
+
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel) {
for (Index outer = 0; outer < kernel.outerSize(); ++outer) {
- copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, VectorizableSize, 0, 0>::run(kernel, outer);
- copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, VectorizableSize, InnerSize>::run(kernel, outer);
+ packet_loop::run(kernel, outer);
+ packet_segment_loop::run(kernel, outer);
}
}
};
@@ -590,15 +674,15 @@
#endif
}
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index size() const EIGEN_NOEXCEPT { return m_dstExpr.size(); }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index innerSize() const EIGEN_NOEXCEPT { return m_dstExpr.innerSize(); }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index outerSize() const EIGEN_NOEXCEPT { return m_dstExpr.outerSize(); }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_dstExpr.rows(); }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_dstExpr.cols(); }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index outerStride() const EIGEN_NOEXCEPT { return m_dstExpr.outerStride(); }
+ EIGEN_DEVICE_FUNC constexpr Index size() const noexcept { return m_dstExpr.size(); }
+ EIGEN_DEVICE_FUNC constexpr Index innerSize() const noexcept { return m_dstExpr.innerSize(); }
+ EIGEN_DEVICE_FUNC constexpr Index outerSize() const noexcept { return m_dstExpr.outerSize(); }
+ EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_dstExpr.rows(); }
+ EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_dstExpr.cols(); }
+ EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return m_dstExpr.outerStride(); }
- EIGEN_DEVICE_FUNC DstEvaluatorType& dstEvaluator() EIGEN_NOEXCEPT { return m_dst; }
- EIGEN_DEVICE_FUNC const SrcEvaluatorType& srcEvaluator() const EIGEN_NOEXCEPT { return m_src; }
+ EIGEN_DEVICE_FUNC DstEvaluatorType& dstEvaluator() noexcept { return m_dst; }
+ EIGEN_DEVICE_FUNC const SrcEvaluatorType& srcEvaluator() const noexcept { return m_src; }
/// Assign src(row,col) to dst(row,col) through the assignment functor.
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void assignCoeff(Index row, Index col) {
@@ -635,6 +719,27 @@
assignPacket<StoreMode, LoadMode, Packet>(row, col);
}
+ template <int StoreMode, int LoadMode, typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacketSegment(Index row, Index col, Index begin, Index count) {
+ m_functor.template assignPacketSegment<StoreMode>(
+ &m_dst.coeffRef(row, col), m_src.template packetSegment<LoadMode, Packet>(row, col, begin, count), begin,
+ count);
+ }
+
+ template <int StoreMode, int LoadMode, typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacketSegment(Index index, Index begin, Index count) {
+ m_functor.template assignPacketSegment<StoreMode>(
+ &m_dst.coeffRef(index), m_src.template packetSegment<LoadMode, Packet>(index, begin, count), begin, count);
+ }
+
+ template <int StoreMode, int LoadMode, typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacketSegmentByOuterInner(Index outer, Index inner, Index begin,
+ Index count) {
+ Index row = rowIndexByOuterInner(outer, inner);
+ Index col = colIndexByOuterInner(outer, inner);
+ assignPacketSegment<StoreMode, LoadMode, Packet>(row, col, begin, count);
+ }
+
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr Index rowIndexByOuterInner(Index outer, Index inner) {
typedef typename DstEvaluatorType::ExpressionTraits Traits;
return int(Traits::RowsAtCompileTime) == 1 ? 0
@@ -704,9 +809,8 @@
}
template <typename DstXprType, typename SrcXprType, typename Functor>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void call_dense_assignment_loop(DstXprType& dst,
- const SrcXprType& src,
- const Functor& func) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src,
+ const Functor& func) {
typedef evaluator<DstXprType> DstEvaluatorType;
typedef evaluator<SrcXprType> SrcEvaluatorType;
@@ -775,7 +879,7 @@
// Deal with "assume-aliasing"
template <typename Dst, typename Src, typename Func>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void call_assignment(
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_assignment(
Dst& dst, const Src& src, const Func& func, std::enable_if_t<evaluator_assume_aliasing<Src>::value, void*> = 0) {
typename plain_matrix_type<Src>::type tmp(src);
call_assignment_no_alias(dst, tmp, func);
@@ -790,14 +894,14 @@
// by-pass "assume-aliasing"
// When there is no aliasing, we require that 'dst' has been properly resized
template <typename Dst, template <typename> class StorageBase, typename Src, typename Func>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void call_assignment(NoAlias<Dst, StorageBase>& dst,
- const Src& src, const Func& func) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_assignment(NoAlias<Dst, StorageBase>& dst, const Src& src,
+ const Func& func) {
call_assignment_no_alias(dst.expression(), src, func);
}
template <typename Dst, typename Src, typename Func>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void call_assignment_no_alias(Dst& dst, const Src& src,
- const Func& func) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_assignment_no_alias(Dst& dst, const Src& src,
+ const Func& func) {
enum {
NeedToTranspose = ((int(Dst::RowsAtCompileTime) == 1 && int(Src::ColsAtCompileTime) == 1) ||
(int(Dst::ColsAtCompileTime) == 1 && int(Src::RowsAtCompileTime) == 1)) &&
@@ -836,14 +940,13 @@
}
template <typename Dst, typename Src>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void call_assignment_no_alias(Dst& dst, const Src& src) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_assignment_no_alias(Dst& dst, const Src& src) {
call_assignment_no_alias(dst, src, internal::assign_op<typename Dst::Scalar, typename Src::Scalar>());
}
template <typename Dst, typename Src, typename Func>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void call_assignment_no_alias_no_transpose(Dst& dst,
- const Src& src,
- const Func& func) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src,
+ const Func& func) {
// TODO check whether this is the right place to perform these checks:
EIGEN_STATIC_ASSERT_LVALUE(Dst)
EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Dst, Src)
@@ -852,8 +955,7 @@
Assignment<Dst, Src, Func>::run(dst, src, func);
}
template <typename Dst, typename Src>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void call_assignment_no_alias_no_transpose(Dst& dst,
- const Src& src) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src) {
call_assignment_no_alias_no_transpose(dst, src, internal::assign_op<typename Dst::Scalar, typename Src::Scalar>());
}
diff --git a/Eigen/src/Core/BandMatrix.h b/Eigen/src/Core/BandMatrix.h
index ca991ca..57b0322 100644
--- a/Eigen/src/Core/BandMatrix.h
+++ b/Eigen/src/Core/BandMatrix.h
@@ -200,16 +200,16 @@
: m_coeffs(1 + supers + subs, cols), m_rows(rows), m_supers(supers), m_subs(subs) {}
/** \returns the number of columns */
- inline EIGEN_CONSTEXPR Index rows() const { return m_rows.value(); }
+ constexpr Index rows() const { return m_rows.value(); }
/** \returns the number of rows */
- inline EIGEN_CONSTEXPR Index cols() const { return m_coeffs.cols(); }
+ constexpr Index cols() const { return m_coeffs.cols(); }
/** \returns the number of super diagonals */
- inline EIGEN_CONSTEXPR Index supers() const { return m_supers.value(); }
+ constexpr Index supers() const { return m_supers.value(); }
/** \returns the number of sub diagonals */
- inline EIGEN_CONSTEXPR Index subs() const { return m_subs.value(); }
+ constexpr Index subs() const { return m_subs.value(); }
inline const CoefficientsType& coeffs() const { return m_coeffs; }
inline CoefficientsType& coeffs() { return m_coeffs; }
@@ -260,16 +260,16 @@
}
/** \returns the number of columns */
- inline EIGEN_CONSTEXPR Index rows() const { return m_rows.value(); }
+ constexpr Index rows() const { return m_rows.value(); }
/** \returns the number of rows */
- inline EIGEN_CONSTEXPR Index cols() const { return m_coeffs.cols(); }
+ constexpr Index cols() const { return m_coeffs.cols(); }
/** \returns the number of super diagonals */
- inline EIGEN_CONSTEXPR Index supers() const { return m_supers.value(); }
+ constexpr Index supers() const { return m_supers.value(); }
/** \returns the number of sub diagonals */
- inline EIGEN_CONSTEXPR Index subs() const { return m_subs.value(); }
+ constexpr Index subs() const { return m_subs.value(); }
inline const CoefficientsType& coeffs() const { return m_coeffs; }
diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h
index 709264c..39abff7 100644
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@@ -289,13 +289,9 @@
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE XprType& nestedExpression() { return m_xpr; }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR StorageIndex startRow() const EIGEN_NOEXCEPT {
- return m_startRow.value();
- }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr StorageIndex startRow() const noexcept { return m_startRow.value(); }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR StorageIndex startCol() const EIGEN_NOEXCEPT {
- return m_startCol.value();
- }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr StorageIndex startCol() const noexcept { return m_startCol.value(); }
protected:
XprTypeNested m_xpr;
@@ -319,8 +315,7 @@
* Adding an offset to nullptr is undefined behavior, so we must avoid it.
*/
template <typename Scalar>
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE static Scalar* add_to_nullable_pointer(Scalar* base,
- Index offset) {
+ EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE static Scalar* add_to_nullable_pointer(Scalar* base, Index offset) {
return base != nullptr ? base + offset : nullptr;
}
@@ -378,30 +373,25 @@
init();
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all_t<XprTypeNested>& nestedExpression() const
- EIGEN_NOEXCEPT {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all_t<XprTypeNested>& nestedExpression() const noexcept {
return m_xpr;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE XprType& nestedExpression() { return m_xpr; }
/** \sa MapBase::innerStride() */
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index innerStride() const EIGEN_NOEXCEPT {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index innerStride() const noexcept {
return internal::traits<BlockType>::HasSameStorageOrderAsXprType ? m_xpr.innerStride() : m_xpr.outerStride();
}
/** \sa MapBase::outerStride() */
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index outerStride() const EIGEN_NOEXCEPT {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index outerStride() const noexcept {
return internal::traits<BlockType>::HasSameStorageOrderAsXprType ? m_xpr.outerStride() : m_xpr.innerStride();
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR StorageIndex startRow() const EIGEN_NOEXCEPT {
- return m_startRow.value();
- }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr StorageIndex startRow() const noexcept { return m_startRow.value(); }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR StorageIndex startCol() const EIGEN_NOEXCEPT {
- return m_startCol.value();
- }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr StorageIndex startCol() const noexcept { return m_startCol.value(); }
#ifndef __SUNPRO_CC
// FIXME sunstudio is not friendly with the above friend...
diff --git a/Eigen/src/Core/CommaInitializer.h b/Eigen/src/Core/CommaInitializer.h
index c629123..c414117 100644
--- a/Eigen/src/Core/CommaInitializer.h
+++ b/Eigen/src/Core/CommaInitializer.h
@@ -92,7 +92,7 @@
EIGEN_DEVICE_FUNC inline ~CommaInitializer()
#if defined VERIFY_RAISES_ASSERT && (!defined EIGEN_NO_ASSERTION_CHECKING) && defined EIGEN_EXCEPTIONS
- EIGEN_EXCEPTION_SPEC(Eigen::eigen_assert_exception)
+ noexcept(false) // Eigen::eigen_assert_exception
#endif
{
finished();
diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h
index 156ca2b..e3af2d2 100644
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h
@@ -149,7 +149,7 @@
#endif
eigen_internal_assert(outerStride == OuterStride);
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index outerStride() const EIGEN_NOEXCEPT { return OuterStride; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index outerStride() const noexcept { return OuterStride; }
const Scalar* data;
};
@@ -198,19 +198,13 @@
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType coeff(Index row, Index col) const {
- if (IsRowMajor)
- return m_d.data[row * m_d.outerStride() + col];
- else
- return m_d.data[row + col * m_d.outerStride()];
+ return coeff(getIndex(row, col));
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType coeff(Index index) const { return m_d.data[index]; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& coeffRef(Index row, Index col) {
- if (IsRowMajor)
- return const_cast<Scalar*>(m_d.data)[row * m_d.outerStride() + col];
- else
- return const_cast<Scalar*>(m_d.data)[row + col * m_d.outerStride()];
+ return coeffRef(getIndex(row, col));
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& coeffRef(Index index) {
@@ -219,10 +213,7 @@
template <int LoadMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
- if (IsRowMajor)
- return ploadt<PacketType, LoadMode>(m_d.data + row * m_d.outerStride() + col);
- else
- return ploadt<PacketType, LoadMode>(m_d.data + row + col * m_d.outerStride());
+ return packet<LoadMode, PacketType>(getIndex(row, col));
}
template <int LoadMode, typename PacketType>
@@ -232,19 +223,43 @@
template <int StoreMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) {
- if (IsRowMajor)
- return pstoret<Scalar, PacketType, StoreMode>(const_cast<Scalar*>(m_d.data) + row * m_d.outerStride() + col, x);
- else
- return pstoret<Scalar, PacketType, StoreMode>(const_cast<Scalar*>(m_d.data) + row + col * m_d.outerStride(), x);
+ writePacket<StoreMode, PacketType>(getIndex(row, col), x);
}
template <int StoreMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) {
- return pstoret<Scalar, PacketType, StoreMode>(const_cast<Scalar*>(m_d.data) + index, x);
+ pstoret<Scalar, PacketType, StoreMode>(const_cast<Scalar*>(m_d.data) + index, x);
+ }
+
+ template <int LoadMode, typename PacketType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+ return packetSegment<LoadMode, PacketType>(getIndex(row, col), begin, count);
+ }
+
+ template <int LoadMode, typename PacketType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
+ return ploadtSegment<PacketType, LoadMode>(m_d.data + index, begin, count);
+ }
+
+ template <int StoreMode, typename PacketType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index row, Index col, const PacketType& x, Index begin,
+ Index count) {
+ writePacketSegment<StoreMode, PacketType>(getIndex(row, col), x, begin, count);
+ }
+
+ template <int StoreMode, typename PacketType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index index, const PacketType& x, Index begin,
+ Index count) {
+ pstoretSegment<Scalar, PacketType, StoreMode>(const_cast<Scalar*>(m_d.data) + index, x, begin, count);
}
protected:
plainobjectbase_evaluator_data<Scalar, OuterStrideAtCompileTime> m_d;
+
+ private:
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index constexpr getIndex(Index row, Index col) const {
+ return IsRowMajor ? row * m_d.outerStride() + col : row + col * m_d.outerStride();
+ }
};
template <typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
@@ -318,6 +333,28 @@
m_argImpl.template writePacket<StoreMode, PacketType>(index, x);
}
+ template <int LoadMode, typename PacketType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+ return m_argImpl.template packetSegment<LoadMode, PacketType>(col, row, begin, count);
+ }
+
+ template <int LoadMode, typename PacketType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
+ return m_argImpl.template packetSegment<LoadMode, PacketType>(index, begin, count);
+ }
+
+ template <int StoreMode, typename PacketType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index row, Index col, const PacketType& x, Index begin,
+ Index count) {
+ m_argImpl.template writePacketSegment<StoreMode, PacketType>(col, row, x, begin, count);
+ }
+
+ template <int StoreMode, typename PacketType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index index, const PacketType& x, Index begin,
+ Index count) {
+ m_argImpl.template writePacketSegment<StoreMode, PacketType>(index, x, begin, count);
+ }
+
protected:
evaluator<ArgType> m_argImpl;
};
@@ -464,10 +501,10 @@
struct evaluator<CwiseNullaryOp<NullaryOp, PlainObjectType>>
: evaluator_base<CwiseNullaryOp<NullaryOp, PlainObjectType>> {
typedef CwiseNullaryOp<NullaryOp, PlainObjectType> XprType;
- typedef internal::remove_all_t<PlainObjectType> PlainObjectTypeCleaned;
+ typedef remove_all_t<PlainObjectType> PlainObjectTypeCleaned;
enum {
- CoeffReadCost = internal::functor_traits<NullaryOp>::Cost,
+ CoeffReadCost = functor_traits<NullaryOp>::Cost,
Flags = (evaluator<PlainObjectTypeCleaned>::Flags &
(HereditaryBits | (functor_has_linear_access<NullaryOp>::ret ? LinearAccessBit : 0) |
@@ -502,9 +539,21 @@
return m_wrapper.template packetOp<PacketType>(m_functor, index);
}
+ template <int LoadMode, typename PacketType, typename IndexType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(IndexType row, IndexType col, Index /*begin*/,
+ Index /*count*/) const {
+ return packet<LoadMode, PacketType, IndexType>(row, col);
+ }
+
+ template <int LoadMode, typename PacketType, typename IndexType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(IndexType index, Index /*begin*/,
+ Index /*count*/) const {
+ return packet<LoadMode, PacketType, IndexType>(index);
+ }
+
protected:
const NullaryOp m_functor;
- const internal::nullary_wrapper<CoeffReturnType, NullaryOp> m_wrapper;
+ const nullary_wrapper<CoeffReturnType, NullaryOp> m_wrapper;
};
// -------------------- CwiseUnaryOp --------------------
@@ -546,6 +595,16 @@
return m_d.func().packetOp(m_d.argImpl.template packet<LoadMode, PacketType>(index));
}
+ template <int LoadMode, typename PacketType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+ return m_d.func().packetOp(m_d.argImpl.template packetSegment<LoadMode, PacketType>(row, col, begin, count));
+ }
+
+ template <int LoadMode, typename PacketType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
+ return m_d.func().packetOp(m_d.argImpl.template packetSegment<LoadMode, PacketType>(index, begin, count));
+ }
+
protected:
// this helper permits to completely eliminate the functor if it is empty
struct Data {
@@ -600,16 +659,11 @@
template <typename DstPacketType>
using SrcPacketArgs8 = std::enable_if_t<(unpacket_traits<DstPacketType>::size) == (8 * SrcPacketSize), bool>;
- template <bool UseRowMajor = IsRowMajor, std::enable_if_t<UseRowMajor, bool> = true>
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_array_bounds(Index, Index col, Index packetSize) const {
- return col + packetSize <= cols();
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_array_bounds(Index row, Index col, Index begin, Index count) const {
+ return IsRowMajor ? (col + count + begin <= cols()) : (row + count + begin <= rows());
}
- template <bool UseRowMajor = IsRowMajor, std::enable_if_t<!UseRowMajor, bool> = true>
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_array_bounds(Index row, Index, Index packetSize) const {
- return row + packetSize <= rows();
- }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_array_bounds(Index index, Index packetSize) const {
- return index + packetSize <= size();
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_array_bounds(Index index, Index begin, Index count) const {
+ return index + count + begin <= size();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SrcType srcCoeff(Index row, Index col, Index offset) const {
@@ -632,43 +686,86 @@
template <int LoadMode, typename PacketType = SrcPacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType srcPacket(Index row, Index col, Index offset) const {
constexpr int PacketSize = unpacket_traits<PacketType>::size;
- Index actualRow = IsRowMajor ? row : row + (offset * PacketSize);
- Index actualCol = IsRowMajor ? col + (offset * PacketSize) : col;
- eigen_assert(check_array_bounds(actualRow, actualCol, PacketSize) && "Array index out of bounds");
+ Index packetOffset = offset * PacketSize;
+ Index actualRow = IsRowMajor ? row : row + packetOffset;
+ Index actualCol = IsRowMajor ? col + packetOffset : col;
+ eigen_assert(check_array_bounds(actualRow, actualCol, 0, PacketSize) && "Array index out of bounds");
return m_argImpl.template packet<LoadMode, PacketType>(actualRow, actualCol);
}
template <int LoadMode, typename PacketType = SrcPacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType srcPacket(Index index, Index offset) const {
constexpr int PacketSize = unpacket_traits<PacketType>::size;
- Index actualIndex = index + (offset * PacketSize);
- eigen_assert(check_array_bounds(actualIndex, PacketSize) && "Array index out of bounds");
+ Index packetOffset = offset * PacketSize;
+ Index actualIndex = index + packetOffset;
+ eigen_assert(check_array_bounds(actualIndex, 0, PacketSize) && "Array index out of bounds");
return m_argImpl.template packet<LoadMode, PacketType>(actualIndex);
}
+ template <int LoadMode, typename PacketType = SrcPacketType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType srcPacketSegment(Index row, Index col, Index begin, Index count,
+ Index offset) const {
+ constexpr int PacketSize = unpacket_traits<PacketType>::size;
+ Index packetOffset = offset * PacketSize;
+ Index actualRow = IsRowMajor ? row : row + packetOffset;
+ Index actualCol = IsRowMajor ? col + packetOffset : col;
+ eigen_assert(check_array_bounds(actualRow, actualCol, 0, count) && "Array index out of bounds");
+ return m_argImpl.template packetSegment<LoadMode, PacketType>(actualRow, actualCol, begin, count);
+ }
+ template <int LoadMode, typename PacketType = SrcPacketType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType srcPacketSegment(Index index, Index begin, Index count,
+ Index offset) const {
+ constexpr int PacketSize = unpacket_traits<PacketType>::size;
+ Index packetOffset = offset * PacketSize;
+ Index actualIndex = index + packetOffset + begin;
+ eigen_assert(check_array_bounds(actualIndex, 0, count) && "Array index out of bounds");
+ return m_argImpl.template packetSegment<LoadMode, PacketType>(actualIndex, begin, count);
+ }
+
+ template <int NumPackets, int LoadMode, typename PacketType = SrcPacketType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketBlock<PacketType, NumPackets> srcPacketSegmentHelper(Index row, Index col,
+ Index begin,
+ Index count) const {
+ constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
+ PacketBlock<PacketType, NumPackets> packets;
+ Index offset = begin / SrcPacketSize;
+ Index actualBegin = begin % SrcPacketSize;
+ for (; offset < NumPackets; offset++) {
+ Index actualCount = numext::mini(SrcPacketSize - actualBegin, count);
+ packets.packet[offset] = srcPacketSegment<SrcLoadMode>(row, col, actualBegin, actualCount, offset);
+ if (count == actualCount) break;
+ actualBegin = 0;
+ count -= actualCount;
+ }
+ return packets;
+ }
+ template <int NumPackets, int LoadMode, typename PacketType = SrcPacketType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketBlock<PacketType, NumPackets> srcPacketSegmentHelper(Index index,
+ Index begin,
+ Index count) const {
+ constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
+ PacketBlock<PacketType, NumPackets> packets;
+ Index offset = begin / SrcPacketSize;
+ Index actualBegin = begin % SrcPacketSize;
+ for (; offset < NumPackets; offset++) {
+ Index actualCount = numext::mini(SrcPacketSize - actualBegin, count);
+ packets.packet[offset] = srcPacketSegment<SrcLoadMode>(index, actualBegin, actualCount, offset);
+ if (count == actualCount) break;
+ actualBegin = 0;
+ count -= actualCount;
+ }
+ return packets;
+ }
// There is no source packet type with equal or fewer elements than DstPacketType.
// This is problematic as the evaluation loop may attempt to access data outside the bounds of the array.
// For example, consider the cast utilizing pcast<Packet4f,Packet2d> with an array of size 4: {0.0f,1.0f,2.0f,3.0f}.
// The first iteration of the evaluation loop will load 16 bytes: {0.0f,1.0f,2.0f,3.0f} and cast to {0.0,1.0}, which
// is acceptable. The second iteration will load 16 bytes: {2.0f,3.0f,?,?}, which is outside the bounds of the array.
-
- // Instead, perform runtime check to determine if the load would access data outside the bounds of the array.
- // If not, perform full load. Otherwise, revert to a scalar loop to perform a partial load.
- // In either case, perform a vectorized cast of the source packet.
template <int LoadMode, typename DstPacketType, AltSrcScalarOp<DstPacketType> = true>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packet(Index row, Index col) const {
constexpr int DstPacketSize = unpacket_traits<DstPacketType>::size;
constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType);
constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode);
- SrcPacketType src;
- if (EIGEN_PREDICT_TRUE(check_array_bounds(row, col, SrcPacketSize))) {
- src = srcPacket<SrcLoadMode>(row, col, 0);
- } else {
- Array<SrcType, SrcPacketSize, 1> srcArray;
- for (size_t k = 0; k < DstPacketSize; k++) srcArray[k] = srcCoeff(row, col, k);
- for (size_t k = DstPacketSize; k < SrcPacketSize; k++) srcArray[k] = SrcType(0);
- src = pload<SrcPacketType>(srcArray.data());
- }
- return pcast<SrcPacketType, DstPacketType>(src);
+ return pcast<SrcPacketType, DstPacketType>(srcPacketSegment<SrcLoadMode>(row, col, 0, DstPacketSize, 0));
}
// Use the source packet type with the same size as DstPacketType, if it exists
template <int LoadMode, typename DstPacketType, SrcPacketArgs1<DstPacketType> = true>
@@ -704,22 +801,67 @@
srcPacket<SrcLoadMode>(row, col, 6), srcPacket<SrcLoadMode>(row, col, 7));
}
+ // packetSegment variants
+ template <int LoadMode, typename DstPacketType, AltSrcScalarOp<DstPacketType> = true>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index row, Index col, Index begin,
+ Index count) const {
+ constexpr int DstPacketSize = unpacket_traits<DstPacketType>::size;
+ constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType);
+ constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode);
+ return pcast<SrcPacketType, DstPacketType>(srcPacketSegment<SrcLoadMode>(row, col, begin, count, 0));
+ }
+ // Use the source packet type with the same size as DstPacketType, if it exists
+ template <int LoadMode, typename DstPacketType, SrcPacketArgs1<DstPacketType> = true>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index row, Index col, Index begin,
+ Index count) const {
+ constexpr int DstPacketSize = unpacket_traits<DstPacketType>::size;
+ using SizedSrcPacketType = typename find_packet_by_size<SrcType, DstPacketSize>::type;
+ constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType);
+ constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode);
+ return pcast<SizedSrcPacketType, DstPacketType>(
+ srcPacketSegment<SrcLoadMode, SizedSrcPacketType>(row, col, begin, count, 0));
+ }
+ // unpacket_traits<DstPacketType>::size == 2 * SrcPacketSize
+ template <int LoadMode, typename DstPacketType, SrcPacketArgs2<DstPacketType> = true>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index row, Index col, Index begin,
+ Index count) const {
+ constexpr int NumPackets = 2;
+ constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
+ PacketBlock<SrcPacketType, NumPackets> packets =
+ srcPacketSegmentHelper<NumPackets, SrcLoadMode>(row, col, begin, count);
+ return pcast<SrcPacketType, DstPacketType>(packets.packet[0], packets.packet[1]);
+ }
+ // unpacket_traits<DstPacketType>::size == 4 * SrcPacketSize
+ template <int LoadMode, typename DstPacketType, SrcPacketArgs4<DstPacketType> = true>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index row, Index col, Index begin,
+ Index count) const {
+ constexpr int NumPackets = 4;
+ constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
+ PacketBlock<SrcPacketType, NumPackets> packets =
+ srcPacketSegmentHelper<NumPackets, SrcLoadMode>(row, col, begin, count);
+ return pcast<SrcPacketType, DstPacketType>(packets.packet[0], packets.packet[1], packets.packet[2],
+ packets.packet[3]);
+ }
+ // unpacket_traits<DstPacketType>::size == 8 * SrcPacketSize
+ template <int LoadMode, typename DstPacketType, SrcPacketArgs8<DstPacketType> = true>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index row, Index col, Index begin,
+ Index count) const {
+ constexpr int NumPackets = 8;
+ constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
+ PacketBlock<SrcPacketType, NumPackets> packets =
+ srcPacketSegmentHelper<NumPackets, SrcLoadMode>(row, col, begin, count);
+ return pcast<SrcPacketType, DstPacketType>(packets.packet[0], packets.packet[1], packets.packet[2],
+ packets.packet[3], packets.packet[4], packets.packet[5],
+ packets.packet[6], packets.packet[7]);
+ }
+
// Analogous routines for linear access.
template <int LoadMode, typename DstPacketType, AltSrcScalarOp<DstPacketType> = true>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packet(Index index) const {
constexpr int DstPacketSize = unpacket_traits<DstPacketType>::size;
constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType);
constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode);
- SrcPacketType src;
- if (EIGEN_PREDICT_TRUE(check_array_bounds(index, SrcPacketSize))) {
- src = srcPacket<SrcLoadMode>(index, 0);
- } else {
- Array<SrcType, SrcPacketSize, 1> srcArray;
- for (size_t k = 0; k < DstPacketSize; k++) srcArray[k] = srcCoeff(index, k);
- for (size_t k = DstPacketSize; k < SrcPacketSize; k++) srcArray[k] = SrcType(0);
- src = pload<SrcPacketType>(srcArray.data());
- }
- return pcast<SrcPacketType, DstPacketType>(src);
+ return pcast<SrcPacketType, DstPacketType>(srcPacketSegment<SrcLoadMode>(index, 0, DstPacketSize, 0));
}
template <int LoadMode, typename DstPacketType, SrcPacketArgs1<DstPacketType> = true>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packet(Index index) const {
@@ -749,6 +891,55 @@
srcPacket<SrcLoadMode>(index, 6), srcPacket<SrcLoadMode>(index, 7));
}
+ // packetSegment variants
+ template <int LoadMode, typename DstPacketType, AltSrcScalarOp<DstPacketType> = true>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index index, Index begin, Index count) const {
+ constexpr int DstPacketSize = unpacket_traits<DstPacketType>::size;
+ constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType);
+ constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode);
+ return pcast<SrcPacketType, DstPacketType>(srcPacketSegment<SrcLoadMode>(index, begin, count, 0));
+ }
+ // Use the source packet type with the same size as DstPacketType, if it exists
+ template <int LoadMode, typename DstPacketType, SrcPacketArgs1<DstPacketType> = true>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index index, Index begin, Index count) const {
+ constexpr int DstPacketSize = unpacket_traits<DstPacketType>::size;
+ using SizedSrcPacketType = typename find_packet_by_size<SrcType, DstPacketSize>::type;
+ constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType);
+ constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode);
+ return pcast<SizedSrcPacketType, DstPacketType>(
+ srcPacketSegment<SrcLoadMode, SizedSrcPacketType>(index, begin, count, 0));
+ }
+ // unpacket_traits<DstPacketType>::size == 2 * SrcPacketSize
+ template <int LoadMode, typename DstPacketType, SrcPacketArgs2<DstPacketType> = true>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index index, Index begin, Index count) const {
+ constexpr int NumPackets = 2;
+ constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
+ PacketBlock<SrcPacketType, NumPackets> packets =
+ srcPacketSegmentHelper<NumPackets, SrcLoadMode>(index, begin, count);
+ return pcast<SrcPacketType, DstPacketType>(packets.packet[0], packets.packet[1]);
+ }
+ // unpacket_traits<DstPacketType>::size == 4 * SrcPacketSize
+ template <int LoadMode, typename DstPacketType, SrcPacketArgs4<DstPacketType> = true>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index index, Index begin, Index count) const {
+ constexpr int NumPackets = 4;
+ constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
+ PacketBlock<SrcPacketType, NumPackets> packets =
+ srcPacketSegmentHelper<NumPackets, SrcLoadMode>(index, begin, count);
+ return pcast<SrcPacketType, DstPacketType>(packets.packet[0], packets.packet[1], packets.packet[2],
+ packets.packet[3]);
+ }
+ // unpacket_traits<DstPacketType>::size == 8 * SrcPacketSize
+ template <int LoadMode, typename DstPacketType, SrcPacketArgs8<DstPacketType> = true>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index index, Index begin, Index count) const {
+ constexpr int NumPackets = 8;
+ constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
+ PacketBlock<SrcPacketType, NumPackets> packets =
+ srcPacketSegmentHelper<NumPackets, SrcLoadMode>(index, begin, count);
+ return pcast<SrcPacketType, DstPacketType>(packets.packet[0], packets.packet[1], packets.packet[2],
+ packets.packet[3], packets.packet[4], packets.packet[5],
+ packets.packet[6], packets.packet[7]);
+ }
+
constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rows() const { return m_rows; }
constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index cols() const { return m_cols; }
constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_rows * m_cols; }
@@ -826,6 +1017,20 @@
m_d.arg3Impl.template packet<LoadMode, PacketType>(index));
}
+ template <int LoadMode, typename PacketType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+ return m_d.func().packetOp(m_d.arg1Impl.template packetSegment<LoadMode, PacketType>(row, col, begin, count),
+ m_d.arg2Impl.template packetSegment<LoadMode, PacketType>(row, col, begin, count),
+ m_d.arg3Impl.template packetSegment<LoadMode, PacketType>(row, col, begin, count));
+ }
+
+ template <int LoadMode, typename PacketType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
+ return m_d.func().packetOp(m_d.arg1Impl.template packetSegment<LoadMode, PacketType>(index, begin, count),
+ m_d.arg2Impl.template packetSegment<LoadMode, PacketType>(index, begin, count),
+ m_d.arg3Impl.template packetSegment<LoadMode, PacketType>(index, begin, count));
+ }
+
protected:
// this helper permits to completely eliminate the functor if it is empty
struct Data {
@@ -922,6 +1127,18 @@
m_d.rhsImpl.template packet<LoadMode, PacketType>(index));
}
+ template <int LoadMode, typename PacketType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+ return m_d.func().packetOp(m_d.lhsImpl.template packetSegment<LoadMode, PacketType>(row, col, begin, count),
+ m_d.rhsImpl.template packetSegment<LoadMode, PacketType>(row, col, begin, count));
+ }
+
+ template <int LoadMode, typename PacketType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
+ return m_d.func().packetOp(m_d.lhsImpl.template packetSegment<LoadMode, PacketType>(index, begin, count),
+ m_d.rhsImpl.template packetSegment<LoadMode, PacketType>(index, begin, count));
+ }
+
protected:
// this helper permits to completely eliminate the functor if it is empty
struct Data {
@@ -1013,7 +1230,7 @@
m_innerStride(map.innerStride()),
m_outerStride(map.outerStride()) {
EIGEN_STATIC_ASSERT(check_implication((evaluator<Derived>::Flags & PacketAccessBit) != 0,
- internal::inner_stride_at_compile_time<Derived>::ret == 1),
+ inner_stride_at_compile_time<Derived>::ret == 1),
PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1);
EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
}
@@ -1035,36 +1252,60 @@
template <int LoadMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
PointerType ptr = m_data + row * rowStride() + col * colStride();
- return internal::ploadt<PacketType, LoadMode>(ptr);
+ return ploadt<PacketType, LoadMode>(ptr);
}
template <int LoadMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index index) const {
- return internal::ploadt<PacketType, LoadMode>(m_data + index * m_innerStride.value());
+ return ploadt<PacketType, LoadMode>(m_data + index * m_innerStride.value());
}
template <int StoreMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) {
PointerType ptr = m_data + row * rowStride() + col * colStride();
- return internal::pstoret<Scalar, PacketType, StoreMode>(ptr, x);
+ pstoret<Scalar, PacketType, StoreMode>(ptr, x);
}
template <int StoreMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) {
- internal::pstoret<Scalar, PacketType, StoreMode>(m_data + index * m_innerStride.value(), x);
+ pstoret<Scalar, PacketType, StoreMode>(m_data + index * m_innerStride.value(), x);
+ }
+
+ template <int LoadMode, typename PacketType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+ PointerType ptr = m_data + row * rowStride() + col * colStride();
+ return ploadtSegment<PacketType, LoadMode>(ptr, begin, count);
+ }
+
+ template <int LoadMode, typename PacketType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
+ return ploadtSegment<PacketType, LoadMode>(m_data + index * m_innerStride.value(), begin, count);
+ }
+
+ template <int StoreMode, typename PacketType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index row, Index col, const PacketType& x, Index begin,
+ Index count) {
+ PointerType ptr = m_data + row * rowStride() + col * colStride();
+ pstoretSegment<Scalar, PacketType, StoreMode>(ptr, x, begin, count);
+ }
+
+ template <int StoreMode, typename PacketType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index index, const PacketType& x, Index begin,
+ Index count) {
+ pstoretSegment<Scalar, PacketType, StoreMode>(m_data + index * m_innerStride.value(), x, begin, count);
}
protected:
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index rowStride() const EIGEN_NOEXCEPT {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rowStride() const noexcept {
return XprType::IsRowMajor ? m_outerStride.value() : m_innerStride.value();
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index colStride() const EIGEN_NOEXCEPT {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index colStride() const noexcept {
return XprType::IsRowMajor ? m_innerStride.value() : m_outerStride.value();
}
PointerType m_data;
- const internal::variable_if_dynamic<Index, XprType::InnerStrideAtCompileTime> m_innerStride;
- const internal::variable_if_dynamic<Index, XprType::OuterStrideAtCompileTime> m_outerStride;
+ const variable_if_dynamic<Index, XprType::InnerStrideAtCompileTime> m_innerStride;
+ const variable_if_dynamic<Index, XprType::OuterStrideAtCompileTime> m_outerStride;
};
template <typename PlainObjectType, int MapOptions, typename StrideType>
@@ -1117,7 +1358,7 @@
// -------------------- Block --------------------
template <typename ArgType, int BlockRows, int BlockCols, bool InnerPanel,
- bool HasDirectAccess = internal::has_direct_access<ArgType>::ret>
+ bool HasDirectAccess = has_direct_access<ArgType>::ret>
struct block_evaluator;
template <typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
@@ -1246,6 +1487,39 @@
x);
}
+ template <int LoadMode, typename PacketType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+ return m_argImpl.template packetSegment<LoadMode, PacketType>(m_startRow.value() + row, m_startCol.value() + col,
+ begin, count);
+ }
+
+ template <int LoadMode, typename PacketType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
+ if (ForwardLinearAccess)
+ return m_argImpl.template packetSegment<LoadMode, PacketType>(m_linear_offset.value() + index, begin, count);
+ else
+ return packetSegment<LoadMode, PacketType>(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0,
+ begin, count);
+ }
+
+ template <int StoreMode, typename PacketType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index row, Index col, const PacketType& x, Index begin,
+ Index count) {
+ return m_argImpl.template writePacketSegment<StoreMode, PacketType>(m_startRow.value() + row,
+ m_startCol.value() + col, x, begin, count);
+ }
+
+ template <int StoreMode, typename PacketType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index index, const PacketType& x, Index begin,
+ Index count) {
+ if (ForwardLinearAccess)
+ return m_argImpl.template writePacketSegment<StoreMode, PacketType>(m_linear_offset.value() + index, x, begin,
+ count);
+ else
+ return writePacketSegment<StoreMode, PacketType>(RowsAtCompileTime == 1 ? 0 : index,
+ RowsAtCompileTime == 1 ? index : 0, x, begin, count);
+ }
+
protected:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType
linear_coeff_impl(Index index, internal::true_type /* ForwardLinearAccess */) const {
@@ -1341,8 +1615,8 @@
typedef Replicate<ArgType, RowFactor, ColFactor> XprType;
typedef typename XprType::CoeffReturnType CoeffReturnType;
enum { Factor = (RowFactor == Dynamic || ColFactor == Dynamic) ? Dynamic : RowFactor * ColFactor };
- typedef typename internal::nested_eval<ArgType, Factor>::type ArgTypeNested;
- typedef internal::remove_all_t<ArgTypeNested> ArgTypeNestedCleaned;
+ typedef typename nested_eval<ArgType, Factor>::type ArgTypeNested;
+ typedef remove_all_t<ArgTypeNested> ArgTypeNestedCleaned;
enum {
CoeffReadCost = evaluator<ArgTypeNestedCleaned>::CoeffReadCost,
@@ -1361,19 +1635,15 @@
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
// try to avoid using modulo; this is a pure optimization strategy
- const Index actual_row = internal::traits<XprType>::RowsAtCompileTime == 1 ? 0
- : RowFactor == 1 ? row
- : row % m_rows.value();
- const Index actual_col = internal::traits<XprType>::ColsAtCompileTime == 1 ? 0
- : ColFactor == 1 ? col
- : col % m_cols.value();
+ const Index actual_row = traits<XprType>::RowsAtCompileTime == 1 ? 0 : RowFactor == 1 ? row : row % m_rows.value();
+ const Index actual_col = traits<XprType>::ColsAtCompileTime == 1 ? 0 : ColFactor == 1 ? col : col % m_cols.value();
return m_argImpl.coeff(actual_row, actual_col);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
// try to avoid using modulo; this is a pure optimization strategy
- const Index actual_index = internal::traits<XprType>::RowsAtCompileTime == 1
+ const Index actual_index = traits<XprType>::RowsAtCompileTime == 1
? (ColFactor == 1 ? index : index % m_cols.value())
: (RowFactor == 1 ? index : index % m_rows.value());
@@ -1382,25 +1652,38 @@
template <int LoadMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
- const Index actual_row = internal::traits<XprType>::RowsAtCompileTime == 1 ? 0
- : RowFactor == 1 ? row
- : row % m_rows.value();
- const Index actual_col = internal::traits<XprType>::ColsAtCompileTime == 1 ? 0
- : ColFactor == 1 ? col
- : col % m_cols.value();
+ const Index actual_row = traits<XprType>::RowsAtCompileTime == 1 ? 0 : RowFactor == 1 ? row : row % m_rows.value();
+ const Index actual_col = traits<XprType>::ColsAtCompileTime == 1 ? 0 : ColFactor == 1 ? col : col % m_cols.value();
return m_argImpl.template packet<LoadMode, PacketType>(actual_row, actual_col);
}
template <int LoadMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index index) const {
- const Index actual_index = internal::traits<XprType>::RowsAtCompileTime == 1
+ const Index actual_index = traits<XprType>::RowsAtCompileTime == 1
? (ColFactor == 1 ? index : index % m_cols.value())
: (RowFactor == 1 ? index : index % m_rows.value());
return m_argImpl.template packet<LoadMode, PacketType>(actual_index);
}
+ template <int LoadMode, typename PacketType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+ const Index actual_row = traits<XprType>::RowsAtCompileTime == 1 ? 0 : RowFactor == 1 ? row : row % m_rows.value();
+ const Index actual_col = traits<XprType>::ColsAtCompileTime == 1 ? 0 : ColFactor == 1 ? col : col % m_cols.value();
+
+ return m_argImpl.template packetSegment<LoadMode, PacketType>(actual_row, actual_col, begin, count);
+ }
+
+ template <int LoadMode, typename PacketType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
+ const Index actual_index = traits<XprType>::RowsAtCompileTime == 1
+ ? (ColFactor == 1 ? index : index % m_cols.value())
+ : (RowFactor == 1 ? index : index % m_rows.value());
+
+ return m_argImpl.template packetSegment<LoadMode, PacketType>(actual_index, begin, count);
+ }
+
protected:
const ArgTypeNested m_arg;
evaluator<ArgTypeNestedCleaned> m_argImpl;
@@ -1457,6 +1740,28 @@
m_argImpl.template writePacket<StoreMode>(index, x);
}
+ template <int LoadMode, typename PacketType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+ return m_argImpl.template packetSegment<LoadMode, PacketType>(row, col, begin, count);
+ }
+
+ template <int LoadMode, typename PacketType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
+ return m_argImpl.template packetSegment<LoadMode, PacketType>(index, begin, count);
+ }
+
+ template <int StoreMode, typename PacketType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index row, Index col, const PacketType& x, Index begin,
+ Index count) {
+ m_argImpl.template writePacketSegment<StoreMode>(row, col, x, begin, count);
+ }
+
+ template <int StoreMode, typename PacketType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index index, const PacketType& x, Index begin,
+ Index count) {
+ m_argImpl.template writePacketSegment<StoreMode>(index, x, begin, count);
+ }
+
protected:
evaluator<ArgType> m_argImpl;
};
@@ -1536,41 +1841,97 @@
template <int LoadMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
- enum {
- PacketSize = unpacket_traits<PacketType>::size,
- OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1,
- OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1
- };
- typedef internal::reverse_packet_cond<PacketType, ReversePacket> reverse_packet;
- return reverse_packet::run(m_argImpl.template packet<LoadMode, PacketType>(
- ReverseRow ? m_rows.value() - row - OffsetRow : row, ReverseCol ? m_cols.value() - col - OffsetCol : col));
+ static constexpr int PacketSize = unpacket_traits<PacketType>::size;
+ static constexpr int OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1;
+ static constexpr int OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1;
+ using reverse_packet = reverse_packet_cond<PacketType, ReversePacket>;
+
+ Index actualRow = ReverseRow ? m_rows.value() - row - OffsetRow : row;
+ Index actualCol = ReverseCol ? m_cols.value() - col - OffsetCol : col;
+
+ return reverse_packet::run(m_argImpl.template packet<LoadMode, PacketType>(actualRow, actualCol));
}
template <int LoadMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index index) const {
- enum { PacketSize = unpacket_traits<PacketType>::size };
- return preverse(
- m_argImpl.template packet<LoadMode, PacketType>(m_rows.value() * m_cols.value() - index - PacketSize));
+ static constexpr int PacketSize = unpacket_traits<PacketType>::size;
+
+ Index actualIndex = m_rows.value() * m_cols.value() - index - PacketSize;
+
+ return preverse(m_argImpl.template packet<LoadMode, PacketType>(actualIndex));
}
template <int LoadMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) {
- // FIXME we could factorize some code with packet(i,j)
- enum {
- PacketSize = unpacket_traits<PacketType>::size,
- OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1,
- OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1
- };
- typedef internal::reverse_packet_cond<PacketType, ReversePacket> reverse_packet;
- m_argImpl.template writePacket<LoadMode>(ReverseRow ? m_rows.value() - row - OffsetRow : row,
- ReverseCol ? m_cols.value() - col - OffsetCol : col,
- reverse_packet::run(x));
+ static constexpr int PacketSize = unpacket_traits<PacketType>::size;
+ static constexpr int OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1;
+ static constexpr int OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1;
+ using reverse_packet = reverse_packet_cond<PacketType, ReversePacket>;
+
+ Index actualRow = ReverseRow ? m_rows.value() - row - OffsetRow : row;
+ Index actualCol = ReverseCol ? m_cols.value() - col - OffsetCol : col;
+
+ m_argImpl.template writePacket<LoadMode>(actualRow, actualCol, reverse_packet::run(x));
}
template <int LoadMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) {
- enum { PacketSize = unpacket_traits<PacketType>::size };
- m_argImpl.template writePacket<LoadMode>(m_rows.value() * m_cols.value() - index - PacketSize, preverse(x));
+ static constexpr int PacketSize = unpacket_traits<PacketType>::size;
+
+ Index actualIndex = m_rows.value() * m_cols.value() - index - PacketSize;
+
+ m_argImpl.template writePacket<LoadMode>(actualIndex, preverse(x));
+ }
+
+ template <int LoadMode, typename PacketType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+ static constexpr int PacketSize = unpacket_traits<PacketType>::size;
+ static constexpr int OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1;
+ static constexpr int OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1;
+ using reverse_packet = reverse_packet_cond<PacketType, ReversePacket>;
+
+ Index actualRow = ReverseRow ? m_rows.value() - row - OffsetRow : row;
+ Index actualCol = ReverseCol ? m_cols.value() - col - OffsetCol : col;
+ Index actualBegin = ReversePacket ? (PacketSize - count - begin) : begin;
+
+ return reverse_packet::run(
+ m_argImpl.template packetSegment<LoadMode, PacketType>(actualRow, actualCol, actualBegin, count));
+ }
+
+ template <int LoadMode, typename PacketType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
+ static constexpr int PacketSize = unpacket_traits<PacketType>::size;
+
+ Index actualIndex = m_rows.value() * m_cols.value() - index - PacketSize;
+ Index actualBegin = PacketSize - count - begin;
+
+ return preverse(m_argImpl.template packetSegment<LoadMode, PacketType>(actualIndex, actualBegin, count));
+ }
+
+ template <int LoadMode, typename PacketType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index row, Index col, const PacketType& x, Index begin,
+ Index count) {
+ static constexpr int PacketSize = unpacket_traits<PacketType>::size;
+ static constexpr int OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1;
+ static constexpr int OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1;
+ using reverse_packet = reverse_packet_cond<PacketType, ReversePacket>;
+
+ Index actualRow = ReverseRow ? m_rows.value() - row - OffsetRow : row;
+ Index actualCol = ReverseCol ? m_cols.value() - col - OffsetCol : col;
+ Index actualBegin = ReversePacket ? (PacketSize - count - begin) : begin;
+
+ m_argImpl.template writePacketSegment<LoadMode>(actualRow, actualCol, reverse_packet::run(x), actualBegin, count);
+ }
+
+ template <int LoadMode, typename PacketType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index index, const PacketType& x, Index begin,
+ Index count) {
+ static constexpr int PacketSize = unpacket_traits<PacketType>::size;
+
+ Index actualIndex = m_rows.value() * m_cols.value() - index - PacketSize;
+ Index actualBegin = PacketSize - count - begin;
+
+ m_argImpl.template writePacketSegment<LoadMode>(actualIndex, preverse(x), actualBegin, count);
}
protected:
@@ -1621,13 +1982,13 @@
protected:
evaluator<ArgType> m_argImpl;
- const internal::variable_if_dynamicindex<Index, XprType::DiagIndex> m_index;
+ const variable_if_dynamicindex<Index, XprType::DiagIndex> m_index;
private:
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index rowOffset() const {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rowOffset() const {
return m_index.value() > 0 ? 0 : -m_index.value();
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index colOffset() const {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index colOffset() const {
return m_index.value() > 0 ? m_index.value() : 0;
}
};
@@ -1656,9 +2017,9 @@
const ArgType& arg() const { return m_arg; }
- EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_arg.rows(); }
+ constexpr Index rows() const noexcept { return m_arg.rows(); }
- EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_arg.cols(); }
+ constexpr Index cols() const noexcept { return m_arg.cols(); }
private:
const ArgType& m_arg;
diff --git a/Eigen/src/Core/CwiseBinaryOp.h b/Eigen/src/Core/CwiseBinaryOp.h
index aa79b60..e2b2da5 100644
--- a/Eigen/src/Core/CwiseBinaryOp.h
+++ b/Eigen/src/Core/CwiseBinaryOp.h
@@ -108,12 +108,12 @@
eigen_assert(aLhs.rows() == aRhs.rows() && aLhs.cols() == aRhs.cols());
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const noexcept {
// return the fixed size type if available to enable compile time optimizations
return internal::traits<internal::remove_all_t<LhsNested>>::RowsAtCompileTime == Dynamic ? m_rhs.rows()
: m_lhs.rows();
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const noexcept {
// return the fixed size type if available to enable compile time optimizations
return internal::traits<internal::remove_all_t<LhsNested>>::ColsAtCompileTime == Dynamic ? m_rhs.cols()
: m_lhs.cols();
diff --git a/Eigen/src/Core/CwiseNullaryOp.h b/Eigen/src/Core/CwiseNullaryOp.h
index 9c305c6..13a542a 100644
--- a/Eigen/src/Core/CwiseNullaryOp.h
+++ b/Eigen/src/Core/CwiseNullaryOp.h
@@ -76,8 +76,8 @@
EIGEN_STATIC_ASSERT(CwiseNullaryOp::IsVectorAtCompileTime, YOU_TRIED_CALLING_A_VECTOR_METHOD_ON_A_MATRIX);
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index rows() const { return m_rows.value(); }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index cols() const { return m_cols.value(); }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return m_rows.value(); }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return m_cols.value(); }
/** \returns the functor representing the nullary operation */
EIGEN_DEVICE_FUNC const NullaryOp& functor() const { return m_functor; }
diff --git a/Eigen/src/Core/CwiseUnaryOp.h b/Eigen/src/Core/CwiseUnaryOp.h
index 42ed459..94ec1a0 100644
--- a/Eigen/src/Core/CwiseUnaryOp.h
+++ b/Eigen/src/Core/CwiseUnaryOp.h
@@ -60,8 +60,8 @@
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit CwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp())
: m_xpr(xpr), m_functor(func) {}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_xpr.rows(); }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_xpr.cols(); }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const noexcept { return m_xpr.rows(); }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const noexcept { return m_xpr.cols(); }
/** \returns the functor representing the unary operation */
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const UnaryOp& functor() const { return m_functor; }
diff --git a/Eigen/src/Core/CwiseUnaryView.h b/Eigen/src/Core/CwiseUnaryView.h
index 49b1410..7dd7623 100644
--- a/Eigen/src/Core/CwiseUnaryView.h
+++ b/Eigen/src/Core/CwiseUnaryView.h
@@ -66,13 +66,13 @@
EIGEN_DEVICE_FUNC inline const Scalar* data() const { return &(this->coeffRef(0)); }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const {
+ EIGEN_DEVICE_FUNC constexpr Index innerStride() const {
return StrideType::InnerStrideAtCompileTime != 0 ? int(StrideType::InnerStrideAtCompileTime)
: derived().nestedExpression().innerStride() *
sizeof(typename traits<MatrixType>::Scalar) / sizeof(Scalar);
}
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const {
+ EIGEN_DEVICE_FUNC constexpr Index outerStride() const {
return StrideType::OuterStrideAtCompileTime != 0 ? int(StrideType::OuterStrideAtCompileTime)
: derived().nestedExpression().outerStride() *
sizeof(typename traits<MatrixType>::Scalar) / sizeof(Scalar);
@@ -145,8 +145,8 @@
EIGEN_INHERIT_ASSIGNMENT_OPERATORS(CwiseUnaryView)
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const noexcept { return m_matrix.rows(); }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const noexcept { return m_matrix.cols(); }
/** \returns the functor representing unary operation */
EIGEN_DEVICE_FUNC const ViewOp& functor() const { return m_functor; }
diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h
index d5906bd..4f68942 100644
--- a/Eigen/src/Core/DenseBase.h
+++ b/Eigen/src/Core/DenseBase.h
@@ -208,7 +208,7 @@
* \note For a vector, this returns just 1. For a matrix (non-vector), this is the major dimension
* with respect to the \ref TopicStorageOrders "storage order", i.e., the number of columns for a
* column-major matrix, and the number of rows for a row-major matrix. */
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index outerSize() const {
+ EIGEN_DEVICE_FUNC constexpr Index outerSize() const {
return IsVectorAtCompileTime ? 1 : int(IsRowMajor) ? this->rows() : this->cols();
}
@@ -217,7 +217,7 @@
* \note For a vector, this is just the size. For a matrix (non-vector), this is the minor dimension
* with respect to the \ref TopicStorageOrders "storage order", i.e., the number of rows for a
* column-major matrix, and the number of columns for a row-major matrix. */
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index innerSize() const {
+ EIGEN_DEVICE_FUNC constexpr Index innerSize() const {
return IsVectorAtCompileTime ? this->size() : int(IsRowMajor) ? this->cols() : this->rows();
}
diff --git a/Eigen/src/Core/DenseCoeffsBase.h b/Eigen/src/Core/DenseCoeffsBase.h
index 97f9b50..cff104c 100644
--- a/Eigen/src/Core/DenseCoeffsBase.h
+++ b/Eigen/src/Core/DenseCoeffsBase.h
@@ -89,13 +89,12 @@
*
* \sa operator()(Index,Index) const, coeffRef(Index,Index), coeff(Index) const
*/
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR CoeffReturnType coeff(Index row, Index col) const {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType coeff(Index row, Index col) const {
eigen_internal_assert(row >= 0 && row < rows() && col >= 0 && col < cols());
return internal::evaluator<Derived>(derived()).coeff(row, col);
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR CoeffReturnType coeffByOuterInner(Index outer,
- Index inner) const {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType coeffByOuterInner(Index outer, Index inner) const {
return coeff(rowIndexByOuterInner(outer, inner), colIndexByOuterInner(outer, inner));
}
@@ -103,7 +102,7 @@
*
* \sa operator()(Index,Index), operator[](Index)
*/
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR CoeffReturnType operator()(Index row, Index col) const {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType operator()(Index row, Index col) const {
eigen_assert(row >= 0 && row < rows() && col >= 0 && col < cols());
return coeff(row, col);
}
@@ -123,7 +122,7 @@
* \sa operator[](Index) const, coeffRef(Index), coeff(Index,Index) const
*/
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR CoeffReturnType coeff(Index index) const {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType coeff(Index index) const {
EIGEN_STATIC_ASSERT(internal::evaluator<Derived>::Flags & LinearAccessBit,
THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS)
eigen_internal_assert(index >= 0 && index < size());
@@ -138,7 +137,7 @@
* z() const, w() const
*/
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR CoeffReturnType operator[](Index index) const {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType operator[](Index index) const {
EIGEN_STATIC_ASSERT(Derived::IsVectorAtCompileTime,
THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD)
eigen_assert(index >= 0 && index < size());
@@ -155,32 +154,32 @@
* z() const, w() const
*/
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR CoeffReturnType operator()(Index index) const {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType operator()(Index index) const {
eigen_assert(index >= 0 && index < size());
return coeff(index);
}
/** equivalent to operator[](0). */
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR CoeffReturnType x() const { return (*this)[0]; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType x() const { return (*this)[0]; }
/** equivalent to operator[](1). */
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR CoeffReturnType y() const {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType y() const {
EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime == -1 || Derived::SizeAtCompileTime >= 2, OUT_OF_RANGE_ACCESS);
return (*this)[1];
}
/** equivalent to operator[](2). */
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR CoeffReturnType z() const {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType z() const {
EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime == -1 || Derived::SizeAtCompileTime >= 3, OUT_OF_RANGE_ACCESS);
return (*this)[2];
}
/** equivalent to operator[](3). */
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR CoeffReturnType w() const {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType w() const {
EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime == -1 || Derived::SizeAtCompileTime >= 4, OUT_OF_RANGE_ACCESS);
return (*this)[3];
}
@@ -362,32 +361,32 @@
* \sa operator[](Index) const, operator()(Index,Index), x(), y(), z(), w()
*/
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Scalar& operator()(Index index) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& operator()(Index index) {
eigen_assert(index >= 0 && index < size());
return coeffRef(index);
}
/** equivalent to operator[](0). */
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Scalar& x() { return (*this)[0]; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& x() { return (*this)[0]; }
/** equivalent to operator[](1). */
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Scalar& y() {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& y() {
EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime == -1 || Derived::SizeAtCompileTime >= 2, OUT_OF_RANGE_ACCESS);
return (*this)[1];
}
/** equivalent to operator[](2). */
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Scalar& z() {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& z() {
EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime == -1 || Derived::SizeAtCompileTime >= 3, OUT_OF_RANGE_ACCESS);
return (*this)[2];
}
/** equivalent to operator[](3). */
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Scalar& w() {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& w() {
EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime == -1 || Derived::SizeAtCompileTime >= 4, OUT_OF_RANGE_ACCESS);
return (*this)[3];
}
@@ -421,33 +420,29 @@
*
* \sa outerStride(), rowStride(), colStride()
*/
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const { return derived().innerStride(); }
+ EIGEN_DEVICE_FUNC constexpr Index innerStride() const { return derived().innerStride(); }
/** \returns the pointer increment between two consecutive inner slices (for example, between two consecutive columns
* in a column-major matrix).
*
* \sa innerStride(), rowStride(), colStride()
*/
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const { return derived().outerStride(); }
+ EIGEN_DEVICE_FUNC constexpr Index outerStride() const { return derived().outerStride(); }
// FIXME shall we remove it ?
- EIGEN_CONSTEXPR inline Index stride() const { return Derived::IsVectorAtCompileTime ? innerStride() : outerStride(); }
+ constexpr Index stride() const { return Derived::IsVectorAtCompileTime ? innerStride() : outerStride(); }
/** \returns the pointer increment between two consecutive rows.
*
* \sa innerStride(), outerStride(), colStride()
*/
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rowStride() const {
- return Derived::IsRowMajor ? outerStride() : innerStride();
- }
+ EIGEN_DEVICE_FUNC constexpr Index rowStride() const { return Derived::IsRowMajor ? outerStride() : innerStride(); }
/** \returns the pointer increment between two consecutive columns.
*
* \sa innerStride(), outerStride(), rowStride()
*/
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index colStride() const {
- return Derived::IsRowMajor ? innerStride() : outerStride();
- }
+ EIGEN_DEVICE_FUNC constexpr Index colStride() const { return Derived::IsRowMajor ? innerStride() : outerStride(); }
};
/** \brief Base class providing direct read/write coefficient access to matrices and arrays.
@@ -478,25 +473,23 @@
*
* \sa outerStride(), rowStride(), colStride()
*/
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const EIGEN_NOEXCEPT { return derived().innerStride(); }
+ EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { return derived().innerStride(); }
/** \returns the pointer increment between two consecutive inner slices (for example, between two consecutive columns
* in a column-major matrix).
*
* \sa innerStride(), rowStride(), colStride()
*/
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const EIGEN_NOEXCEPT { return derived().outerStride(); }
+ EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return derived().outerStride(); }
// FIXME shall we remove it ?
- EIGEN_CONSTEXPR inline Index stride() const EIGEN_NOEXCEPT {
- return Derived::IsVectorAtCompileTime ? innerStride() : outerStride();
- }
+ constexpr Index stride() const noexcept { return Derived::IsVectorAtCompileTime ? innerStride() : outerStride(); }
/** \returns the pointer increment between two consecutive rows.
*
* \sa innerStride(), outerStride(), colStride()
*/
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rowStride() const EIGEN_NOEXCEPT {
+ EIGEN_DEVICE_FUNC constexpr Index rowStride() const noexcept {
return Derived::IsRowMajor ? outerStride() : innerStride();
}
@@ -504,7 +497,7 @@
*
* \sa innerStride(), outerStride(), rowStride()
*/
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index colStride() const EIGEN_NOEXCEPT {
+ EIGEN_DEVICE_FUNC constexpr Index colStride() const noexcept {
return Derived::IsRowMajor ? innerStride() : outerStride();
}
};
@@ -513,7 +506,7 @@
template <int Alignment, typename Derived, bool JustReturnZero>
struct first_aligned_impl {
- static EIGEN_CONSTEXPR inline Index run(const Derived&) EIGEN_NOEXCEPT { return 0; }
+ static constexpr Index run(const Derived&) noexcept { return 0; }
};
template <int Alignment, typename Derived>
diff --git a/Eigen/src/Core/DeviceWrapper.h b/Eigen/src/Core/DeviceWrapper.h
index 75fc8e7..012dce1 100644
--- a/Eigen/src/Core/DeviceWrapper.h
+++ b/Eigen/src/Core/DeviceWrapper.h
@@ -87,13 +87,13 @@
int Unrolling = Kernel::AssignmentTraits::Unrolling>
struct dense_assignment_loop_with_device {
using Base = dense_assignment_loop<Kernel, Traversal, Unrolling>;
- static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel, Device&) { Base::run(kernel); }
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel, Device&) { Base::run(kernel); }
};
// entry point for a generic expression with device
template <typename Dst, typename Src, typename Func, typename Device>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void call_assignment_no_alias(DeviceWrapper<Dst, Device> dst,
- const Src& src, const Func& func) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_assignment_no_alias(DeviceWrapper<Dst, Device> dst,
+ const Src& src, const Func& func) {
enum {
NeedToTranspose = ((int(Dst::RowsAtCompileTime) == 1 && int(Src::ColsAtCompileTime) == 1) ||
(int(Dst::ColsAtCompileTime) == 1 && int(Src::RowsAtCompileTime) == 1)) &&
@@ -115,10 +115,8 @@
// copy and pasted from AssignEvaluator except forward device to kernel
template <typename DstXprType, typename SrcXprType, typename Functor, typename Device>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void call_dense_assignment_loop(DstXprType& dst,
- const SrcXprType& src,
- const Functor& func,
- Device& device) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src,
+ const Functor& func, Device& device) {
using DstEvaluatorType = evaluator<DstXprType>;
using SrcEvaluatorType = evaluator<SrcXprType>;
diff --git a/Eigen/src/Core/Diagonal.h b/Eigen/src/Core/Diagonal.h
index 8d27857..ff8611c 100644
--- a/Eigen/src/Core/Diagonal.h
+++ b/Eigen/src/Core/Diagonal.h
@@ -83,13 +83,11 @@
: numext::mini<Index>(m_matrix.rows(), m_matrix.cols() - m_index.value());
}
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return 1; }
+ EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return 1; }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const EIGEN_NOEXCEPT {
- return m_matrix.outerStride() + 1;
- }
+ EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { return m_matrix.outerStride() + 1; }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const EIGEN_NOEXCEPT { return 0; }
+ EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return 0; }
typedef std::conditional_t<internal::is_lvalue<MatrixType>::value, Scalar, const Scalar> ScalarWithConstIfNotLvalue;
@@ -134,13 +132,13 @@
private:
// some compilers may fail to optimize std::max etc in case of compile-time constants...
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index absDiagIndex() const EIGEN_NOEXCEPT {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index absDiagIndex() const noexcept {
return m_index.value() > 0 ? m_index.value() : -m_index.value();
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index rowOffset() const EIGEN_NOEXCEPT {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rowOffset() const noexcept {
return m_index.value() > 0 ? 0 : -m_index.value();
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index colOffset() const EIGEN_NOEXCEPT {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index colOffset() const noexcept {
return m_index.value() > 0 ? m_index.value() : 0;
}
// trigger a compile-time error if someone try to call packet
diff --git a/Eigen/src/Core/DiagonalMatrix.h b/Eigen/src/Core/DiagonalMatrix.h
index 4115b64..52630d9 100644
--- a/Eigen/src/Core/DiagonalMatrix.h
+++ b/Eigen/src/Core/DiagonalMatrix.h
@@ -76,9 +76,9 @@
}
/** \returns the number of rows. */
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const { return diagonal().size(); }
+ EIGEN_DEVICE_FUNC constexpr Index rows() const { return diagonal().size(); }
/** \returns the number of columns. */
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const { return diagonal().size(); }
+ EIGEN_DEVICE_FUNC constexpr Index cols() const { return diagonal().size(); }
/** \returns the diagonal matrix product of \c *this by the dense matrix, \a matrix */
template <typename MatrixDerived>
diff --git a/Eigen/src/Core/EigenBase.h b/Eigen/src/Core/EigenBase.h
index 894bfc1..c9a6e88 100644
--- a/Eigen/src/Core/EigenBase.h
+++ b/Eigen/src/Core/EigenBase.h
@@ -56,12 +56,12 @@
EIGEN_DEVICE_FUNC inline const Derived& const_derived() const { return *static_cast<const Derived*>(this); }
/** \returns the number of rows. \sa cols(), RowsAtCompileTime */
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return derived().rows(); }
+ EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return derived().rows(); }
/** \returns the number of columns. \sa rows(), ColsAtCompileTime*/
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return derived().cols(); }
+ EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return derived().cols(); }
/** \returns the number of coefficients, which is rows()*cols().
* \sa rows(), cols(), SizeAtCompileTime. */
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index size() const EIGEN_NOEXCEPT { return rows() * cols(); }
+ EIGEN_DEVICE_FUNC constexpr Index size() const noexcept { return rows() * cols(); }
/** \internal Don't use it, but do the equivalent: \code dst = *this; \endcode */
template <typename Dest>
diff --git a/Eigen/src/Core/Fill.h b/Eigen/src/Core/Fill.h
index 3b0af91..9d4ecd4 100644
--- a/Eigen/src/Core/Fill.h
+++ b/Eigen/src/Core/Fill.h
@@ -92,7 +92,8 @@
template <typename Xpr>
struct eigen_memset_helper {
- static constexpr bool value = std::is_trivial<typename Xpr::Scalar>::value && eigen_fill_helper<Xpr>::value;
+ static constexpr bool value =
+ std::is_trivially_copyable<typename Xpr::Scalar>::value && eigen_fill_helper<Xpr>::value;
};
template <typename Xpr>
diff --git a/Eigen/src/Core/ForceAlignedAccess.h b/Eigen/src/Core/ForceAlignedAccess.h
index a91b0da..55beab3 100644
--- a/Eigen/src/Core/ForceAlignedAccess.h
+++ b/Eigen/src/Core/ForceAlignedAccess.h
@@ -41,14 +41,10 @@
EIGEN_DEVICE_FUNC explicit inline ForceAlignedAccess(const ExpressionType& matrix) : m_expression(matrix) {}
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_expression.rows(); }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_expression.cols(); }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const EIGEN_NOEXCEPT {
- return m_expression.outerStride();
- }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const EIGEN_NOEXCEPT {
- return m_expression.innerStride();
- }
+ EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_expression.rows(); }
+ EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_expression.cols(); }
+ EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return m_expression.outerStride(); }
+ EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { return m_expression.innerStride(); }
EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index row, Index col) const {
return m_expression.coeff(row, col);
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index 26a4634..d45cb4b 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -72,6 +72,7 @@
HasReciprocal = 0,
HasSqrt = 0,
HasRsqrt = 0,
+ HasCbrt = 0,
HasExp = 0,
HasExpm1 = 0,
HasLog = 0,
@@ -368,6 +369,11 @@
EIGEN_DEVICE_FUNC inline Packet pdiv(const Packet& a, const Packet& b) {
return a / b;
}
+// Avoid compiler warning for boolean algebra.
+template <>
+EIGEN_DEVICE_FUNC inline bool pdiv(const bool& a, const bool& b) {
+ return a && b;
+}
// In the generic case, memset to all one bits.
template <typename Packet, typename EnableIf = void>
@@ -449,48 +455,42 @@
template <typename T>
struct bit_and {
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE T operator()(const T& a, const T& b) const { return a & b; }
+ EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE T operator()(const T& a, const T& b) const { return a & b; }
};
template <typename T>
struct bit_or {
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE T operator()(const T& a, const T& b) const { return a | b; }
+ EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE T operator()(const T& a, const T& b) const { return a | b; }
};
template <typename T>
struct bit_xor {
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE T operator()(const T& a, const T& b) const { return a ^ b; }
+ EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE T operator()(const T& a, const T& b) const { return a ^ b; }
};
template <typename T>
struct bit_not {
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE T operator()(const T& a) const { return ~a; }
+ EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE T operator()(const T& a) const { return ~a; }
};
template <>
struct bit_and<bool> {
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE bool operator()(const bool& a, const bool& b) const {
- return a && b;
- }
+ EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE bool operator()(const bool& a, const bool& b) const { return a && b; }
};
template <>
struct bit_or<bool> {
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE bool operator()(const bool& a, const bool& b) const {
- return a || b;
- }
+ EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE bool operator()(const bool& a, const bool& b) const { return a || b; }
};
template <>
struct bit_xor<bool> {
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE bool operator()(const bool& a, const bool& b) const {
- return a != b;
- }
+ EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE bool operator()(const bool& a, const bool& b) const { return a != b; }
};
template <>
struct bit_not<bool> {
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE bool operator()(const bool& a) const { return !a; }
+ EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE bool operator()(const bool& a) const { return !a; }
};
// Use operators &, |, ^, ~.
@@ -580,7 +580,7 @@
}
// In the general case, use bitwise select.
-template <typename Packet, typename EnableIf = void>
+template <typename Packet, bool is_scalar = is_scalar<Packet>::value>
struct pselect_impl {
static EIGEN_DEVICE_FUNC inline Packet run(const Packet& mask, const Packet& a, const Packet& b) {
return por(pand(a, mask), pandnot(b, mask));
@@ -589,9 +589,9 @@
// For scalars, use ternary select.
template <typename Packet>
-struct pselect_impl<Packet, std::enable_if_t<is_scalar<Packet>::value>> {
+struct pselect_impl<Packet, true> {
static EIGEN_DEVICE_FUNC inline Packet run(const Packet& mask, const Packet& a, const Packet& b) {
- return numext::equal_strict(mask, Packet(0)) ? b : a;
+ return numext::select(mask, a, b);
}
};
@@ -1294,29 +1294,61 @@
* The following functions might not have to be overwritten for vectorized types
***************************************************************************/
+template <typename Packet, typename EnableIf = void>
+struct pmadd_impl {
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pmadd(const Packet& a, const Packet& b, const Packet& c) {
+ return padd(pmul(a, b), c);
+ }
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pmsub(const Packet& a, const Packet& b, const Packet& c) {
+ return psub(pmul(a, b), c);
+ }
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pnmadd(const Packet& a, const Packet& b, const Packet& c) {
+ return psub(c, pmul(a, b));
+ }
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pnmsub(const Packet& a, const Packet& b, const Packet& c) {
+ return pnegate(pmadd(a, b, c));
+ }
+};
+
+template <typename Scalar>
+struct pmadd_impl<Scalar, std::enable_if_t<is_scalar<Scalar>::value && NumTraits<Scalar>::IsSigned>> {
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar pmadd(const Scalar& a, const Scalar& b, const Scalar& c) {
+ return numext::fma(a, b, c);
+ }
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar pmsub(const Scalar& a, const Scalar& b, const Scalar& c) {
+ return numext::fma(a, b, Scalar(-c));
+ }
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar pnmadd(const Scalar& a, const Scalar& b, const Scalar& c) {
+ return numext::fma(Scalar(-a), b, c);
+ }
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar pnmsub(const Scalar& a, const Scalar& b, const Scalar& c) {
+ return -Scalar(numext::fma(a, b, c));
+ }
+};
+
// FMA instructions.
/** \internal \returns a * b + c (coeff-wise) */
template <typename Packet>
EIGEN_DEVICE_FUNC inline Packet pmadd(const Packet& a, const Packet& b, const Packet& c) {
- return padd(pmul(a, b), c);
+ return pmadd_impl<Packet>::pmadd(a, b, c);
}
/** \internal \returns a * b - c (coeff-wise) */
template <typename Packet>
EIGEN_DEVICE_FUNC inline Packet pmsub(const Packet& a, const Packet& b, const Packet& c) {
- return psub(pmul(a, b), c);
+ return pmadd_impl<Packet>::pmsub(a, b, c);
}
/** \internal \returns -(a * b) + c (coeff-wise) */
template <typename Packet>
EIGEN_DEVICE_FUNC inline Packet pnmadd(const Packet& a, const Packet& b, const Packet& c) {
- return psub(c, pmul(a, b));
+ return pmadd_impl<Packet>::pnmadd(a, b, c);
}
/** \internal \returns -((a * b + c) (coeff-wise) */
template <typename Packet>
EIGEN_DEVICE_FUNC inline Packet pnmsub(const Packet& a, const Packet& b, const Packet& c) {
- return pnegate(pmadd(a, b, c));
+ return pmadd_impl<Packet>::pnmsub(a, b, c);
}
/** \internal copy a packet with constant coefficient \a a (e.g., [a,a,a,a]) to \a *to. \a to must be 16 bytes aligned
@@ -1525,6 +1557,104 @@
return (Packet)pand(result, peven_mask(result)); // atan2 0 atan2 0 ...
}
+/** \internal \returns a packet populated with values in the range [begin, begin + count). Elements
+ * outside this range are not defined. \a *from does not need to be aligned, and can be null if \a count is zero.*/
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet ploaduSegment(const typename unpacket_traits<Packet>::type* from, Index begin,
+ Index count) {
+ using Scalar = typename unpacket_traits<Packet>::type;
+ constexpr Index PacketSize = unpacket_traits<Packet>::size;
+ eigen_assert((begin >= 0 && count >= 0 && begin + count <= PacketSize) && "invalid range");
+ Scalar aux[PacketSize];
+ memset(static_cast<void*>(aux), 0x00, sizeof(Scalar) * PacketSize);
+ smart_copy(from + begin, from + begin + count, aux + begin);
+ return ploadu<Packet>(aux);
+}
+
+/** \internal \returns a packet populated with values in the range [begin, begin + count). Elements
+ * outside this range are not defined. \a *from must be aligned, and cannot be null.*/
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet ploadSegment(const typename unpacket_traits<Packet>::type* from, Index begin,
+ Index count) {
+ return ploaduSegment<Packet>(from, begin, count);
+}
+
+/** \internal copy the packet \a from in the range [begin, begin + count) to \a *to.
+Elements outside of the range [begin, begin + count) are not defined. \a *to does not need to be aligned, and can be
+null if \a count is zero.*/
+template <typename Scalar, typename Packet>
+EIGEN_DEVICE_FUNC inline void pstoreuSegment(Scalar* to, const Packet& from, Index begin, Index count) {
+ constexpr Index PacketSize = unpacket_traits<Packet>::size;
+ eigen_assert((begin >= 0 && count >= 0 && begin + count <= PacketSize) && "invalid range");
+ Scalar aux[PacketSize];
+ pstoreu<Scalar, Packet>(aux, from);
+ smart_copy(aux + begin, aux + begin + count, to + begin);
+}
+
+/** \internal copy the packet \a from in the range [begin, begin + count) to \a *to.
+Elements outside of the range [begin, begin + count) are not defined. \a *to must be aligned, and cannot be
+null.*/
+template <typename Scalar, typename Packet>
+EIGEN_DEVICE_FUNC inline void pstoreSegment(Scalar* to, const Packet& from, Index begin, Index count) {
+ return pstoreuSegment(to, from, begin, count);
+}
+
+/** \internal \returns a packet populated with values in the range [begin, begin + count). Elements
+ * outside this range are not defined.*/
+template <typename Packet, int Alignment>
+EIGEN_DEVICE_FUNC inline Packet ploadtSegment(const typename unpacket_traits<Packet>::type* from, Index begin,
+ Index count) {
+ constexpr int RequiredAlignment = unpacket_traits<Packet>::alignment;
+ if (Alignment >= RequiredAlignment) {
+ return ploadSegment<Packet>(from, begin, count);
+ } else {
+ return ploaduSegment<Packet>(from, begin, count);
+ }
+}
+
+/** \internal copy the packet \a from in the range [begin, begin + count) to \a *to.
+Elements outside of the range [begin, begin + count) are not defined.*/
+template <typename Scalar, typename Packet, int Alignment>
+EIGEN_DEVICE_FUNC inline void pstoretSegment(Scalar* to, const Packet& from, Index begin, Index count) {
+ constexpr int RequiredAlignment = unpacket_traits<Packet>::alignment;
+ if (Alignment >= RequiredAlignment) {
+ pstoreSegment<Scalar, Packet>(to, from, begin, count);
+ } else {
+ pstoreuSegment<Scalar, Packet>(to, from, begin, count);
+ }
+}
+
+#ifndef EIGEN_NO_IO
+
+template <typename Packet>
+class StreamablePacket {
+ public:
+ using Scalar = typename unpacket_traits<Packet>::type;
+ StreamablePacket(const Packet& packet) { pstoreu(v_, packet); }
+
+ friend std::ostream& operator<<(std::ostream& os, const StreamablePacket& packet) {
+ os << "{" << packet.v_[0];
+ for (int i = 1; i < unpacket_traits<Packet>::size; ++i) {
+ os << "," << packet.v_[i];
+ }
+ os << "}";
+ return os;
+ }
+
+ private:
+ Scalar v_[unpacket_traits<Packet>::size];
+};
+
+/**
+ * \internal \returns an intermediary that can be used to ostream packets, e.g. for debugging.
+ */
+template <typename Packet>
+StreamablePacket<Packet> postream(const Packet& packet) {
+ return StreamablePacket<Packet>(packet);
+}
+
+#endif // EIGEN_NO_IO
+
} // end namespace internal
} // end namespace Eigen
diff --git a/Eigen/src/Core/IndexedView.h b/Eigen/src/Core/IndexedView.h
index 454e560..358239c 100644
--- a/Eigen/src/Core/IndexedView.h
+++ b/Eigen/src/Core/IndexedView.h
@@ -225,14 +225,14 @@
return this->nestedExpression().data() + row_offset + col_offset;
}
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const EIGEN_NOEXCEPT {
+ EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept {
if (traits<Derived>::InnerStrideAtCompileTime != Dynamic) {
return traits<Derived>::InnerStrideAtCompileTime;
}
return innerIncrement() * this->nestedExpression().innerStride();
}
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const EIGEN_NOEXCEPT {
+ EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept {
if (traits<Derived>::OuterStrideAtCompileTime != Dynamic) {
return traits<Derived>::OuterStrideAtCompileTime;
}
diff --git a/Eigen/src/Core/Inverse.h b/Eigen/src/Core/Inverse.h
index 013ad0a..79fc3ab 100644
--- a/Eigen/src/Core/Inverse.h
+++ b/Eigen/src/Core/Inverse.h
@@ -51,8 +51,8 @@
explicit EIGEN_DEVICE_FUNC Inverse(const XprType& xpr) : m_xpr(xpr) {}
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_xpr.cols(); }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_xpr.rows(); }
+ EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_xpr.cols(); }
+ EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_xpr.rows(); }
EIGEN_DEVICE_FUNC const XprTypeNestedCleaned& nestedExpression() const { return m_xpr; }
diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h
index df7b7ca..c740da7 100644
--- a/Eigen/src/Core/Map.h
+++ b/Eigen/src/Core/Map.h
@@ -102,11 +102,11 @@
typedef PointerType PointerArgType;
EIGEN_DEVICE_FUNC inline PointerType cast_to_pointer_type(PointerArgType ptr) { return ptr; }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const {
+ EIGEN_DEVICE_FUNC constexpr Index innerStride() const {
return StrideType::InnerStrideAtCompileTime != 0 ? m_stride.inner() : 1;
}
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const {
+ EIGEN_DEVICE_FUNC constexpr Index outerStride() const {
return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
: internal::traits<Map>::OuterStrideAtCompileTime != Dynamic
? Index(internal::traits<Map>::OuterStrideAtCompileTime)
diff --git a/Eigen/src/Core/MapBase.h b/Eigen/src/Core/MapBase.h
index 1e83fdf..5e3d746 100644
--- a/Eigen/src/Core/MapBase.h
+++ b/Eigen/src/Core/MapBase.h
@@ -84,9 +84,9 @@
typedef typename Base::CoeffReturnType CoeffReturnType;
/** \copydoc DenseBase::rows() */
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_rows.value(); }
+ EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_rows.value(); }
/** \copydoc DenseBase::cols() */
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_cols.value(); }
+ EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_cols.value(); }
/** Returns a pointer to the first coefficient of the matrix or vector.
*
diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index 528aed2..941961d 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -170,8 +170,8 @@
template <typename Scalar>
struct imag_ref_default_impl<Scalar, false> {
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline Scalar run(Scalar&) { return Scalar(0); }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline const Scalar run(const Scalar&) { return Scalar(0); }
+ EIGEN_DEVICE_FUNC constexpr static Scalar run(Scalar&) { return Scalar(0); }
+ EIGEN_DEVICE_FUNC constexpr static const Scalar run(const Scalar&) { return Scalar(0); }
};
template <typename Scalar>
@@ -182,6 +182,10 @@
typedef typename NumTraits<Scalar>::Real& type;
};
+// implementation in MathFunctionsImpl.h
+template <typename Mask, bool is_built_in_float = std::is_floating_point<Mask>::value>
+struct scalar_select_mask;
+
} // namespace internal
namespace numext {
@@ -207,6 +211,11 @@
return EIGEN_MATHFUNC_IMPL(imag, Scalar)::run(x);
}
+template <typename Scalar, typename Mask>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar select(const Mask& mask, const Scalar& a, const Scalar& b) {
+ return internal::scalar_select_mask<Mask>::run(mask) ? b : a;
+}
+
} // namespace numext
namespace internal {
@@ -827,8 +836,8 @@
template <typename T>
EIGEN_DEVICE_FUNC
- std::enable_if_t<!(std::numeric_limits<T>::has_quiet_NaN || std::numeric_limits<T>::has_signaling_NaN), bool>
- isnan_impl(const T&) {
+std::enable_if_t<!(std::numeric_limits<T>::has_quiet_NaN || std::numeric_limits<T>::has_signaling_NaN), bool>
+isnan_impl(const T&) {
return false;
}
@@ -936,6 +945,38 @@
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run_trunc(const Scalar& x) { return x; }
};
+// Default implementation.
+template <typename Scalar, typename Enable = void>
+struct fma_impl {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run(const Scalar& a, const Scalar& b, const Scalar& c) {
+ return a * b + c;
+ }
+};
+
+// ADL version if it exists.
+template <typename T>
+struct fma_impl<
+ T,
+ std::enable_if_t<std::is_same<T, decltype(fma(std::declval<T>(), std::declval<T>(), std::declval<T>()))>::value>> {
+ static T run(const T& a, const T& b, const T& c) { return fma(a, b, c); }
+};
+
+#if defined(EIGEN_GPUCC)
+template <>
+struct fma_impl<float, void> {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float run(const float& a, const float& b, const float& c) {
+ return ::fmaf(a, b, c);
+ }
+};
+
+template <>
+struct fma_impl<double, void> {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double run(const double& a, const double& b, const double& c) {
+ return ::fma(a, b, c);
+ }
+};
+#endif
+
} // end namespace internal
/****************************************************************************
@@ -1256,7 +1297,7 @@
// Integer division with rounding up.
// T is assumed to be an integer type with a>=0, and b>0
template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE EIGEN_CONSTEXPR T div_ceil(T a, T b) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE constexpr T div_ceil(T a, T b) {
using UnsignedT = typename internal::make_unsigned<T>::type;
EIGEN_STATIC_ASSERT((NumTraits<T>::IsInteger), THIS FUNCTION IS FOR INTEGER TYPES)
// Note: explicitly declaring a and b as non-negative values allows the compiler to use better optimizations
@@ -1269,7 +1310,7 @@
// Integer round down to nearest power of b
// T is assumed to be an integer type with a>=0, and b>0
template <typename T, typename U>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE EIGEN_CONSTEXPR T round_down(T a, U b) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE constexpr T round_down(T a, U b) {
using UnsignedT = typename internal::make_unsigned<T>::type;
using UnsignedU = typename internal::make_unsigned<U>::type;
EIGEN_STATIC_ASSERT((NumTraits<T>::IsInteger), THIS FUNCTION IS FOR INTEGER TYPES)
@@ -1282,7 +1323,7 @@
/** Log base 2 for 32 bits positive integers.
* Conveniently returns 0 for x==0. */
-EIGEN_CONSTEXPR inline int log2(int x) {
+constexpr int log2(int x) {
unsigned int v(x);
constexpr int table[32] = {0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31};
@@ -1320,11 +1361,17 @@
/** \returns the cube root of \a x. **/
template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T cbrt(const T& x) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t<!NumTraits<T>::IsComplex, T> cbrt(const T& x) {
EIGEN_USING_STD(cbrt);
return static_cast<T>(cbrt(x));
}
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t<NumTraits<T>::IsComplex, T> cbrt(const T& x) {
+ EIGEN_USING_STD(pow);
+ return pow(x, typename NumTraits<T>::Real(1.0 / 3.0));
+}
+
/** \returns the reciprocal square root of \a x. **/
template <typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T rsqrt(const T& x) {
@@ -1353,17 +1400,17 @@
#endif
template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
- std::enable_if_t<NumTraits<T>::IsSigned || NumTraits<T>::IsComplex, typename NumTraits<T>::Real>
- abs(const T& x) {
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE std::enable_if_t<NumTraits<T>::IsSigned || NumTraits<T>::IsComplex, typename NumTraits<T>::Real>
+abs(const T& x) {
EIGEN_USING_STD(abs);
return abs(x);
}
template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
- std::enable_if_t<!(NumTraits<T>::IsSigned || NumTraits<T>::IsComplex), typename NumTraits<T>::Real>
- abs(const T& x) {
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE std::enable_if_t<!(NumTraits<T>::IsSigned || NumTraits<T>::IsComplex), typename NumTraits<T>::Real>
+abs(const T& x) {
return x;
}
@@ -1843,6 +1890,15 @@
return bit_cast<Scalar, SignedScalar>(bit_cast<SignedScalar, Scalar>(a) >> n);
}
+// Use std::fma if available.
+using std::fma;
+
+// Otherwise, rely on template implementation.
+template <typename Scalar>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar fma(const Scalar& x, const Scalar& y, const Scalar& z) {
+ return internal::fma_impl<Scalar>::run(x, y, z);
+}
+
} // end namespace numext
namespace internal {
diff --git a/Eigen/src/Core/MathFunctionsImpl.h b/Eigen/src/Core/MathFunctionsImpl.h
index 8e2705b..cbac1c2 100644
--- a/Eigen/src/Core/MathFunctionsImpl.h
+++ b/Eigen/src/Core/MathFunctionsImpl.h
@@ -76,7 +76,7 @@
static_assert(Steps > 0, "Steps must be at least 1.");
using Scalar = typename unpacket_traits<Packet>::type;
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Packet run(const Packet& a, const Packet& approx_rsqrt) {
- constexpr Scalar kMinusHalf = Scalar(-1) / Scalar(2);
+ const Scalar kMinusHalf = Scalar(-1) / Scalar(2);
const Packet cst_minus_half = pset1<Packet>(kMinusHalf);
const Packet cst_minus_one = pset1<Packet>(Scalar(-1));
@@ -256,6 +256,48 @@
return ComplexT(numext::log(a), b);
}
+// For generic scalars, use ternary select.
+template <typename Mask>
+struct scalar_select_mask<Mask, /*is_built_in_float*/ false> {
+ static EIGEN_DEVICE_FUNC inline bool run(const Mask& mask) { return numext::is_exactly_zero(mask); }
+};
+
+// For built-in float mask, bitcast the mask to its integer counterpart and use ternary select.
+template <typename Mask>
+struct scalar_select_mask<Mask, /*is_built_in_float*/ true> {
+ using IntegerType = typename numext::get_integer_by_size<sizeof(Mask)>::unsigned_type;
+ static EIGEN_DEVICE_FUNC inline bool run(const Mask& mask) {
+ return numext::is_exactly_zero(numext::bit_cast<IntegerType>(std::abs(mask)));
+ }
+};
+
+template <int Size = sizeof(long double)>
+struct ldbl_select_mask {
+ static constexpr int MantissaDigits = std::numeric_limits<long double>::digits;
+ static constexpr int NumBytes = (MantissaDigits == 64 ? 80 : 128) / CHAR_BIT;
+ static EIGEN_DEVICE_FUNC inline bool run(const long double& mask) {
+ const uint8_t* mask_bytes = reinterpret_cast<const uint8_t*>(&mask);
+ for (Index i = 0; i < NumBytes; i++) {
+ if (mask_bytes[i] != 0) return false;
+ }
+ return true;
+ }
+};
+
+template <>
+struct ldbl_select_mask<sizeof(double)> : scalar_select_mask<double> {};
+
+template <>
+struct scalar_select_mask<long double, true> : ldbl_select_mask<> {};
+
+template <typename RealMask>
+struct scalar_select_mask<std::complex<RealMask>, false> {
+ using impl = scalar_select_mask<RealMask>;
+ static EIGEN_DEVICE_FUNC inline bool run(const std::complex<RealMask>& mask) {
+ return impl::run(numext::real(mask)) && impl::run(numext::imag(mask));
+ }
+};
+
} // end namespace internal
} // end namespace Eigen
diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h
index 67590fb..a2c8eba 100644
--- a/Eigen/src/Core/Matrix.h
+++ b/Eigen/src/Core/Matrix.h
@@ -258,8 +258,8 @@
/** \brief Moves the matrix into the other one.
*
*/
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Matrix& operator=(Matrix&& other)
- EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable<Scalar>::value) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Matrix& operator=(Matrix&& other) noexcept(
+ std::is_nothrow_move_assignable<Scalar>::value) {
Base::operator=(std::move(other));
return *this;
}
@@ -393,8 +393,8 @@
template <typename OtherDerived>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Matrix(const EigenBase<OtherDerived>& other) : Base(other.derived()) {}
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const EIGEN_NOEXCEPT { return 1; }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const EIGEN_NOEXCEPT { return this->innerSize(); }
+ EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { return 1; }
+ EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return this->innerSize(); }
/////////// Geometry module ///////////
diff --git a/Eigen/src/Core/NestByValue.h b/Eigen/src/Core/NestByValue.h
index ec360eb..2ce83a8 100644
--- a/Eigen/src/Core/NestByValue.h
+++ b/Eigen/src/Core/NestByValue.h
@@ -45,8 +45,8 @@
EIGEN_DEVICE_FUNC explicit inline NestByValue(const ExpressionType& matrix) : m_expression(matrix) {}
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_expression.rows(); }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_expression.cols(); }
+ EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_expression.rows(); }
+ EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_expression.cols(); }
EIGEN_DEVICE_FUNC operator const ExpressionType&() const { return m_expression; }
diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h
index 1dc3448..5e4e5c2 100644
--- a/Eigen/src/Core/NumTraits.h
+++ b/Eigen/src/Core/NumTraits.h
@@ -22,13 +22,13 @@
template <typename T, bool use_numeric_limits = std::numeric_limits<T>::is_specialized,
bool is_integer = NumTraits<T>::IsInteger>
struct default_digits_impl {
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int run() { return std::numeric_limits<T>::digits; }
+ EIGEN_DEVICE_FUNC constexpr static int run() { return std::numeric_limits<T>::digits; }
};
template <typename T>
struct default_digits_impl<T, false, false> // Floating point
{
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int run() {
+ EIGEN_DEVICE_FUNC constexpr static int run() {
using std::ceil;
using std::log2;
typedef typename NumTraits<T>::Real Real;
@@ -39,7 +39,7 @@
template <typename T>
struct default_digits_impl<T, false, true> // Integer
{
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int run() { return 0; }
+ EIGEN_DEVICE_FUNC constexpr static int run() { return 0; }
};
// default implementation of digits10(), based on numeric_limits if specialized,
@@ -47,13 +47,13 @@
template <typename T, bool use_numeric_limits = std::numeric_limits<T>::is_specialized,
bool is_integer = NumTraits<T>::IsInteger>
struct default_digits10_impl {
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int run() { return std::numeric_limits<T>::digits10; }
+ EIGEN_DEVICE_FUNC constexpr static int run() { return std::numeric_limits<T>::digits10; }
};
template <typename T>
struct default_digits10_impl<T, false, false> // Floating point
{
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int run() {
+ EIGEN_DEVICE_FUNC constexpr static int run() {
using std::floor;
using std::log10;
typedef typename NumTraits<T>::Real Real;
@@ -64,7 +64,7 @@
template <typename T>
struct default_digits10_impl<T, false, true> // Integer
{
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int run() { return 0; }
+ EIGEN_DEVICE_FUNC constexpr static int run() { return 0; }
};
// default implementation of max_digits10(), based on numeric_limits if specialized,
@@ -72,13 +72,13 @@
template <typename T, bool use_numeric_limits = std::numeric_limits<T>::is_specialized,
bool is_integer = NumTraits<T>::IsInteger>
struct default_max_digits10_impl {
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int run() { return std::numeric_limits<T>::max_digits10; }
+ EIGEN_DEVICE_FUNC constexpr static int run() { return std::numeric_limits<T>::max_digits10; }
};
template <typename T>
struct default_max_digits10_impl<T, false, false> // Floating point
{
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int run() {
+ EIGEN_DEVICE_FUNC constexpr static int run() {
using std::ceil;
using std::log10;
typedef typename NumTraits<T>::Real Real;
@@ -89,7 +89,7 @@
template <typename T>
struct default_max_digits10_impl<T, false, true> // Integer
{
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int run() { return 0; }
+ EIGEN_DEVICE_FUNC constexpr static int run() { return 0; }
};
} // end namespace internal
@@ -188,32 +188,30 @@
typedef T Nested;
typedef T Literal;
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline Real epsilon() { return numext::numeric_limits<T>::epsilon(); }
+ EIGEN_DEVICE_FUNC constexpr static Real epsilon() { return numext::numeric_limits<T>::epsilon(); }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline int digits10() { return internal::default_digits10_impl<T>::run(); }
+ EIGEN_DEVICE_FUNC constexpr static int digits10() { return internal::default_digits10_impl<T>::run(); }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline int max_digits10() {
- return internal::default_max_digits10_impl<T>::run();
- }
+ EIGEN_DEVICE_FUNC constexpr static int max_digits10() { return internal::default_max_digits10_impl<T>::run(); }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline int digits() { return internal::default_digits_impl<T>::run(); }
+ EIGEN_DEVICE_FUNC constexpr static int digits() { return internal::default_digits_impl<T>::run(); }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline int min_exponent() { return numext::numeric_limits<T>::min_exponent; }
+ EIGEN_DEVICE_FUNC constexpr static int min_exponent() { return numext::numeric_limits<T>::min_exponent; }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline int max_exponent() { return numext::numeric_limits<T>::max_exponent; }
+ EIGEN_DEVICE_FUNC constexpr static int max_exponent() { return numext::numeric_limits<T>::max_exponent; }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline Real dummy_precision() {
+ EIGEN_DEVICE_FUNC constexpr static Real dummy_precision() {
// make sure to override this for floating-point types
return Real(0);
}
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline T highest() { return (numext::numeric_limits<T>::max)(); }
+ EIGEN_DEVICE_FUNC constexpr static T highest() { return (numext::numeric_limits<T>::max)(); }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline T lowest() { return (numext::numeric_limits<T>::lowest)(); }
+ EIGEN_DEVICE_FUNC constexpr static T lowest() { return (numext::numeric_limits<T>::lowest)(); }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline T infinity() { return numext::numeric_limits<T>::infinity(); }
+ EIGEN_DEVICE_FUNC constexpr static T infinity() { return numext::numeric_limits<T>::infinity(); }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline T quiet_NaN() { return numext::numeric_limits<T>::quiet_NaN(); }
+ EIGEN_DEVICE_FUNC constexpr static T quiet_NaN() { return numext::numeric_limits<T>::quiet_NaN(); }
};
template <typename T>
@@ -221,25 +219,23 @@
template <>
struct NumTraits<float> : GenericNumTraits<float> {
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline float dummy_precision() { return 1e-5f; }
+ EIGEN_DEVICE_FUNC constexpr static float dummy_precision() { return 1e-5f; }
};
template <>
struct NumTraits<double> : GenericNumTraits<double> {
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline double dummy_precision() { return 1e-12; }
+ EIGEN_DEVICE_FUNC constexpr static double dummy_precision() { return 1e-12; }
};
// GPU devices treat `long double` as `double`.
#ifndef EIGEN_GPU_COMPILE_PHASE
template <>
struct NumTraits<long double> : GenericNumTraits<long double> {
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline long double dummy_precision() {
- return static_cast<long double>(1e-15l);
- }
+ EIGEN_DEVICE_FUNC constexpr static long double dummy_precision() { return static_cast<long double>(1e-15l); }
#if defined(EIGEN_ARCH_PPC) && (__LDBL_MANT_DIG__ == 106)
// PowerPC double double causes issues with some values
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline long double epsilon() {
+ EIGEN_DEVICE_FUNC constexpr static long double epsilon() {
// 2^(-(__LDBL_MANT_DIG__)+1)
return static_cast<long double>(2.4651903288156618919116517665087e-32l);
}
@@ -260,10 +256,10 @@
MulCost = 4 * NumTraits<Real>::MulCost + 2 * NumTraits<Real>::AddCost
};
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline Real epsilon() { return NumTraits<Real>::epsilon(); }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline Real dummy_precision() { return NumTraits<Real>::dummy_precision(); }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline int digits10() { return NumTraits<Real>::digits10(); }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline int max_digits10() { return NumTraits<Real>::max_digits10(); }
+ EIGEN_DEVICE_FUNC constexpr static Real epsilon() { return NumTraits<Real>::epsilon(); }
+ EIGEN_DEVICE_FUNC constexpr static Real dummy_precision() { return NumTraits<Real>::dummy_precision(); }
+ EIGEN_DEVICE_FUNC constexpr static int digits10() { return NumTraits<Real>::digits10(); }
+ EIGEN_DEVICE_FUNC constexpr static int max_digits10() { return NumTraits<Real>::max_digits10(); }
};
template <typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
@@ -290,25 +286,19 @@
: ArrayType::SizeAtCompileTime * int(NumTraits<Scalar>::MulCost)
};
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline RealScalar epsilon() { return NumTraits<RealScalar>::epsilon(); }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline RealScalar dummy_precision() {
- return NumTraits<RealScalar>::dummy_precision();
- }
+ EIGEN_DEVICE_FUNC constexpr static RealScalar epsilon() { return NumTraits<RealScalar>::epsilon(); }
+ EIGEN_DEVICE_FUNC constexpr static RealScalar dummy_precision() { return NumTraits<RealScalar>::dummy_precision(); }
- EIGEN_CONSTEXPR
- static inline int digits10() { return NumTraits<Scalar>::digits10(); }
- EIGEN_CONSTEXPR
- static inline int max_digits10() { return NumTraits<Scalar>::max_digits10(); }
+ constexpr static int digits10() { return NumTraits<Scalar>::digits10(); }
+ constexpr static int max_digits10() { return NumTraits<Scalar>::max_digits10(); }
};
template <>
struct NumTraits<std::string> : GenericNumTraits<std::string> {
enum { RequireInitialization = 1, ReadCost = HugeCost, AddCost = HugeCost, MulCost = HugeCost };
- EIGEN_CONSTEXPR
- static inline int digits10() { return 0; }
- EIGEN_CONSTEXPR
- static inline int max_digits10() { return 0; }
+ constexpr static int digits10() { return 0; }
+ constexpr static int max_digits10() { return 0; }
private:
static inline std::string epsilon();
diff --git a/Eigen/src/Core/PartialReduxEvaluator.h b/Eigen/src/Core/PartialReduxEvaluator.h
index 7b2c8dc..1f638f9 100644
--- a/Eigen/src/Core/PartialReduxEvaluator.h
+++ b/Eigen/src/Core/PartialReduxEvaluator.h
@@ -103,19 +103,36 @@
EIGEN_DEVICE_FUNC static PacketType run(const Evaluator& eval, const Func& func, Index size) {
if (size == 0) return packetwise_redux_empty_value<PacketType>(func);
- const Index size4 = (size - 1) & (~3);
+ const Index size4 = 1 + numext::round_down(size - 1, 4);
PacketType p = eval.template packetByOuterInner<Unaligned, PacketType>(0, 0);
- Index i = 1;
// This loop is optimized for instruction pipelining:
// - each iteration generates two independent instructions
// - thanks to branch prediction and out-of-order execution we have independent instructions across loops
- for (; i < size4; i += 4)
+ for (Index i = 1; i < size4; i += 4)
p = func.packetOp(
p, func.packetOp(func.packetOp(eval.template packetByOuterInner<Unaligned, PacketType>(i + 0, 0),
eval.template packetByOuterInner<Unaligned, PacketType>(i + 1, 0)),
func.packetOp(eval.template packetByOuterInner<Unaligned, PacketType>(i + 2, 0),
eval.template packetByOuterInner<Unaligned, PacketType>(i + 3, 0))));
- for (; i < size; ++i) p = func.packetOp(p, eval.template packetByOuterInner<Unaligned, PacketType>(i, 0));
+ for (Index i = size4; i < size; ++i)
+ p = func.packetOp(p, eval.template packetByOuterInner<Unaligned, PacketType>(i, 0));
+ return p;
+ }
+};
+
+template <typename Func, typename Evaluator>
+struct packetwise_segment_redux_impl {
+ typedef typename Evaluator::Scalar Scalar;
+ typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
+
+ template <typename PacketType>
+ EIGEN_DEVICE_FUNC static PacketType run(const Evaluator& eval, const Func& func, Index size, Index begin,
+ Index count) {
+ if (size == 0) return packetwise_redux_empty_value<PacketType>(func);
+
+ PacketType p = eval.template packetSegmentByOuterInner<Unaligned, PacketType>(0, 0, begin, count);
+ for (Index i = 1; i < size; ++i)
+ p = func.packetOp(p, eval.template packetSegmentByOuterInner<Unaligned, PacketType>(i, 0, begin, count));
return p;
}
};
@@ -174,14 +191,13 @@
template <int LoadMode, typename PacketType>
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC PacketType packet(Index idx) const {
- enum { PacketSize = internal::unpacket_traits<PacketType>::size };
- typedef Block<const ArgTypeNestedCleaned, Direction == Vertical ? int(ArgType::RowsAtCompileTime) : int(PacketSize),
- Direction == Vertical ? int(PacketSize) : int(ArgType::ColsAtCompileTime), true /* InnerPanel */>
- PanelType;
-
- PanelType panel(m_arg, Direction == Vertical ? 0 : idx, Direction == Vertical ? idx : 0,
- Direction == Vertical ? m_arg.rows() : Index(PacketSize),
- Direction == Vertical ? Index(PacketSize) : m_arg.cols());
+ static constexpr int PacketSize = internal::unpacket_traits<PacketType>::size;
+ static constexpr int PanelRows = Direction == Vertical ? ArgType::RowsAtCompileTime : PacketSize;
+ static constexpr int PanelCols = Direction == Vertical ? PacketSize : ArgType::ColsAtCompileTime;
+ using PanelType = Block<const ArgTypeNestedCleaned, PanelRows, PanelCols, true /* InnerPanel */>;
+ using PanelEvaluator = typename internal::redux_evaluator<PanelType>;
+ using BinaryOp = typename MemberOp::BinaryOp;
+ using Impl = internal::packetwise_redux_impl<BinaryOp, PanelEvaluator>;
// FIXME
// See bug 1612, currently if PacketSize==1 (i.e. complex<double> with 128bits registers) then the storage-order of
@@ -189,11 +205,39 @@
// by pass "vectorization" in this case:
if (PacketSize == 1) return internal::pset1<PacketType>(coeff(idx));
- typedef typename internal::redux_evaluator<PanelType> PanelEvaluator;
+ Index startRow = Direction == Vertical ? 0 : idx;
+ Index startCol = Direction == Vertical ? idx : 0;
+ Index numRows = Direction == Vertical ? m_arg.rows() : PacketSize;
+ Index numCols = Direction == Vertical ? PacketSize : m_arg.cols();
+
+ PanelType panel(m_arg, startRow, startCol, numRows, numCols);
PanelEvaluator panel_eval(panel);
- typedef typename MemberOp::BinaryOp BinaryOp;
- PacketType p = internal::packetwise_redux_impl<BinaryOp, PanelEvaluator>::template run<PacketType>(
- panel_eval, m_functor.binaryFunc(), m_arg.outerSize());
+ PacketType p = Impl::template run<PacketType>(panel_eval, m_functor.binaryFunc(), m_arg.outerSize());
+ return p;
+ }
+
+ template <int LoadMode, typename PacketType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index i, Index j, Index begin, Index count) const {
+ return packetSegment<LoadMode, PacketType>(Direction == Vertical ? j : i, begin, count);
+ }
+
+ template <int LoadMode, typename PacketType>
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC PacketType packetSegment(Index idx, Index begin, Index count) const {
+ static constexpr int PanelRows = Direction == Vertical ? ArgType::RowsAtCompileTime : Dynamic;
+ static constexpr int PanelCols = Direction == Vertical ? Dynamic : ArgType::ColsAtCompileTime;
+ using PanelType = Block<const ArgTypeNestedCleaned, PanelRows, PanelCols, true /* InnerPanel */>;
+ using PanelEvaluator = typename internal::redux_evaluator<PanelType>;
+ using BinaryOp = typename MemberOp::BinaryOp;
+ using Impl = internal::packetwise_segment_redux_impl<BinaryOp, PanelEvaluator>;
+
+ Index startRow = Direction == Vertical ? 0 : idx;
+ Index startCol = Direction == Vertical ? idx : 0;
+ Index numRows = Direction == Vertical ? m_arg.rows() : begin + count;
+ Index numCols = Direction == Vertical ? begin + count : m_arg.cols();
+
+ PanelType panel(m_arg, startRow, startCol, numRows, numCols);
+ PanelEvaluator panel_eval(panel);
+ PacketType p = Impl::template run<PacketType>(panel_eval, m_functor.binaryFunc(), m_arg.outerSize(), begin, count);
return p;
}
diff --git a/Eigen/src/Core/PlainObjectBase.h b/Eigen/src/Core/PlainObjectBase.h
index eca7e1f..a78305e 100644
--- a/Eigen/src/Core/PlainObjectBase.h
+++ b/Eigen/src/Core/PlainObjectBase.h
@@ -162,8 +162,8 @@
EIGEN_DEVICE_FUNC Base& base() { return *static_cast<Base*>(this); }
EIGEN_DEVICE_FUNC const Base& base() const { return *static_cast<const Base*>(this); }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_storage.rows(); }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_storage.cols(); }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const noexcept { return m_storage.rows(); }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const noexcept { return m_storage.cols(); }
/** This is an overloaded version of DenseCoeffsBase<Derived,ReadOnlyAccessors>::coeff(Index,Index) const
* provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts.
@@ -298,7 +298,7 @@
*
* \sa resize(Index,Index), resize(NoChange_t, Index), resize(Index, NoChange_t)
*/
- EIGEN_DEVICE_FUNC inline constexpr void resize(Index size) {
+ EIGEN_DEVICE_FUNC constexpr void resize(Index size) {
EIGEN_STATIC_ASSERT_VECTOR_ONLY(PlainObjectBase)
eigen_assert(((SizeAtCompileTime == Dynamic && (MaxSizeAtCompileTime == Dynamic || size <= MaxSizeAtCompileTime)) ||
SizeAtCompileTime == size) &&
@@ -323,7 +323,7 @@
*
* \sa resize(Index,Index)
*/
- EIGEN_DEVICE_FUNC inline constexpr void resize(NoChange_t, Index cols) { resize(rows(), cols); }
+ EIGEN_DEVICE_FUNC constexpr void resize(NoChange_t, Index cols) { resize(rows(), cols); }
/** Resizes the matrix, changing only the number of rows. For the parameter of type NoChange_t, just pass the special
* value \c NoChange as in the example below.
@@ -333,7 +333,7 @@
*
* \sa resize(Index,Index)
*/
- EIGEN_DEVICE_FUNC inline constexpr void resize(Index rows, NoChange_t) { resize(rows, cols()); }
+ EIGEN_DEVICE_FUNC constexpr void resize(Index rows, NoChange_t) { resize(rows, cols()); }
/** Resizes \c *this to have the same dimensions as \a other.
* Takes care of doing all the checking that's needed.
@@ -450,7 +450,7 @@
/** \brief Move constructor */
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr PlainObjectBase(PlainObjectBase&&) = default;
/** \brief Move assignment operator */
- EIGEN_DEVICE_FUNC constexpr PlainObjectBase& operator=(PlainObjectBase&& other) EIGEN_NOEXCEPT {
+ EIGEN_DEVICE_FUNC constexpr PlainObjectBase& operator=(PlainObjectBase&& other) noexcept {
m_storage = std::move(other.m_storage);
return *this;
}
diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index 37683e3..e16c7cc 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -224,8 +224,8 @@
"if you wanted a coeff-wise or a dot product use the respective explicit functions");
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_lhs.rows(); }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const noexcept { return m_lhs.rows(); }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const noexcept { return m_rhs.cols(); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const LhsNestedCleaned& lhs() const { return m_lhs; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const RhsNestedCleaned& rhs() const { return m_rhs; }
diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h
index 9de6481..ce8d954 100644
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@@ -283,7 +283,7 @@
template <typename Lhs, typename Rhs>
struct generic_product_impl<Lhs, Rhs, DenseShape, DenseShape, OuterProduct> {
template <typename T>
- struct is_row_major : std::conditional_t<(int(T::Flags) & RowMajorBit), internal::true_type, internal::false_type> {};
+ struct is_row_major : bool_constant<(int(T::Flags) & RowMajorBit)> {};
typedef typename Product<Lhs, Rhs>::Scalar Scalar;
// TODO it would be nice to be able to exploit our *_assign_op functors for that purpose
@@ -445,7 +445,7 @@
eval_dynamic_impl(dst, blas_traits<Lhs>::extract(lhs).template conjugateIf<ConjLhs>(),
blas_traits<Rhs>::extract(rhs).template conjugateIf<ConjRhs>(), func, actualAlpha,
- std::conditional_t<HasScalarFactor, true_type, false_type>());
+ bool_constant<HasScalarFactor>());
}
protected:
@@ -635,6 +635,24 @@
return packet<LoadMode, PacketType>(row, col);
}
+ template <int LoadMode, typename PacketType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packetSegment(Index row, Index col, Index begin,
+ Index count) const {
+ PacketType res;
+ typedef etor_product_packet_impl<bool(int(Flags) & RowMajorBit) ? RowMajor : ColMajor,
+ Unroll ? int(InnerSize) : Dynamic, LhsEtorType, RhsEtorType, PacketType, LoadMode>
+ PacketImpl;
+ PacketImpl::run_segment(row, col, m_lhsImpl, m_rhsImpl, m_innerDim, res, begin, count);
+ return res;
+ }
+
+ template <int LoadMode, typename PacketType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packetSegment(Index index, Index begin, Index count) const {
+ const Index row = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime == 1) ? 0 : index;
+ const Index col = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime == 1) ? index : 0;
+ return packetSegment<LoadMode, PacketType>(row, col, begin, count);
+ }
+
protected:
add_const_on_value_type_t<LhsNested> m_lhs;
add_const_on_value_type_t<RhsNested> m_rhs;
@@ -670,6 +688,13 @@
res = pmadd(pset1<Packet>(lhs.coeff(row, Index(UnrollingIndex - 1))),
rhs.template packet<LoadMode, Packet>(Index(UnrollingIndex - 1), col), res);
}
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
+ Index innerDim, Packet& res, Index begin, Index count) {
+ etor_product_packet_impl<RowMajor, UnrollingIndex - 1, Lhs, Rhs, Packet, LoadMode>::run_segment(
+ row, col, lhs, rhs, innerDim, res, begin, count);
+ res = pmadd(pset1<Packet>(lhs.coeff(row, Index(UnrollingIndex - 1))),
+ rhs.template packetSegment<LoadMode, Packet>(Index(UnrollingIndex - 1), col, begin, count), res);
+ }
};
template <int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
@@ -681,6 +706,13 @@
res = pmadd(lhs.template packet<LoadMode, Packet>(row, Index(UnrollingIndex - 1)),
pset1<Packet>(rhs.coeff(Index(UnrollingIndex - 1), col)), res);
}
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
+ Index innerDim, Packet& res, Index begin, Index count) {
+ etor_product_packet_impl<ColMajor, UnrollingIndex - 1, Lhs, Rhs, Packet, LoadMode>::run_segment(
+ row, col, lhs, rhs, innerDim, res, begin, count);
+ res = pmadd(lhs.template packetSegment<LoadMode, Packet>(row, Index(UnrollingIndex - 1), begin, count),
+ pset1<Packet>(rhs.coeff(Index(UnrollingIndex - 1), col)), res);
+ }
};
template <typename Lhs, typename Rhs, typename Packet, int LoadMode>
@@ -689,6 +721,12 @@
Index /*innerDim*/, Packet& res) {
res = pmul(pset1<Packet>(lhs.coeff(row, Index(0))), rhs.template packet<LoadMode, Packet>(Index(0), col));
}
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
+ Index /*innerDim*/, Packet& res, Index begin,
+ Index count) {
+ res = pmul(pset1<Packet>(lhs.coeff(row, Index(0))),
+ rhs.template packetSegment<LoadMode, Packet>(Index(0), col, begin, count));
+ }
};
template <typename Lhs, typename Rhs, typename Packet, int LoadMode>
@@ -697,6 +735,12 @@
Index /*innerDim*/, Packet& res) {
res = pmul(lhs.template packet<LoadMode, Packet>(row, Index(0)), pset1<Packet>(rhs.coeff(Index(0), col)));
}
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
+ Index /*innerDim*/, Packet& res, Index begin,
+ Index count) {
+ res = pmul(lhs.template packetSegment<LoadMode, Packet>(row, Index(0), begin, count),
+ pset1<Packet>(rhs.coeff(Index(0), col)));
+ }
};
template <typename Lhs, typename Rhs, typename Packet, int LoadMode>
@@ -705,6 +749,11 @@
const Rhs& /*rhs*/, Index /*innerDim*/, Packet& res) {
res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
}
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/,
+ const Rhs& /*rhs*/, Index /*innerDim*/, Packet& res,
+ Index /*begin*/, Index /*count*/) {
+ res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
+ }
};
template <typename Lhs, typename Rhs, typename Packet, int LoadMode>
@@ -713,6 +762,11 @@
const Rhs& /*rhs*/, Index /*innerDim*/, Packet& res) {
res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
}
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/,
+ const Rhs& /*rhs*/, Index /*innerDim*/, Packet& res,
+ Index /*begin*/, Index /*count*/) {
+ res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
+ }
};
template <typename Lhs, typename Rhs, typename Packet, int LoadMode>
@@ -723,6 +777,13 @@
for (Index i = 0; i < innerDim; ++i)
res = pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode, Packet>(i, col), res);
}
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
+ Index innerDim, Packet& res, Index begin, Index count) {
+ res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
+ for (Index i = 0; i < innerDim; ++i)
+ res = pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packetSegment<LoadMode, Packet>(i, col, begin, count),
+ res);
+ }
};
template <typename Lhs, typename Rhs, typename Packet, int LoadMode>
@@ -733,6 +794,13 @@
for (Index i = 0; i < innerDim; ++i)
res = pmadd(lhs.template packet<LoadMode, Packet>(row, i), pset1<Packet>(rhs.coeff(i, col)), res);
}
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
+ Index innerDim, Packet& res, Index begin, Index count) {
+ res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
+ for (Index i = 0; i < innerDim; ++i)
+ res = pmadd(lhs.template packetSegment<LoadMode, Packet>(row, i, begin, count), pset1<Packet>(rhs.coeff(i, col)),
+ res);
+ }
};
/***************************************************************************
@@ -871,6 +939,26 @@
m_diagImpl.template packet<DiagonalPacketLoadMode, PacketType>(id));
}
+ template <int LoadMode, typename PacketType>
+ EIGEN_STRONG_INLINE PacketType packet_segment_impl(Index row, Index col, Index id, Index begin, Index count,
+ internal::true_type) const {
+ return internal::pmul(m_matImpl.template packetSegment<LoadMode, PacketType>(row, col, begin, count),
+ internal::pset1<PacketType>(m_diagImpl.coeff(id)));
+ }
+
+ template <int LoadMode, typename PacketType>
+ EIGEN_STRONG_INLINE PacketType packet_segment_impl(Index row, Index col, Index id, Index begin, Index count,
+ internal::false_type) const {
+ enum {
+ InnerSize = (MatrixType::Flags & RowMajorBit) ? MatrixType::ColsAtCompileTime : MatrixType::RowsAtCompileTime,
+ DiagonalPacketLoadMode = plain_enum_min(
+ LoadMode,
+ ((InnerSize % 16) == 0) ? int(Aligned16) : int(evaluator<DiagonalType>::Alignment)) // FIXME hardcoded 16!!
+ };
+ return internal::pmul(m_matImpl.template packetSegment<LoadMode, PacketType>(row, col, begin, count),
+ m_diagImpl.template packetSegment<DiagonalPacketLoadMode, PacketType>(id, begin, count));
+ }
+
evaluator<DiagonalType> m_diagImpl;
evaluator<MatrixType> m_matImpl;
};
@@ -892,7 +980,8 @@
typedef typename XprType::PlainObject PlainObject;
typedef typename Lhs::DiagonalVectorType DiagonalType;
- enum { StorageOrder = Base::StorageOrder_ };
+ static constexpr int StorageOrder = Base::StorageOrder_;
+ using IsRowMajor_t = bool_constant<StorageOrder == RowMajor>;
EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr) : Base(xpr.rhs(), xpr.lhs().diagonal()) {}
@@ -905,8 +994,7 @@
EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
// FIXME: NVCC used to complain about the template keyword, but we have to check whether this is still the case.
// See also similar calls below.
- return this->template packet_impl<LoadMode, PacketType>(
- row, col, row, std::conditional_t<int(StorageOrder) == RowMajor, internal::true_type, internal::false_type>());
+ return this->template packet_impl<LoadMode, PacketType>(row, col, row, IsRowMajor_t());
}
template <int LoadMode, typename PacketType>
@@ -914,6 +1002,19 @@
return packet<LoadMode, PacketType>(int(StorageOrder) == ColMajor ? idx : 0,
int(StorageOrder) == ColMajor ? 0 : idx);
}
+
+ template <int LoadMode, typename PacketType>
+ EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+ // FIXME: NVCC used to complain about the template keyword, but we have to check whether this is still the case.
+ // See also similar calls below.
+ return this->template packet_segment_impl<LoadMode, PacketType>(row, col, row, begin, count, IsRowMajor_t());
+ }
+
+ template <int LoadMode, typename PacketType>
+ EIGEN_STRONG_INLINE PacketType packetSegment(Index idx, Index begin, Index count) const {
+ return packetSegment<LoadMode, PacketType>(StorageOrder == ColMajor ? idx : 0, StorageOrder == ColMajor ? 0 : idx,
+ begin, count);
+ }
#endif
};
@@ -933,7 +1034,8 @@
typedef Product<Lhs, Rhs, ProductKind> XprType;
typedef typename XprType::PlainObject PlainObject;
- enum { StorageOrder = Base::StorageOrder_ };
+ static constexpr int StorageOrder = Base::StorageOrder_;
+ using IsColMajor_t = bool_constant<StorageOrder == ColMajor>;
EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr) : Base(xpr.lhs(), xpr.rhs().diagonal()) {}
@@ -944,14 +1046,23 @@
#ifndef EIGEN_GPUCC
template <int LoadMode, typename PacketType>
EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
- return this->template packet_impl<LoadMode, PacketType>(
- row, col, col, std::conditional_t<int(StorageOrder) == ColMajor, internal::true_type, internal::false_type>());
+ return this->template packet_impl<LoadMode, PacketType>(row, col, col, IsColMajor_t());
}
template <int LoadMode, typename PacketType>
EIGEN_STRONG_INLINE PacketType packet(Index idx) const {
- return packet<LoadMode, PacketType>(int(StorageOrder) == ColMajor ? idx : 0,
- int(StorageOrder) == ColMajor ? 0 : idx);
+ return packet<LoadMode, PacketType>(StorageOrder == ColMajor ? idx : 0, StorageOrder == ColMajor ? 0 : idx);
+ }
+
+ template <int LoadMode, typename PacketType>
+ EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+ return this->template packet_segment_impl<LoadMode, PacketType>(row, col, col, begin, count, IsColMajor_t());
+ }
+
+ template <int LoadMode, typename PacketType>
+ EIGEN_STRONG_INLINE PacketType packetSegment(Index idx, Index begin, Index count) const {
+ return packetSegment<LoadMode, PacketType>(StorageOrder == ColMajor ? idx : 0, StorageOrder == ColMajor ? 0 : idx,
+ begin, count);
}
#endif
};
diff --git a/Eigen/src/Core/RandomImpl.h b/Eigen/src/Core/RandomImpl.h
index 76e43f5..efba336 100644
--- a/Eigen/src/Core/RandomImpl.h
+++ b/Eigen/src/Core/RandomImpl.h
@@ -122,7 +122,7 @@
((std::numeric_limits<long double>::digits != (2 * std::numeric_limits<double>::digits)))>
struct random_longdouble_impl {
static constexpr int Size = sizeof(long double);
- static constexpr EIGEN_DEVICE_FUNC inline int mantissaBits() { return NumTraits<long double>::digits() - 1; }
+ static constexpr EIGEN_DEVICE_FUNC int mantissaBits() { return NumTraits<long double>::digits() - 1; }
static EIGEN_DEVICE_FUNC inline long double run(int numRandomBits) {
eigen_assert(numRandomBits >= 0 && numRandomBits <= mantissaBits());
EIGEN_USING_STD(memcpy);
@@ -140,7 +140,7 @@
};
template <>
struct random_longdouble_impl<false> {
- static constexpr EIGEN_DEVICE_FUNC inline int mantissaBits() { return NumTraits<double>::digits() - 1; }
+ static constexpr EIGEN_DEVICE_FUNC int mantissaBits() { return NumTraits<double>::digits() - 1; }
static EIGEN_DEVICE_FUNC inline long double run(int numRandomBits) {
return static_cast<long double>(random_float_impl<double>::run(numRandomBits));
}
diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h
index 0c5f2d9..4e9ab0e 100644
--- a/Eigen/src/Core/Redux.h
+++ b/Eigen/src/Core/Redux.h
@@ -414,6 +414,13 @@
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetByOuterInner(Index outer, Index inner) const {
return Base::template packet<LoadMode, PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer);
}
+
+ template <int LoadMode, typename PacketType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegmentByOuterInner(Index outer, Index inner, Index begin,
+ Index count) const {
+ return Base::template packetSegment<LoadMode, PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer,
+ begin, count);
+ }
};
} // end namespace internal
diff --git a/Eigen/src/Core/Ref.h b/Eigen/src/Core/Ref.h
index 129bc85..30ec277 100644
--- a/Eigen/src/Core/Ref.h
+++ b/Eigen/src/Core/Ref.h
@@ -73,11 +73,11 @@
typedef MapBase<Derived> Base;
EIGEN_DENSE_PUBLIC_INTERFACE(RefBase)
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const {
+ EIGEN_DEVICE_FUNC constexpr Index innerStride() const {
return StrideType::InnerStrideAtCompileTime != 0 ? m_stride.inner() : 1;
}
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const {
+ EIGEN_DEVICE_FUNC constexpr Index outerStride() const {
return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
: IsVectorAtCompileTime ? this->size()
: int(Flags) & RowMajorBit ? this->cols()
@@ -97,11 +97,11 @@
typedef Stride<StrideType::OuterStrideAtCompileTime, StrideType::InnerStrideAtCompileTime> StrideBase;
// Resolves inner stride if default 0.
- static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index resolveInnerStride(Index inner) { return inner == 0 ? 1 : inner; }
+ static EIGEN_DEVICE_FUNC constexpr Index resolveInnerStride(Index inner) { return inner == 0 ? 1 : inner; }
// Resolves outer stride if default 0.
- static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index resolveOuterStride(Index inner, Index outer, Index rows, Index cols,
- bool isVectorAtCompileTime, bool isRowMajor) {
+ static EIGEN_DEVICE_FUNC constexpr Index resolveOuterStride(Index inner, Index outer, Index rows, Index cols,
+ bool isVectorAtCompileTime, bool isRowMajor) {
return outer == 0 ? isVectorAtCompileTime ? inner * rows * cols : isRowMajor ? inner * cols : inner * rows : outer;
}
diff --git a/Eigen/src/Core/Replicate.h b/Eigen/src/Core/Replicate.h
index 11d7ad1..3415045 100644
--- a/Eigen/src/Core/Replicate.h
+++ b/Eigen/src/Core/Replicate.h
@@ -85,8 +85,8 @@
THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE)
}
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const { return m_matrix.rows() * m_rowFactor.value(); }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const { return m_matrix.cols() * m_colFactor.value(); }
+ EIGEN_DEVICE_FUNC constexpr Index rows() const { return m_matrix.rows() * m_rowFactor.value(); }
+ EIGEN_DEVICE_FUNC constexpr Index cols() const { return m_matrix.cols() * m_colFactor.value(); }
EIGEN_DEVICE_FUNC const MatrixTypeNested_& nestedExpression() const { return m_matrix; }
diff --git a/Eigen/src/Core/Reshaped.h b/Eigen/src/Core/Reshaped.h
index 4b34e16..22acdc0 100644
--- a/Eigen/src/Core/Reshaped.h
+++ b/Eigen/src/Core/Reshaped.h
@@ -215,10 +215,10 @@
EIGEN_DEVICE_FUNC XprType& nestedExpression() { return m_xpr; }
/** \sa MapBase::innerStride() */
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const { return m_xpr.innerStride(); }
+ EIGEN_DEVICE_FUNC constexpr Index innerStride() const { return m_xpr.innerStride(); }
/** \sa MapBase::outerStride() */
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const {
+ EIGEN_DEVICE_FUNC constexpr Index outerStride() const {
return (((Flags & RowMajorBit) == RowMajorBit) ? this->cols() : this->rows()) * m_xpr.innerStride();
}
diff --git a/Eigen/src/Core/ReturnByValue.h b/Eigen/src/Core/ReturnByValue.h
index 3b5e470..892c193 100644
--- a/Eigen/src/Core/ReturnByValue.h
+++ b/Eigen/src/Core/ReturnByValue.h
@@ -58,12 +58,8 @@
EIGEN_DEVICE_FUNC inline void evalTo(Dest& dst) const {
static_cast<const Derived*>(this)->evalTo(dst);
}
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT {
- return static_cast<const Derived*>(this)->rows();
- }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT {
- return static_cast<const Derived*>(this)->cols();
- }
+ EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return static_cast<const Derived*>(this)->rows(); }
+ EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return static_cast<const Derived*>(this)->cols(); }
#ifndef EIGEN_PARSED_BY_DOXYGEN
#define Unusable \
diff --git a/Eigen/src/Core/Reverse.h b/Eigen/src/Core/Reverse.h
index eb06fff..d11ba16 100644
--- a/Eigen/src/Core/Reverse.h
+++ b/Eigen/src/Core/Reverse.h
@@ -87,8 +87,8 @@
EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Reverse)
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
+ EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_matrix.rows(); }
+ EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_matrix.cols(); }
EIGEN_DEVICE_FUNC inline Index innerStride() const { return -m_matrix.innerStride(); }
diff --git a/Eigen/src/Core/Select.h b/Eigen/src/Core/Select.h
index 9f46120..0fa5f1e 100644
--- a/Eigen/src/Core/Select.h
+++ b/Eigen/src/Core/Select.h
@@ -63,8 +63,8 @@
eigen_assert(m_condition.cols() == m_then.cols() && m_condition.cols() == m_else.cols());
}
- inline EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_condition.rows(); }
- inline EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_condition.cols(); }
+ EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_condition.rows(); }
+ EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_condition.cols(); }
inline EIGEN_DEVICE_FUNC const Scalar coeff(Index i, Index j) const {
if (m_condition.coeff(i, j))
diff --git a/Eigen/src/Core/SelfAdjointView.h b/Eigen/src/Core/SelfAdjointView.h
index 4e9a923..16f0e75 100644
--- a/Eigen/src/Core/SelfAdjointView.h
+++ b/Eigen/src/Core/SelfAdjointView.h
@@ -73,10 +73,10 @@
EIGEN_DEVICE_FUNC explicit inline SelfAdjointView(MatrixType& matrix) : m_matrix(matrix) {}
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const EIGEN_NOEXCEPT { return m_matrix.outerStride(); }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const EIGEN_NOEXCEPT { return m_matrix.innerStride(); }
+ EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_matrix.rows(); }
+ EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_matrix.cols(); }
+ EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return m_matrix.outerStride(); }
+ EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { return m_matrix.innerStride(); }
/** \sa MatrixBase::coeff()
* \warning the coordinates must fit into the referenced triangular part
diff --git a/Eigen/src/Core/SkewSymmetricMatrix3.h b/Eigen/src/Core/SkewSymmetricMatrix3.h
index 1945fd3..3545afc 100644
--- a/Eigen/src/Core/SkewSymmetricMatrix3.h
+++ b/Eigen/src/Core/SkewSymmetricMatrix3.h
@@ -66,7 +66,7 @@
EIGEN_DEVICE_FUNC DenseMatrixType toDenseMatrix() const { return derived(); }
/** Determinant vanishes */
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Scalar determinant() const { return 0; }
+ EIGEN_DEVICE_FUNC constexpr Scalar determinant() const { return 0; }
/** A.transpose() = -A */
EIGEN_DEVICE_FUNC PlainObject transpose() const { return (-vector()).asSkewSymmetric(); }
@@ -91,9 +91,9 @@
EIGEN_DEVICE_FUNC inline SkewSymmetricVectorType& vector() { return derived().vector(); }
/** \returns the number of rows. */
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const { return 3; }
+ EIGEN_DEVICE_FUNC constexpr Index rows() const { return 3; }
/** \returns the number of columns. */
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const { return 3; }
+ EIGEN_DEVICE_FUNC constexpr Index cols() const { return 3; }
/** \returns the matrix product of \c *this by the dense matrix, \a matrix */
template <typename MatrixDerived>
diff --git a/Eigen/src/Core/Solve.h b/Eigen/src/Core/Solve.h
index dfea9c6..aa51410 100644
--- a/Eigen/src/Core/Solve.h
+++ b/Eigen/src/Core/Solve.h
@@ -66,8 +66,8 @@
Solve(const Decomposition &dec, const RhsType &rhs) : m_dec(dec), m_rhs(rhs) {}
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_dec.cols(); }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); }
+ EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_dec.cols(); }
+ EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_rhs.cols(); }
EIGEN_DEVICE_FUNC const Decomposition &dec() const { return m_dec; }
EIGEN_DEVICE_FUNC const RhsType &rhs() const { return m_rhs; }
diff --git a/Eigen/src/Core/SolveTriangular.h b/Eigen/src/Core/SolveTriangular.h
index 26d62ff..9d31874 100644
--- a/Eigen/src/Core/SolveTriangular.h
+++ b/Eigen/src/Core/SolveTriangular.h
@@ -216,8 +216,8 @@
triangular_solve_retval(const TriangularType& tri, const Rhs& rhs) : m_triangularMatrix(tri), m_rhs(rhs) {}
- inline EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_rhs.rows(); }
- inline EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); }
+ constexpr Index rows() const noexcept { return m_rhs.rows(); }
+ constexpr Index cols() const noexcept { return m_rhs.cols(); }
template <typename Dest>
inline void evalTo(Dest& dst) const {
diff --git a/Eigen/src/Core/StlIterators.h b/Eigen/src/Core/StlIterators.h
index 4576cc0..a24d4c2 100644
--- a/Eigen/src/Core/StlIterators.h
+++ b/Eigen/src/Core/StlIterators.h
@@ -36,11 +36,11 @@
typedef Index difference_type;
typedef std::random_access_iterator_tag iterator_category;
- indexed_based_stl_iterator_base() EIGEN_NO_THROW : mp_xpr(0), m_index(0) {}
- indexed_based_stl_iterator_base(XprType& xpr, Index index) EIGEN_NO_THROW : mp_xpr(&xpr), m_index(index) {}
+ indexed_based_stl_iterator_base() noexcept : mp_xpr(0), m_index(0) {}
+ indexed_based_stl_iterator_base(XprType& xpr, Index index) noexcept : mp_xpr(&xpr), m_index(index) {}
- indexed_based_stl_iterator_base(const non_const_iterator& other) EIGEN_NO_THROW : mp_xpr(other.mp_xpr),
- m_index(other.m_index) {}
+ indexed_based_stl_iterator_base(const non_const_iterator& other) noexcept
+ : mp_xpr(other.mp_xpr), m_index(other.m_index) {}
indexed_based_stl_iterator_base& operator=(const non_const_iterator& other) {
mp_xpr = other.mp_xpr;
@@ -335,15 +335,14 @@
typedef std::conditional_t<bool(is_lvalue), value_type*, const value_type*> pointer;
typedef std::conditional_t<bool(is_lvalue), value_type&, const value_type&> reference;
- pointer_based_stl_iterator() EIGEN_NO_THROW : m_ptr(0) {}
- pointer_based_stl_iterator(XprType& xpr, Index index) EIGEN_NO_THROW : m_incr(xpr.innerStride()) {
+ pointer_based_stl_iterator() noexcept : m_ptr(0) {}
+ pointer_based_stl_iterator(XprType& xpr, Index index) noexcept : m_incr(xpr.innerStride()) {
m_ptr = xpr.data() + index * m_incr.value();
}
- pointer_based_stl_iterator(const non_const_iterator& other) EIGEN_NO_THROW : m_ptr(other.m_ptr),
- m_incr(other.m_incr) {}
+ pointer_based_stl_iterator(const non_const_iterator& other) noexcept : m_ptr(other.m_ptr), m_incr(other.m_incr) {}
- pointer_based_stl_iterator& operator=(const non_const_iterator& other) EIGEN_NO_THROW {
+ pointer_based_stl_iterator& operator=(const non_const_iterator& other) noexcept {
m_ptr = other.m_ptr;
m_incr.setValue(other.m_incr);
return *this;
diff --git a/Eigen/src/Core/Stride.h b/Eigen/src/Core/Stride.h
index 14b025c..692f0a1 100644
--- a/Eigen/src/Core/Stride.h
+++ b/Eigen/src/Core/Stride.h
@@ -78,9 +78,9 @@
}
/** \returns the outer stride */
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outer() const { return m_outer.value(); }
+ EIGEN_DEVICE_FUNC constexpr Index outer() const { return m_outer.value(); }
/** \returns the inner stride */
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index inner() const { return m_inner.value(); }
+ EIGEN_DEVICE_FUNC constexpr Index inner() const { return m_inner.value(); }
protected:
internal::variable_if_dynamic<Index, OuterStrideAtCompileTime> m_outer;
diff --git a/Eigen/src/Core/Swap.h b/Eigen/src/Core/Swap.h
index d417c1a..dd825e9 100644
--- a/Eigen/src/Core/Swap.h
+++ b/Eigen/src/Core/Swap.h
@@ -65,6 +65,31 @@
Index col = Base::colIndexByOuterInner(outer, inner);
assignPacket<StoreMode, LoadMode, PacketType>(row, col);
}
+
+ template <int StoreMode, int LoadMode, typename PacketType>
+ EIGEN_STRONG_INLINE void assignPacketSegment(Index row, Index col, Index begin, Index count) {
+ PacketType tmp = m_src.template packetSegment<LoadMode, PacketType>(row, col, begin, count);
+ const_cast<SrcEvaluatorTypeT &>(m_src).template writePacketSegment<LoadMode>(
+ row, col, m_dst.template packetSegment<StoreMode, PacketType>(row, col, begin, count), begin, count);
+ m_dst.template writePacketSegment<StoreMode>(row, col, tmp, begin, count);
+ }
+
+ template <int StoreMode, int LoadMode, typename PacketType>
+ EIGEN_STRONG_INLINE void assignPacketSegment(Index index, Index begin, Index count) {
+ PacketType tmp = m_src.template packetSegment<LoadMode, PacketType>(index, begin, count);
+ const_cast<SrcEvaluatorTypeT &>(m_src).template writePacketSegment<LoadMode>(
+ index, m_dst.template packetSegment<StoreMode, PacketType>(index, begin, count), begin, count);
+ m_dst.template writePacketSegment<StoreMode>(index, tmp, begin, count);
+ }
+
+ // TODO find a simple way not to have to copy/paste this function from generic_dense_assignment_kernel, by simple I
+ // mean no CRTP (Gael)
+ template <int StoreMode, int LoadMode, typename PacketType>
+ EIGEN_STRONG_INLINE void assignPacketSegmentByOuterInner(Index outer, Index inner, Index begin, Index count) {
+ Index row = Base::rowIndexByOuterInner(outer, inner);
+ Index col = Base::colIndexByOuterInner(outer, inner);
+ assignPacketSegment<StoreMode, LoadMode, PacketType>(row, col, begin, count);
+ }
};
} // namespace internal
diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h
index 89e3d95..0676a25 100644
--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h
@@ -65,8 +65,8 @@
EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Transpose)
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const noexcept { return m_matrix.cols(); }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const noexcept { return m_matrix.rows(); }
/** \returns the nested expression */
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all_t<MatrixTypeNested>& nestedExpression() const {
diff --git a/Eigen/src/Core/Transpositions.h b/Eigen/src/Core/Transpositions.h
index 6fbbbd8..f6dd258 100644
--- a/Eigen/src/Core/Transpositions.h
+++ b/Eigen/src/Core/Transpositions.h
@@ -293,9 +293,9 @@
public:
explicit Transpose(const TranspositionType& t) : m_transpositions(t) {}
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index size() const EIGEN_NOEXCEPT { return m_transpositions.size(); }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_transpositions.size(); }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_transpositions.size(); }
+ EIGEN_DEVICE_FUNC constexpr Index size() const noexcept { return m_transpositions.size(); }
+ EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_transpositions.size(); }
+ EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_transpositions.size(); }
/** \returns the \a matrix with the inverse transpositions applied to the columns.
*/
diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h
index 2b1683b..27ad78e 100644
--- a/Eigen/src/Core/TriangularMatrix.h
+++ b/Eigen/src/Core/TriangularMatrix.h
@@ -58,10 +58,10 @@
eigen_assert(!((int(Mode) & int(UnitDiag)) && (int(Mode) & int(ZeroDiag))));
}
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return derived().rows(); }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return derived().cols(); }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const EIGEN_NOEXCEPT { return derived().outerStride(); }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const EIGEN_NOEXCEPT { return derived().innerStride(); }
+ EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return derived().rows(); }
+ EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return derived().cols(); }
+ EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return derived().outerStride(); }
+ EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { return derived().innerStride(); }
// dummy resize function
EIGEN_DEVICE_FUNC void resize(Index rows, Index cols) {
@@ -194,9 +194,9 @@
EIGEN_INHERIT_ASSIGNMENT_OPERATORS(TriangularView)
/** \copydoc EigenBase::rows() */
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
+ EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_matrix.rows(); }
/** \copydoc EigenBase::cols() */
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
+ EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_matrix.cols(); }
/** \returns a const reference to the nested expression */
EIGEN_DEVICE_FUNC const NestedExpression& nestedExpression() const { return m_matrix; }
diff --git a/Eigen/src/Core/VectorwiseOp.h b/Eigen/src/Core/VectorwiseOp.h
index 9887db6..ac52dc5 100644
--- a/Eigen/src/Core/VectorwiseOp.h
+++ b/Eigen/src/Core/VectorwiseOp.h
@@ -36,6 +36,7 @@
class PartialReduxExpr;
namespace internal {
+
template <typename MatrixType, typename MemberOp, int Direction>
struct traits<PartialReduxExpr<MatrixType, MemberOp, Direction> > : traits<MatrixType> {
typedef typename MemberOp::result_type Scalar;
@@ -63,12 +64,8 @@
EIGEN_DEVICE_FUNC explicit PartialReduxExpr(const MatrixType& mat, const MemberOp& func = MemberOp())
: m_matrix(mat), m_functor(func) {}
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT {
- return (Direction == Vertical ? 1 : m_matrix.rows());
- }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT {
- return (Direction == Horizontal ? 1 : m_matrix.cols());
- }
+ EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return (Direction == Vertical ? 1 : m_matrix.rows()); }
+ EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return (Direction == Horizontal ? 1 : m_matrix.cols()); }
EIGEN_DEVICE_FUNC typename MatrixType::Nested nestedExpression() const { return m_matrix; }
diff --git a/Eigen/src/Core/Visitor.h b/Eigen/src/Core/Visitor.h
index 198ec95..0450e2d 100644
--- a/Eigen/src/Core/Visitor.h
+++ b/Eigen/src/Core/Visitor.h
@@ -25,14 +25,12 @@
template <typename Visitor, bool ShortCircuitEvaluation = false>
struct short_circuit_eval_impl {
// if short circuit evaluation is not used, do nothing
- static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(const Visitor&) { return false; }
+ static constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(const Visitor&) { return false; }
};
template <typename Visitor>
struct short_circuit_eval_impl<Visitor, true> {
// if short circuit evaluation is used, check the visitor
- static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(const Visitor& visitor) {
- return visitor.done();
- }
+ static constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(const Visitor& visitor) { return visitor.done(); }
};
// unrolled inner-outer traversal
@@ -296,9 +294,9 @@
EIGEN_DEVICE_FUNC explicit visitor_evaluator(const XprType& xpr) : m_evaluator(xpr), m_xpr(xpr) {}
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_xpr.rows(); }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_xpr.cols(); }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index size() const EIGEN_NOEXCEPT { return m_xpr.size(); }
+ EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_xpr.rows(); }
+ EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_xpr.cols(); }
+ EIGEN_DEVICE_FUNC constexpr Index size() const noexcept { return m_xpr.size(); }
// outer-inner access
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
return m_evaluator.coeff(row, col);
@@ -632,6 +630,17 @@
};
};
+template <typename Derived, bool AlwaysTrue = NumTraits<typename traits<Derived>::Scalar>::IsInteger>
+struct all_finite_impl {
+ static EIGEN_DEVICE_FUNC inline bool run(const Derived& /*derived*/) { return true; }
+};
+#if !defined(__FINITE_MATH_ONLY__) || !(__FINITE_MATH_ONLY__)
+template <typename Derived>
+struct all_finite_impl<Derived, false> {
+ static EIGEN_DEVICE_FUNC inline bool run(const Derived& derived) { return derived.array().isFiniteTyped().all(); }
+};
+#endif
+
} // end namespace internal
/** \fn DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
@@ -781,7 +790,7 @@
*/
template <typename Derived>
EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::allFinite() const {
- return derived().array().isFiniteTyped().all();
+ return internal::all_finite_impl<Derived>::run(derived());
}
} // end namespace Eigen
diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h
index d5506da..a4a87c4 100644
--- a/Eigen/src/Core/arch/AVX/Complex.h
+++ b/Eigen/src/Core/arch/AVX/Complex.h
@@ -475,19 +475,11 @@
}
template <>
EIGEN_STRONG_INLINE Packet4cf pnmadd(const Packet4cf& a, const Packet4cf& b, const Packet4cf& c) {
- __m256 a_odd = _mm256_movehdup_ps(a.v);
- __m256 a_even = _mm256_moveldup_ps(a.v);
- __m256 b_swap = _mm256_permute_ps(b.v, _MM_SHUFFLE(2, 3, 0, 1));
- __m256 result = _mm256_fmaddsub_ps(a_odd, b_swap, _mm256_fmaddsub_ps(a_even, b.v, c.v));
- return Packet4cf(result);
+ return pnegate(pmsub(a, b, c));
}
template <>
EIGEN_STRONG_INLINE Packet4cf pnmsub(const Packet4cf& a, const Packet4cf& b, const Packet4cf& c) {
- __m256 a_odd = _mm256_movehdup_ps(a.v);
- __m256 a_even = _mm256_moveldup_ps(a.v);
- __m256 b_swap = _mm256_permute_ps(b.v, _MM_SHUFFLE(2, 3, 0, 1));
- __m256 result = _mm256_fmaddsub_ps(a_odd, b_swap, _mm256_fmsubadd_ps(a_even, b.v, c.v));
- return Packet4cf(result);
+ return pnegate(pmadd(a, b, c));
}
// std::complex<double>
template <>
@@ -508,21 +500,64 @@
}
template <>
EIGEN_STRONG_INLINE Packet2cd pnmadd(const Packet2cd& a, const Packet2cd& b, const Packet2cd& c) {
- __m256d a_odd = _mm256_permute_pd(a.v, 0xF);
- __m256d a_even = _mm256_movedup_pd(a.v);
- __m256d b_swap = _mm256_permute_pd(b.v, 0x5);
- __m256d result = _mm256_fmaddsub_pd(a_odd, b_swap, _mm256_fmaddsub_pd(a_even, b.v, c.v));
- return Packet2cd(result);
+ return pnegate(pmsub(a, b, c));
}
template <>
EIGEN_STRONG_INLINE Packet2cd pnmsub(const Packet2cd& a, const Packet2cd& b, const Packet2cd& c) {
- __m256d a_odd = _mm256_permute_pd(a.v, 0xF);
- __m256d a_even = _mm256_movedup_pd(a.v);
- __m256d b_swap = _mm256_permute_pd(b.v, 0x5);
- __m256d result = _mm256_fmaddsub_pd(a_odd, b_swap, _mm256_fmsubadd_pd(a_even, b.v, c.v));
- return Packet2cd(result);
+ return pnegate(pmadd(a, b, c));
}
#endif
+
+/*---------------- load/store segment support ----------------*/
+
+/*---------------- std::complex<float> ----------------*/
+
+template <>
+struct has_packet_segment<Packet2cf> : std::true_type {};
+
+template <>
+struct has_packet_segment<Packet4cf> : std::true_type {};
+
+template <>
+inline Packet2cf ploaduSegment<Packet2cf>(const std::complex<float>* from, Index begin, Index count) {
+ return (Packet2cf)_mm_maskload_ps(&numext::real_ref(*from), segment_mask_2x64(begin, count));
+}
+
+template <>
+inline void pstoreuSegment<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index begin,
+ Index count) {
+ _mm_maskstore_ps(&numext::real_ref(*to), segment_mask_2x64(begin, count), from.v);
+}
+
+template <>
+inline Packet4cf ploaduSegment<Packet4cf>(const std::complex<float>* from, Index begin, Index count) {
+ return (Packet4cf)_mm256_maskload_ps(&numext::real_ref(*from), segment_mask_4x64(begin, count));
+}
+
+template <>
+inline void pstoreuSegment<std::complex<float>, Packet4cf>(std::complex<float>* to, const Packet4cf& from, Index begin,
+ Index count) {
+ _mm256_maskstore_ps(&numext::real_ref(*to), segment_mask_4x64(begin, count), from.v);
+}
+
+/*---------------- std::complex<double> ----------------*/
+
+template <>
+struct has_packet_segment<Packet2cd> : std::true_type {};
+
+template <>
+inline Packet2cd ploaduSegment<Packet2cd>(const std::complex<double>* from, Index begin, Index count) {
+ return (Packet2cd)_mm256_maskload_pd(&numext::real_ref(*from), segment_mask_4x64(2 * begin, 2 * count));
+}
+
+template <>
+inline void pstoreuSegment<std::complex<double>, Packet2cd>(std::complex<double>* to, const Packet2cd& from,
+ Index begin, Index count) {
+ _mm256_maskstore_pd(&numext::real_ref(*to), segment_mask_4x64(2 * begin, 2 * count), from.v);
+}
+
+/*---------------- end load/store segment support ----------------*/
+
} // end namespace internal
} // end namespace Eigen
diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h
index a5c38e7..5b7285f 100644
--- a/Eigen/src/Core/arch/AVX/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX/MathFunctions.h
@@ -28,6 +28,7 @@
EIGEN_DOUBLE_PACKET_FUNCTION(log2, Packet4d)
EIGEN_DOUBLE_PACKET_FUNCTION(exp, Packet4d)
EIGEN_DOUBLE_PACKET_FUNCTION(tanh, Packet4d)
+EIGEN_DOUBLE_PACKET_FUNCTION(cbrt, Packet4d)
#ifdef EIGEN_VECTORIZE_AVX2
EIGEN_DOUBLE_PACKET_FUNCTION(sin, Packet4d)
EIGEN_DOUBLE_PACKET_FUNCTION(cos, Packet4d)
@@ -106,6 +107,8 @@
BF16_PACKET_FUNCTION(Packet8f, Packet8bf, psin)
BF16_PACKET_FUNCTION(Packet8f, Packet8bf, psqrt)
BF16_PACKET_FUNCTION(Packet8f, Packet8bf, ptanh)
+
+#ifndef EIGEN_VECTORIZE_AVX512FP16
F16_PACKET_FUNCTION(Packet8f, Packet8h, pcos)
F16_PACKET_FUNCTION(Packet8f, Packet8h, pexp)
F16_PACKET_FUNCTION(Packet8f, Packet8h, pexp2)
@@ -118,6 +121,7 @@
F16_PACKET_FUNCTION(Packet8f, Packet8h, psin)
F16_PACKET_FUNCTION(Packet8f, Packet8h, psqrt)
F16_PACKET_FUNCTION(Packet8f, Packet8h, ptanh)
+#endif
} // end namespace internal
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index c29523a..470e36d 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -122,6 +122,7 @@
HasBessel = 1,
HasSqrt = 1,
HasRsqrt = 1,
+ HasCbrt = 1,
HasTanh = EIGEN_FAST_MATH,
HasErf = EIGEN_FAST_MATH,
HasErfc = EIGEN_FAST_MATH,
@@ -150,6 +151,7 @@
HasExp = 1,
HasSqrt = 1,
HasRsqrt = 1,
+ HasCbrt = 1,
HasATan = 1,
HasATanh = 1,
HasBlend = 1
@@ -1839,10 +1841,13 @@
return a;
}
+#ifndef EIGEN_VECTORIZE_AVX512FP16
template <>
EIGEN_STRONG_INLINE Packet8h psignbit(const Packet8h& a) {
return _mm_cmpgt_epi16(_mm_setzero_si128(), a);
}
+#endif // EIGEN_VECTORIZE_AVX512FP16
+
template <>
EIGEN_STRONG_INLINE Packet8bf psignbit(const Packet8bf& a) {
return _mm_cmpgt_epi16(_mm_setzero_si128(), a);
@@ -2044,10 +2049,13 @@
return _mm256_movemask_ps(_mm256_castsi256_ps(x)) != 0;
}
+#ifndef EIGEN_VECTORIZE_AVX512FP16
template <>
EIGEN_STRONG_INLINE bool predux_any(const Packet8h& x) {
return _mm_movemask_epi8(x) != 0;
}
+#endif // EIGEN_VECTORIZE_AVX512FP16
+
template <>
EIGEN_STRONG_INLINE bool predux_any(const Packet8bf& x) {
return _mm_movemask_epi8(x) != 0;
@@ -2211,7 +2219,6 @@
};
typedef Packet8h half;
};
-#endif
template <>
EIGEN_STRONG_INLINE Packet8h pset1<Packet8h>(const Eigen::half& from) {
@@ -2411,6 +2418,26 @@
}
template <>
+EIGEN_STRONG_INLINE Packet8h pmadd<Packet8h>(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
+ return float2half(pmadd(half2float(a), half2float(b), half2float(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pmsub<Packet8h>(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
+ return float2half(pmsub(half2float(a), half2float(b), half2float(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pnmadd<Packet8h>(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
+ return float2half(pnmadd(half2float(a), half2float(b), half2float(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pnmsub<Packet8h>(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
+ return float2half(pnmsub(half2float(a), half2float(b), half2float(c)));
+}
+
+template <>
EIGEN_STRONG_INLINE Packet8h pdiv<Packet8h>(const Packet8h& a, const Packet8h& b) {
Packet8f af = half2float(a);
Packet8f bf = half2float(b);
@@ -2446,14 +2473,12 @@
to[stride * 7] = aux[7];
}
-#ifndef EIGEN_VECTORIZE_AVX512FP16
template <>
EIGEN_STRONG_INLINE Eigen::half predux<Packet8h>(const Packet8h& a) {
Packet8f af = half2float(a);
float reduced = predux<Packet8f>(af);
return Eigen::half(reduced);
}
-#endif
template <>
EIGEN_STRONG_INLINE Eigen::half predux_max<Packet8h>(const Packet8h& a) {
@@ -2553,6 +2578,8 @@
kernel.packet[3] = pload<Packet8h>(out[3]);
}
+#endif
+
// BFloat16 implementation.
EIGEN_STRONG_INLINE Packet8f Bf16ToF32(const Packet8bf& a) {
@@ -2781,6 +2808,26 @@
}
template <>
+EIGEN_STRONG_INLINE Packet8bf pmadd<Packet8bf>(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
+ return F32ToBf16(pmadd(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pmsub<Packet8bf>(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
+ return F32ToBf16(pmsub(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pnmadd<Packet8bf>(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
+ return F32ToBf16(pnmadd(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pnmsub<Packet8bf>(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
+ return F32ToBf16(pnmsub(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
+}
+
+template <>
EIGEN_STRONG_INLINE Packet8bf pdiv<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
return F32ToBf16(pdiv<Packet8f>(Bf16ToF32(a), Bf16ToF32(b)));
}
@@ -2893,6 +2940,258 @@
kernel.packet[3] = _mm_unpackhi_epi32(ab_47, cd_47);
}
+/*---------------- load/store segment support ----------------*/
+
+// returns a mask of 8-bit elements (at most 4) that are all 1's in the range [begin, begin + count) and 0 elsewhere.
+inline __m128i segment_mask_4x8(Index begin, Index count) {
+ eigen_assert(begin >= 0 && begin + count <= 4);
+ long long mask = 1;
+ mask <<= CHAR_BIT * count;
+ mask--;
+ mask <<= CHAR_BIT * begin;
+#if defined(_WIN32) && !defined(_WIN64)
+ return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&mask));
+#else
+ return _mm_cvtsi64_si128(mask);
+#endif
+}
+
+// returns a mask of 8-bit elements (at most 8) that are all 1's in the range [begin, begin + count) and 0 elsewhere.
+inline __m128i segment_mask_8x8(Index begin, Index count) {
+ eigen_assert(begin >= 0 && begin + count <= 8);
+ long long mask = 1;
+ // avoid UB when count == 8
+ mask <<= (CHAR_BIT / 2) * count;
+ mask <<= (CHAR_BIT / 2) * count;
+ mask--;
+ mask <<= CHAR_BIT * begin;
+#if defined(_WIN32) && !defined(_WIN64)
+ return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&mask));
+#else
+ return _mm_cvtsi64_si128(mask);
+#endif
+}
+
+// returns a mask of 32-bit elements (at most 4) that are all 1's in the range [begin, begin + count) and 0 elsewhere.
+inline __m128i segment_mask_4x32(Index begin, Index count) {
+ eigen_assert(begin >= 0 && begin + count <= 4);
+ return _mm_cvtepi8_epi32(segment_mask_4x8(begin, count));
+}
+
+// returns a mask of 64-bit elements (at most 2) that are all 1's in the range [begin, begin + count) and 0 elsewhere.
+inline __m128i segment_mask_2x64(Index begin, Index count) {
+ eigen_assert(begin >= 0 && begin + count <= 2);
+ return _mm_cvtepi8_epi64(segment_mask_4x8(begin, count));
+}
+
+// returns a mask of 32-bit elements (at most 8) that are all 1's in the range [begin, begin + count) and 0 elsewhere.
+inline __m256i segment_mask_8x32(Index begin, Index count) {
+ __m128i mask_epi8 = segment_mask_8x8(begin, count);
+#ifdef EIGEN_VECTORIZE_AVX2
+ __m256i mask_epi32 = _mm256_cvtepi8_epi32(mask_epi8);
+#else
+ __m128i mask_epi32_lo = _mm_cvtepi8_epi32(mask_epi8);
+ __m128i mask_epi32_hi = _mm_cvtepi8_epi32(_mm_srli_epi64(mask_epi8, 32));
+ __m256i mask_epi32 = _mm256_insertf128_si256(_mm256_castsi128_si256(mask_epi32_lo), mask_epi32_hi, 1);
+#endif
+ return mask_epi32;
+}
+
+// returns a mask of 64-bit elements (at most 4) that are all 1's in the range [begin, begin + count) and 0 elsewhere.
+inline __m256i segment_mask_4x64(Index begin, Index count) {
+ __m128i mask_epi8 = segment_mask_4x8(begin, count);
+#ifdef EIGEN_VECTORIZE_AVX2
+ __m256i mask_epi64 = _mm256_cvtepi8_epi64(mask_epi8);
+#else
+ __m128i mask_epi64_lo = _mm_cvtepi8_epi64(mask_epi8);
+ __m128i mask_epi64_hi = _mm_cvtepi8_epi64(_mm_srli_epi64(mask_epi8, 16));
+ __m256i mask_epi64 = _mm256_insertf128_si256(_mm256_castsi128_si256(mask_epi64_lo), mask_epi64_hi, 1);
+#endif
+ return mask_epi64;
+}
+
+/*---------------- float ----------------*/
+
+template <>
+struct has_packet_segment<Packet4f> : std::true_type {};
+
+template <>
+struct has_packet_segment<Packet8f> : std::true_type {};
+
+template <>
+inline Packet4f ploaduSegment<Packet4f>(const float* from, Index begin, Index count) {
+ return _mm_maskload_ps(from, segment_mask_4x32(begin, count));
+}
+
+template <>
+inline void pstoreuSegment<float, Packet4f>(float* to, const Packet4f& from, Index begin, Index count) {
+ _mm_maskstore_ps(to, segment_mask_4x32(begin, count), from);
+}
+
+template <>
+inline Packet8f ploaduSegment<Packet8f>(const float* from, Index begin, Index count) {
+ return _mm256_maskload_ps(from, segment_mask_8x32(begin, count));
+}
+
+template <>
+inline void pstoreuSegment<float, Packet8f>(float* to, const Packet8f& from, Index begin, Index count) {
+ _mm256_maskstore_ps(to, segment_mask_8x32(begin, count), from);
+}
+
+/*---------------- int32 ----------------*/
+
+template <>
+struct has_packet_segment<Packet4i> : std::true_type {};
+
+template <>
+struct has_packet_segment<Packet8i> : std::true_type {};
+
+#ifdef EIGEN_VECTORIZE_AVX2
+
+template <>
+inline Packet4i ploaduSegment<Packet4i>(const int* from, Index begin, Index count) {
+ return _mm_maskload_epi32(from, segment_mask_4x32(begin, count));
+}
+
+template <>
+inline void pstoreuSegment<int, Packet4i>(int* to, const Packet4i& from, Index begin, Index count) {
+ _mm_maskstore_epi32(to, segment_mask_4x32(begin, count), from);
+}
+
+template <>
+inline Packet8i ploaduSegment<Packet8i>(const int* from, Index begin, Index count) {
+ return _mm256_maskload_epi32(from, segment_mask_8x32(begin, count));
+}
+
+template <>
+inline void pstoreuSegment<int, Packet8i>(int* to, const Packet8i& from, Index begin, Index count) {
+ _mm256_maskstore_epi32(to, segment_mask_8x32(begin, count), from);
+}
+
+#else
+
+template <>
+inline Packet4i ploaduSegment<Packet4i>(const int* from, Index begin, Index count) {
+ return _mm_castps_si128(ploaduSegment<Packet4f>(reinterpret_cast<const float*>(from), begin, count));
+}
+
+template <>
+inline void pstoreuSegment<int, Packet4i>(int* to, const Packet4i& from, Index begin, Index count) {
+ pstoreuSegment<float, Packet4f>(reinterpret_cast<float*>(to), _mm_castsi128_ps(from), begin, count);
+}
+
+template <>
+inline Packet8i ploaduSegment<Packet8i>(const int* from, Index begin, Index count) {
+ return _mm256_castps_si256(ploaduSegment<Packet8f>(reinterpret_cast<const float*>(from), begin, count));
+}
+
+template <>
+inline void pstoreuSegment<int, Packet8i>(int* to, const Packet8i& from, Index begin, Index count) {
+ pstoreuSegment<float, Packet8f>(reinterpret_cast<float*>(to), _mm256_castsi256_ps(from), begin, count);
+}
+
+#endif
+
+/*---------------- uint32 ----------------*/
+
+template <>
+struct has_packet_segment<Packet4ui> : std::true_type {};
+
+template <>
+struct has_packet_segment<Packet8ui> : std::true_type {};
+
+template <>
+inline Packet4ui ploaduSegment<Packet4ui>(const uint32_t* from, Index begin, Index count) {
+ return Packet4ui(ploaduSegment<Packet4i>(reinterpret_cast<const int*>(from), begin, count));
+}
+
+template <>
+inline void pstoreuSegment<uint32_t, Packet4ui>(uint32_t* to, const Packet4ui& from, Index begin, Index count) {
+ pstoreuSegment<int, Packet4i>(reinterpret_cast<int*>(to), Packet4i(from), begin, count);
+}
+
+template <>
+inline Packet8ui ploaduSegment<Packet8ui>(const uint32_t* from, Index begin, Index count) {
+ return Packet8ui(ploaduSegment<Packet8i>(reinterpret_cast<const int*>(from), begin, count));
+}
+
+template <>
+inline void pstoreuSegment<uint32_t, Packet8ui>(uint32_t* to, const Packet8ui& from, Index begin, Index count) {
+ pstoreuSegment<int, Packet8i>(reinterpret_cast<int*>(to), Packet8i(from), begin, count);
+}
+
+/*---------------- double ----------------*/
+
+template <>
+struct has_packet_segment<Packet2d> : std::true_type {};
+
+template <>
+struct has_packet_segment<Packet4d> : std::true_type {};
+
+template <>
+inline Packet2d ploaduSegment<Packet2d>(const double* from, Index begin, Index count) {
+ return _mm_maskload_pd(from, segment_mask_2x64(begin, count));
+}
+
+template <>
+inline void pstoreuSegment<double, Packet2d>(double* to, const Packet2d& from, Index begin, Index count) {
+ _mm_maskstore_pd(to, segment_mask_2x64(begin, count), from);
+}
+
+template <>
+inline Packet4d ploaduSegment<Packet4d>(const double* from, Index begin, Index count) {
+ return _mm256_maskload_pd(from, segment_mask_4x64(begin, count));
+}
+
+template <>
+inline void pstoreuSegment<double, Packet4d>(double* to, const Packet4d& from, Index begin, Index count) {
+ _mm256_maskstore_pd(to, segment_mask_4x64(begin, count), from);
+}
+
+#ifdef EIGEN_VECTORIZE_AVX2
+
+/*---------------- int64_t ----------------*/
+
+template <>
+struct has_packet_segment<Packet2l> : std::true_type {};
+
+template <>
+struct has_packet_segment<Packet4l> : std::true_type {};
+
+template <>
+inline Packet2l ploaduSegment<Packet2l>(const int64_t* from, Index begin, Index count) {
+ return _mm_maskload_epi64(reinterpret_cast<const long long*>(from), segment_mask_2x64(begin, count));
+}
+template <>
+inline void pstoreuSegment<int64_t, Packet2l>(int64_t* to, const Packet2l& from, Index begin, Index count) {
+ _mm_maskstore_epi64(reinterpret_cast<long long*>(to), segment_mask_2x64(begin, count), from);
+}
+template <>
+inline Packet4l ploaduSegment<Packet4l>(const int64_t* from, Index begin, Index count) {
+ return _mm256_maskload_epi64(reinterpret_cast<const long long*>(from), segment_mask_4x64(begin, count));
+}
+template <>
+inline void pstoreuSegment<int64_t, Packet4l>(int64_t* to, const Packet4l& from, Index begin, Index count) {
+ _mm256_maskstore_epi64(reinterpret_cast<long long*>(to), segment_mask_4x64(begin, count), from);
+}
+
+/*---------------- uint64_t ----------------*/
+
+template <>
+struct has_packet_segment<Packet4ul> : std::true_type {};
+
+template <>
+inline Packet4ul ploaduSegment<Packet4ul>(const uint64_t* from, Index begin, Index count) {
+ return Packet4ul(ploaduSegment<Packet4l>(reinterpret_cast<const int64_t*>(from), begin, count));
+}
+template <>
+inline void pstoreuSegment<uint64_t, Packet4ul>(uint64_t* to, const Packet4ul& from, Index begin, Index count) {
+ pstoreuSegment<int64_t, Packet4l>(reinterpret_cast<int64_t*>(to), Packet4l(from), begin, count);
+}
+#endif
+
+/*---------------- end load/store segment support ----------------*/
+
} // end namespace internal
} // end namespace Eigen
diff --git a/Eigen/src/Core/arch/AVX/TypeCasting.h b/Eigen/src/Core/arch/AVX/TypeCasting.h
index 9dcd6ef..5b73ffe 100644
--- a/Eigen/src/Core/arch/AVX/TypeCasting.h
+++ b/Eigen/src/Core/arch/AVX/TypeCasting.h
@@ -279,20 +279,22 @@
}
#endif
+#ifndef EIGEN_VECTORIZE_AVX512FP16
template <>
EIGEN_STRONG_INLINE Packet8f pcast<Packet8h, Packet8f>(const Packet8h& a) {
return half2float(a);
}
template <>
-EIGEN_STRONG_INLINE Packet8f pcast<Packet8bf, Packet8f>(const Packet8bf& a) {
- return Bf16ToF32(a);
-}
-
-template <>
EIGEN_STRONG_INLINE Packet8h pcast<Packet8f, Packet8h>(const Packet8f& a) {
return float2half(a);
}
+#endif
+
+template <>
+EIGEN_STRONG_INLINE Packet8f pcast<Packet8bf, Packet8f>(const Packet8bf& a) {
+ return Bf16ToF32(a);
+}
template <>
EIGEN_STRONG_INLINE Packet8bf pcast<Packet8f, Packet8bf>(const Packet8f& a) {
diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h
index 6039254..04499a0 100644
--- a/Eigen/src/Core/arch/AVX512/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h
@@ -47,16 +47,16 @@
#if EIGEN_FAST_MATH
template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f psqrt<Packet16f>(const Packet16f& _x) {
- return generic_sqrt_newton_step<Packet16f>::run(_x, _mm512_rsqrt14_ps(_x));
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f psqrt<Packet16f>(const Packet16f& x) {
+ return generic_sqrt_newton_step<Packet16f>::run(x, _mm512_rsqrt14_ps(x));
}
template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8d psqrt<Packet8d>(const Packet8d& _x) {
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8d psqrt<Packet8d>(const Packet8d& x) {
#ifdef EIGEN_VECTORIZE_AVX512ER
- return generic_sqrt_newton_step<Packet8d, /*Steps=*/1>::run(_x, _mm512_rsqrt28_pd(_x));
+ return generic_sqrt_newton_step<Packet8d, /*Steps=*/1>::run(x, _mm512_rsqrt28_pd(x));
#else
- return generic_sqrt_newton_step<Packet8d, /*Steps=*/2>::run(_x, _mm512_rsqrt14_pd(_x));
+ return generic_sqrt_newton_step<Packet8d, /*Steps=*/2>::run(x, _mm512_rsqrt14_pd(x));
#endif
}
#else
@@ -80,19 +80,19 @@
#elif EIGEN_FAST_MATH
template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f prsqrt<Packet16f>(const Packet16f& _x) {
- return generic_rsqrt_newton_step<Packet16f, /*Steps=*/1>::run(_x, _mm512_rsqrt14_ps(_x));
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f prsqrt<Packet16f>(const Packet16f& x) {
+ return generic_rsqrt_newton_step<Packet16f, /*Steps=*/1>::run(x, _mm512_rsqrt14_ps(x));
}
#endif
// prsqrt for double.
#if EIGEN_FAST_MATH
template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8d prsqrt<Packet8d>(const Packet8d& _x) {
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8d prsqrt<Packet8d>(const Packet8d& x) {
#ifdef EIGEN_VECTORIZE_AVX512ER
- return generic_rsqrt_newton_step<Packet8d, /*Steps=*/1>::run(_x, _mm512_rsqrt28_pd(_x));
+ return generic_rsqrt_newton_step<Packet8d, /*Steps=*/1>::run(x, _mm512_rsqrt28_pd(x));
#else
- return generic_rsqrt_newton_step<Packet8d, /*Steps=*/2>::run(_x, _mm512_rsqrt14_pd(_x));
+ return generic_rsqrt_newton_step<Packet8d, /*Steps=*/2>::run(x, _mm512_rsqrt14_pd(x));
#endif
}
@@ -118,6 +118,8 @@
BF16_PACKET_FUNCTION(Packet16f, Packet16bf, psin)
BF16_PACKET_FUNCTION(Packet16f, Packet16bf, psqrt)
BF16_PACKET_FUNCTION(Packet16f, Packet16bf, ptanh)
+
+#ifndef EIGEN_VECTORIZE_AVX512FP16
F16_PACKET_FUNCTION(Packet16f, Packet16h, pcos)
F16_PACKET_FUNCTION(Packet16f, Packet16h, pexp)
F16_PACKET_FUNCTION(Packet16f, Packet16h, pexp2)
@@ -130,6 +132,7 @@
F16_PACKET_FUNCTION(Packet16f, Packet16h, psin)
F16_PACKET_FUNCTION(Packet16f, Packet16h, psqrt)
F16_PACKET_FUNCTION(Packet16f, Packet16h, ptanh)
+#endif // EIGEN_VECTORIZE_AVX512FP16
} // end namespace internal
diff --git a/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h b/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h
new file mode 100644
index 0000000..240ade4
--- /dev/null
+++ b/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h
@@ -0,0 +1,75 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2025 The Eigen Authors.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATH_FUNCTIONS_FP16_AVX512_H
+#define EIGEN_MATH_FUNCTIONS_FP16_AVX512_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+EIGEN_STRONG_INLINE Packet32h combine2Packet16h(const Packet16h& a, const Packet16h& b) {
+ __m512i result = _mm512_castsi256_si512(_mm256_castph_si256(a));
+ result = _mm512_inserti64x4(result, _mm256_castph_si256(b), 1);
+ return _mm512_castsi512_ph(result);
+}
+
+EIGEN_STRONG_INLINE void extract2Packet16h(const Packet32h& x, Packet16h& a, Packet16h& b) {
+ a = _mm256_castsi256_ph(_mm512_castsi512_si256(_mm512_castph_si512(x)));
+ b = _mm256_castsi256_ph(_mm512_extracti64x4_epi64(_mm512_castph_si512(x), 1));
+}
+
+#define _EIGEN_GENERATE_FP16_MATH_FUNCTION(func) \
+ template <> \
+ EIGEN_STRONG_INLINE Packet8h func<Packet8h>(const Packet8h& a) { \
+ return float2half(func(half2float(a))); \
+ } \
+ \
+ template <> \
+ EIGEN_STRONG_INLINE Packet16h func<Packet16h>(const Packet16h& a) { \
+ return float2half(func(half2float(a))); \
+ } \
+ \
+ template <> \
+ EIGEN_STRONG_INLINE Packet32h func<Packet32h>(const Packet32h& a) { \
+ Packet16h low; \
+ Packet16h high; \
+ extract2Packet16h(a, low, high); \
+ return combine2Packet16h(func(low), func(high)); \
+ }
+
+_EIGEN_GENERATE_FP16_MATH_FUNCTION(psin)
+_EIGEN_GENERATE_FP16_MATH_FUNCTION(pcos)
+_EIGEN_GENERATE_FP16_MATH_FUNCTION(plog)
+_EIGEN_GENERATE_FP16_MATH_FUNCTION(plog2)
+_EIGEN_GENERATE_FP16_MATH_FUNCTION(plog1p)
+_EIGEN_GENERATE_FP16_MATH_FUNCTION(pexp)
+_EIGEN_GENERATE_FP16_MATH_FUNCTION(pexpm1)
+_EIGEN_GENERATE_FP16_MATH_FUNCTION(pexp2)
+_EIGEN_GENERATE_FP16_MATH_FUNCTION(ptanh)
+#undef _EIGEN_GENERATE_FP16_MATH_FUNCTION
+
+// pfrexp
+template <>
+EIGEN_STRONG_INLINE Packet32h pfrexp<Packet32h>(const Packet32h& a, Packet32h& exponent) {
+ return pfrexp_generic(a, exponent);
+}
+
+// pldexp
+template <>
+EIGEN_STRONG_INLINE Packet32h pldexp<Packet32h>(const Packet32h& a, const Packet32h& exponent) {
+ return pldexp_generic(a, exponent);
+}
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // EIGEN_MATH_FUNCTIONS_FP16_AVX512_H
\ No newline at end of file
diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h
index 5d869e4..27a0f10 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
@@ -40,6 +40,10 @@
#endif
typedef eigen_packet_wrapper<__m256i, 2> Packet16bf;
+typedef eigen_packet_wrapper<__m512i, 6> Packet32s;
+typedef eigen_packet_wrapper<__m256i, 6> Packet16s;
+typedef eigen_packet_wrapper<__m128i, 6> Packet8s;
+
template <>
struct is_arithmetic<__m512> {
enum { value = true };
@@ -124,6 +128,7 @@
HasATanh = 1,
HasSqrt = 1,
HasRsqrt = 1,
+ HasCbrt = 1,
HasLog = 1,
HasLog1p = 1,
HasExpm1 = 1,
@@ -149,6 +154,7 @@
HasBlend = 1,
HasSqrt = 1,
HasRsqrt = 1,
+ HasCbrt = 1,
HasSin = EIGEN_FAST_MATH,
HasCos = EIGEN_FAST_MATH,
HasLog = 1,
@@ -249,6 +255,39 @@
#endif
template <>
+struct unpacket_traits<Packet32s> {
+ typedef numext::int16_t type;
+ typedef Packet16s half;
+ enum {
+ size = 32,
+ alignment = Aligned64,
+ vectorizable = false,
+ };
+};
+
+template <>
+struct unpacket_traits<Packet16s> {
+ typedef numext::int16_t type;
+ typedef Packet8s half;
+ enum {
+ size = 16,
+ alignment = Aligned32,
+ vectorizable = false,
+ };
+};
+
+template <>
+struct unpacket_traits<Packet8s> {
+ typedef numext::int16_t type;
+ typedef Packet8s half;
+ enum {
+ size = 8,
+ alignment = Aligned16,
+ vectorizable = false,
+ };
+};
+
+template <>
EIGEN_STRONG_INLINE Packet16f pset1<Packet16f>(const float& from) {
return _mm512_set1_ps(from);
}
@@ -1335,10 +1374,13 @@
return _mm512_abs_epi64(a);
}
+#ifndef EIGEN_VECTORIZE_AVX512FP16
template <>
EIGEN_STRONG_INLINE Packet16h psignbit(const Packet16h& a) {
return _mm256_srai_epi16(a, 15);
}
+#endif // EIGEN_VECTORIZE_AVX512FP16
+
template <>
EIGEN_STRONG_INLINE Packet16bf psignbit(const Packet16bf& a) {
return _mm256_srai_epi16(a, 15);
@@ -2199,6 +2241,7 @@
}
// Packet math for Eigen::half
+#ifndef EIGEN_VECTORIZE_AVX512FP16
template <>
EIGEN_STRONG_INLINE Packet16h pset1<Packet16h>(const Eigen::half& from) {
return _mm256_set1_epi16(from.x);
@@ -2223,6 +2266,7 @@
EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet16h& from) {
// (void*) -> workaround clang warning:
// cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32
+ EIGEN_DEBUG_ALIGNED_STORE
_mm256_store_si256((__m256i*)(void*)to, from);
}
@@ -2230,6 +2274,7 @@
EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet16h& from) {
// (void*) -> workaround clang warning:
// cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32
+ EIGEN_DEBUG_UNALIGNED_STORE
_mm256_storeu_si256((__m256i*)(void*)to, from);
}
@@ -2369,7 +2414,6 @@
return _mm256_xor_si256(a, sign_mask);
}
-#ifndef EIGEN_VECTORIZE_AVX512FP16
template <>
EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
Packet16f af = half2float(a);
@@ -2403,13 +2447,31 @@
}
template <>
+EIGEN_STRONG_INLINE Packet16h pmadd<Packet16h>(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
+ return float2half(pmadd(half2float(a), half2float(b), half2float(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pmsub<Packet16h>(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
+ return float2half(pmsub(half2float(a), half2float(b), half2float(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pnmadd<Packet16h>(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
+ return float2half(pnmadd(half2float(a), half2float(b), half2float(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pnmsub<Packet16h>(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
+ return float2half(pnmsub(half2float(a), half2float(b), half2float(c)));
+}
+
+template <>
EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& from) {
Packet16f from_float = half2float(from);
return half(predux(from_float));
}
-#endif
-
template <>
EIGEN_STRONG_INLINE Packet8h predux_half_dowto4<Packet16h>(const Packet16h& a) {
Packet8h lane0 = _mm256_extractf128_si256(a, 0);
@@ -2643,6 +2705,8 @@
kernel.packet[3] = pload<Packet16h>(out[3]);
}
+#endif // EIGEN_VECTORIZE_AVX512FP16
+
template <>
struct is_arithmetic<Packet16bf> {
enum { value = true };
@@ -2714,11 +2778,13 @@
template <>
EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet16bf& from) {
+ EIGEN_DEBUG_ALIGNED_STORE
_mm256_store_si256(reinterpret_cast<__m256i*>(to), from);
}
template <>
EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet16bf& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE
_mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from);
}
@@ -2889,7 +2955,27 @@
template <>
EIGEN_STRONG_INLINE Packet16bf pmul<Packet16bf>(const Packet16bf& a, const Packet16bf& b) {
- return F32ToBf16(pmul<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
+ return F32ToBf16(pmul(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pmadd<Packet16bf>(const Packet16bf& a, const Packet16bf& b, const Packet16bf& c) {
+ return F32ToBf16(pmadd(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pmsub<Packet16bf>(const Packet16bf& a, const Packet16bf& b, const Packet16bf& c) {
+ return F32ToBf16(pmsub(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pnmadd<Packet16bf>(const Packet16bf& a, const Packet16bf& b, const Packet16bf& c) {
+ return F32ToBf16(pnmadd(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pnmsub<Packet16bf>(const Packet16bf& a, const Packet16bf& b, const Packet16bf& c) {
+ return F32ToBf16(pnmsub(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
}
template <>
@@ -3095,6 +3181,172 @@
kernel.packet[3] = _mm256_permute2x128_si256(abcd_8b, abcd_cf, 0x31);
}
+// Minimal implementation of 16-bit int packets for use in pfrexp, pldexp.
+
+template <>
+EIGEN_STRONG_INLINE Packet32s pset1<Packet32s>(const numext::int16_t& x) {
+ return _mm512_set1_epi16(x);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16s pset1<Packet16s>(const numext::int16_t& x) {
+ return _mm256_set1_epi16(x);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const numext::int16_t& x) {
+ return _mm_set1_epi16(x);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<numext::int16_t, Packet32s>(numext::int16_t* out, const Packet32s& x) {
+ EIGEN_DEBUG_ALIGNED_STORE
+ _mm512_store_epi32(out, x);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<numext::int16_t, Packet16s>(numext::int16_t* out, const Packet16s& x) {
+ EIGEN_DEBUG_ALIGNED_STORE
+#if defined(EIGEN_VECTORIZE_AVX512F) && defined(EIGEN_VECTORIZE_AVX512VL)
+ _mm256_store_epi32(out, x);
+#else
+ _mm256_store_si256(reinterpret_cast<__m256i*>(out), x);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<numext::int16_t, Packet8s>(numext::int16_t* out, const Packet8s& x) {
+ EIGEN_DEBUG_ALIGNED_STORE
+#if defined(EIGEN_VECTORIZE_AVX512F) && defined(EIGEN_VECTORIZE_AVX512VL)
+ _mm256_store_epi32(out, x);
+#else
+ _mm_store_si128(reinterpret_cast<__m128i*>(out), x);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<numext::int16_t, Packet32s>(numext::int16_t* out, const Packet32s& x) {
+ EIGEN_DEBUG_UNALIGNED_STORE
+ _mm512_storeu_epi32(out, x);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<numext::int16_t, Packet16s>(numext::int16_t* out, const Packet16s& x) {
+ EIGEN_DEBUG_UNALIGNED_STORE
+ _mm256_storeu_epi32(out, x);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<numext::int16_t, Packet8s>(numext::int16_t* out, const Packet8s& x) {
+ EIGEN_DEBUG_UNALIGNED_STORE
+ _mm_storeu_epi32(out, x);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet32s padd(const Packet32s& a, const Packet32s& b) {
+ return _mm512_add_epi16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16s padd(const Packet16s& a, const Packet16s& b) {
+ return _mm256_add_epi16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8s padd(const Packet8s& a, const Packet8s& b) {
+ return _mm_add_epi16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet32s psub(const Packet32s& a, const Packet32s& b) {
+ return _mm512_sub_epi16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16s psub(const Packet16s& a, const Packet16s& b) {
+ return _mm256_sub_epi16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8s psub(const Packet8s& a, const Packet8s& b) {
+ return _mm_sub_epi16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet32s pmul(const Packet32s& a, const Packet32s& b) {
+ return _mm512_mullo_epi16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16s pmul(const Packet16s& a, const Packet16s& b) {
+ return _mm256_mullo_epi16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8s pmul(const Packet8s& a, const Packet8s& b) {
+ return _mm_mullo_epi16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet32s pnegate(const Packet32s& a) {
+ return _mm512_sub_epi16(_mm512_setzero_si512(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16s pnegate(const Packet16s& a) {
+ return _mm256_sub_epi16(_mm256_setzero_si256(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8s pnegate(const Packet8s& a) {
+ return _mm_sub_epi16(_mm_setzero_si128(), a);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet32s parithmetic_shift_right(Packet32s a) {
+ return _mm512_srai_epi16(a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet16s parithmetic_shift_right(Packet16s a) {
+ return _mm256_srai_epi16(a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet8s parithmetic_shift_right(Packet8s a) {
+ return _mm_srai_epi16(a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet32s plogical_shift_left(Packet32s a) {
+ return _mm512_slli_epi16(a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet16s plogical_shift_left(Packet16s a) {
+ return _mm256_slli_epi16(a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet8s plogical_shift_left(Packet8s a) {
+ return _mm_slli_epi16(a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet32s plogical_shift_right(Packet32s a) {
+ return _mm512_srli_epi16(a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet16s plogical_shift_right(Packet16s a) {
+ return _mm256_srli_epi16(a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet8s plogical_shift_right(Packet8s a) {
+ return _mm_srli_epi16(a, N);
+}
+
} // end namespace internal
} // end namespace Eigen
diff --git a/Eigen/src/Core/arch/AVX512/PacketMathFP16.h b/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
index 038e233..ef64bc5 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
@@ -1,7 +1,7 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
-//
+// Copyright (C) 2025 The Eigen Authors.
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
@@ -18,8 +18,8 @@
namespace internal {
typedef __m512h Packet32h;
-typedef eigen_packet_wrapper<__m256i, 1> Packet16h;
-typedef eigen_packet_wrapper<__m128i, 2> Packet8h;
+typedef __m256h Packet16h;
+typedef __m128h Packet8h;
template <>
struct is_arithmetic<Packet8h> {
@@ -68,6 +68,7 @@
struct unpacket_traits<Packet32h> {
typedef Eigen::half type;
typedef Packet16h half;
+ typedef Packet32s integer_packet;
enum {
size = 32,
alignment = Aligned64,
@@ -81,6 +82,7 @@
struct unpacket_traits<Packet16h> {
typedef Eigen::half type;
typedef Packet8h half;
+ typedef Packet16s integer_packet;
enum {
size = 16,
alignment = Aligned32,
@@ -94,6 +96,7 @@
struct unpacket_traits<Packet8h> {
typedef Eigen::half type;
typedef Packet8h half;
+ typedef Packet8s integer_packet;
enum {
size = 8,
alignment = Aligned16,
@@ -103,14 +106,33 @@
};
};
+// Conversions
+
+EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) { return _mm512_cvtxph_ps(a); }
+
+EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) { return _mm256_cvtxph_ps(a); }
+
+EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) { return _mm512_cvtxps_ph(a); }
+
+EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) { return _mm256_cvtxps_ph(a); }
+
// Memory functions
// pset1
template <>
EIGEN_STRONG_INLINE Packet32h pset1<Packet32h>(const Eigen::half& from) {
- // half/half_raw is bit compatible
- return _mm512_set1_ph(numext::bit_cast<_Float16>(from));
+ return _mm512_set1_ph(from.x);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pset1<Packet16h>(const Eigen::half& from) {
+ return _mm256_set1_ph(from.x);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pset1<Packet8h>(const Eigen::half& from) {
+ return _mm_set1_ph(from.x);
}
template <>
@@ -118,24 +140,47 @@
return _mm512_setzero_ph();
}
+template <>
+EIGEN_STRONG_INLINE Packet16h pzero(const Packet16h& /*a*/) {
+ return _mm256_setzero_ph();
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pzero(const Packet8h& /*a*/) {
+ return _mm_setzero_ph();
+}
+
// pset1frombits
template <>
EIGEN_STRONG_INLINE Packet32h pset1frombits<Packet32h>(unsigned short from) {
return _mm512_castsi512_ph(_mm512_set1_epi16(from));
}
+template <>
+EIGEN_STRONG_INLINE Packet16h pset1frombits<Packet16h>(unsigned short from) {
+ return _mm256_castsi256_ph(_mm256_set1_epi16(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pset1frombits<Packet8h>(unsigned short from) {
+ return _mm_castsi128_ph(_mm_set1_epi16(from));
+}
+
// pfirst
template <>
EIGEN_STRONG_INLINE Eigen::half pfirst<Packet32h>(const Packet32h& from) {
-#ifdef EIGEN_VECTORIZE_AVX512DQ
- return half_impl::raw_uint16_to_half(
- static_cast<unsigned short>(_mm256_extract_epi16(_mm512_extracti32x8_epi32(_mm512_castph_si512(from), 0), 0)));
-#else
- Eigen::half dest[32];
- _mm512_storeu_ph(dest, from);
- return dest[0];
-#endif
+ return Eigen::half(_mm512_cvtsh_h(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half pfirst<Packet16h>(const Packet16h& from) {
+ return Eigen::half(_mm256_cvtsh_h(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half pfirst<Packet8h>(const Packet8h& from) {
+ return Eigen::half(_mm_cvtsh_h(from));
}
// pload
@@ -145,6 +190,16 @@
EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_ph(from);
}
+template <>
+EIGEN_STRONG_INLINE Packet16h pload<Packet16h>(const Eigen::half* from) {
+ EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ph(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pload<Packet8h>(const Eigen::half* from) {
+ EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ph(from);
+}
+
// ploadu
template <>
@@ -152,6 +207,16 @@
EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_ph(from);
}
+template <>
+EIGEN_STRONG_INLINE Packet16h ploadu<Packet16h>(const Eigen::half* from) {
+ EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_ph(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h ploadu<Packet8h>(const Eigen::half* from) {
+ EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_ph(from);
+}
+
// pstore
template <>
@@ -159,6 +224,16 @@
EIGEN_DEBUG_ALIGNED_STORE _mm512_store_ph(to, from);
}
+template <>
+EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet16h& from) {
+ EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ph(to, from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet8h& from) {
+ EIGEN_DEBUG_ALIGNED_STORE _mm_store_ph(to, from);
+}
+
// pstoreu
template <>
@@ -166,6 +241,16 @@
EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_ph(to, from);
}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet16h& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_ph(to, from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet8h& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ph(to, from);
+}
+
// ploaddup
template <>
EIGEN_STRONG_INLINE Packet32h ploaddup<Packet32h>(const Eigen::half* from) {
@@ -175,6 +260,17 @@
a);
}
+template <>
+EIGEN_STRONG_INLINE Packet16h ploaddup<Packet16h>(const Eigen::half* from) {
+ __m256h a = _mm256_castph128_ph256(_mm_loadu_ph(from));
+ return _mm256_permutexvar_ph(_mm256_set_epi16(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h ploaddup<Packet8h>(const Eigen::half* from) {
+ return _mm_set_ph(from[3].x, from[3].x, from[2].x, from[2].x, from[1].x, from[1].x, from[0].x, from[0].x);
+}
+
// ploadquad
template <>
EIGEN_STRONG_INLINE Packet32h ploadquad<Packet32h>(const Eigen::half* from) {
@@ -184,6 +280,17 @@
a);
}
+template <>
+EIGEN_STRONG_INLINE Packet16h ploadquad<Packet16h>(const Eigen::half* from) {
+ return _mm256_set_ph(from[3].x, from[3].x, from[3].x, from[3].x, from[2].x, from[2].x, from[2].x, from[2].x,
+ from[1].x, from[1].x, from[1].x, from[1].x, from[0].x, from[0].x, from[0].x, from[0].x);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h ploadquad<Packet8h>(const Eigen::half* from) {
+ return _mm_set_ph(from[1].x, from[1].x, from[1].x, from[1].x, from[0].x, from[0].x, from[0].x, from[0].x);
+}
+
// pabs
template <>
@@ -191,6 +298,16 @@
return _mm512_abs_ph(a);
}
+template <>
+EIGEN_STRONG_INLINE Packet16h pabs<Packet16h>(const Packet16h& a) {
+ return _mm256_abs_ph(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pabs<Packet8h>(const Packet8h& a) {
+ return _mm_abs_ph(a);
+}
+
// psignbit
template <>
@@ -198,6 +315,16 @@
return _mm512_castsi512_ph(_mm512_srai_epi16(_mm512_castph_si512(a), 15));
}
+template <>
+EIGEN_STRONG_INLINE Packet16h psignbit<Packet16h>(const Packet16h& a) {
+ return _mm256_castsi256_ph(_mm256_srai_epi16(_mm256_castph_si256(a), 15));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h psignbit<Packet8h>(const Packet8h& a) {
+ return _mm_castsi128_ph(_mm_srai_epi16(_mm_castph_si128(a), 15));
+}
+
// pmin
template <>
@@ -205,6 +332,16 @@
return _mm512_min_ph(a, b);
}
+template <>
+EIGEN_STRONG_INLINE Packet16h pmin<Packet16h>(const Packet16h& a, const Packet16h& b) {
+ return _mm256_min_ph(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pmin<Packet8h>(const Packet8h& a, const Packet8h& b) {
+ return _mm_min_ph(a, b);
+}
+
// pmax
template <>
@@ -212,6 +349,16 @@
return _mm512_max_ph(a, b);
}
+template <>
+EIGEN_STRONG_INLINE Packet16h pmax<Packet16h>(const Packet16h& a, const Packet16h& b) {
+ return _mm256_max_ph(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pmax<Packet8h>(const Packet8h& a, const Packet8h& b) {
+ return _mm_max_ph(a, b);
+}
+
// plset
template <>
EIGEN_STRONG_INLINE Packet32h plset<Packet32h>(const half& a) {
@@ -219,6 +366,16 @@
16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
}
+template <>
+EIGEN_STRONG_INLINE Packet16h plset<Packet16h>(const half& a) {
+ return _mm256_add_ph(pset1<Packet16h>(a), _mm256_set_ph(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h plset<Packet8h>(const half& a) {
+ return _mm_add_ph(pset1<Packet8h>(a), _mm_set_ph(7, 6, 5, 4, 3, 2, 1, 0));
+}
+
// por
template <>
@@ -226,6 +383,16 @@
return _mm512_castsi512_ph(_mm512_or_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
}
+template <>
+EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a, const Packet16h& b) {
+ return _mm256_castsi256_ph(_mm256_or_si256(_mm256_castph_si256(a), _mm256_castph_si256(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a, const Packet8h& b) {
+ return _mm_castsi128_ph(_mm_or_si128(_mm_castph_si128(a), _mm_castph_si128(b)));
+}
+
// pxor
template <>
@@ -233,6 +400,16 @@
return _mm512_castsi512_ph(_mm512_xor_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
}
+template <>
+EIGEN_STRONG_INLINE Packet16h pxor(const Packet16h& a, const Packet16h& b) {
+ return _mm256_castsi256_ph(_mm256_xor_si256(_mm256_castph_si256(a), _mm256_castph_si256(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pxor(const Packet8h& a, const Packet8h& b) {
+ return _mm_castsi128_ph(_mm_xor_si128(_mm_castph_si128(a), _mm_castph_si128(b)));
+}
+
// pand
template <>
@@ -240,6 +417,16 @@
return _mm512_castsi512_ph(_mm512_and_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
}
+template <>
+EIGEN_STRONG_INLINE Packet16h pand(const Packet16h& a, const Packet16h& b) {
+ return _mm256_castsi256_ph(_mm256_and_si256(_mm256_castph_si256(a), _mm256_castph_si256(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pand(const Packet8h& a, const Packet8h& b) {
+ return _mm_castsi128_ph(_mm_and_si128(_mm_castph_si128(a), _mm_castph_si128(b)));
+}
+
// pandnot
template <>
@@ -247,6 +434,16 @@
return _mm512_castsi512_ph(_mm512_andnot_si512(_mm512_castph_si512(b), _mm512_castph_si512(a)));
}
+template <>
+EIGEN_STRONG_INLINE Packet16h pandnot(const Packet16h& a, const Packet16h& b) {
+ return _mm256_castsi256_ph(_mm256_andnot_si256(_mm256_castph_si256(b), _mm256_castph_si256(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h& a, const Packet8h& b) {
+ return _mm_castsi128_ph(_mm_andnot_si128(_mm_castph_si128(b), _mm_castph_si128(a)));
+}
+
// pselect
template <>
@@ -255,6 +452,18 @@
return _mm512_mask_blend_ph(mask32, a, b);
}
+template <>
+EIGEN_DEVICE_FUNC inline Packet16h pselect(const Packet16h& mask, const Packet16h& a, const Packet16h& b) {
+ __mmask16 mask16 = _mm256_cmp_epi16_mask(_mm256_castph_si256(mask), _mm256_setzero_si256(), _MM_CMPINT_EQ);
+ return _mm256_mask_blend_ph(mask16, a, b);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet8h pselect(const Packet8h& mask, const Packet8h& a, const Packet8h& b) {
+ __mmask8 mask8 = _mm_cmp_epi16_mask(_mm_castph_si128(mask), _mm_setzero_si128(), _MM_CMPINT_EQ);
+ return _mm_mask_blend_ph(mask8, a, b);
+}
+
// pcmp_eq
template <>
@@ -263,6 +472,18 @@
return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, static_cast<short>(0xffffu)));
}
+template <>
+EIGEN_STRONG_INLINE Packet16h pcmp_eq(const Packet16h& a, const Packet16h& b) {
+ __mmask16 mask = _mm256_cmp_ph_mask(a, b, _CMP_EQ_OQ);
+ return _mm256_castsi256_ph(_mm256_mask_set1_epi16(_mm256_set1_epi32(0), mask, static_cast<short>(0xffffu)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pcmp_eq(const Packet8h& a, const Packet8h& b) {
+ __mmask8 mask = _mm_cmp_ph_mask(a, b, _CMP_EQ_OQ);
+ return _mm_castsi128_ph(_mm_mask_set1_epi16(_mm_set1_epi32(0), mask, static_cast<short>(0xffffu)));
+}
+
// pcmp_le
template <>
@@ -271,6 +492,18 @@
return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, static_cast<short>(0xffffu)));
}
+template <>
+EIGEN_STRONG_INLINE Packet16h pcmp_le(const Packet16h& a, const Packet16h& b) {
+ __mmask16 mask = _mm256_cmp_ph_mask(a, b, _CMP_LE_OQ);
+ return _mm256_castsi256_ph(_mm256_mask_set1_epi16(_mm256_set1_epi32(0), mask, static_cast<short>(0xffffu)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pcmp_le(const Packet8h& a, const Packet8h& b) {
+ __mmask8 mask = _mm_cmp_ph_mask(a, b, _CMP_LE_OQ);
+ return _mm_castsi128_ph(_mm_mask_set1_epi16(_mm_set1_epi32(0), mask, static_cast<short>(0xffffu)));
+}
+
// pcmp_lt
template <>
@@ -279,6 +512,18 @@
return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, static_cast<short>(0xffffu)));
}
+template <>
+EIGEN_STRONG_INLINE Packet16h pcmp_lt(const Packet16h& a, const Packet16h& b) {
+ __mmask16 mask = _mm256_cmp_ph_mask(a, b, _CMP_LT_OQ);
+ return _mm256_castsi256_ph(_mm256_mask_set1_epi16(_mm256_set1_epi32(0), mask, static_cast<short>(0xffffu)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pcmp_lt(const Packet8h& a, const Packet8h& b) {
+ __mmask8 mask = _mm_cmp_ph_mask(a, b, _CMP_LT_OQ);
+ return _mm_castsi128_ph(_mm_mask_set1_epi16(_mm_set1_epi32(0), mask, static_cast<short>(0xffffu)));
+}
+
// pcmp_lt_or_nan
template <>
@@ -287,6 +532,18 @@
return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi16(0), mask, static_cast<short>(0xffffu)));
}
+template <>
+EIGEN_STRONG_INLINE Packet16h pcmp_lt_or_nan(const Packet16h& a, const Packet16h& b) {
+ __mmask16 mask = _mm256_cmp_ph_mask(a, b, _CMP_NGE_UQ);
+ return _mm256_castsi256_ph(_mm256_mask_set1_epi16(_mm256_set1_epi32(0), mask, static_cast<short>(0xffffu)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pcmp_lt_or_nan(const Packet8h& a, const Packet8h& b) {
+ __mmask8 mask = _mm_cmp_ph_mask(a, b, _CMP_NGE_UQ);
+ return _mm_castsi128_ph(_mm_mask_set1_epi16(_mm_set1_epi32(0), mask, static_cast<short>(0xffffu)));
+}
+
// padd
template <>
@@ -296,12 +553,12 @@
template <>
EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
- return _mm256_castph_si256(_mm256_add_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
+ return _mm256_add_ph(a, b);
}
template <>
EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {
- return _mm_castph_si128(_mm_add_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
+ return _mm_add_ph(a, b);
}
// psub
@@ -313,12 +570,12 @@
template <>
EIGEN_STRONG_INLINE Packet16h psub<Packet16h>(const Packet16h& a, const Packet16h& b) {
- return _mm256_castph_si256(_mm256_sub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
+ return _mm256_sub_ph(a, b);
}
template <>
EIGEN_STRONG_INLINE Packet8h psub<Packet8h>(const Packet8h& a, const Packet8h& b) {
- return _mm_castph_si128(_mm_sub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
+ return _mm_sub_ph(a, b);
}
// pmul
@@ -330,12 +587,12 @@
template <>
EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {
- return _mm256_castph_si256(_mm256_mul_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
+ return _mm256_mul_ph(a, b);
}
template <>
EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {
- return _mm_castph_si128(_mm_mul_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
+ return _mm_mul_ph(a, b);
}
// pdiv
@@ -347,12 +604,13 @@
template <>
EIGEN_STRONG_INLINE Packet16h pdiv<Packet16h>(const Packet16h& a, const Packet16h& b) {
- return _mm256_castph_si256(_mm256_div_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
+ return _mm256_div_ph(a, b);
}
template <>
EIGEN_STRONG_INLINE Packet8h pdiv<Packet8h>(const Packet8h& a, const Packet8h& b) {
- return _mm_castph_si128(_mm_div_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
+ return _mm_div_ph(a, b);
+ ;
}
// pround
@@ -361,14 +619,40 @@
EIGEN_STRONG_INLINE Packet32h pround<Packet32h>(const Packet32h& a) {
// Work-around for default std::round rounding mode.
- // Mask for the sign bit
- const Packet32h signMask = pset1frombits<Packet32h>(static_cast<numext::uint16_t>(0x8000u));
- // The largest half-preicision float less than 0.5
+ // Mask for the sign bit.
+ const Packet32h signMask =
+ pset1frombits<Packet32h>(static_cast<numext::uint16_t>(static_cast<std::uint16_t>(0x8000u)));
+ // The largest half-precision float less than 0.5.
const Packet32h prev0dot5 = pset1frombits<Packet32h>(static_cast<numext::uint16_t>(0x37FFu));
return _mm512_roundscale_ph(padd(por(pand(a, signMask), prev0dot5), a), _MM_FROUND_TO_ZERO);
}
+template <>
+EIGEN_STRONG_INLINE Packet16h pround<Packet16h>(const Packet16h& a) {
+ // Work-around for default std::round rounding mode.
+
+ // Mask for the sign bit.
+ const Packet16h signMask =
+ pset1frombits<Packet16h>(static_cast<numext::uint16_t>(static_cast<std::uint16_t>(0x8000u)));
+ // The largest half-precision float less than 0.5.
+ const Packet16h prev0dot5 = pset1frombits<Packet16h>(static_cast<numext::uint16_t>(0x37FFu));
+
+ return _mm256_roundscale_ph(padd(por(pand(a, signMask), prev0dot5), a), _MM_FROUND_TO_ZERO);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pround<Packet8h>(const Packet8h& a) {
+ // Work-around for default std::round rounding mode.
+
+ // Mask for the sign bit.
+ const Packet8h signMask = pset1frombits<Packet8h>(static_cast<numext::uint16_t>(static_cast<std::uint16_t>(0x8000u)));
+ // The largest half-precision float less than 0.5.
+ const Packet8h prev0dot5 = pset1frombits<Packet8h>(static_cast<numext::uint16_t>(0x37FFu));
+
+ return _mm_roundscale_ph(padd(por(pand(a, signMask), prev0dot5), a), _MM_FROUND_TO_ZERO);
+}
+
// print
template <>
@@ -376,6 +660,16 @@
return _mm512_roundscale_ph(a, _MM_FROUND_CUR_DIRECTION);
}
+template <>
+EIGEN_STRONG_INLINE Packet16h print<Packet16h>(const Packet16h& a) {
+ return _mm256_roundscale_ph(a, _MM_FROUND_CUR_DIRECTION);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h print<Packet8h>(const Packet8h& a) {
+ return _mm_roundscale_ph(a, _MM_FROUND_CUR_DIRECTION);
+}
+
// pceil
template <>
@@ -383,6 +677,16 @@
return _mm512_roundscale_ph(a, _MM_FROUND_TO_POS_INF);
}
+template <>
+EIGEN_STRONG_INLINE Packet16h pceil<Packet16h>(const Packet16h& a) {
+ return _mm256_roundscale_ph(a, _MM_FROUND_TO_POS_INF);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pceil<Packet8h>(const Packet8h& a) {
+ return _mm_roundscale_ph(a, _MM_FROUND_TO_POS_INF);
+}
+
// pfloor
template <>
@@ -390,6 +694,16 @@
return _mm512_roundscale_ph(a, _MM_FROUND_TO_NEG_INF);
}
+template <>
+EIGEN_STRONG_INLINE Packet16h pfloor<Packet16h>(const Packet16h& a) {
+ return _mm256_roundscale_ph(a, _MM_FROUND_TO_NEG_INF);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pfloor<Packet8h>(const Packet8h& a) {
+ return _mm_roundscale_ph(a, _MM_FROUND_TO_NEG_INF);
+}
+
// ptrunc
template <>
@@ -397,47 +711,99 @@
return _mm512_roundscale_ph(a, _MM_FROUND_TO_ZERO);
}
+template <>
+EIGEN_STRONG_INLINE Packet16h ptrunc<Packet16h>(const Packet16h& a) {
+ return _mm256_roundscale_ph(a, _MM_FROUND_TO_ZERO);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h ptrunc<Packet8h>(const Packet8h& a) {
+ return _mm_roundscale_ph(a, _MM_FROUND_TO_ZERO);
+}
+
// predux
template <>
EIGEN_STRONG_INLINE half predux<Packet32h>(const Packet32h& a) {
- return (half)_mm512_reduce_add_ph(a);
+ return half(_mm512_reduce_add_ph(a));
}
template <>
EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& a) {
- return (half)_mm256_reduce_add_ph(_mm256_castsi256_ph(a));
+ return half(_mm256_reduce_add_ph(a));
}
template <>
EIGEN_STRONG_INLINE half predux<Packet8h>(const Packet8h& a) {
- return (half)_mm_reduce_add_ph(_mm_castsi128_ph(a));
+ return half(_mm_reduce_add_ph(a));
}
// predux_half_dowto4
template <>
EIGEN_STRONG_INLINE Packet16h predux_half_dowto4<Packet32h>(const Packet32h& a) {
-#ifdef EIGEN_VECTORIZE_AVX512DQ
- __m256i lowHalf = _mm256_castps_si256(_mm512_extractf32x8_ps(_mm512_castph_ps(a), 0));
- __m256i highHalf = _mm256_castps_si256(_mm512_extractf32x8_ps(_mm512_castph_ps(a), 1));
+ const __m512i bits = _mm512_castph_si512(a);
+ Packet16h lo = _mm256_castsi256_ph(_mm512_castsi512_si256(bits));
+ Packet16h hi = _mm256_castsi256_ph(_mm512_extracti64x4_epi64(bits, 1));
+ return padd(lo, hi);
+}
- return Packet16h(padd<Packet16h>(lowHalf, highHalf));
-#else
- Eigen::half data[32];
- _mm512_storeu_ph(data, a);
-
- __m256i lowHalf = _mm256_castph_si256(_mm256_loadu_ph(data));
- __m256i highHalf = _mm256_castph_si256(_mm256_loadu_ph(data + 16));
-
- return Packet16h(padd<Packet16h>(lowHalf, highHalf));
-#endif
+template <>
+EIGEN_STRONG_INLINE Packet8h predux_half_dowto4<Packet16h>(const Packet16h& a) {
+ Packet8h lo = _mm_castsi128_ph(_mm256_castsi256_si128(_mm256_castph_si256(a)));
+ Packet8h hi = _mm_castps_ph(_mm256_extractf128_ps(_mm256_castph_ps(a), 1));
+ return padd(lo, hi);
}
// predux_max
+template <>
+EIGEN_STRONG_INLINE half predux_max<Packet32h>(const Packet32h& a) {
+ return half(_mm512_reduce_max_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_max<Packet16h>(const Packet16h& a) {
+ return half(_mm256_reduce_max_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_max<Packet8h>(const Packet8h& a) {
+ return half(_mm_reduce_max_ph(a));
+}
+
// predux_min
+template <>
+EIGEN_STRONG_INLINE half predux_min<Packet32h>(const Packet32h& a) {
+ return half(_mm512_reduce_min_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_min<Packet16h>(const Packet16h& a) {
+ return half(_mm256_reduce_min_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_min<Packet8h>(const Packet8h& a) {
+ return half(_mm_reduce_min_ph(a));
+}
+
// predux_mul
+template <>
+EIGEN_STRONG_INLINE half predux_mul<Packet32h>(const Packet32h& a) {
+ return half(_mm512_reduce_mul_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_mul<Packet16h>(const Packet16h& a) {
+ return half(_mm256_reduce_mul_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_mul<Packet8h>(const Packet8h& a) {
+ return half(_mm_reduce_mul_ph(a));
+}
+
#ifdef EIGEN_VECTORIZE_FMA
// pmadd
@@ -449,12 +815,12 @@
template <>
EIGEN_STRONG_INLINE Packet16h pmadd(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
- return _mm256_castph_si256(_mm256_fmadd_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
+ return _mm256_fmadd_ph(a, b, c);
}
template <>
EIGEN_STRONG_INLINE Packet8h pmadd(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
- return _mm_castph_si128(_mm_fmadd_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
+ return _mm_fmadd_ph(a, b, c);
}
// pmsub
@@ -466,12 +832,12 @@
template <>
EIGEN_STRONG_INLINE Packet16h pmsub(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
- return _mm256_castph_si256(_mm256_fmsub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
+ return _mm256_fmsub_ph(a, b, c);
}
template <>
EIGEN_STRONG_INLINE Packet8h pmsub(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
- return _mm_castph_si128(_mm_fmsub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
+ return _mm_fmsub_ph(a, b, c);
}
// pnmadd
@@ -483,12 +849,12 @@
template <>
EIGEN_STRONG_INLINE Packet16h pnmadd(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
- return _mm256_castph_si256(_mm256_fnmadd_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
+ return _mm256_fnmadd_ph(a, b, c);
}
template <>
EIGEN_STRONG_INLINE Packet8h pnmadd(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
- return _mm_castph_si128(_mm_fnmadd_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
+ return _mm_fnmadd_ph(a, b, c);
}
// pnmsub
@@ -500,12 +866,12 @@
template <>
EIGEN_STRONG_INLINE Packet16h pnmsub(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
- return _mm256_castph_si256(_mm256_fnmsub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
+ return _mm256_fnmsub_ph(a, b, c);
}
template <>
EIGEN_STRONG_INLINE Packet8h pnmsub(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
- return _mm_castph_si128(_mm_fnmsub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
+ return _mm_fnmsub_ph(a, b, c);
}
#endif
@@ -514,35 +880,74 @@
template <>
EIGEN_STRONG_INLINE Packet32h pnegate<Packet32h>(const Packet32h& a) {
- return psub(pzero(a), a);
+ return _mm512_castsi512_ph(
+ _mm512_xor_si512(_mm512_castph_si512(a), _mm512_set1_epi16(static_cast<std::uint16_t>(0x8000u))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pnegate<Packet16h>(const Packet16h& a) {
+ return _mm256_castsi256_ph(
+ _mm256_xor_si256(_mm256_castph_si256(a), _mm256_set1_epi16(static_cast<std::uint16_t>(0x8000u))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pnegate<Packet8h>(const Packet8h& a) {
+ return _mm_castsi128_ph(_mm_xor_si128(_mm_castph_si128(a), _mm_set1_epi16(static_cast<std::uint16_t>(0x8000u))));
}
// pconj
-template <>
-EIGEN_STRONG_INLINE Packet32h pconj<Packet32h>(const Packet32h& a) {
- return a;
-}
+// Nothing, packets are real.
// psqrt
template <>
EIGEN_STRONG_INLINE Packet32h psqrt<Packet32h>(const Packet32h& a) {
- return _mm512_sqrt_ph(a);
+ return generic_sqrt_newton_step<Packet32h>::run(a, _mm512_rsqrt_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h psqrt<Packet16h>(const Packet16h& a) {
+ return generic_sqrt_newton_step<Packet16h>::run(a, _mm256_rsqrt_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h psqrt<Packet8h>(const Packet8h& a) {
+ return generic_sqrt_newton_step<Packet8h>::run(a, _mm_rsqrt_ph(a));
}
// prsqrt
template <>
EIGEN_STRONG_INLINE Packet32h prsqrt<Packet32h>(const Packet32h& a) {
- return _mm512_rsqrt_ph(a);
+ return generic_rsqrt_newton_step<Packet32h, /*Steps=*/1>::run(a, _mm512_rsqrt_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h prsqrt<Packet16h>(const Packet16h& a) {
+ return generic_rsqrt_newton_step<Packet16h, /*Steps=*/1>::run(a, _mm256_rsqrt_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h prsqrt<Packet8h>(const Packet8h& a) {
+ return generic_rsqrt_newton_step<Packet8h, /*Steps=*/1>::run(a, _mm_rsqrt_ph(a));
}
// preciprocal
template <>
EIGEN_STRONG_INLINE Packet32h preciprocal<Packet32h>(const Packet32h& a) {
- return _mm512_rcp_ph(a);
+ return generic_reciprocal_newton_step<Packet32h, /*Steps=*/1>::run(a, _mm512_rcp_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h preciprocal<Packet16h>(const Packet16h& a) {
+ return generic_reciprocal_newton_step<Packet16h, /*Steps=*/1>::run(a, _mm256_rcp_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h preciprocal<Packet8h>(const Packet8h& a) {
+ return generic_reciprocal_newton_step<Packet8h, /*Steps=*/1>::run(a, _mm_rcp_ph(a));
}
// ptranspose
@@ -663,6 +1068,246 @@
a.packet[3] = _mm512_castsi512_ph(a3);
}
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16h, 16>& kernel) {
+ __m256i a = _mm256_castph_si256(kernel.packet[0]);
+ __m256i b = _mm256_castph_si256(kernel.packet[1]);
+ __m256i c = _mm256_castph_si256(kernel.packet[2]);
+ __m256i d = _mm256_castph_si256(kernel.packet[3]);
+ __m256i e = _mm256_castph_si256(kernel.packet[4]);
+ __m256i f = _mm256_castph_si256(kernel.packet[5]);
+ __m256i g = _mm256_castph_si256(kernel.packet[6]);
+ __m256i h = _mm256_castph_si256(kernel.packet[7]);
+ __m256i i = _mm256_castph_si256(kernel.packet[8]);
+ __m256i j = _mm256_castph_si256(kernel.packet[9]);
+ __m256i k = _mm256_castph_si256(kernel.packet[10]);
+ __m256i l = _mm256_castph_si256(kernel.packet[11]);
+ __m256i m = _mm256_castph_si256(kernel.packet[12]);
+ __m256i n = _mm256_castph_si256(kernel.packet[13]);
+ __m256i o = _mm256_castph_si256(kernel.packet[14]);
+ __m256i p = _mm256_castph_si256(kernel.packet[15]);
+
+ __m256i ab_07 = _mm256_unpacklo_epi16(a, b);
+ __m256i cd_07 = _mm256_unpacklo_epi16(c, d);
+ __m256i ef_07 = _mm256_unpacklo_epi16(e, f);
+ __m256i gh_07 = _mm256_unpacklo_epi16(g, h);
+ __m256i ij_07 = _mm256_unpacklo_epi16(i, j);
+ __m256i kl_07 = _mm256_unpacklo_epi16(k, l);
+ __m256i mn_07 = _mm256_unpacklo_epi16(m, n);
+ __m256i op_07 = _mm256_unpacklo_epi16(o, p);
+
+ __m256i ab_8f = _mm256_unpackhi_epi16(a, b);
+ __m256i cd_8f = _mm256_unpackhi_epi16(c, d);
+ __m256i ef_8f = _mm256_unpackhi_epi16(e, f);
+ __m256i gh_8f = _mm256_unpackhi_epi16(g, h);
+ __m256i ij_8f = _mm256_unpackhi_epi16(i, j);
+ __m256i kl_8f = _mm256_unpackhi_epi16(k, l);
+ __m256i mn_8f = _mm256_unpackhi_epi16(m, n);
+ __m256i op_8f = _mm256_unpackhi_epi16(o, p);
+
+ __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07);
+ __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07);
+ __m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07);
+ __m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07);
+ __m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07);
+ __m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07);
+ __m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07);
+ __m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07);
+
+ __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f);
+ __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f);
+ __m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f);
+ __m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f);
+ __m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f);
+ __m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f);
+ __m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f);
+ __m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f);
+
+ __m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03);
+ __m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03);
+ __m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03);
+ __m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03);
+ __m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47);
+ __m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47);
+ __m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47);
+ __m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47);
+ __m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b);
+ __m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b);
+ __m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b);
+ __m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b);
+ __m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf);
+ __m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf);
+ __m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf);
+ __m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf);
+
+ // NOTE: no unpacklo/hi instr in this case, so using permute instr.
+ __m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20);
+ __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
+ __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
+ __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
+ __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
+ __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
+ __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
+ __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
+ __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
+ __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
+ __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
+ __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
+ __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
+ __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
+ __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
+ __m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31);
+
+ kernel.packet[0] = _mm256_castsi256_ph(a_p_0);
+ kernel.packet[1] = _mm256_castsi256_ph(a_p_1);
+ kernel.packet[2] = _mm256_castsi256_ph(a_p_2);
+ kernel.packet[3] = _mm256_castsi256_ph(a_p_3);
+ kernel.packet[4] = _mm256_castsi256_ph(a_p_4);
+ kernel.packet[5] = _mm256_castsi256_ph(a_p_5);
+ kernel.packet[6] = _mm256_castsi256_ph(a_p_6);
+ kernel.packet[7] = _mm256_castsi256_ph(a_p_7);
+ kernel.packet[8] = _mm256_castsi256_ph(a_p_8);
+ kernel.packet[9] = _mm256_castsi256_ph(a_p_9);
+ kernel.packet[10] = _mm256_castsi256_ph(a_p_a);
+ kernel.packet[11] = _mm256_castsi256_ph(a_p_b);
+ kernel.packet[12] = _mm256_castsi256_ph(a_p_c);
+ kernel.packet[13] = _mm256_castsi256_ph(a_p_d);
+ kernel.packet[14] = _mm256_castsi256_ph(a_p_e);
+ kernel.packet[15] = _mm256_castsi256_ph(a_p_f);
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16h, 8>& kernel) {
+ EIGEN_ALIGN64 half in[8][16];
+ pstore<half>(in[0], kernel.packet[0]);
+ pstore<half>(in[1], kernel.packet[1]);
+ pstore<half>(in[2], kernel.packet[2]);
+ pstore<half>(in[3], kernel.packet[3]);
+ pstore<half>(in[4], kernel.packet[4]);
+ pstore<half>(in[5], kernel.packet[5]);
+ pstore<half>(in[6], kernel.packet[6]);
+ pstore<half>(in[7], kernel.packet[7]);
+
+ EIGEN_ALIGN64 half out[8][16];
+
+ for (int i = 0; i < 8; ++i) {
+ for (int j = 0; j < 8; ++j) {
+ out[i][j] = in[j][2 * i];
+ }
+ for (int j = 0; j < 8; ++j) {
+ out[i][j + 8] = in[j][2 * i + 1];
+ }
+ }
+
+ kernel.packet[0] = pload<Packet16h>(out[0]);
+ kernel.packet[1] = pload<Packet16h>(out[1]);
+ kernel.packet[2] = pload<Packet16h>(out[2]);
+ kernel.packet[3] = pload<Packet16h>(out[3]);
+ kernel.packet[4] = pload<Packet16h>(out[4]);
+ kernel.packet[5] = pload<Packet16h>(out[5]);
+ kernel.packet[6] = pload<Packet16h>(out[6]);
+ kernel.packet[7] = pload<Packet16h>(out[7]);
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16h, 4>& kernel) {
+ EIGEN_ALIGN64 half in[4][16];
+ pstore<half>(in[0], kernel.packet[0]);
+ pstore<half>(in[1], kernel.packet[1]);
+ pstore<half>(in[2], kernel.packet[2]);
+ pstore<half>(in[3], kernel.packet[3]);
+
+ EIGEN_ALIGN64 half out[4][16];
+
+ for (int i = 0; i < 4; ++i) {
+ for (int j = 0; j < 4; ++j) {
+ out[i][j] = in[j][4 * i];
+ }
+ for (int j = 0; j < 4; ++j) {
+ out[i][j + 4] = in[j][4 * i + 1];
+ }
+ for (int j = 0; j < 4; ++j) {
+ out[i][j + 8] = in[j][4 * i + 2];
+ }
+ for (int j = 0; j < 4; ++j) {
+ out[i][j + 12] = in[j][4 * i + 3];
+ }
+ }
+
+ kernel.packet[0] = pload<Packet16h>(out[0]);
+ kernel.packet[1] = pload<Packet16h>(out[1]);
+ kernel.packet[2] = pload<Packet16h>(out[2]);
+ kernel.packet[3] = pload<Packet16h>(out[3]);
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8h, 8>& kernel) {
+ __m128i a = _mm_castph_si128(kernel.packet[0]);
+ __m128i b = _mm_castph_si128(kernel.packet[1]);
+ __m128i c = _mm_castph_si128(kernel.packet[2]);
+ __m128i d = _mm_castph_si128(kernel.packet[3]);
+ __m128i e = _mm_castph_si128(kernel.packet[4]);
+ __m128i f = _mm_castph_si128(kernel.packet[5]);
+ __m128i g = _mm_castph_si128(kernel.packet[6]);
+ __m128i h = _mm_castph_si128(kernel.packet[7]);
+
+ __m128i a03b03 = _mm_unpacklo_epi16(a, b);
+ __m128i c03d03 = _mm_unpacklo_epi16(c, d);
+ __m128i e03f03 = _mm_unpacklo_epi16(e, f);
+ __m128i g03h03 = _mm_unpacklo_epi16(g, h);
+ __m128i a47b47 = _mm_unpackhi_epi16(a, b);
+ __m128i c47d47 = _mm_unpackhi_epi16(c, d);
+ __m128i e47f47 = _mm_unpackhi_epi16(e, f);
+ __m128i g47h47 = _mm_unpackhi_epi16(g, h);
+
+ __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03);
+ __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03);
+ __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03);
+ __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03);
+ __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47);
+ __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47);
+ __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47);
+ __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47);
+
+ __m128i a0b0c0d0e0f0g0h0 = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01);
+ __m128i a1b1c1d1e1f1g1h1 = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01);
+ __m128i a2b2c2d2e2f2g2h2 = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23);
+ __m128i a3b3c3d3e3f3g3h3 = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23);
+ __m128i a4b4c4d4e4f4g4h4 = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45);
+ __m128i a5b5c5d5e5f5g5h5 = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45);
+ __m128i a6b6c6d6e6f6g6h6 = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67);
+ __m128i a7b7c7d7e7f7g7h7 = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67);
+
+ kernel.packet[0] = _mm_castsi128_ph(a0b0c0d0e0f0g0h0);
+ kernel.packet[1] = _mm_castsi128_ph(a1b1c1d1e1f1g1h1);
+ kernel.packet[2] = _mm_castsi128_ph(a2b2c2d2e2f2g2h2);
+ kernel.packet[3] = _mm_castsi128_ph(a3b3c3d3e3f3g3h3);
+ kernel.packet[4] = _mm_castsi128_ph(a4b4c4d4e4f4g4h4);
+ kernel.packet[5] = _mm_castsi128_ph(a5b5c5d5e5f5g5h5);
+ kernel.packet[6] = _mm_castsi128_ph(a6b6c6d6e6f6g6h6);
+ kernel.packet[7] = _mm_castsi128_ph(a7b7c7d7e7f7g7h7);
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8h, 4>& kernel) {
+ EIGEN_ALIGN32 Eigen::half in[4][8];
+ pstore<Eigen::half>(in[0], kernel.packet[0]);
+ pstore<Eigen::half>(in[1], kernel.packet[1]);
+ pstore<Eigen::half>(in[2], kernel.packet[2]);
+ pstore<Eigen::half>(in[3], kernel.packet[3]);
+
+ EIGEN_ALIGN32 Eigen::half out[4][8];
+
+ for (int i = 0; i < 4; ++i) {
+ for (int j = 0; j < 4; ++j) {
+ out[i][j] = in[j][2 * i];
+ }
+ for (int j = 0; j < 4; ++j) {
+ out[i][j + 4] = in[j][2 * i + 1];
+ }
+ }
+
+ kernel.packet[0] = pload<Packet8h>(out[0]);
+ kernel.packet[1] = pload<Packet8h>(out[1]);
+ kernel.packet[2] = pload<Packet8h>(out[2]);
+ kernel.packet[3] = pload<Packet8h>(out[3]);
+}
+
// preverse
template <>
@@ -672,6 +1317,20 @@
a);
}
+template <>
+EIGEN_STRONG_INLINE Packet16h preverse(const Packet16h& a) {
+ __m128i m = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
+ return _mm256_castsi256_ph(_mm256_insertf128_si256(
+ _mm256_castsi128_si256(_mm_shuffle_epi8(_mm256_extractf128_si256(_mm256_castph_si256(a), 1), m)),
+ _mm_shuffle_epi8(_mm256_extractf128_si256(_mm256_castph_si256(a), 0), m), 1));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h preverse(const Packet8h& a) {
+ __m128i m = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
+ return _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(a), m));
+}
+
// pscatter
template <>
@@ -684,191 +1343,68 @@
to[stride * i] = aux[i];
}
}
+template <>
+EIGEN_STRONG_INLINE void pscatter<half, Packet16h>(half* to, const Packet16h& from, Index stride) {
+ EIGEN_ALIGN64 half aux[16];
+ pstore(aux, from);
+ to[stride * 0] = aux[0];
+ to[stride * 1] = aux[1];
+ to[stride * 2] = aux[2];
+ to[stride * 3] = aux[3];
+ to[stride * 4] = aux[4];
+ to[stride * 5] = aux[5];
+ to[stride * 6] = aux[6];
+ to[stride * 7] = aux[7];
+ to[stride * 8] = aux[8];
+ to[stride * 9] = aux[9];
+ to[stride * 10] = aux[10];
+ to[stride * 11] = aux[11];
+ to[stride * 12] = aux[12];
+ to[stride * 13] = aux[13];
+ to[stride * 14] = aux[14];
+ to[stride * 15] = aux[15];
+}
+
+template <>
+EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8h>(Eigen::half* to, const Packet8h& from, Index stride) {
+ EIGEN_ALIGN32 Eigen::half aux[8];
+ pstore(aux, from);
+ to[stride * 0] = aux[0];
+ to[stride * 1] = aux[1];
+ to[stride * 2] = aux[2];
+ to[stride * 3] = aux[3];
+ to[stride * 4] = aux[4];
+ to[stride * 5] = aux[5];
+ to[stride * 6] = aux[6];
+ to[stride * 7] = aux[7];
+}
// pgather
template <>
EIGEN_STRONG_INLINE Packet32h pgather<Eigen::half, Packet32h>(const Eigen::half* from, Index stride) {
- return _mm512_castsi512_ph(_mm512_set_epi16(
- from[31 * stride].x, from[30 * stride].x, from[29 * stride].x, from[28 * stride].x, from[27 * stride].x,
- from[26 * stride].x, from[25 * stride].x, from[24 * stride].x, from[23 * stride].x, from[22 * stride].x,
- from[21 * stride].x, from[20 * stride].x, from[19 * stride].x, from[18 * stride].x, from[17 * stride].x,
- from[16 * stride].x, from[15 * stride].x, from[14 * stride].x, from[13 * stride].x, from[12 * stride].x,
- from[11 * stride].x, from[10 * stride].x, from[9 * stride].x, from[8 * stride].x, from[7 * stride].x,
- from[6 * stride].x, from[5 * stride].x, from[4 * stride].x, from[3 * stride].x, from[2 * stride].x,
- from[1 * stride].x, from[0 * stride].x));
+ return _mm512_set_ph(from[31 * stride].x, from[30 * stride].x, from[29 * stride].x, from[28 * stride].x,
+ from[27 * stride].x, from[26 * stride].x, from[25 * stride].x, from[24 * stride].x,
+ from[23 * stride].x, from[22 * stride].x, from[21 * stride].x, from[20 * stride].x,
+ from[19 * stride].x, from[18 * stride].x, from[17 * stride].x, from[16 * stride].x,
+ from[15 * stride].x, from[14 * stride].x, from[13 * stride].x, from[12 * stride].x,
+ from[11 * stride].x, from[10 * stride].x, from[9 * stride].x, from[8 * stride].x,
+ from[7 * stride].x, from[6 * stride].x, from[5 * stride].x, from[4 * stride].x,
+ from[3 * stride].x, from[2 * stride].x, from[1 * stride].x, from[0 * stride].x);
}
template <>
-EIGEN_STRONG_INLINE Packet16h pcos<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h psin<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h plog<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h plog2<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h plog1p<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h pexp<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h pexpm1<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h ptanh<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h pfrexp<Packet16h>(const Packet16h&, Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h pldexp<Packet16h>(const Packet16h&, const Packet16h&);
-
-EIGEN_STRONG_INLINE Packet32h combine2Packet16h(const Packet16h& a, const Packet16h& b) {
- __m512d result = _mm512_undefined_pd();
- result = _mm512_insertf64x4(result, _mm256_castsi256_pd(a), 0);
- result = _mm512_insertf64x4(result, _mm256_castsi256_pd(b), 1);
- return _mm512_castpd_ph(result);
+EIGEN_STRONG_INLINE Packet16h pgather<Eigen::half, Packet16h>(const Eigen::half* from, Index stride) {
+ return _mm256_set_ph(from[15 * stride].x, from[14 * stride].x, from[13 * stride].x, from[12 * stride].x,
+ from[11 * stride].x, from[10 * stride].x, from[9 * stride].x, from[8 * stride].x,
+ from[7 * stride].x, from[6 * stride].x, from[5 * stride].x, from[4 * stride].x,
+ from[3 * stride].x, from[2 * stride].x, from[1 * stride].x, from[0 * stride].x);
}
-EIGEN_STRONG_INLINE void extract2Packet16h(const Packet32h& x, Packet16h& a, Packet16h& b) {
- a = _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(x), 0));
- b = _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(x), 1));
-}
-
-// psin
template <>
-EIGEN_STRONG_INLINE Packet32h psin<Packet32h>(const Packet32h& a) {
- Packet16h low;
- Packet16h high;
- extract2Packet16h(a, low, high);
-
- Packet16h lowOut = psin(low);
- Packet16h highOut = psin(high);
-
- return combine2Packet16h(lowOut, highOut);
-}
-
-// pcos
-template <>
-EIGEN_STRONG_INLINE Packet32h pcos<Packet32h>(const Packet32h& a) {
- Packet16h low;
- Packet16h high;
- extract2Packet16h(a, low, high);
-
- Packet16h lowOut = pcos(low);
- Packet16h highOut = pcos(high);
-
- return combine2Packet16h(lowOut, highOut);
-}
-
-// plog
-template <>
-EIGEN_STRONG_INLINE Packet32h plog<Packet32h>(const Packet32h& a) {
- Packet16h low;
- Packet16h high;
- extract2Packet16h(a, low, high);
-
- Packet16h lowOut = plog(low);
- Packet16h highOut = plog(high);
-
- return combine2Packet16h(lowOut, highOut);
-}
-
-// plog2
-template <>
-EIGEN_STRONG_INLINE Packet32h plog2<Packet32h>(const Packet32h& a) {
- Packet16h low;
- Packet16h high;
- extract2Packet16h(a, low, high);
-
- Packet16h lowOut = plog2(low);
- Packet16h highOut = plog2(high);
-
- return combine2Packet16h(lowOut, highOut);
-}
-
-// plog1p
-template <>
-EIGEN_STRONG_INLINE Packet32h plog1p<Packet32h>(const Packet32h& a) {
- Packet16h low;
- Packet16h high;
- extract2Packet16h(a, low, high);
-
- Packet16h lowOut = plog1p(low);
- Packet16h highOut = plog1p(high);
-
- return combine2Packet16h(lowOut, highOut);
-}
-
-// pexp
-template <>
-EIGEN_STRONG_INLINE Packet32h pexp<Packet32h>(const Packet32h& a) {
- Packet16h low;
- Packet16h high;
- extract2Packet16h(a, low, high);
-
- Packet16h lowOut = pexp(low);
- Packet16h highOut = pexp(high);
-
- return combine2Packet16h(lowOut, highOut);
-}
-
-// pexpm1
-template <>
-EIGEN_STRONG_INLINE Packet32h pexpm1<Packet32h>(const Packet32h& a) {
- Packet16h low;
- Packet16h high;
- extract2Packet16h(a, low, high);
-
- Packet16h lowOut = pexpm1(low);
- Packet16h highOut = pexpm1(high);
-
- return combine2Packet16h(lowOut, highOut);
-}
-
-// ptanh
-template <>
-EIGEN_STRONG_INLINE Packet32h ptanh<Packet32h>(const Packet32h& a) {
- Packet16h low;
- Packet16h high;
- extract2Packet16h(a, low, high);
-
- Packet16h lowOut = ptanh(low);
- Packet16h highOut = ptanh(high);
-
- return combine2Packet16h(lowOut, highOut);
-}
-
-// pfrexp
-template <>
-EIGEN_STRONG_INLINE Packet32h pfrexp<Packet32h>(const Packet32h& a, Packet32h& exponent) {
- Packet16h low;
- Packet16h high;
- extract2Packet16h(a, low, high);
-
- Packet16h exp1 = _mm256_undefined_si256();
- Packet16h exp2 = _mm256_undefined_si256();
-
- Packet16h lowOut = pfrexp(low, exp1);
- Packet16h highOut = pfrexp(high, exp2);
-
- exponent = combine2Packet16h(exp1, exp2);
-
- return combine2Packet16h(lowOut, highOut);
-}
-
-// pldexp
-template <>
-EIGEN_STRONG_INLINE Packet32h pldexp<Packet32h>(const Packet32h& a, const Packet32h& exponent) {
- Packet16h low;
- Packet16h high;
- extract2Packet16h(a, low, high);
-
- Packet16h exp1;
- Packet16h exp2;
- extract2Packet16h(exponent, exp1, exp2);
-
- Packet16h lowOut = pldexp(low, exp1);
- Packet16h highOut = pldexp(high, exp2);
-
- return combine2Packet16h(lowOut, highOut);
+EIGEN_STRONG_INLINE Packet8h pgather<Eigen::half, Packet8h>(const Eigen::half* from, Index stride) {
+ return _mm_set_ph(from[7 * stride].x, from[6 * stride].x, from[5 * stride].x, from[4 * stride].x, from[3 * stride].x,
+ from[2 * stride].x, from[1 * stride].x, from[0 * stride].x);
}
} // end namespace internal
diff --git a/Eigen/src/Core/arch/AVX512/TypeCasting.h b/Eigen/src/Core/arch/AVX512/TypeCasting.h
index 9508ac6..fc55fd8 100644
--- a/Eigen/src/Core/arch/AVX512/TypeCasting.h
+++ b/Eigen/src/Core/arch/AVX512/TypeCasting.h
@@ -237,17 +237,13 @@
return _mm512_castsi512_si128(a);
}
+#ifndef EIGEN_VECTORIZE_AVX512FP16
template <>
EIGEN_STRONG_INLINE Packet8h preinterpret<Packet8h, Packet16h>(const Packet16h& a) {
return _mm256_castsi256_si128(a);
}
template <>
-EIGEN_STRONG_INLINE Packet8bf preinterpret<Packet8bf, Packet16bf>(const Packet16bf& a) {
- return _mm256_castsi256_si128(a);
-}
-
-template <>
EIGEN_STRONG_INLINE Packet16f pcast<Packet16h, Packet16f>(const Packet16h& a) {
return half2float(a);
}
@@ -257,6 +253,13 @@
return float2half(a);
}
+#endif
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf preinterpret<Packet8bf, Packet16bf>(const Packet16bf& a) {
+ return _mm256_castsi256_si128(a);
+}
+
template <>
EIGEN_STRONG_INLINE Packet16f pcast<Packet16bf, Packet16f>(const Packet16bf& a) {
return Bf16ToF32(a);
@@ -267,68 +270,6 @@
return F32ToBf16(a);
}
-#ifdef EIGEN_VECTORIZE_AVX512FP16
-
-template <>
-EIGEN_STRONG_INLINE Packet16h preinterpret<Packet16h, Packet32h>(const Packet32h& a) {
- return _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(a), 0));
-}
-template <>
-EIGEN_STRONG_INLINE Packet8h preinterpret<Packet8h, Packet32h>(const Packet32h& a) {
- return _mm256_castsi256_si128(preinterpret<Packet16h>(a));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16f pcast<Packet32h, Packet16f>(const Packet32h& a) {
- // Discard second-half of input.
- Packet16h low = _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(a), 0));
- return _mm512_cvtxph_ps(_mm256_castsi256_ph(low));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pcast<Packet16f, Packet32h>(const Packet16f& a, const Packet16f& b) {
- __m512d result = _mm512_undefined_pd();
- result = _mm512_insertf64x4(
- result, _mm256_castsi256_pd(_mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)), 0);
- result = _mm512_insertf64x4(
- result, _mm256_castsi256_pd(_mm512_cvtps_ph(b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)), 1);
- return _mm512_castpd_ph(result);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8f pcast<Packet16h, Packet8f>(const Packet16h& a) {
- // Discard second-half of input.
- Packet8h low = _mm_castps_si128(_mm256_extractf32x4_ps(_mm256_castsi256_ps(a), 0));
- return _mm256_cvtxph_ps(_mm_castsi128_ph(low));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16h pcast<Packet8f, Packet16h>(const Packet8f& a, const Packet8f& b) {
- __m256d result = _mm256_undefined_pd();
- result = _mm256_insertf64x2(result,
- _mm_castsi128_pd(_mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)), 0);
- result = _mm256_insertf64x2(result,
- _mm_castsi128_pd(_mm256_cvtps_ph(b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)), 1);
- return _mm256_castpd_si256(result);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4f pcast<Packet8h, Packet4f>(const Packet8h& a) {
- Packet8f full = _mm256_cvtxph_ps(_mm_castsi128_ph(a));
- // Discard second-half of input.
- return _mm256_extractf32x4_ps(full, 0);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8h pcast<Packet4f, Packet8h>(const Packet4f& a, const Packet4f& b) {
- __m256 result = _mm256_undefined_ps();
- result = _mm256_insertf128_ps(result, a, 0);
- result = _mm256_insertf128_ps(result, b, 1);
- return _mm256_cvtps_ph(result, _MM_FROUND_TO_NEAREST_INT);
-}
-
-#endif
-
} // end namespace internal
} // end namespace Eigen
diff --git a/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h b/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h
new file mode 100644
index 0000000..f06f13d
--- /dev/null
+++ b/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h
@@ -0,0 +1,130 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2025 The Eigen Authors.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TYPE_CASTING_FP16_AVX512_H
+#define EIGEN_TYPE_CASTING_FP16_AVX512_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+template <>
+EIGEN_STRONG_INLINE Packet32s preinterpret<Packet32s, Packet32h>(const Packet32h& a) {
+ return _mm512_castph_si512(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16s preinterpret<Packet16s, Packet16h>(const Packet16h& a) {
+ return _mm256_castph_si256(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s preinterpret<Packet8s, Packet8h>(const Packet8h& a) {
+ return _mm_castph_si128(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet32h preinterpret<Packet32h, Packet32s>(const Packet32s& a) {
+ return _mm512_castsi512_ph(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16h preinterpret<Packet16h, Packet16s>(const Packet16s& a) {
+ return _mm256_castsi256_ph(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8h preinterpret<Packet8h, Packet8s>(const Packet8s& a) {
+ return _mm_castsi128_ph(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pcast<Packet16h, Packet16f>(const Packet16h& a) {
+ return half2float(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pcast<Packet8h, Packet8f>(const Packet8h& a) {
+ return half2float(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pcast<Packet16f, Packet16h>(const Packet16f& a) {
+ return float2half(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8h pcast<Packet8f, Packet8h>(const Packet8f& a) {
+ return float2half(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pcast<Packet32h, Packet16f>(const Packet32h& a) {
+ // Discard second-half of input.
+ Packet16h low = _mm256_castpd_ph(_mm512_extractf64x4_pd(_mm512_castph_pd(a), 0));
+ return _mm512_cvtxph_ps(low);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pcast<Packet16h, Packet8f>(const Packet16h& a) {
+ // Discard second-half of input.
+ Packet8h low = _mm_castps_ph(_mm256_extractf32x4_ps(_mm256_castph_ps(a), 0));
+ return _mm256_cvtxph_ps(low);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet8h, Packet4f>(const Packet8h& a) {
+ Packet8f full = _mm256_cvtxph_ps(a);
+ // Discard second-half of input.
+ return _mm256_extractf32x4_ps(full, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pcast<Packet16f, Packet32h>(const Packet16f& a, const Packet16f& b) {
+ __m512 result = _mm512_castsi512_ps(_mm512_castsi256_si512(_mm256_castph_si256(_mm512_cvtxps_ph(a))));
+ result = _mm512_insertf32x8(result, _mm256_castph_ps(_mm512_cvtxps_ph(b)), 1);
+ return _mm512_castps_ph(result);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16h pcast<Packet8f, Packet16h>(const Packet8f& a, const Packet8f& b) {
+ __m256 result = _mm256_castsi256_ps(_mm256_castsi128_si256(_mm_castph_si128(_mm256_cvtxps_ph(a))));
+ result = _mm256_insertf32x4(result, _mm_castph_ps(_mm256_cvtxps_ph(b)), 1);
+ return _mm256_castps_ph(result);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8h pcast<Packet4f, Packet8h>(const Packet4f& a, const Packet4f& b) {
+ __m256 result = _mm256_castsi256_ps(_mm256_castsi128_si256(_mm_castps_si128(a)));
+ result = _mm256_insertf128_ps(result, b, 1);
+ return _mm256_cvtxps_ph(result);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet32s pcast<Packet32h, Packet32s>(const Packet32h& a) {
+ return _mm512_cvtph_epi16(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16s pcast<Packet16h, Packet16s>(const Packet16h& a) {
+ return _mm256_cvtph_epi16(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pcast<Packet8h, Packet8s>(const Packet8h& a) {
+ return _mm_cvtph_epi16(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pcast<Packet32s, Packet32h>(const Packet32s& a) {
+ return _mm512_cvtepi16_ph(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16h pcast<Packet16s, Packet16h>(const Packet16s& a) {
+ return _mm256_cvtepi16_ph(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8h pcast<Packet8s, Packet8h>(const Packet8s& a) {
+ return _mm_cvtepi16_ph(a);
+}
+
+} // namespace internal
+} // namespace Eigen
+
+#endif // EIGEN_TYPE_CASTING_FP16_AVX512_H
diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index 482064e..d7bd9be 100644
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -186,6 +186,7 @@
HasExp = 1,
#ifdef EIGEN_VECTORIZE_VSX
HasSqrt = 1,
+ HasCbrt = 1,
#if !EIGEN_COMP_CLANG
HasRsqrt = 1,
#else
@@ -424,55 +425,6 @@
masked_store_available = false
};
};
-inline std::ostream& operator<<(std::ostream& s, const Packet16c& v) {
- union {
- Packet16c v;
- signed char n[16];
- } vt;
- vt.v = v;
- for (int i = 0; i < 16; i++) s << vt.n[i] << ", ";
- return s;
-}
-
-inline std::ostream& operator<<(std::ostream& s, const Packet16uc& v) {
- union {
- Packet16uc v;
- unsigned char n[16];
- } vt;
- vt.v = v;
- for (int i = 0; i < 16; i++) s << vt.n[i] << ", ";
- return s;
-}
-
-inline std::ostream& operator<<(std::ostream& s, const Packet4f& v) {
- union {
- Packet4f v;
- float n[4];
- } vt;
- vt.v = v;
- s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
- return s;
-}
-
-inline std::ostream& operator<<(std::ostream& s, const Packet4i& v) {
- union {
- Packet4i v;
- int n[4];
- } vt;
- vt.v = v;
- s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
- return s;
-}
-
-inline std::ostream& operator<<(std::ostream& s, const Packet4ui& v) {
- union {
- Packet4ui v;
- unsigned int n[4];
- } vt;
- vt.v = v;
- s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
- return s;
-}
template <typename Packet>
EIGEN_STRONG_INLINE Packet pload_common(const __UNPACK_TYPE__(Packet) * from) {
@@ -2385,6 +2337,44 @@
}
template <>
+EIGEN_STRONG_INLINE Packet8bf pmsub(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
+ Packet4f a_even = Bf16ToF32Even(a);
+ Packet4f a_odd = Bf16ToF32Odd(a);
+ Packet4f b_even = Bf16ToF32Even(b);
+ Packet4f b_odd = Bf16ToF32Odd(b);
+ Packet4f c_even = Bf16ToF32Even(c);
+ Packet4f c_odd = Bf16ToF32Odd(c);
+ Packet4f pmadd_even = pmsub<Packet4f>(a_even, b_even, c_even);
+ Packet4f pmadd_odd = pmsub<Packet4f>(a_odd, b_odd, c_odd);
+ return F32ToBf16(pmadd_even, pmadd_odd);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8bf pnmadd(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
+ Packet4f a_even = Bf16ToF32Even(a);
+ Packet4f a_odd = Bf16ToF32Odd(a);
+ Packet4f b_even = Bf16ToF32Even(b);
+ Packet4f b_odd = Bf16ToF32Odd(b);
+ Packet4f c_even = Bf16ToF32Even(c);
+ Packet4f c_odd = Bf16ToF32Odd(c);
+ Packet4f pmadd_even = pnmadd<Packet4f>(a_even, b_even, c_even);
+ Packet4f pmadd_odd = pnmadd<Packet4f>(a_odd, b_odd, c_odd);
+ return F32ToBf16(pmadd_even, pmadd_odd);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pnmsub(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
+ Packet4f a_even = Bf16ToF32Even(a);
+ Packet4f a_odd = Bf16ToF32Odd(a);
+ Packet4f b_even = Bf16ToF32Even(b);
+ Packet4f b_odd = Bf16ToF32Odd(b);
+ Packet4f c_even = Bf16ToF32Even(c);
+ Packet4f c_odd = Bf16ToF32Odd(c);
+ Packet4f pmadd_even = pnmsub<Packet4f>(a_even, b_even, c_even);
+ Packet4f pmadd_odd = pnmsub<Packet4f>(a_odd, b_odd, c_odd);
+ return F32ToBf16(pmadd_even, pmadd_odd);
+}
+
+template <>
EIGEN_STRONG_INLINE Packet8bf pmin<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
BF16_TO_F32_BINARY_OP_WRAPPER(pmin<Packet4f>, a, b);
}
@@ -3187,6 +3177,7 @@
HasLog = 0,
HasExp = 1,
HasSqrt = 1,
+ HasCbrt = 1,
#if !EIGEN_COMP_CLANG
HasRsqrt = 1,
#else
diff --git a/Eigen/src/Core/arch/Default/BFloat16.h b/Eigen/src/Core/arch/Default/BFloat16.h
index b8d3b4f..f2e55f3 100644
--- a/Eigen/src/Core/arch/Default/BFloat16.h
+++ b/Eigen/src/Core/arch/Default/BFloat16.h
@@ -673,6 +673,11 @@
return bfloat16(::fmaxf(f1, f2));
}
+EIGEN_DEVICE_FUNC inline bfloat16 fma(const bfloat16& a, const bfloat16& b, const bfloat16& c) {
+ // Emulate FMA via float.
+ return bfloat16(numext::fma(static_cast<float>(a), static_cast<float>(b), static_cast<float>(c)));
+}
+
#ifndef EIGEN_NO_IO
EIGEN_ALWAYS_INLINE std::ostream& operator<<(std::ostream& os, const bfloat16& v) {
os << static_cast<float>(v);
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
index 174eb57..e9f564b 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@@ -187,7 +187,7 @@
static constexpr int TotalBits = sizeof(Scalar) * CHAR_BIT, MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
ExponentBits = TotalBits - MantissaBits - 1;
- EIGEN_CONSTEXPR ScalarUI scalar_sign_mantissa_mask =
+ constexpr ScalarUI scalar_sign_mantissa_mask =
~(((ScalarUI(1) << ExponentBits) - ScalarUI(1)) << MantissaBits); // ~0x7f800000
const Packet sign_mantissa_mask = pset1frombits<Packet>(static_cast<ScalarUI>(scalar_sign_mantissa_mask));
const Packet half = pset1<Packet>(Scalar(0.5));
@@ -196,7 +196,7 @@
// To handle denormals, normalize by multiplying by 2^(int(MantissaBits)+1).
const Packet is_denormal = pcmp_lt(pabs(a), normal_min);
- EIGEN_CONSTEXPR ScalarUI scalar_normalization_offset = ScalarUI(MantissaBits + 1); // 24
+ constexpr ScalarUI scalar_normalization_offset = ScalarUI(MantissaBits + 1); // 24
// The following cannot be constexpr because bfloat16(uint16_t) is not constexpr.
const Scalar scalar_normalization_factor = Scalar(ScalarUI(1) << int(scalar_normalization_offset)); // 2^24
const Packet normalization_factor = pset1<Packet>(scalar_normalization_factor);
@@ -289,6 +289,142 @@
return pmul(a, preinterpret<Packet>(plogical_shift_left<MantissaBits>(e)));
}
+// This function implements a single step of Halley's iteration for
+// computing x = y^(1/3):
+// x_{k+1} = x_k - (x_k^3 - y) x_k / (2x_k^3 + y)
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet cbrt_halley_iteration_step(const Packet& x_k,
+ const Packet& y) {
+ typedef typename unpacket_traits<Packet>::type Scalar;
+ Packet x_k_cb = pmul(x_k, pmul(x_k, x_k));
+ Packet denom = pmadd(pset1<Packet>(Scalar(2)), x_k_cb, y);
+ Packet num = psub(x_k_cb, y);
+ Packet r = pdiv(num, denom);
+ return pnmadd(x_k, r, x_k);
+}
+
+// Decompose the input such that x^(1/3) = y^(1/3) * 2^e_div3, and y is in the
+// interval [0.125,1].
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet cbrt_decompose(const Packet& x, Packet& e_div3) {
+ typedef typename unpacket_traits<Packet>::type Scalar;
+ // Extract the significant s in the range [0.5,1) and exponent e, such that
+ // x = 2^e * s.
+ Packet e, s;
+ s = pfrexp(x, e);
+
+ // Split the exponent into a part divisible by 3 and the remainder.
+ // e = 3*e_div3 + e_mod3.
+ constexpr Scalar kOneThird = Scalar(1) / 3;
+ e_div3 = pceil(pmul(e, pset1<Packet>(kOneThird)));
+ Packet e_mod3 = pnmadd(pset1<Packet>(Scalar(3)), e_div3, e);
+
+ // Replace s by y = (s * 2^e_mod3).
+ return pldexp_fast(s, e_mod3);
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet cbrt_special_cases_and_sign(const Packet& x,
+ const Packet& abs_root) {
+ typedef typename unpacket_traits<Packet>::type Scalar;
+
+ // Set sign.
+ const Packet sign_mask = pset1<Packet>(Scalar(-0.0));
+ const Packet x_sign = pand(sign_mask, x);
+ Packet root = por(x_sign, abs_root);
+
+ // Pass non-finite and zero values of x straight through.
+ const Packet is_not_finite = por(pisinf(x), pisnan(x));
+ const Packet is_zero = pcmp_eq(pzero(x), x);
+ const Packet use_x = por(is_not_finite, is_zero);
+ return pselect(use_x, x, root);
+}
+
+// Generic implementation of cbrt(x) for float.
+//
+// The algorithm computes the cubic root of the input by first
+// decomposing it into a exponent and significant
+// x = s * 2^e.
+//
+// We can then write the cube root as
+//
+// x^(1/3) = 2^(e/3) * s^(1/3)
+// = 2^((3*e_div3 + e_mod3)/3) * s^(1/3)
+// = 2^(e_div3) * 2^(e_mod3/3) * s^(1/3)
+// = 2^(e_div3) * (s * 2^e_mod3)^(1/3)
+//
+// where e_div3 = ceil(e/3) and e_mod3 = e - 3*e_div3.
+//
+// The cube root of the second term y = (s * 2^e_mod3)^(1/3) is coarsely
+// approximated using a cubic polynomial and subsequently refined using a
+// single step of Halley's iteration, and finally the two terms are combined
+// using pldexp_fast.
+//
+// Note: Many alternatives exist for implementing cbrt. See, for example,
+// the excellent discussion in Kahan's note:
+// https://csclub.uwaterloo.ca/~pbarfuss/qbrt.pdf
+// This particular implementation was found to be very fast and accurate
+// among several alternatives tried, but is probably not "optimal" on all
+// platforms.
+//
+// This is accurate to 2 ULP.
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcbrt_float(const Packet& x) {
+ typedef typename unpacket_traits<Packet>::type Scalar;
+ static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
+
+ // Decompose the input such that x^(1/3) = y^(1/3) * 2^e_div3, and y is in the
+ // interval [0.125,1].
+ Packet e_div3;
+ const Packet y = cbrt_decompose(pabs(x), e_div3);
+
+ // Compute initial approximation accurate to 5.22e-3.
+ // The polynomial was computed using Rminimax.
+ constexpr float alpha[] = {5.9220016002655029296875e-01f, -1.3859539031982421875e+00f, 1.4581282138824462890625e+00f,
+ 3.408401906490325927734375e-01f};
+ Packet r = ppolevl<Packet, 3>::run(y, alpha);
+
+ // Take one step of Halley's iteration.
+ r = cbrt_halley_iteration_step(r, y);
+
+ // Finally multiply by 2^(e_div3)
+ r = pldexp_fast(r, e_div3);
+
+ return cbrt_special_cases_and_sign(x, r);
+}
+
+// Generic implementation of cbrt(x) for double.
+//
+// The algorithm is identical to the one for float except that a different initial
+// approximation is used for y^(1/3) and two Halley iteration steps are peformed.
+//
+// This is accurate to 1 ULP.
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcbrt_double(const Packet& x) {
+ typedef typename unpacket_traits<Packet>::type Scalar;
+ static_assert(std::is_same<Scalar, double>::value, "Scalar type must be double");
+
+ // Decompose the input such that x^(1/3) = y^(1/3) * 2^e_div3, and y is in the
+ // interval [0.125,1].
+ Packet e_div3;
+ const Packet y = cbrt_decompose(pabs(x), e_div3);
+
+ // Compute initial approximation accurate to 0.016.
+ // The polynomial was computed using Rminimax.
+ constexpr double alpha[] = {-4.69470621553356115551736138513660989701747894287109375e-01,
+ 1.072314636518546304699839311069808900356292724609375e+00,
+ 3.81249427609571867048288140722434036433696746826171875e-01};
+ Packet r = ppolevl<Packet, 2>::run(y, alpha);
+
+ // Take two steps of Halley's iteration.
+ r = cbrt_halley_iteration_step(r, y);
+ r = cbrt_halley_iteration_step(r, y);
+
+ // Finally multiply by 2^(e_div3).
+ r = pldexp_fast(r, e_div3);
+ return cbrt_special_cases_and_sign(x, r);
+}
+
// Natural or base 2 logarithm.
// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can
@@ -1123,7 +1259,7 @@
constexpr Scalar kPiOverTwo = static_cast<Scalar>(EIGEN_PI / 2);
- const Packet cst_signmask = pset1<Packet>(-Scalar(0));
+ const Packet cst_signmask = pset1<Packet>(Scalar(-0.0));
const Packet cst_one = pset1<Packet>(Scalar(1));
const Packet cst_pi_over_two = pset1<Packet>(kPiOverTwo);
@@ -1685,7 +1821,7 @@
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void veltkamp_splitting(const Packet& x, Packet& x_hi, Packet& x_lo) {
typedef typename unpacket_traits<Packet>::type Scalar;
- EIGEN_CONSTEXPR int shift = (NumTraits<Scalar>::digits() + 1) / 2;
+ constexpr int shift = (NumTraits<Scalar>::digits() + 1) / 2;
const Scalar shift_scale = Scalar(uint64_t(1) << shift); // Scalar constructor not necessarily constexpr.
const Packet gamma = pmul(pset1<Packet>(shift_scale + Scalar(1)), x);
Packet rho = psub(x, gamma);
@@ -1991,7 +2127,7 @@
Packet m_x = pfrexp(x, e_x);
// Adjust m_x to lie in [1/sqrt(2):sqrt(2)] to minimize absolute error in log2(m_x).
- EIGEN_CONSTEXPR Scalar sqrt_half = Scalar(0.70710678118654752440);
+ constexpr Scalar sqrt_half = Scalar(0.70710678118654752440);
const Packet m_x_scale_mask = pcmp_lt(m_x, pset1<Packet>(sqrt_half));
m_x = pselect(m_x_scale_mask, pmul(pset1<Packet>(Scalar(2)), m_x), m_x);
e_x = pselect(m_x_scale_mask, psub(e_x, pset1<Packet>(Scalar(1))), e_x);
@@ -2074,7 +2210,7 @@
const Packet y_is_even = pcmp_eq(pround(y_div_2), y_div_2);
const Packet y_is_odd_int = pandnot(y_is_int, y_is_even);
// Smallest exponent for which (1 + epsilon) overflows to infinity.
- EIGEN_CONSTEXPR Scalar huge_exponent =
+ constexpr Scalar huge_exponent =
(NumTraits<Scalar>::max_exponent() * Scalar(EIGEN_LN2)) / NumTraits<Scalar>::epsilon();
const Packet y_abs_is_huge = pcmp_le(pset1<Packet>(huge_exponent), y_abs);
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
index ac0e2cf..673954e 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
@@ -54,6 +54,14 @@
template <typename Packet>
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_fast(const Packet& a, const Packet& exponent);
+/** \internal \returns cbrt(x) for single precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcbrt_float(const Packet& x_in);
+
+/** \internal \returns cbrt(x) for double precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcbrt_double(const Packet& x_in);
+
/** \internal \returns log(x) for single precision float */
template <typename Packet>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_float(const Packet _x);
@@ -195,6 +203,7 @@
EIGEN_FLOAT_PACKET_FUNCTION(log, PACKET) \
EIGEN_FLOAT_PACKET_FUNCTION(log2, PACKET) \
EIGEN_FLOAT_PACKET_FUNCTION(exp, PACKET) \
+ EIGEN_FLOAT_PACKET_FUNCTION(cbrt, PACKET) \
EIGEN_GENERIC_PACKET_FUNCTION(expm1, PACKET) \
EIGEN_GENERIC_PACKET_FUNCTION(exp2, PACKET) \
EIGEN_GENERIC_PACKET_FUNCTION(log1p, PACKET) \
@@ -208,6 +217,7 @@
EIGEN_DOUBLE_PACKET_FUNCTION(log2, PACKET) \
EIGEN_DOUBLE_PACKET_FUNCTION(exp, PACKET) \
EIGEN_DOUBLE_PACKET_FUNCTION(tanh, PACKET) \
+ EIGEN_DOUBLE_PACKET_FUNCTION(cbrt, PACKET) \
EIGEN_GENERIC_PACKET_FUNCTION(atan, PACKET) \
EIGEN_GENERIC_PACKET_FUNCTION(exp2, PACKET)
diff --git a/Eigen/src/Core/arch/Default/Half.h b/Eigen/src/Core/arch/Default/Half.h
index 95697f3..ba70d5f 100644
--- a/Eigen/src/Core/arch/Default/Half.h
+++ b/Eigen/src/Core/arch/Default/Half.h
@@ -37,21 +37,23 @@
// IWYU pragma: private
#include "../../InternalHeaderCheck.h"
-#if defined(EIGEN_HAS_GPU_FP16) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
// When compiling with GPU support, the "__half_raw" base class as well as
// some other routines are defined in the GPU compiler header files
// (cuda_fp16.h, hip_fp16.h), and they are not tagged constexpr
// As a consequence, we get compile failures when compiling Eigen with
// GPU support. Hence the need to disable EIGEN_CONSTEXPR when building
-// Eigen with GPU support
-#pragma push_macro("EIGEN_CONSTEXPR")
-#undef EIGEN_CONSTEXPR
-#define EIGEN_CONSTEXPR
+// Eigen with GPU support.
+// Any functions that require `numext::bit_cast` may also not be constexpr,
+// including any native types when setting via raw bit values.
+#if defined(EIGEN_HAS_GPU_FP16) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
+#define _EIGEN_MAYBE_CONSTEXPR
+#else
+#define _EIGEN_MAYBE_CONSTEXPR constexpr
#endif
#define F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, METHOD) \
template <> \
- EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_UNUSED PACKET_F16 METHOD<PACKET_F16>(const PACKET_F16& _x) { \
+ EIGEN_UNUSED EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC PACKET_F16 METHOD<PACKET_F16>(const PACKET_F16& _x) { \
return float2half(METHOD<PACKET_F>(half2float(_x))); \
}
@@ -81,8 +83,10 @@
// Making the host side compile phase of hipcc use the same Eigen::half impl, as the gcc compile, resolves
// this error, and hence the following convoluted #if condition
#if !defined(EIGEN_HAS_GPU_FP16) || !defined(EIGEN_GPU_COMPILE_PHASE)
+
// Make our own __half_raw definition that is similar to CUDA's.
struct __half_raw {
+ struct construct_from_rep_tag {};
#if (defined(EIGEN_HAS_GPU_FP16) && !defined(EIGEN_GPU_COMPILE_PHASE))
// Eigen::half can be used as the datatype for shared memory declarations (in Eigen and TF)
// The element type for shared memory cannot have non-trivial constructors
@@ -91,43 +95,53 @@
// hence the need for this
EIGEN_DEVICE_FUNC __half_raw() {}
#else
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw() : x(0) {}
+ EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR __half_raw() : x(0) {}
#endif
+
#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
- explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw(numext::uint16_t raw) : x(numext::bit_cast<__fp16>(raw)) {}
+ explicit EIGEN_DEVICE_FUNC __half_raw(numext::uint16_t raw) : x(numext::bit_cast<__fp16>(raw)) {}
+ EIGEN_DEVICE_FUNC constexpr __half_raw(construct_from_rep_tag, __fp16 rep) : x{rep} {}
__fp16 x;
+#elif defined(EIGEN_HAS_BUILTIN_FLOAT16)
+ explicit EIGEN_DEVICE_FUNC __half_raw(numext::uint16_t raw) : x(numext::bit_cast<_Float16>(raw)) {}
+ EIGEN_DEVICE_FUNC constexpr __half_raw(construct_from_rep_tag, _Float16 rep) : x{rep} {}
+ _Float16 x;
#else
- explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw(numext::uint16_t raw) : x(raw) {}
+ explicit EIGEN_DEVICE_FUNC constexpr __half_raw(numext::uint16_t raw) : x(raw) {}
+ EIGEN_DEVICE_FUNC constexpr __half_raw(construct_from_rep_tag, numext::uint16_t rep) : x{rep} {}
numext::uint16_t x;
#endif
};
#elif defined(EIGEN_HAS_HIP_FP16)
-// Nothing to do here
+// HIP GPU compile phase: nothing to do here.
// HIP fp16 header file has a definition for __half_raw
#elif defined(EIGEN_HAS_CUDA_FP16)
+
+// CUDA GPU compile phase.
#if EIGEN_CUDA_SDK_VER < 90000
// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
typedef __half __half_raw;
#endif // defined(EIGEN_HAS_CUDA_FP16)
+
#elif defined(SYCL_DEVICE_ONLY)
typedef cl::sycl::half __half_raw;
#endif
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw raw_uint16_to_half(numext::uint16_t x);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR __half_raw raw_uint16_to_half(numext::uint16_t x);
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff);
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h);
struct half_base : public __half_raw {
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half_base() {}
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half_base(const __half_raw& h) : __half_raw(h) {}
+ EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base() {}
+ EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base(const __half_raw& h) : __half_raw(h) {}
#if defined(EIGEN_HAS_GPU_FP16)
#if defined(EIGEN_HAS_HIP_FP16)
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half_base(const __half& h) { x = __half_as_ushort(h); }
+ EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base(const __half& h) { x = __half_as_ushort(h); }
#elif defined(EIGEN_HAS_CUDA_FP16)
#if EIGEN_CUDA_SDK_VER >= 90000
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
+ EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
#endif
#endif
#endif
@@ -156,21 +170,29 @@
#endif
#endif
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half() {}
+ EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half() {}
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half(const __half_raw& h) : half_impl::half_base(h) {}
+ EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(const __half_raw& h) : half_impl::half_base(h) {}
#if defined(EIGEN_HAS_GPU_FP16)
#if defined(EIGEN_HAS_HIP_FP16)
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {}
+ EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {}
#elif defined(EIGEN_HAS_CUDA_FP16)
#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {}
+ EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {}
#endif
#endif
#endif
- explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half(bool b)
+#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
+ explicit EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(__fp16 b)
+ : half(__half_raw(__half_raw::construct_from_rep_tag(), b)) {}
+#elif defined(EIGEN_HAS_BUILTIN_FLOAT16)
+ explicit EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(_Float16 b)
+ : half(__half_raw(__half_raw::construct_from_rep_tag(), b)) {}
+#endif
+
+ explicit EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(bool b)
: half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
template <class T>
explicit EIGEN_DEVICE_FUNC half(T val)
@@ -201,99 +223,99 @@
namespace half_impl {
template <typename = void>
struct numeric_limits_half_impl {
- static EIGEN_CONSTEXPR const bool is_specialized = true;
- static EIGEN_CONSTEXPR const bool is_signed = true;
- static EIGEN_CONSTEXPR const bool is_integer = false;
- static EIGEN_CONSTEXPR const bool is_exact = false;
- static EIGEN_CONSTEXPR const bool has_infinity = true;
- static EIGEN_CONSTEXPR const bool has_quiet_NaN = true;
- static EIGEN_CONSTEXPR const bool has_signaling_NaN = true;
+ static constexpr const bool is_specialized = true;
+ static constexpr const bool is_signed = true;
+ static constexpr const bool is_integer = false;
+ static constexpr const bool is_exact = false;
+ static constexpr const bool has_infinity = true;
+ static constexpr const bool has_quiet_NaN = true;
+ static constexpr const bool has_signaling_NaN = true;
EIGEN_DIAGNOSTICS(push)
EIGEN_DISABLE_DEPRECATED_WARNING
- static EIGEN_CONSTEXPR const std::float_denorm_style has_denorm = std::denorm_present;
- static EIGEN_CONSTEXPR const bool has_denorm_loss = false;
+ static constexpr const std::float_denorm_style has_denorm = std::denorm_present;
+ static constexpr const bool has_denorm_loss = false;
EIGEN_DIAGNOSTICS(pop)
- static EIGEN_CONSTEXPR const std::float_round_style round_style = std::round_to_nearest;
- static EIGEN_CONSTEXPR const bool is_iec559 = true;
+ static constexpr const std::float_round_style round_style = std::round_to_nearest;
+ static constexpr const bool is_iec559 = true;
// The C++ standard defines this as "true if the set of values representable
// by the type is finite." Half has finite precision.
- static EIGEN_CONSTEXPR const bool is_bounded = true;
- static EIGEN_CONSTEXPR const bool is_modulo = false;
- static EIGEN_CONSTEXPR const int digits = 11;
- static EIGEN_CONSTEXPR const int digits10 =
+ static constexpr const bool is_bounded = true;
+ static constexpr const bool is_modulo = false;
+ static constexpr const int digits = 11;
+ static constexpr const int digits10 =
3; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
- static EIGEN_CONSTEXPR const int max_digits10 =
+ static constexpr const int max_digits10 =
5; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
- static EIGEN_CONSTEXPR const int radix = std::numeric_limits<float>::radix;
- static EIGEN_CONSTEXPR const int min_exponent = -13;
- static EIGEN_CONSTEXPR const int min_exponent10 = -4;
- static EIGEN_CONSTEXPR const int max_exponent = 16;
- static EIGEN_CONSTEXPR const int max_exponent10 = 4;
- static EIGEN_CONSTEXPR const bool traps = std::numeric_limits<float>::traps;
+ static constexpr const int radix = std::numeric_limits<float>::radix;
+ static constexpr const int min_exponent = -13;
+ static constexpr const int min_exponent10 = -4;
+ static constexpr const int max_exponent = 16;
+ static constexpr const int max_exponent10 = 4;
+ static constexpr const bool traps = std::numeric_limits<float>::traps;
// IEEE754: "The implementer shall choose how tininess is detected, but shall
// detect tininess in the same way for all operations in radix two"
- static EIGEN_CONSTEXPR const bool tinyness_before = std::numeric_limits<float>::tinyness_before;
+ static constexpr const bool tinyness_before = std::numeric_limits<float>::tinyness_before;
- static EIGEN_CONSTEXPR Eigen::half(min)() { return Eigen::half_impl::raw_uint16_to_half(0x0400); }
- static EIGEN_CONSTEXPR Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); }
- static EIGEN_CONSTEXPR Eigen::half(max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); }
- static EIGEN_CONSTEXPR Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x1400); }
- static EIGEN_CONSTEXPR Eigen::half round_error() { return Eigen::half_impl::raw_uint16_to_half(0x3800); }
- static EIGEN_CONSTEXPR Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); }
- static EIGEN_CONSTEXPR Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); }
- static EIGEN_CONSTEXPR Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7d00); }
- static EIGEN_CONSTEXPR Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x0001); }
+ static _EIGEN_MAYBE_CONSTEXPR Eigen::half(min)() { return Eigen::half_impl::raw_uint16_to_half(0x0400); }
+ static _EIGEN_MAYBE_CONSTEXPR Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); }
+ static _EIGEN_MAYBE_CONSTEXPR Eigen::half(max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); }
+ static _EIGEN_MAYBE_CONSTEXPR Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x1400); }
+ static _EIGEN_MAYBE_CONSTEXPR Eigen::half round_error() { return Eigen::half_impl::raw_uint16_to_half(0x3800); }
+ static _EIGEN_MAYBE_CONSTEXPR Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); }
+ static _EIGEN_MAYBE_CONSTEXPR Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); }
+ static _EIGEN_MAYBE_CONSTEXPR Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7d00); }
+ static _EIGEN_MAYBE_CONSTEXPR Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x0001); }
};
template <typename T>
-EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::is_specialized;
+constexpr const bool numeric_limits_half_impl<T>::is_specialized;
template <typename T>
-EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::is_signed;
+constexpr const bool numeric_limits_half_impl<T>::is_signed;
template <typename T>
-EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::is_integer;
+constexpr const bool numeric_limits_half_impl<T>::is_integer;
template <typename T>
-EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::is_exact;
+constexpr const bool numeric_limits_half_impl<T>::is_exact;
template <typename T>
-EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::has_infinity;
+constexpr const bool numeric_limits_half_impl<T>::has_infinity;
template <typename T>
-EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::has_quiet_NaN;
+constexpr const bool numeric_limits_half_impl<T>::has_quiet_NaN;
template <typename T>
-EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::has_signaling_NaN;
+constexpr const bool numeric_limits_half_impl<T>::has_signaling_NaN;
EIGEN_DIAGNOSTICS(push)
EIGEN_DISABLE_DEPRECATED_WARNING
template <typename T>
-EIGEN_CONSTEXPR const std::float_denorm_style numeric_limits_half_impl<T>::has_denorm;
+constexpr const std::float_denorm_style numeric_limits_half_impl<T>::has_denorm;
template <typename T>
-EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::has_denorm_loss;
+constexpr const bool numeric_limits_half_impl<T>::has_denorm_loss;
EIGEN_DIAGNOSTICS(pop)
template <typename T>
-EIGEN_CONSTEXPR const std::float_round_style numeric_limits_half_impl<T>::round_style;
+constexpr const std::float_round_style numeric_limits_half_impl<T>::round_style;
template <typename T>
-EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::is_iec559;
+constexpr const bool numeric_limits_half_impl<T>::is_iec559;
template <typename T>
-EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::is_bounded;
+constexpr const bool numeric_limits_half_impl<T>::is_bounded;
template <typename T>
-EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::is_modulo;
+constexpr const bool numeric_limits_half_impl<T>::is_modulo;
template <typename T>
-EIGEN_CONSTEXPR const int numeric_limits_half_impl<T>::digits;
+constexpr const int numeric_limits_half_impl<T>::digits;
template <typename T>
-EIGEN_CONSTEXPR const int numeric_limits_half_impl<T>::digits10;
+constexpr const int numeric_limits_half_impl<T>::digits10;
template <typename T>
-EIGEN_CONSTEXPR const int numeric_limits_half_impl<T>::max_digits10;
+constexpr const int numeric_limits_half_impl<T>::max_digits10;
template <typename T>
-EIGEN_CONSTEXPR const int numeric_limits_half_impl<T>::radix;
+constexpr const int numeric_limits_half_impl<T>::radix;
template <typename T>
-EIGEN_CONSTEXPR const int numeric_limits_half_impl<T>::min_exponent;
+constexpr const int numeric_limits_half_impl<T>::min_exponent;
template <typename T>
-EIGEN_CONSTEXPR const int numeric_limits_half_impl<T>::min_exponent10;
+constexpr const int numeric_limits_half_impl<T>::min_exponent10;
template <typename T>
-EIGEN_CONSTEXPR const int numeric_limits_half_impl<T>::max_exponent;
+constexpr const int numeric_limits_half_impl<T>::max_exponent;
template <typename T>
-EIGEN_CONSTEXPR const int numeric_limits_half_impl<T>::max_exponent10;
+constexpr const int numeric_limits_half_impl<T>::max_exponent10;
template <typename T>
-EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::traps;
+constexpr const bool numeric_limits_half_impl<T>::traps;
template <typename T>
-EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::tinyness_before;
+constexpr const bool numeric_limits_half_impl<T>::tinyness_before;
} // end namespace half_impl
} // end namespace Eigen
@@ -320,8 +342,7 @@
(defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE))
// Note: We deliberately do *not* define this to 1 even if we have Arm's native
// fp16 type since GPU half types are rather different from native CPU half types.
-// TODO: Rename to something like EIGEN_HAS_NATIVE_GPU_FP16
-#define EIGEN_HAS_NATIVE_FP16
+#define EIGEN_HAS_NATIVE_GPU_FP16
#endif
// Intrinsics for native fp16 support. Note that on current hardware,
@@ -329,7 +350,7 @@
// versions to get the ALU speed increased), but you do save the
// conversion steps back and forth.
-#if defined(EIGEN_HAS_NATIVE_FP16)
+#if defined(EIGEN_HAS_NATIVE_GPU_FP16)
EIGEN_STRONG_INLINE __device__ half operator+(const half& a, const half& b) {
#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
return __hadd(::__half(a), ::__half(b));
@@ -371,7 +392,8 @@
EIGEN_STRONG_INLINE __device__ bool operator<=(const half& a, const half& b) { return __hle(a, b); }
EIGEN_STRONG_INLINE __device__ bool operator>(const half& a, const half& b) { return __hgt(a, b); }
EIGEN_STRONG_INLINE __device__ bool operator>=(const half& a, const half& b) { return __hge(a, b); }
-#endif
+
+#endif // EIGEN_HAS_NATIVE_GPU_FP16
#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) && !defined(EIGEN_GPU_COMPILE_PHASE)
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator+(const half& a, const half& b) { return half(vaddh_f16(a.x, b.x)); }
@@ -401,16 +423,47 @@
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<=(const half& a, const half& b) { return vcleh_f16(a.x, b.x); }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>(const half& a, const half& b) { return vcgth_f16(a.x, b.x); }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>=(const half& a, const half& b) { return vcgeh_f16(a.x, b.x); }
+
+#elif defined(EIGEN_HAS_BUILTIN_FLOAT16) && !defined(EIGEN_GPU_COMPILE_PHASE)
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator+(const half& a, const half& b) { return half(a.x + b.x); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator*(const half& a, const half& b) { return half(a.x * b.x); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator-(const half& a, const half& b) { return half(a.x - b.x); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator/(const half& a, const half& b) { return half(a.x / b.x); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator-(const half& a) { return half(-a.x); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator+=(half& a, const half& b) {
+ a = a + b;
+ return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator*=(half& a, const half& b) {
+ a = a * b;
+ return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator-=(half& a, const half& b) {
+ a = a - b;
+ return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator/=(half& a, const half& b) {
+ a = a / b;
+ return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator==(const half& a, const half& b) { return a.x == b.x; }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator!=(const half& a, const half& b) { return a.x != b.x; }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<(const half& a, const half& b) { return a.x < b.x; }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<=(const half& a, const half& b) { return a.x <= b.x; }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>(const half& a, const half& b) { return a.x > b.x; }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>=(const half& a, const half& b) { return a.x >= b.x; }
+
// We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler,
// invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation
// of the functions, while the latter can only deal with one of them.
-#elif !defined(EIGEN_HAS_NATIVE_FP16) || (EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) // Emulate support for half floats
+#elif !defined(EIGEN_HAS_NATIVE_GPU_FP16) || (EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) // Emulate support for half floats
#if EIGEN_COMP_CLANG && defined(EIGEN_GPUCC)
// We need to provide emulated *host-side* FP16 operators for clang.
#pragma push_macro("EIGEN_DEVICE_FUNC")
#undef EIGEN_DEVICE_FUNC
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_HAS_NATIVE_FP16)
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_HAS_NATIVE_GPU_FP16)
#define EIGEN_DEVICE_FUNC __host__
#else // both host and device need emulated ops.
#define EIGEN_DEVICE_FUNC __host__ __device__
@@ -458,6 +511,7 @@
#if EIGEN_COMP_CLANG && defined(EIGEN_GPUCC)
#pragma pop_macro("EIGEN_DEVICE_FUNC")
#endif
+
#endif // Emulate support for half floats
// Division by an index. Do it in full float precision to avoid accuracy
@@ -493,7 +547,7 @@
// these in hardware. If we need more performance on older/other CPUs, they are
// also possible to vectorize directly.
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw raw_uint16_to_half(numext::uint16_t x) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR __half_raw raw_uint16_to_half(numext::uint16_t x) {
// We cannot simply do a "return __half_raw(x)" here, because __half_raw is union type
// in the hip_fp16 header file, and that will trigger a compile error
// On the other hand, having anything but a return statement also triggers a compile error
@@ -515,6 +569,8 @@
// For SYCL, cl::sycl::half is _Float16, so cast directly.
#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
return numext::bit_cast<numext::uint16_t>(h.x);
+#elif defined(EIGEN_HAS_BUILTIN_FLOAT16)
+ return numext::bit_cast<numext::uint16_t>(h.x);
#elif defined(SYCL_DEVICE_ONLY)
return numext::bit_cast<numext::uint16_t>(h);
#else
@@ -528,6 +584,16 @@
__half tmp_ff = __float2half(ff);
return *(__half_raw*)&tmp_ff;
+#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
+ __half_raw h;
+ h.x = static_cast<__fp16>(ff);
+ return h;
+
+#elif defined(EIGEN_HAS_BUILTIN_FLOAT16)
+ __half_raw h;
+ h.x = static_cast<_Float16>(ff);
+ return h;
+
#elif defined(EIGEN_HAS_FP16_C)
__half_raw h;
#if EIGEN_COMP_MSVC
@@ -538,11 +604,6 @@
#endif
return h;
-#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
- __half_raw h;
- h.x = static_cast<__fp16>(ff);
- return h;
-
#else
uint32_t f_bits = Eigen::numext::bit_cast<uint32_t>(ff);
const uint32_t f32infty_bits = {255 << 23};
@@ -595,6 +656,8 @@
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
return __half2float(h);
+#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
+ return static_cast<float>(h.x);
#elif defined(EIGEN_HAS_FP16_C)
#if EIGEN_COMP_MSVC
// MSVC does not have scalar instructions.
@@ -602,8 +665,6 @@
#else
return _cvtsh_ss(h.x);
#endif
-#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
- return static_cast<float>(h.x);
#else
const float magic = Eigen::numext::bit_cast<float>(static_cast<uint32_t>(113 << 23));
const uint32_t shifted_exp = 0x7c00 << 13; // exponent mask after shift
@@ -628,7 +689,7 @@
// --- standard functions ---
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isinf)(const half& a) {
-#ifdef EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC
+#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
return (numext::bit_cast<numext::uint16_t>(a.x) & 0x7fff) == 0x7c00;
#else
return (a.x & 0x7fff) == 0x7c00;
@@ -638,7 +699,7 @@
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
return __hisnan(a);
-#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
+#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
return (numext::bit_cast<numext::uint16_t>(a.x) & 0x7fff) > 0x7c00;
#else
return (a.x & 0x7fff) > 0x7c00;
@@ -651,6 +712,11 @@
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
return half(vabsh_f16(a.x));
+#elif defined(EIGEN_HAS_BUILTIN_FLOAT16)
+ half result;
+ result.x =
+ numext::bit_cast<_Float16>(static_cast<numext::uint16_t>(numext::bit_cast<numext::uint16_t>(a.x) & 0x7FFF));
+ return result;
#else
half result;
result.x = a.x & 0x7FFF;
@@ -734,24 +800,19 @@
return half(::fmodf(float(a), float(b)));
}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half(min)(const half& a, const half& b) {
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
- (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
- return __hlt(b, a) ? b : a;
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half(min)(const half& a, const half& b) { return b < a ? b : a; }
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half(max)(const half& a, const half& b) { return a < b ? b : a; }
+
+EIGEN_DEVICE_FUNC inline half fma(const half& a, const half& b, const half& c) {
+#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
+ return half(vfmah_f16(c.x, a.x, b.x));
+#elif defined(EIGEN_VECTORIZE_AVX512FP16)
+ // Reduces to vfmadd213sh.
+ return half(_mm_cvtsh_h(_mm_fmadd_ph(_mm_set_sh(a.x), _mm_set_sh(b.x), _mm_set_sh(c.x))));
#else
- const float f1 = static_cast<float>(a);
- const float f2 = static_cast<float>(b);
- return f2 < f1 ? b : a;
-#endif
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half(max)(const half& a, const half& b) {
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
- (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
- return __hlt(a, b) ? b : a;
-#else
- const float f1 = static_cast<float>(a);
- const float f2 = static_cast<float>(b);
- return f1 < f2 ? b : a;
+ // Emulate FMA via float.
+ return half(numext::fma(static_cast<float>(a), static_cast<float>(b), static_cast<float>(c)));
#endif
}
@@ -794,31 +855,29 @@
struct NumTraits<Eigen::half> : GenericNumTraits<Eigen::half> {
enum { IsSigned = true, IsInteger = false, IsComplex = false, RequireInitialization = false };
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half epsilon() {
+ EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half epsilon() {
return half_impl::raw_uint16_to_half(0x0800);
}
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half dummy_precision() {
+ EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half dummy_precision() {
return half_impl::raw_uint16_to_half(0x211f); // Eigen::half(1e-2f);
}
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half highest() {
+ EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half highest() {
return half_impl::raw_uint16_to_half(0x7bff);
}
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half lowest() {
+ EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half lowest() {
return half_impl::raw_uint16_to_half(0xfbff);
}
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half infinity() {
+ EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half infinity() {
return half_impl::raw_uint16_to_half(0x7c00);
}
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half quiet_NaN() {
+ EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half quiet_NaN() {
return half_impl::raw_uint16_to_half(0x7e00);
}
};
} // end namespace Eigen
-#if defined(EIGEN_HAS_GPU_FP16) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
-#pragma pop_macro("EIGEN_CONSTEXPR")
-#endif
+#undef _EIGEN_MAYBE_CONSTEXPR
namespace Eigen {
namespace numext {
diff --git a/Eigen/src/Core/arch/GPU/Tuple.h b/Eigen/src/Core/arch/GPU/Tuple.h
index 6bea9ac..402d92f 100644
--- a/Eigen/src/Core/arch/GPU/Tuple.h
+++ b/Eigen/src/Core/arch/GPU/Tuple.h
@@ -34,7 +34,7 @@
template <typename U1 = T1,
typename EnableIf = std::enable_if_t<std::is_default_constructible<U1>::value &&
reduce_all<std::is_default_constructible<Ts>::value...>::value>>
- EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC TupleImpl() : head_{}, tail_{} {}
+ constexpr EIGEN_DEVICE_FUNC TupleImpl() : head_{}, tail_{} {}
// Element constructor.
template <typename U1, typename... Us,
@@ -44,7 +44,7 @@
sizeof...(Us) == sizeof...(Ts) && (
// this does not look like a copy/move constructor.
N > 1 || std::is_convertible<U1, T1>::value)>>
- EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC TupleImpl(U1&& arg1, Us&&... args)
+ constexpr EIGEN_DEVICE_FUNC TupleImpl(U1&& arg1, Us&&... args)
: head_(std::forward<U1>(arg1)), tail_(std::forward<Us>(args)...) {}
// The first stored value.
@@ -102,11 +102,11 @@
using TupleType = TupleImpl<sizeof...(Ts) + 1, T1, Ts...>;
using ReturnType = typename tuple_get_impl<Idx - 1, Ts...>::ReturnType;
- static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ReturnType& run(TupleType& tuple) {
+ static constexpr EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ReturnType& run(TupleType& tuple) {
return tuple_get_impl<Idx - 1, Ts...>::run(tuple.tail());
}
- static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const ReturnType& run(const TupleType& tuple) {
+ static constexpr EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const ReturnType& run(const TupleType& tuple) {
return tuple_get_impl<Idx - 1, Ts...>::run(tuple.tail());
}
};
@@ -117,11 +117,9 @@
using TupleType = TupleImpl<sizeof...(Ts) + 1, T1, Ts...>;
using ReturnType = T1;
- static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T1& run(TupleType& tuple) { return tuple.head(); }
+ static constexpr EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T1& run(TupleType& tuple) { return tuple.head(); }
- static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const T1& run(const TupleType& tuple) {
- return tuple.head();
- }
+ static constexpr EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const T1& run(const TupleType& tuple) { return tuple.head(); }
};
// Concatenates N Tuples.
@@ -139,11 +137,9 @@
// Uses the index sequences to extract and merge elements from tuple1 and tuple2,
// then recursively calls again.
template <typename Tuple1, size_t... I1s, typename Tuple2, size_t... I2s, typename... MoreTuples>
- static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType run(Tuple1&& tuple1,
- std::index_sequence<I1s...>,
- Tuple2&& tuple2,
- std::index_sequence<I2s...>,
- MoreTuples&&... tuples) {
+ static constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType run(Tuple1&& tuple1, std::index_sequence<I1s...>,
+ Tuple2&& tuple2, std::index_sequence<I2s...>,
+ MoreTuples&&... tuples) {
return tuple_cat_impl<NTuples - 1, MergedTupleType, Tuples...>::run(
MergedTupleType(tuple_get_impl<I1s, Args1...>::run(std::forward<Tuple1>(tuple1))...,
tuple_get_impl<I2s, Args2...>::run(std::forward<Tuple2>(tuple2))...),
@@ -152,8 +148,8 @@
// Concatenates the first two tuples.
template <typename Tuple1, typename Tuple2, typename... MoreTuples>
- static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType run(Tuple1&& tuple1, Tuple2&& tuple2,
- MoreTuples&&... tuples) {
+ static constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType run(Tuple1&& tuple1, Tuple2&& tuple2,
+ MoreTuples&&... tuples) {
return run(std::forward<Tuple1>(tuple1), std::make_index_sequence<N1>{}, std::forward<Tuple2>(tuple2),
std::make_index_sequence<N2>{}, std::forward<MoreTuples>(tuples)...);
}
@@ -165,7 +161,7 @@
using ReturnType = TupleImpl<N, Args...>;
template <typename Tuple1>
- static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType run(Tuple1&& tuple1) {
+ static constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType run(Tuple1&& tuple1) {
return tuple1;
}
};
@@ -174,7 +170,7 @@
template <>
struct tuple_cat_impl<0> {
using ReturnType = TupleImpl<0>;
- static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType run() { return ReturnType{}; }
+ static constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType run() { return ReturnType{}; }
};
// For use in make_tuple, unwraps a reference_wrapper.
@@ -211,13 +207,13 @@
* \return a reference to the desired element.
*/
template <size_t Idx, typename... Types>
-EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename tuple_get_impl<Idx, Types...>::ReturnType& get(
+constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename tuple_get_impl<Idx, Types...>::ReturnType& get(
const TupleImpl<sizeof...(Types), Types...>& tuple) {
return tuple_get_impl<Idx, Types...>::run(tuple);
}
template <size_t Idx, typename... Types>
-EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename tuple_get_impl<Idx, Types...>::ReturnType& get(
+constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename tuple_get_impl<Idx, Types...>::ReturnType& get(
TupleImpl<sizeof...(Types), Types...>& tuple) {
return tuple_get_impl<Idx, Types...>::run(tuple);
}
@@ -229,7 +225,7 @@
*/
template <typename... Tuples, typename EnableIf = std::enable_if_t<
internal::reduce_all<is_tuple<typename std::decay<Tuples>::type>::value...>::value>>
-EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
typename tuple_cat_impl<sizeof...(Tuples), typename std::decay<Tuples>::type...>::ReturnType
tuple_cat(Tuples&&... tuples) {
return tuple_cat_impl<sizeof...(Tuples), typename std::decay<Tuples>::type...>::run(std::forward<Tuples>(tuples)...);
@@ -239,7 +235,7 @@
* Tie arguments together into a tuple.
*/
template <typename... Args, typename ReturnType = TupleImpl<sizeof...(Args), Args&...>>
-EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType tie(Args&... args) EIGEN_NOEXCEPT {
+constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType tie(Args&... args) noexcept {
return ReturnType{args...};
}
@@ -247,7 +243,7 @@
* Create a tuple of l-values with the supplied arguments.
*/
template <typename... Args, typename ReturnType = TupleImpl<sizeof...(Args), typename unwrap_decay<Args>::type...>>
-EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType make_tuple(Args&&... args) {
+constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType make_tuple(Args&&... args) {
return ReturnType{std::forward<Args>(args)...};
}
@@ -255,8 +251,7 @@
* Forward a set of arguments as a tuple.
*/
template <typename... Args>
-EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TupleImpl<sizeof...(Args), Args...> forward_as_tuple(
- Args&&... args) {
+constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TupleImpl<sizeof...(Args), Args...> forward_as_tuple(Args&&... args) {
return TupleImpl<sizeof...(Args), Args...>(std::forward<Args>(args)...);
}
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 3f2d9d5..6d7f038 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -207,6 +207,7 @@
HasExp = 1,
HasSqrt = 1,
HasRsqrt = 1,
+ HasCbrt = 1,
HasTanh = EIGEN_FAST_MATH,
HasErf = EIGEN_FAST_MATH,
HasErfc = EIGEN_FAST_MATH,
@@ -4965,6 +4966,26 @@
}
template <>
+EIGEN_STRONG_INLINE Packet4bf pmadd<Packet4bf>(const Packet4bf& a, const Packet4bf& b, const Packet4bf& c) {
+ return F32ToBf16(pmadd<Packet4f>(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf pmsub<Packet4bf>(const Packet4bf& a, const Packet4bf& b, const Packet4bf& c) {
+ return F32ToBf16(pmsub<Packet4f>(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf pnmadd<Packet4bf>(const Packet4bf& a, const Packet4bf& b, const Packet4bf& c) {
+ return F32ToBf16(pnmadd<Packet4f>(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf pnmsub<Packet4bf>(const Packet4bf& a, const Packet4bf& b, const Packet4bf& c) {
+ return F32ToBf16(pnmsub<Packet4f>(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
+}
+
+template <>
EIGEN_STRONG_INLINE Packet4bf pdiv<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
return F32ToBf16(pdiv<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
}
@@ -5140,6 +5161,7 @@
HasCos = EIGEN_FAST_MATH,
HasSqrt = 1,
HasRsqrt = 1,
+ HasCbrt = 1,
HasTanh = EIGEN_FAST_MATH,
HasErf = EIGEN_FAST_MATH,
HasErfc = EIGEN_FAST_MATH
@@ -5635,6 +5657,21 @@
}
template <>
+EIGEN_STRONG_INLINE Packet8hf pmsub(const Packet8hf& a, const Packet8hf& b, const Packet8hf& c) {
+ return vfmaq_f16(pnegate(c), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pnmadd(const Packet4hf& a, const Packet4hf& b, const Packet4hf& c) {
+ return vfma_f16(c, pnegate(a), b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pnmsub(const Packet4hf& a, const Packet4hf& b, const Packet4hf& c) {
+ return vfma_f16(pnegate(c), pnegate(a), b);
+}
+
+template <>
EIGEN_STRONG_INLINE Packet8hf pmin<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
return vminq_f16(a, b);
}
diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h
index c69e3d4..f79da7b 100644
--- a/Eigen/src/Core/arch/SSE/Complex.h
+++ b/Eigen/src/Core/arch/SSE/Complex.h
@@ -465,19 +465,11 @@
}
template <>
EIGEN_STRONG_INLINE Packet2cf pnmadd(const Packet2cf& a, const Packet2cf& b, const Packet2cf& c) {
- __m128 a_odd = _mm_movehdup_ps(a.v);
- __m128 a_even = _mm_moveldup_ps(a.v);
- __m128 b_swap = _mm_permute_ps(b.v, _MM_SHUFFLE(2, 3, 0, 1));
- __m128 result = _mm_fmaddsub_ps(a_odd, b_swap, _mm_fmaddsub_ps(a_even, b.v, c.v));
- return Packet2cf(result);
+ return pnegate(pmsub(a, b, c));
}
template <>
EIGEN_STRONG_INLINE Packet2cf pnmsub(const Packet2cf& a, const Packet2cf& b, const Packet2cf& c) {
- __m128 a_odd = _mm_movehdup_ps(a.v);
- __m128 a_even = _mm_moveldup_ps(a.v);
- __m128 b_swap = _mm_permute_ps(b.v, _MM_SHUFFLE(2, 3, 0, 1));
- __m128 result = _mm_fmaddsub_ps(a_odd, b_swap, _mm_fmsubadd_ps(a_even, b.v, c.v));
- return Packet2cf(result);
+ return pnegate(pmadd(a, b, c));
}
// std::complex<double>
template <>
@@ -498,19 +490,11 @@
}
template <>
EIGEN_STRONG_INLINE Packet1cd pnmadd(const Packet1cd& a, const Packet1cd& b, const Packet1cd& c) {
- __m128d a_odd = _mm_permute_pd(a.v, 0x3);
- __m128d a_even = _mm_movedup_pd(a.v);
- __m128d b_swap = _mm_permute_pd(b.v, 0x1);
- __m128d result = _mm_fmaddsub_pd(a_odd, b_swap, _mm_fmaddsub_pd(a_even, b.v, c.v));
- return Packet1cd(result);
+ return pnegate(pmsub(a, b, c));
}
template <>
EIGEN_STRONG_INLINE Packet1cd pnmsub(const Packet1cd& a, const Packet1cd& b, const Packet1cd& c) {
- __m128d a_odd = _mm_permute_pd(a.v, 0x3);
- __m128d a_even = _mm_movedup_pd(a.v);
- __m128d b_swap = _mm_permute_pd(b.v, 0x1);
- __m128d result = _mm_fmaddsub_pd(a_odd, b_swap, _mm_fmsubadd_pd(a_even, b.v, c.v));
- return Packet1cd(result);
+ return pnegate(pmadd(a, b, c));
}
#endif
} // end namespace internal
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 5e91fba..70d13d6 100644
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -195,6 +195,7 @@
HasBessel = 1,
HasSqrt = 1,
HasRsqrt = 1,
+ HasCbrt = 1,
HasTanh = EIGEN_FAST_MATH,
HasErf = EIGEN_FAST_MATH,
HasErfc = EIGEN_FAST_MATH,
@@ -222,6 +223,7 @@
HasExp = 1,
HasSqrt = 1,
HasRsqrt = 1,
+ HasCbrt = 1,
HasATan = 1,
HasATanh = 1,
HasBlend = 1
diff --git a/Eigen/src/Core/functors/AssignmentFunctors.h b/Eigen/src/Core/functors/AssignmentFunctors.h
index 7edcc60..0239262 100644
--- a/Eigen/src/Core/functors/AssignmentFunctors.h
+++ b/Eigen/src/Core/functors/AssignmentFunctors.h
@@ -29,6 +29,11 @@
EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const {
pstoret<DstScalar, Packet, Alignment>(a, b);
}
+
+ template <int Alignment, typename Packet>
+ EIGEN_STRONG_INLINE void assignPacketSegment(DstScalar* a, const Packet& b, Index begin, Index count) const {
+ pstoretSegment<DstScalar, Packet, Alignment>(a, b, begin, count);
+ }
};
// Empty overload for void type (used by PermutationMatrix)
@@ -60,6 +65,12 @@
assign_op<DstScalar, DstScalar>().template assignPacket<Alignment, Packet>(
a, Func().packetOp(ploadt<Packet, Alignment>(a), b));
}
+
+ template <int Alignment, typename Packet>
+ EIGEN_STRONG_INLINE void assignPacketSegment(DstScalar* a, const Packet& b, Index begin, Index count) const {
+ assign_op<DstScalar, DstScalar>().template assignPacketSegment<Alignment, Packet>(
+ a, Func().packetOp(ploadtSegment<Packet, Alignment>(a, begin, count), b), begin, count);
+ }
};
template <typename DstScalar, typename SrcScalar, typename Func>
diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h
index c91e6bb..a93b998 100644
--- a/Eigen/src/Core/functors/BinaryFunctors.h
+++ b/Eigen/src/Core/functors/BinaryFunctors.h
@@ -438,7 +438,6 @@
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const {
- maybe_raise_div_by_zero<Packet>::run(b);
return internal::pdiv(a, b);
}
};
diff --git a/Eigen/src/Core/functors/NullaryFunctors.h b/Eigen/src/Core/functors/NullaryFunctors.h
index 14b56d7..35dc738 100644
--- a/Eigen/src/Core/functors/NullaryFunctors.h
+++ b/Eigen/src/Core/functors/NullaryFunctors.h
@@ -28,7 +28,7 @@
const Scalar m_other;
};
template <typename Scalar>
-struct functor_traits<scalar_constant_op<Scalar> > {
+struct functor_traits<scalar_constant_op<Scalar>> {
enum {
Cost = 0 /* as the constant value should be loaded in register only once for the whole expression */,
PacketAccess = packet_traits<Scalar>::Vectorizable,
@@ -56,7 +56,7 @@
}
};
template <typename Scalar>
-struct functor_traits<scalar_identity_op<Scalar> > {
+struct functor_traits<scalar_identity_op<Scalar>> {
enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = false, IsRepeatable = true };
};
@@ -86,18 +86,19 @@
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const {
// Principle:
// [low, ..., low] + ( [step, ..., step] * ( [i, ..., i] + [0, ..., size] ) )
+ Packet low = pset1<Packet>(m_low);
+ Packet high = pset1<Packet>(m_high);
+ Packet step = pset1<Packet>(m_step);
if (m_flip) {
Packet pi = plset<Packet>(Scalar(i - m_size1));
- Packet res = padd(pset1<Packet>(m_high), pmul(pset1<Packet>(m_step), pi));
- if (EIGEN_PREDICT_TRUE(i != 0)) return res;
- Packet mask = pcmp_lt(pset1<Packet>(0), plset<Packet>(0));
- return pselect<Packet>(mask, res, pset1<Packet>(m_low));
+ Packet res = pmadd(step, pi, high);
+ Packet mask = pcmp_lt(pzero(res), plset<Packet>(Scalar(i)));
+ return pselect<Packet>(mask, res, low);
} else {
Packet pi = plset<Packet>(Scalar(i));
- Packet res = padd(pset1<Packet>(m_low), pmul(pset1<Packet>(m_step), pi));
- if (EIGEN_PREDICT_TRUE(i != m_size1 - unpacket_traits<Packet>::size + 1)) return res;
- Packet mask = pcmp_lt(plset<Packet>(0), pset1<Packet>(unpacket_traits<Packet>::size - 1));
- return pselect<Packet>(mask, res, pset1<Packet>(m_high));
+ Packet res = pmadd(step, pi, low);
+ Packet mask = pcmp_lt(pi, pset1<Packet>(Scalar(m_size1)));
+ return pselect<Packet>(mask, res, high);
}
}
@@ -139,7 +140,7 @@
template <typename Scalar>
struct linspaced_op;
template <typename Scalar>
-struct functor_traits<linspaced_op<Scalar> > {
+struct functor_traits<linspaced_op<Scalar>> {
enum {
Cost = 1,
PacketAccess = (!NumTraits<Scalar>::IsInteger) && packet_traits<Scalar>::HasSetLinear,
@@ -192,7 +193,7 @@
};
template <typename Scalar>
-struct functor_traits<equalspaced_op<Scalar> > {
+struct functor_traits<equalspaced_op<Scalar>> {
enum {
Cost = NumTraits<Scalar>::AddCost + NumTraits<Scalar>::MulCost,
PacketAccess =
diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h
index 03542e3..ba7d97a 100644
--- a/Eigen/src/Core/functors/UnaryFunctors.h
+++ b/Eigen/src/Core/functors/UnaryFunctors.h
@@ -558,11 +558,15 @@
template <typename Scalar>
struct scalar_cbrt_op {
EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::cbrt(a); }
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
+ return internal::pcbrt(a);
+ }
};
template <typename Scalar>
struct functor_traits<scalar_cbrt_op<Scalar>> {
- enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
+ enum { Cost = 20 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasCbrt };
};
/** \internal
diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h
index 8f7b7dd..4f36689 100644
--- a/Eigen/src/Core/products/Parallelizer.h
+++ b/Eigen/src/Core/products/Parallelizer.h
@@ -217,7 +217,7 @@
// Note that the actual number of threads might be lower than the number of
// requested ones
Index actual_threads = omp_get_num_threads();
- GemmParallelInfo<Index> info(i, static_cast<int>(actual_threads), task_info);
+ GemmParallelInfo<Index> info(static_cast<int>(i), static_cast<int>(actual_threads), task_info);
Index blockCols = (cols / actual_threads) & ~Index(0x3);
Index blockRows = (rows / actual_threads);
diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h
index 5d3f1cf..49f307c 100644
--- a/Eigen/src/Core/util/ConfigureVectorization.h
+++ b/Eigen/src/Core/util/ConfigureVectorization.h
@@ -285,6 +285,8 @@
#ifdef __AVX512FP16__
#ifdef __AVX512VL__
#define EIGEN_VECTORIZE_AVX512FP16
+// Built-in _Float16.
+#define EIGEN_HAS_BUILTIN_FLOAT16 1
#else
#if EIGEN_COMP_GNUC
#error Please add -mavx512vl to your compiler flags: compiling with -mavx512fp16 alone without AVX512-VL is not supported.
diff --git a/Eigen/src/Core/util/EmulateArray.h b/Eigen/src/Core/util/EmulateArray.h
index f2fd10b..6c4c22d 100644
--- a/Eigen/src/Core/util/EmulateArray.h
+++ b/Eigen/src/Core/util/EmulateArray.h
@@ -248,15 +248,15 @@
#endif
template <std::size_t I_, class T, std::size_t N>
-constexpr inline T& array_get(std::array<T, N>& a) {
+constexpr T& array_get(std::array<T, N>& a) {
return (T&)STD_GET_ARR_HACK;
}
template <std::size_t I_, class T, std::size_t N>
-constexpr inline T&& array_get(std::array<T, N>&& a) {
+constexpr T&& array_get(std::array<T, N>&& a) {
return (T&&)STD_GET_ARR_HACK;
}
template <std::size_t I_, class T, std::size_t N>
-constexpr inline T const& array_get(std::array<T, N> const& a) {
+constexpr T const& array_get(std::array<T, N> const& a) {
return (T const&)STD_GET_ARR_HACK;
}
diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h
index 2488be4..3c0bc46 100644
--- a/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/Eigen/src/Core/util/ForwardDeclarations.h
@@ -514,6 +514,9 @@
struct eigen_memset_helper;
template <typename Xpr, bool use_memset = eigen_memset_helper<Xpr>::value>
struct eigen_zero_impl;
+
+template <typename Packet>
+struct has_packet_segment : std::false_type {};
} // namespace internal
} // end namespace Eigen
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index 7347dbb..aed8a88 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -1281,11 +1281,6 @@
#define EIGEN_CATCH(X) else
#endif
-#define EIGEN_NOEXCEPT noexcept
-#define EIGEN_NOEXCEPT_IF(x) noexcept(x)
-#define EIGEN_NO_THROW noexcept(true)
-#define EIGEN_EXCEPTION_SPEC(X) noexcept(false)
-
// The all function is used to enable a variadic version of eigen_assert which can take a parameter pack as its input.
namespace Eigen {
namespace internal {
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index 0facd26..89b2fff 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -833,46 +833,44 @@
// HIP does not support new/delete on device.
#if EIGEN_MAX_ALIGN_BYTES != 0 && !defined(EIGEN_HIP_DEVICE_COMPILE)
-#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \
- EIGEN_DEVICE_FUNC void* operator new(std::size_t size, const std::nothrow_t&) EIGEN_NO_THROW { \
- EIGEN_TRY { return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); } \
- EIGEN_CATCH(...) { return 0; } \
+#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \
+ EIGEN_DEVICE_FUNC void* operator new(std::size_t size, const std::nothrow_t&) noexcept { \
+ EIGEN_TRY { return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); } \
+ EIGEN_CATCH(...) { return 0; } \
}
-#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign) \
- EIGEN_DEVICE_FUNC void* operator new(std::size_t size) { \
- return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); \
- } \
- EIGEN_DEVICE_FUNC void* operator new[](std::size_t size) { \
- return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); \
- } \
- EIGEN_DEVICE_FUNC void operator delete(void* ptr) EIGEN_NO_THROW { \
- Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); \
- } \
- EIGEN_DEVICE_FUNC void operator delete[](void* ptr) EIGEN_NO_THROW { \
- Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); \
- } \
- EIGEN_DEVICE_FUNC void operator delete(void* ptr, std::size_t /* sz */) EIGEN_NO_THROW { \
- Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); \
- } \
- EIGEN_DEVICE_FUNC void operator delete[](void* ptr, std::size_t /* sz */) EIGEN_NO_THROW { \
- Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); \
- } \
- /* in-place new and delete. since (at least afaik) there is no actual */ \
- /* memory allocated we can safely let the default implementation handle */ \
- /* this particular case. */ \
- EIGEN_DEVICE_FUNC static void* operator new(std::size_t size, void* ptr) { return ::operator new(size, ptr); } \
- EIGEN_DEVICE_FUNC static void* operator new[](std::size_t size, void* ptr) { return ::operator new[](size, ptr); } \
- EIGEN_DEVICE_FUNC void operator delete(void* memory, void* ptr) EIGEN_NO_THROW { \
- return ::operator delete(memory, ptr); \
- } \
- EIGEN_DEVICE_FUNC void operator delete[](void* memory, void* ptr) EIGEN_NO_THROW { \
- return ::operator delete[](memory, ptr); \
- } \
- /* nothrow-new (returns zero instead of std::bad_alloc) */ \
- EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \
- EIGEN_DEVICE_FUNC void operator delete(void* ptr, const std::nothrow_t&) EIGEN_NO_THROW { \
- Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); \
- } \
+#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign) \
+ EIGEN_DEVICE_FUNC void* operator new(std::size_t size) { \
+ return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); \
+ } \
+ EIGEN_DEVICE_FUNC void* operator new[](std::size_t size) { \
+ return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); \
+ } \
+ EIGEN_DEVICE_FUNC void operator delete(void* ptr) noexcept { \
+ Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); \
+ } \
+ EIGEN_DEVICE_FUNC void operator delete[](void* ptr) noexcept { \
+ Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); \
+ } \
+ EIGEN_DEVICE_FUNC void operator delete(void* ptr, std::size_t /* sz */) noexcept { \
+ Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); \
+ } \
+ EIGEN_DEVICE_FUNC void operator delete[](void* ptr, std::size_t /* sz */) noexcept { \
+ Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); \
+ } \
+ /* in-place new and delete. since (at least afaik) there is no actual */ \
+ /* memory allocated we can safely let the default implementation handle */ \
+ /* this particular case. */ \
+ EIGEN_DEVICE_FUNC static void* operator new(std::size_t size, void* ptr) { return ::operator new(size, ptr); } \
+ EIGEN_DEVICE_FUNC static void* operator new[](std::size_t size, void* ptr) { return ::operator new[](size, ptr); } \
+ EIGEN_DEVICE_FUNC void operator delete(void* memory, void* ptr) noexcept { return ::operator delete(memory, ptr); } \
+ EIGEN_DEVICE_FUNC void operator delete[](void* memory, void* ptr) noexcept { \
+ return ::operator delete[](memory, ptr); \
+ } \
+ /* nothrow-new (returns zero instead of std::bad_alloc) */ \
+ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \
+ EIGEN_DEVICE_FUNC void operator delete(void* ptr, const std::nothrow_t&) noexcept { \
+ Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); \
+ } \
typedef void eigen_aligned_operator_new_marker_type;
#else
#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign)
diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h
index 40604f8..ddbc898 100644
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@@ -90,12 +90,8 @@
* we however don't want to add a dependency to Boost.
*/
-struct true_type {
- enum { value = 1 };
-};
-struct false_type {
- enum { value = 0 };
-};
+using std::false_type;
+using std::true_type;
template <bool Condition>
struct bool_constant;
@@ -341,7 +337,7 @@
#if EIGEN_COMP_CXXVER >= 20 && defined(__cpp_lib_ssize) && __cpp_lib_ssize >= 201902L
template <typename T>
-EIGEN_CONSTEXPR auto index_list_size(T&& x) {
+constexpr auto index_list_size(T&& x) {
using std::ssize;
return ssize(std::forward<T>(x));
}
@@ -349,13 +345,13 @@
#else
template <typename T>
-EIGEN_CONSTEXPR auto index_list_size(const T& x) {
+constexpr auto index_list_size(const T& x) {
using R = std::common_type_t<std::ptrdiff_t, std::make_signed_t<decltype(x.size())>>;
return static_cast<R>(x.size());
}
template <typename T, std::ptrdiff_t N>
-EIGEN_CONSTEXPR std::ptrdiff_t index_list_size(const T (&)[N]) {
+constexpr std::ptrdiff_t index_list_size(const T (&)[N]) {
return N;
}
#endif
@@ -641,21 +637,21 @@
constexpr bool is_int_or_enum_v = std::is_enum<A>::value || std::is_integral<A>::value;
template <typename A, typename B>
-inline constexpr void plain_enum_asserts(A, B) {
+constexpr void plain_enum_asserts(A, B) {
static_assert(is_int_or_enum_v<A>, "Argument a must be an integer or enum");
static_assert(is_int_or_enum_v<B>, "Argument b must be an integer or enum");
}
/// \internal Gets the minimum of two values which may be integers or enums
template <typename A, typename B>
-inline constexpr int plain_enum_min(A a, B b) {
+constexpr int plain_enum_min(A a, B b) {
plain_enum_asserts(a, b);
return ((int)a <= (int)b) ? (int)a : (int)b;
}
/// \internal Gets the maximum of two values which may be integers or enums
template <typename A, typename B>
-inline constexpr int plain_enum_max(A a, B b) {
+constexpr int plain_enum_max(A a, B b) {
plain_enum_asserts(a, b);
return ((int)a >= (int)b) ? (int)a : (int)b;
}
@@ -667,7 +663,7 @@
* finite values is that min(3, Dynamic) should be Dynamic, since that could be anything between 0 and 3.
*/
template <typename A, typename B>
-inline constexpr int min_size_prefer_dynamic(A a, B b) {
+constexpr int min_size_prefer_dynamic(A a, B b) {
plain_enum_asserts(a, b);
if ((int)a == 0 || (int)b == 0) return 0;
if ((int)a == 1 || (int)b == 1) return 1;
@@ -682,7 +678,7 @@
* 0 and 3), it is not more than 3.
*/
template <typename A, typename B>
-inline constexpr int min_size_prefer_fixed(A a, B b) {
+constexpr int min_size_prefer_fixed(A a, B b) {
plain_enum_asserts(a, b);
if ((int)a == 0 || (int)b == 0) return 0;
if ((int)a == 1 || (int)b == 1) return 1;
@@ -694,7 +690,7 @@
/// \internal see `min_size_prefer_fixed`. No need for a separate variant for MaxSizes here.
template <typename A, typename B>
-inline constexpr int max_size_prefer_dynamic(A a, B b) {
+constexpr int max_size_prefer_dynamic(A a, B b) {
plain_enum_asserts(a, b);
if ((int)a == Dynamic || (int)b == Dynamic) return Dynamic;
return plain_enum_max(a, b);
@@ -714,38 +710,38 @@
}
template <typename A, typename B>
-inline constexpr bool enum_lt_not_dynamic(A a, B b) {
+constexpr bool enum_lt_not_dynamic(A a, B b) {
plain_enum_asserts(a, b);
if ((int)a == Dynamic || (int)b == Dynamic) return false;
return (int)a < (int)b;
}
template <typename A, typename B>
-inline constexpr bool enum_le_not_dynamic(A a, B b) {
+constexpr bool enum_le_not_dynamic(A a, B b) {
plain_enum_asserts(a, b);
if ((int)a == Dynamic || (int)b == Dynamic) return false;
return (int)a <= (int)b;
}
template <typename A, typename B>
-inline constexpr bool enum_gt_not_dynamic(A a, B b) {
+constexpr bool enum_gt_not_dynamic(A a, B b) {
plain_enum_asserts(a, b);
if ((int)a == Dynamic || (int)b == Dynamic) return false;
return (int)a > (int)b;
}
template <typename A, typename B>
-inline constexpr bool enum_ge_not_dynamic(A a, B b) {
+constexpr bool enum_ge_not_dynamic(A a, B b) {
plain_enum_asserts(a, b);
if ((int)a == Dynamic || (int)b == Dynamic) return false;
return (int)a >= (int)b;
}
/// \internal Calculate logical XOR at compile time
-inline constexpr bool logical_xor(bool a, bool b) { return a != b; }
+constexpr bool logical_xor(bool a, bool b) { return a != b; }
/// \internal Calculate logical IMPLIES at compile time
-inline constexpr bool check_implication(bool a, bool b) { return !a || b; }
+constexpr bool check_implication(bool a, bool b) { return !a || b; }
/// \internal Provide fallback for std::is_constant_evaluated for pre-C++20.
#if EIGEN_COMP_CXXVER >= 20 && defined(__cpp_lib_is_constant_evaluated) && __cpp_lib_is_constant_evaluated >= 201811L
diff --git a/Eigen/src/Core/util/ReshapedHelper.h b/Eigen/src/Core/util/ReshapedHelper.h
index e569408..1747950 100644
--- a/Eigen/src/Core/util/ReshapedHelper.h
+++ b/Eigen/src/Core/util/ReshapedHelper.h
@@ -40,7 +40,7 @@
inline Index get_runtime_reshape_size(AutoSize_t /*size*/, Index other, Index total) { return total / other; }
-constexpr inline int get_compiletime_reshape_order(int flags, int order) {
+constexpr int get_compiletime_reshape_order(int flags, int order) {
return order == AutoOrder ? flags & RowMajorBit : order;
}
diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h
index a42bb0f..a0e160e 100644
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@@ -158,8 +158,8 @@
EIGEN_ONLY_USED_FOR_DEBUG(v);
eigen_assert(v == T(Value));
}
- EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR T value() { return T(Value); }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR operator T() const { return T(Value); }
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr T value() { return T(Value); }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr operator T() const { return T(Value); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T v) const {
EIGEN_ONLY_USED_FOR_DEBUG(v);
eigen_assert(v == T(Value));
@@ -171,7 +171,7 @@
T m_value;
public:
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamic(T value = 0) EIGEN_NO_THROW : m_value(value) {}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamic(T value = 0) noexcept : m_value(value) {}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T value() const { return m_value; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator T() const { return m_value; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T value) { m_value = value; }
@@ -186,7 +186,7 @@
EIGEN_ONLY_USED_FOR_DEBUG(v);
eigen_assert(v == T(Value));
}
- EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR T value() { return T(Value); }
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr T value() { return T(Value); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T) {}
};
@@ -315,7 +315,7 @@
};
#if EIGEN_MAX_STATIC_ALIGN_BYTES > 0
-constexpr inline int compute_default_alignment_helper(int ArrayBytes, int AlignmentBytes) {
+constexpr int compute_default_alignment_helper(int ArrayBytes, int AlignmentBytes) {
if ((ArrayBytes % AlignmentBytes) == 0) {
return AlignmentBytes;
} else if (EIGEN_MIN_ALIGN_BYTES < AlignmentBytes) {
@@ -327,7 +327,7 @@
#else
// If static alignment is disabled, no need to bother.
// This also avoids a division by zero
-constexpr inline int compute_default_alignment_helper(int ArrayBytes, int AlignmentBytes) {
+constexpr int compute_default_alignment_helper(int ArrayBytes, int AlignmentBytes) {
EIGEN_UNUSED_VARIABLE(ArrayBytes);
EIGEN_UNUSED_VARIABLE(AlignmentBytes);
return 0;
@@ -362,7 +362,7 @@
typedef Matrix<Scalar_, Rows_, Cols_, Options, MaxRows_, MaxCols_> type;
};
-constexpr inline unsigned compute_matrix_flags(int Options) {
+constexpr unsigned compute_matrix_flags(int Options) {
unsigned row_major_bit = Options & RowMajor ? RowMajorBit : 0;
// FIXME currently we still have to handle DirectAccessBit at the expression level to handle DenseCoeffsBase<>
// and then propagate this information to the evaluator's flags.
@@ -370,7 +370,7 @@
return DirectAccessBit | LvalueBit | NestByRefBit | row_major_bit;
}
-constexpr inline int size_at_compile_time(int rows, int cols) {
+constexpr int size_at_compile_time(int rows, int cols) {
if (rows == 0 || cols == 0) return 0;
if (rows == Dynamic || cols == Dynamic) return Dynamic;
return rows * cols;
diff --git a/Eigen/src/Eigenvalues/RealQZ.h b/Eigen/src/Eigenvalues/RealQZ.h
index 5915387..a54d82d 100644
--- a/Eigen/src/Eigenvalues/RealQZ.h
+++ b/Eigen/src/Eigenvalues/RealQZ.h
@@ -449,23 +449,23 @@
Index lr = (std::min)(k + 4, dim); // last row to update
Map<Matrix<Scalar, Dynamic, 1> > tmp(m_workspace.data(), lr);
// S
- tmp = m_S.template middleCols<2>(k).topRows(lr) * essential2;
+ tmp.noalias() = m_S.template middleCols<2>(k).topRows(lr) * essential2;
tmp += m_S.col(k + 2).head(lr);
m_S.col(k + 2).head(lr) -= tau * tmp;
- m_S.template middleCols<2>(k).topRows(lr) -= (tau * tmp) * essential2.adjoint();
+ m_S.template middleCols<2>(k).topRows(lr).noalias() -= (tau * tmp) * essential2.adjoint();
// T
tmp = m_T.template middleCols<2>(k).topRows(lr) * essential2;
tmp += m_T.col(k + 2).head(lr);
m_T.col(k + 2).head(lr) -= tau * tmp;
- m_T.template middleCols<2>(k).topRows(lr) -= (tau * tmp) * essential2.adjoint();
+ m_T.template middleCols<2>(k).topRows(lr).noalias() -= (tau * tmp) * essential2.adjoint();
}
if (m_computeQZ) {
// Z
Map<Matrix<Scalar, 1, Dynamic> > tmp(m_workspace.data(), dim);
- tmp = essential2.adjoint() * (m_Z.template middleRows<2>(k));
+ tmp.noalias() = essential2.adjoint() * (m_Z.template middleRows<2>(k));
tmp += m_Z.row(k + 2);
m_Z.row(k + 2) -= tau * tmp;
- m_Z.template middleRows<2>(k) -= essential2 * (tau * tmp);
+ m_Z.template middleRows<2>(k).noalias() -= essential2 * (tau * tmp);
}
m_T.coeffRef(k + 2, k) = m_T.coeffRef(k + 2, k + 1) = Scalar(0.0);
diff --git a/Eigen/src/Eigenvalues/Tridiagonalization.h b/Eigen/src/Eigenvalues/Tridiagonalization.h
index b2f07bc..4da6d07 100644
--- a/Eigen/src/Eigenvalues/Tridiagonalization.h
+++ b/Eigen/src/Eigenvalues/Tridiagonalization.h
@@ -515,8 +515,8 @@
result.template diagonal<-1>() = m_matrix.template diagonal<-1>();
}
- EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
- EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
+ constexpr Index rows() const noexcept { return m_matrix.rows(); }
+ constexpr Index cols() const noexcept { return m_matrix.cols(); }
protected:
typename MatrixType::Nested m_matrix;
diff --git a/Eigen/src/Geometry/Homogeneous.h b/Eigen/src/Geometry/Homogeneous.h
index 64c1b65..795af0d 100644
--- a/Eigen/src/Geometry/Homogeneous.h
+++ b/Eigen/src/Geometry/Homogeneous.h
@@ -69,10 +69,10 @@
EIGEN_DEVICE_FUNC explicit inline Homogeneous(const MatrixType& matrix) : m_matrix(matrix) {}
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT {
+ EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept {
return m_matrix.rows() + (int(Direction) == Vertical ? 1 : 0);
}
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT {
+ EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept {
return m_matrix.cols() + (int(Direction) == Horizontal ? 1 : 0);
}
@@ -244,8 +244,8 @@
EIGEN_DEVICE_FUNC homogeneous_left_product_impl(const Lhs& lhs, const MatrixType& rhs)
: m_lhs(take_matrix_for_product<Lhs>::run(lhs)), m_rhs(rhs) {}
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_lhs.rows(); }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); }
+ EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_lhs.rows(); }
+ EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_rhs.cols(); }
template <typename Dest>
EIGEN_DEVICE_FUNC void evalTo(Dest& dst) const {
@@ -275,8 +275,8 @@
typedef remove_all_t<typename Rhs::Nested> RhsNested;
EIGEN_DEVICE_FUNC homogeneous_right_product_impl(const MatrixType& lhs, const Rhs& rhs) : m_lhs(lhs), m_rhs(rhs) {}
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_lhs.rows(); }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); }
+ EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_lhs.rows(); }
+ EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_rhs.cols(); }
template <typename Dest>
EIGEN_DEVICE_FUNC void evalTo(Dest& dst) const {
diff --git a/Eigen/src/Geometry/Quaternion.h b/Eigen/src/Geometry/Quaternion.h
index 1d8ded9..147e6e3 100644
--- a/Eigen/src/Geometry/Quaternion.h
+++ b/Eigen/src/Geometry/Quaternion.h
@@ -57,22 +57,22 @@
typedef AngleAxis<Scalar> AngleAxisType;
/** \returns the \c x coefficient */
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline CoeffReturnType x() const { return this->derived().coeffs().coeff(0); }
+ EIGEN_DEVICE_FUNC constexpr CoeffReturnType x() const { return this->derived().coeffs().coeff(0); }
/** \returns the \c y coefficient */
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline CoeffReturnType y() const { return this->derived().coeffs().coeff(1); }
+ EIGEN_DEVICE_FUNC constexpr CoeffReturnType y() const { return this->derived().coeffs().coeff(1); }
/** \returns the \c z coefficient */
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline CoeffReturnType z() const { return this->derived().coeffs().coeff(2); }
+ EIGEN_DEVICE_FUNC constexpr CoeffReturnType z() const { return this->derived().coeffs().coeff(2); }
/** \returns the \c w coefficient */
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline CoeffReturnType w() const { return this->derived().coeffs().coeff(3); }
+ EIGEN_DEVICE_FUNC constexpr CoeffReturnType w() const { return this->derived().coeffs().coeff(3); }
/** \returns a reference to the \c x coefficient (if Derived is a non-const lvalue) */
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline NonConstCoeffReturnType x() { return this->derived().coeffs().x(); }
+ EIGEN_DEVICE_FUNC constexpr NonConstCoeffReturnType x() { return this->derived().coeffs().x(); }
/** \returns a reference to the \c y coefficient (if Derived is a non-const lvalue) */
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline NonConstCoeffReturnType y() { return this->derived().coeffs().y(); }
+ EIGEN_DEVICE_FUNC constexpr NonConstCoeffReturnType y() { return this->derived().coeffs().y(); }
/** \returns a reference to the \c z coefficient (if Derived is a non-const lvalue) */
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline NonConstCoeffReturnType z() { return this->derived().coeffs().z(); }
+ EIGEN_DEVICE_FUNC constexpr NonConstCoeffReturnType z() { return this->derived().coeffs().z(); }
/** \returns a reference to the \c w coefficient (if Derived is a non-const lvalue) */
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline NonConstCoeffReturnType w() { return this->derived().coeffs().w(); }
+ EIGEN_DEVICE_FUNC constexpr NonConstCoeffReturnType w() { return this->derived().coeffs().w(); }
/** \returns a read-only vector expression of the imaginary part (x,y,z) */
EIGEN_DEVICE_FUNC inline const VectorBlock<const Coefficients, 3> vec() const { return coeffs().template head<3>(); }
@@ -346,13 +346,11 @@
// We define a copy constructor, which means we don't get an implicit move constructor or assignment operator.
/** Default move constructor */
- EIGEN_DEVICE_FUNC inline Quaternion(Quaternion&& other)
- EIGEN_NOEXCEPT_IF(std::is_nothrow_move_constructible<Scalar>::value)
+ EIGEN_DEVICE_FUNC inline Quaternion(Quaternion&& other) noexcept(std::is_nothrow_move_constructible<Scalar>::value)
: m_coeffs(std::move(other.coeffs())) {}
/** Default move assignment operator */
- EIGEN_DEVICE_FUNC Quaternion& operator=(Quaternion&& other)
- EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable<Scalar>::value) {
+ EIGEN_DEVICE_FUNC Quaternion& operator=(Quaternion&& other) noexcept(std::is_nothrow_move_assignable<Scalar>::value) {
m_coeffs = std::move(other.coeffs());
return *this;
}
@@ -793,7 +791,7 @@
} else {
// theta is the angle between the 2 quaternions
Scalar theta = acos(absD);
- Scalar sinTheta = sin(theta);
+ Scalar sinTheta = numext::sqrt(Scalar(1) - absD * absD);
scale0 = sin((Scalar(1) - t) * theta) / sinTheta;
scale1 = sin((t * theta)) / sinTheta;
diff --git a/Eigen/src/Geometry/Transform.h b/Eigen/src/Geometry/Transform.h
index b1a9f21..a5d7b60 100644
--- a/Eigen/src/Geometry/Transform.h
+++ b/Eigen/src/Geometry/Transform.h
@@ -353,10 +353,10 @@
inline QTransform toQTransform(void) const;
#endif
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT {
+ EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept {
return int(Mode) == int(Projective) ? m_matrix.cols() : (m_matrix.cols() - 1);
}
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
+ EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_matrix.cols(); }
/** shortcut for m_matrix(row,col);
* \sa MatrixBase::operator(Index,Index) const */
@@ -1059,11 +1059,11 @@
: Scalar(1); // so x has absolute value 1
VectorType sv(svd.singularValues());
sv.coeffRef(Dim - 1) *= x;
- if (scaling) *scaling = svd.matrixV() * sv.asDiagonal() * svd.matrixV().adjoint();
+ if (scaling) (*scaling).noalias() = svd.matrixV() * sv.asDiagonal() * svd.matrixV().adjoint();
if (rotation) {
LinearMatrixType m(svd.matrixU());
m.col(Dim - 1) *= x;
- *rotation = m * svd.matrixV().adjoint();
+ (*rotation).noalias() = m * svd.matrixV().adjoint();
}
}
@@ -1182,7 +1182,8 @@
eigen_assert(false && "Invalid transform traits in Transform::Inverse");
}
// translation and remaining parts
- res.matrix().template topRightCorner<Dim, 1>() = -res.matrix().template topLeftCorner<Dim, Dim>() * translation();
+ res.matrix().template topRightCorner<Dim, 1>().noalias() =
+ -res.matrix().template topLeftCorner<Dim, Dim>() * translation();
res.makeAffine(); // we do need this, because in the beginning res is uninitialized
}
return res;
@@ -1432,7 +1433,7 @@
typedef Transform<Scalar, Dim, ResultMode, LhsOptions> ResultType;
static EIGEN_DEVICE_FUNC ResultType run(const Lhs& lhs, const Rhs& rhs) {
ResultType res;
- res.linear() = lhs.linear() * rhs.linear();
+ res.linear().noalias() = lhs.linear() * rhs.linear();
res.translation() = lhs.linear() * rhs.translation() + lhs.translation();
res.makeAffine();
return res;
diff --git a/Eigen/src/Geometry/Translation.h b/Eigen/src/Geometry/Translation.h
index 682c4c7..d942ac8 100644
--- a/Eigen/src/Geometry/Translation.h
+++ b/Eigen/src/Geometry/Translation.h
@@ -69,18 +69,18 @@
EIGEN_DEVICE_FUNC explicit inline Translation(const VectorType& vector) : m_coeffs(vector) {}
/** \brief Returns the x-translation by value. **/
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Scalar x() const { return m_coeffs.x(); }
+ EIGEN_DEVICE_FUNC constexpr Scalar x() const { return m_coeffs.x(); }
/** \brief Returns the y-translation by value. **/
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Scalar y() const { return m_coeffs.y(); }
+ EIGEN_DEVICE_FUNC constexpr Scalar y() const { return m_coeffs.y(); }
/** \brief Returns the z-translation by value. **/
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Scalar z() const { return m_coeffs.z(); }
+ EIGEN_DEVICE_FUNC constexpr Scalar z() const { return m_coeffs.z(); }
/** \brief Returns the x-translation as a reference. **/
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Scalar& x() { return m_coeffs.x(); }
+ EIGEN_DEVICE_FUNC constexpr Scalar& x() { return m_coeffs.x(); }
/** \brief Returns the y-translation as a reference. **/
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Scalar& y() { return m_coeffs.y(); }
+ EIGEN_DEVICE_FUNC constexpr Scalar& y() { return m_coeffs.y(); }
/** \brief Returns the z-translation as a reference. **/
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Scalar& z() { return m_coeffs.z(); }
+ EIGEN_DEVICE_FUNC constexpr Scalar& z() { return m_coeffs.z(); }
EIGEN_DEVICE_FUNC const VectorType& vector() const { return m_coeffs; }
EIGEN_DEVICE_FUNC VectorType& vector() { return m_coeffs; }
diff --git a/Eigen/src/Householder/HouseholderSequence.h b/Eigen/src/Householder/HouseholderSequence.h
index 440573f..d49c961 100644
--- a/Eigen/src/Householder/HouseholderSequence.h
+++ b/Eigen/src/Householder/HouseholderSequence.h
@@ -183,7 +183,7 @@
* \returns Number of rows
* \details This equals the dimension of the space that the transformation acts on.
*/
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT {
+ EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept {
return Side == OnTheLeft ? m_vectors.rows() : m_vectors.cols();
}
@@ -191,7 +191,7 @@
* \returns Number of columns
* \details This equals the dimension of the space that the transformation acts on.
*/
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return rows(); }
+ EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return rows(); }
/** \brief Essential part of a Householder vector.
* \param[in] k Index of Householder reflection
diff --git a/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h b/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
index 0beef60..904d853 100644
--- a/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
+++ b/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
@@ -51,8 +51,8 @@
compute(mat);
}
- EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_invdiag.size(); }
- EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_invdiag.size(); }
+ constexpr Index rows() const noexcept { return m_invdiag.size(); }
+ constexpr Index cols() const noexcept { return m_invdiag.size(); }
template <typename MatType>
DiagonalPreconditioner& analyzePattern(const MatType&) {
diff --git a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h
index ba379ec..dd40058 100644
--- a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h
+++ b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h
@@ -84,10 +84,10 @@
}
/** \returns number of rows of the factored matrix */
- EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_L.rows(); }
+ constexpr Index rows() const noexcept { return m_L.rows(); }
/** \returns number of columns of the factored matrix */
- EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_L.cols(); }
+ constexpr Index cols() const noexcept { return m_L.cols(); }
/** \brief Reports whether previous computation was successful.
*
diff --git a/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h b/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
index 930077d..11ce5e5 100644
--- a/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
+++ b/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
@@ -135,9 +135,9 @@
/** \brief Extraction Method for U-Factor */
const FactorType matrixU() const;
- EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_lu.rows(); }
+ constexpr Index rows() const noexcept { return m_lu.rows(); }
- EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_lu.cols(); }
+ constexpr Index cols() const noexcept { return m_lu.cols(); }
/** \brief Reports whether previous computation was successful.
*
@@ -446,4 +446,4 @@
} // end namespace Eigen
-#endif // EIGEN_INCOMPLETE_LUT_H
\ No newline at end of file
+#endif // EIGEN_INCOMPLETE_LUT_H
diff --git a/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h b/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
index cf85f2e..5caa396 100644
--- a/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
+++ b/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
@@ -218,10 +218,10 @@
}
/** \internal */
- EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return matrix().rows(); }
+ constexpr Index rows() const noexcept { return matrix().rows(); }
/** \internal */
- EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return matrix().cols(); }
+ constexpr Index cols() const noexcept { return matrix().cols(); }
/** \returns the tolerance threshold used by the stopping criteria.
* \sa setTolerance()
diff --git a/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h b/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h
index 2b146b3..271679f 100644
--- a/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h
+++ b/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h
@@ -50,8 +50,8 @@
SolveWithGuess(const Decomposition &dec, const RhsType &rhs, const GuessType &guess)
: m_dec(dec), m_rhs(rhs), m_guess(guess) {}
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_dec.cols(); }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); }
+ EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_dec.cols(); }
+ EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_rhs.cols(); }
EIGEN_DEVICE_FUNC const Decomposition &dec() const { return m_dec; }
EIGEN_DEVICE_FUNC const RhsType &rhs() const { return m_rhs; }
diff --git a/Eigen/src/KLUSupport/KLUSupport.h b/Eigen/src/KLUSupport/KLUSupport.h
index 574021d..12cf6c2 100644
--- a/Eigen/src/KLUSupport/KLUSupport.h
+++ b/Eigen/src/KLUSupport/KLUSupport.h
@@ -100,8 +100,8 @@
if (m_numeric) klu_free_numeric(&m_numeric, &m_common);
}
- EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return mp_matrix.rows(); }
- EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return mp_matrix.cols(); }
+ constexpr Index rows() const noexcept { return mp_matrix.rows(); }
+ constexpr Index cols() const noexcept { return mp_matrix.cols(); }
/** \brief Reports whether previous computation was successful.
*
diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h
index 3e57764..a725a7b 100644
--- a/Eigen/src/LU/FullPivLU.h
+++ b/Eigen/src/LU/FullPivLU.h
@@ -391,8 +391,8 @@
MatrixType reconstructedMatrix() const;
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_lu.rows(); }
- EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_lu.cols(); }
+ EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_lu.rows(); }
+ EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_lu.cols(); }
#ifndef EIGEN_PARSED_BY_DOXYGEN
template <typename RhsType, typename DstType>
@@ -717,7 +717,7 @@
// Step 2
m_lu.topLeftCorner(smalldim, smalldim).template triangularView<UnitLower>().solveInPlace(c.topRows(smalldim));
- if (rows > cols) c.bottomRows(rows - cols) -= m_lu.bottomRows(rows - cols) * c.topRows(cols);
+ if (rows > cols) c.bottomRows(rows - cols).noalias() -= m_lu.bottomRows(rows - cols) * c.topRows(cols);
// Step 3
m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots)
diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h
index 1edd6b8..f09b90e 100644
--- a/Eigen/src/LU/PartialPivLU.h
+++ b/Eigen/src/LU/PartialPivLU.h
@@ -210,8 +210,8 @@
MatrixType reconstructedMatrix() const;
- EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_lu.rows(); }
- EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_lu.cols(); }
+ constexpr Index rows() const noexcept { return m_lu.rows(); }
+ constexpr Index cols() const noexcept { return m_lu.cols(); }
#ifndef EIGEN_PARSED_BY_DOXYGEN
template <typename RhsType, typename DstType>
diff --git a/Eigen/src/SVD/SVDBase.h b/Eigen/src/SVD/SVDBase.h
index d1ad63d..dcb4dba 100644
--- a/Eigen/src/SVD/SVDBase.h
+++ b/Eigen/src/SVD/SVDBase.h
@@ -379,7 +379,7 @@
Index l_rank = rank();
tmp.noalias() = m_matrixU.leftCols(l_rank).adjoint() * rhs;
tmp = m_singularValues.head(l_rank).asDiagonal().inverse() * tmp;
- dst = m_matrixV.leftCols(l_rank) * tmp;
+ dst.noalias() = m_matrixV.leftCols(l_rank) * tmp;
}
template <typename Derived>
diff --git a/Eigen/src/SVD/UpperBidiagonalization.h b/Eigen/src/SVD/UpperBidiagonalization.h
index d78b30b..6df6318 100644
--- a/Eigen/src/SVD/UpperBidiagonalization.h
+++ b/Eigen/src/SVD/UpperBidiagonalization.h
@@ -172,7 +172,7 @@
// 1 - update the k-th column of A
SubColumnType v_k = A.col(k).tail(remainingRows);
v_k -= V_k1 * Y.row(k).head(k).adjoint();
- if (k) v_k -= X_k1 * A.col(k).head(k);
+ if (k) v_k.noalias() -= X_k1 * A.col(k).head(k);
// 2 - construct left Householder transform in-place
v_k.makeHouseholderInPlace(tau_v, diagonal[k]);
@@ -203,7 +203,7 @@
SubRowType u_k(A.row(k).tail(remainingCols));
u_k = u_k.conjugate();
{
- u_k -= Y_k * A.row(k).head(k + 1).adjoint();
+ u_k.noalias() -= Y_k * A.row(k).head(k + 1).adjoint();
if (k) u_k -= U_k1.adjoint() * X.row(k).head(k).adjoint();
}
diff --git a/Eigen/src/ThreadPool/CoreThreadPoolDevice.h b/Eigen/src/ThreadPool/CoreThreadPoolDevice.h
index a45be20..c603a38 100644
--- a/Eigen/src/ThreadPool/CoreThreadPoolDevice.h
+++ b/Eigen/src/ThreadPool/CoreThreadPoolDevice.h
@@ -308,19 +308,24 @@
this->template assignPacket<DstAlignment, SrcAlignment, PacketType>(index);
}
};
+ static constexpr bool UsePacketSegment = Kernel::AssignmentTraits::UsePacketSegment;
+ using head_loop =
+ unaligned_dense_assignment_loop<PacketType, DstAlignment, SrcAlignment, UsePacketSegment, DstIsAligned>;
+ using tail_loop = unaligned_dense_assignment_loop<PacketType, DstAlignment, SrcAlignment, UsePacketSegment, false>;
+
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {
const Index size = kernel.size();
const Index alignedStart =
DstIsAligned ? 0 : internal::first_aligned<RequestedAlignment>(kernel.dstDataPtr(), size);
const Index alignedEnd = alignedStart + numext::round_down(size - alignedStart, PacketSize);
- unaligned_dense_assignment_loop<DstIsAligned != 0>::run(kernel, 0, alignedStart);
+ head_loop::run(kernel, 0, alignedStart);
constexpr float cost = static_cast<float>(XprEvaluationCost);
AssignmentFunctor functor(kernel);
device.template parallelFor<AssignmentFunctor, PacketSize>(alignedStart, alignedEnd, functor, cost);
- unaligned_dense_assignment_loop<>::run(kernel, alignedEnd, size);
+ tail_loop::run(kernel, alignedEnd, size);
}
};
diff --git a/Eigen/src/misc/lapacke_helpers.h b/Eigen/src/misc/lapacke_helpers.h
index 5a2f38f..ff98639 100644
--- a/Eigen/src/misc/lapacke_helpers.h
+++ b/Eigen/src/misc/lapacke_helpers.h
@@ -75,7 +75,7 @@
/// translates storage order of the given Eigen object to the corresponding lapack constant
template <typename Derived>
-EIGEN_ALWAYS_INLINE EIGEN_CONSTEXPR lapack_int lapack_storage_of(const EigenBase<Derived> &) {
+EIGEN_ALWAYS_INLINE constexpr lapack_int lapack_storage_of(const EigenBase<Derived> &) {
return Derived::IsRowMajor ? LAPACK_ROW_MAJOR : LAPACK_COL_MAJOR;
}
diff --git a/Eigen/src/plugins/BlockMethods.inc b/Eigen/src/plugins/BlockMethods.inc
index 46dc9dd..0782aa3 100644
--- a/Eigen/src/plugins/BlockMethods.inc
+++ b/Eigen/src/plugins/BlockMethods.inc
@@ -1365,6 +1365,6 @@
* \sa subVector(Index)
*/
template <DirectionType Direction>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index subVectors() const {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index subVectors() const {
return (Direction == Vertical) ? cols() : rows();
}
diff --git a/cmake/EigenConfigureTesting.cmake b/cmake/EigenConfigureTesting.cmake
index 2a1e7ab..d72d88a 100644
--- a/cmake/EigenConfigureTesting.cmake
+++ b/cmake/EigenConfigureTesting.cmake
@@ -1,19 +1,26 @@
include(EigenTesting)
include(CheckCXXSourceCompiles)
-# configure the "site" and "buildname"
+
+# configure the "site" and "buildname"
ei_set_sitename()
# retrieve and store the build string
ei_set_build_string()
add_custom_target(buildtests)
+
+if (NOT EIGEN_CTEST_ARGS)
+ # By default, run tests in parallel on all available cores.
+ set(EIGEN_CTEST_ARGS "" CACHE STRING "-j0")
+endif()
add_custom_target(check COMMAND "ctest" ${EIGEN_CTEST_ARGS})
+
add_dependencies(check buildtests)
# Convenience target for only building GPU tests.
add_custom_target(buildtests_gpu)
-add_custom_target(check_gpu COMMAND "ctest" "--output-on-failure"
+add_custom_target(check_gpu COMMAND "ctest" "--output-on-failure"
"--no-compress-output"
"--build-no-clean"
"-T" "test"
@@ -59,7 +66,7 @@
set(CTEST_CUSTOM_COVERAGE_EXCLUDE "/test/")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COVERAGE_FLAGS}")
endif()
-
+
elseif(MSVC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /D_CRT_SECURE_NO_WARNINGS /D_SCL_SECURE_NO_WARNINGS")
endif()
diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
index 0d123a2..6178d4b 100644
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -115,3 +115,8 @@
WORKING_DIRECTORY ${Eigen_BINARY_DIR}/doc)
add_dependencies(doc doc-eigen-prerequisites doc-unsupported-prerequisites)
+
+add_custom_target(install-doc ALL
+ COMMAND ${CMAKE_COMMAND} -E copy_directory ${Eigen_BINARY_DIR}/doc ${CMAKE_INSTALL_FULL_DOCDIR}
+)
+add_dependencies(install-doc doc)
diff --git a/scripts/msvc_setup.ps1 b/scripts/msvc_setup.ps1
new file mode 100644
index 0000000..e2d0642
--- /dev/null
+++ b/scripts/msvc_setup.ps1
@@ -0,0 +1,21 @@
+# Powershell script to set up MSVC environment.
+
+param ($EIGEN_CI_MSVC_ARCH, $EIGEN_CI_MSVC_VER)
+
+Set-PSDebug -Trace 1
+
+function Get-ScriptDirectory { Split-Path $MyInvocation.ScriptName }
+
+# Set defaults if not already set.
+IF (!$EIGEN_CI_MSVC_ARCH) { $EIGEN_CI_MSVC_ARCH = "x64" }
+IF (!$EIGEN_CI_MSVC_VER) { $EIGEN_CI_MSVC_VER = "14.29" }
+
+# Export variables into the global scope
+$global:EIGEN_CI_MSVC_ARCH = $EIGEN_CI_MSVC_ARCH
+$global:EIGEN_CI_MSVC_VER = $EIGEN_CI_MSVC_VER
+
+# Find Visual Studio installation directory.
+$global:VS_INSTALL_DIR = &"${Env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer\vswhere.exe" -latest -property installationPath
+# Run VCVarsAll.bat incitialization script and extract environment variables.
+# http://allen-mack.blogspot.com/2008/03/replace-visual-studio-command-prompt.html
+cmd.exe /c "`"${VS_INSTALL_DIR}\VC\Auxiliary\Build\vcvarsall.bat`" $EIGEN_CI_MSVC_ARCH -vcvars_ver=$EIGEN_CI_MSVC_VER & set" | foreach { if ($_ -match "=") { $v = $_.split("="); set-item -force -path "ENV:\$($v[0])" -value "$($v[1])" } }
\ No newline at end of file
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 813cc53..e62ec45 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -186,6 +186,7 @@
ei_add_test(float_conversion)
ei_add_test(io)
ei_add_test(packetmath "-DEIGEN_FAST_MATH=1")
+ei_add_test(packet_segment)
ei_add_test(vectorization_logic)
ei_add_test(basicstuff)
ei_add_test(constexpr)
diff --git a/test/packet_ostream.h b/test/packet_ostream.h
deleted file mode 100644
index 49e1bb0..0000000
--- a/test/packet_ostream.h
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef TEST_PACKET_OSTREAM
-#define TEST_PACKET_OSTREAM
-
-#include <type_traits>
-#include <ostream>
-
-// Include this header to be able to print Packets while debugging.
-
-template <typename Packet,
- typename EnableIf = std::enable_if_t<Eigen::internal::unpacket_traits<Packet>::vectorizable> >
-std::ostream& operator<<(std::ostream& os, const Packet& packet) {
- using Scalar = typename Eigen::internal::unpacket_traits<Packet>::type;
- Scalar v[Eigen::internal::unpacket_traits<Packet>::size];
- Eigen::internal::pstoreu(v, packet);
- os << "{" << v[0];
- for (int i = 1; i < Eigen::internal::unpacket_traits<Packet>::size; ++i) {
- os << "," << v[i];
- }
- os << "}";
- return os;
-}
-
-#endif // TEST_PACKET_OSTREAM
\ No newline at end of file
diff --git a/test/packet_segment.cpp b/test/packet_segment.cpp
new file mode 100644
index 0000000..6fa6a29
--- /dev/null
+++ b/test/packet_segment.cpp
@@ -0,0 +1,168 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2025 The Eigen Authors
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+template <typename Scalar, typename Packet>
+void verify_data(const Scalar* data_in, const Scalar* data_out, const Packet& a, Index begin, Index count) {
+ constexpr int PacketSize = internal::unpacket_traits<Packet>::size;
+ bool ok = true;
+ for (Index i = begin; i < begin + count; i++) {
+ ok = ok && numext::equal_strict(data_in[i], data_out[i]);
+ }
+ if (!ok) {
+ std::cout << "begin: " << begin << ", count: " << count << "\n";
+ std::cout << "Scalar type: " << type_name(Scalar()) << " x " << PacketSize << "\n";
+ std::cout << "data in: {";
+ for (Index i = 0; i < PacketSize; i++) {
+ if (i > 0) std::cout << ",";
+ if (i < begin || i >= begin + count) {
+ std::cout << "MASK";
+ } else {
+ std::cout << data_in[i];
+ }
+ }
+ std::cout << "}\n";
+ std::cout << "data out: {";
+ for (Index i = 0; i < PacketSize; i++) {
+ if (i > 0) std::cout << ",";
+ if (i < begin || i >= begin + count) {
+ std::cout << "MASK";
+ } else {
+ std::cout << data_out[i];
+ }
+ }
+ std::cout << "}\n";
+ std::cout << "packet: ";
+ std::cout << internal::postream(a) << "\n";
+ }
+ VERIFY(ok);
+}
+
+template <typename Scalar, int PacketSize, bool Run = internal::find_packet_by_size<Scalar, PacketSize>::value>
+struct packet_segment_test_impl {
+ using Packet = typename internal::find_packet_by_size<Scalar, PacketSize>::type;
+ static void test_unaligned() {
+ // test loading a packet segment from unaligned memory that includes unallocated memory
+
+ // | X X X X | * * * X | X X X X |
+ // begin -> { X | * * * } <- begin + count
+
+ VectorX<Scalar> data_in(PacketSize), data_out(PacketSize);
+ data_in.setRandom();
+ data_out.setRandom();
+
+ Scalar* unaligned_data_in = data_in.data() - 1;
+ Scalar* unaligned_data_out = data_out.data() - 1;
+
+ Index begin = 1;
+ Index count = PacketSize - 1;
+
+ Packet a = internal::ploaduSegment<Packet>(unaligned_data_in, begin, count);
+ internal::pstoreuSegment<Scalar, Packet>(unaligned_data_out, a, begin, count);
+
+ verify_data(unaligned_data_in, unaligned_data_out, a, begin, count);
+
+ // test loading the entire packet
+
+ data_in.setRandom();
+ data_out.setRandom();
+
+ unaligned_data_in = data_in.data();
+ unaligned_data_out = data_out.data();
+
+ begin = 0;
+ count = PacketSize;
+
+ Packet b = internal::ploaduSegment<Packet>(unaligned_data_in, begin, count);
+ internal::pstoreuSegment<Scalar, Packet>(unaligned_data_out, b, begin, count);
+
+ verify_data(unaligned_data_in, unaligned_data_out, b, begin, count);
+
+ // test loading an empty packet segment in unallocated memory
+ count = 0;
+
+ for (begin = 0; begin < PacketSize; begin++) {
+ data_in.setRandom();
+ data_out = data_in;
+ Packet c = internal::ploaduSegment<Packet>(data_in.data(), begin, count);
+ internal::pstoreuSegment<Scalar, Packet>(data_out.data(), c, begin, count);
+ // verify that ploaduSegment / pstoreuSegment did nothing
+ VERIFY_IS_CWISE_EQUAL(data_in, data_out);
+ }
+ }
+ static void test_aligned() {
+ // test loading a packet segment from aligned memory that includes unallocated memory
+
+ // | X X X X | * * * X | X X X X |
+ // begin -> { * * * X } <- begin + count
+
+ VectorX<Scalar> data_in(PacketSize - 1), data_out(PacketSize - 1);
+ data_in.setRandom();
+ data_out.setRandom();
+
+ Scalar* aligned_data_in = data_in.data();
+ Scalar* aligned_data_out = data_out.data();
+
+ Index begin = 0;
+ Index count = PacketSize - 1;
+
+ Packet b = internal::ploadSegment<Packet>(aligned_data_in, begin, count);
+ internal::pstoreSegment<Scalar, Packet>(aligned_data_out, b, begin, count);
+
+ verify_data(aligned_data_in, aligned_data_out, b, begin, count);
+ }
+ static void run() {
+ test_unaligned();
+ test_aligned();
+ }
+};
+
+template <typename Scalar, int PacketSize>
+struct packet_segment_test_impl<Scalar, PacketSize, false> {
+ static void run() {}
+};
+
+template <typename Scalar, int PacketSize>
+struct packet_segment_test_driver {
+ static void run() {
+ packet_segment_test_impl<Scalar, PacketSize>::run();
+ packet_segment_test_driver<Scalar, PacketSize / 2>::run();
+ }
+};
+
+template <typename Scalar>
+struct packet_segment_test_driver<Scalar, 1> {
+ static void run() {}
+};
+
+template <typename Scalar>
+void test_packet_segment() {
+ packet_segment_test_driver<Scalar, internal::packet_traits<Scalar>::size>::run();
+}
+
+EIGEN_DECLARE_TEST(packet_segment) {
+ for (int i = 0; i < g_repeat; i++) {
+ test_packet_segment<bool>();
+ test_packet_segment<int8_t>();
+ test_packet_segment<uint8_t>();
+ test_packet_segment<int16_t>();
+ test_packet_segment<uint16_t>();
+ test_packet_segment<int32_t>();
+ test_packet_segment<uint32_t>();
+ test_packet_segment<int64_t>();
+ test_packet_segment<uint64_t>();
+ test_packet_segment<bfloat16>();
+ test_packet_segment<half>();
+ test_packet_segment<float>();
+ test_packet_segment<double>();
+ test_packet_segment<std::complex<float>>();
+ test_packet_segment<std::complex<double>>();
+ }
+}
diff --git a/test/packetmath.cpp b/test/packetmath.cpp
index 9c5d6cf..7647592 100644
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@@ -10,7 +10,6 @@
#include "packetmath_test_shared.h"
#include "random_without_cast_overflow.h"
-#include "packet_ostream.h"
template <typename T>
inline T REF_ADD(const T& a, const T& b) {
@@ -24,21 +23,55 @@
inline T REF_MUL(const T& a, const T& b) {
return a * b;
}
+
+template <typename Scalar, typename EnableIf = void>
+struct madd_impl {
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar madd(const Scalar& a, const Scalar& b, const Scalar& c) {
+ return a * b + c;
+ }
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar msub(const Scalar& a, const Scalar& b, const Scalar& c) {
+ return a * b - c;
+ }
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar nmadd(const Scalar& a, const Scalar& b, const Scalar& c) {
+ return c - a * b;
+ }
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar nmsub(const Scalar& a, const Scalar& b, const Scalar& c) {
+ return Scalar(0) - (a * b + c);
+ }
+};
+
+template <typename Scalar>
+struct madd_impl<Scalar,
+ std::enable_if_t<Eigen::internal::is_scalar<Scalar>::value && Eigen::NumTraits<Scalar>::IsSigned>> {
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar madd(const Scalar& a, const Scalar& b, const Scalar& c) {
+ return numext::fma(a, b, c);
+ }
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar msub(const Scalar& a, const Scalar& b, const Scalar& c) {
+ return numext::fma(a, b, Scalar(-c));
+ }
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar nmadd(const Scalar& a, const Scalar& b, const Scalar& c) {
+ return numext::fma(Scalar(-a), b, c);
+ }
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar nmsub(const Scalar& a, const Scalar& b, const Scalar& c) {
+ return -Scalar(numext::fma(a, b, c));
+ }
+};
+
template <typename T>
inline T REF_MADD(const T& a, const T& b, const T& c) {
- return a * b + c;
+ return madd_impl<T>::madd(a, b, c);
}
template <typename T>
inline T REF_MSUB(const T& a, const T& b, const T& c) {
- return a * b - c;
+ return madd_impl<T>::msub(a, b, c);
}
template <typename T>
inline T REF_NMADD(const T& a, const T& b, const T& c) {
- return c - a * b;
+ return madd_impl<T>::nmadd(a, b, c);
}
template <typename T>
inline T REF_NMSUB(const T& a, const T& b, const T& c) {
- return test::negate(a * b + c);
+ return madd_impl<T>::nmsub(a, b, c);
}
template <typename T>
inline T REF_DIV(const T& a, const T& b) {
@@ -70,6 +103,14 @@
inline bool REF_MADD(const bool& a, const bool& b, const bool& c) {
return (a && b) || c;
}
+template <>
+inline bool REF_DIV(const bool& a, const bool& b) {
+ return a && b;
+}
+template <>
+inline bool REF_RECIPROCAL(const bool& a) {
+ return a;
+}
template <typename T>
inline T REF_FREXP(const T& x, T& exp) {
@@ -92,6 +133,26 @@
return static_cast<T>(ldexp(x, static_cast<int>(exp)));
}
+// provides a convenient function to take the absolute value of each component of a complex number to prevent
+// catastrophic cancellation in randomly generated complex numbers
+template <typename T, bool IsComplex = NumTraits<T>::IsComplex>
+struct abs_helper_impl {
+ static T run(T x) { return numext::abs(x); }
+};
+template <typename T>
+struct abs_helper_impl<T, true> {
+ static T run(T x) {
+ T res = x;
+ numext::real_ref(res) = numext::abs(numext::real(res));
+ numext::imag_ref(res) = numext::abs(numext::imag(res));
+ return res;
+ }
+};
+template <typename T>
+T abs_helper(T x) {
+ return abs_helper_impl<T>::run(x);
+}
+
// Uses pcast to cast from one array to another.
template <typename SrcPacket, typename TgtPacket, int SrcCoeffRatio, int TgtCoeffRatio>
struct pcast_array;
@@ -481,8 +542,8 @@
eigen_optimization_barrier_test<Scalar>::run();
for (int i = 0; i < size; ++i) {
- data1[i] = internal::random<Scalar>() / RealScalar(PacketSize);
- data2[i] = internal::random<Scalar>() / RealScalar(PacketSize);
+ data1[i] = internal::random<Scalar>();
+ data2[i] = internal::random<Scalar>();
refvalue = (std::max)(refvalue, numext::abs(data1[i]));
}
@@ -502,8 +563,8 @@
for (int M = 0; M < PacketSize; ++M) {
for (int N = 0; N <= PacketSize; ++N) {
for (int j = 0; j < size; ++j) {
- data1[j] = internal::random<Scalar>() / RealScalar(PacketSize);
- data2[j] = internal::random<Scalar>() / RealScalar(PacketSize);
+ data1[j] = internal::random<Scalar>();
+ data2[j] = internal::random<Scalar>();
refvalue = (std::max)(refvalue, numext::abs(data1[j]));
}
@@ -568,6 +629,7 @@
negate_test<Scalar, Packet>(data1, data2, ref, PacketSize);
CHECK_CWISE1_IF(PacketTraits::HasReciprocal, REF_RECIPROCAL, internal::preciprocal);
CHECK_CWISE1(numext::conj, internal::pconj);
+
CHECK_CWISE1_IF(PacketTraits::HasSign, numext::sign, internal::psign);
for (int offset = 0; offset < 3; ++offset) {
@@ -632,11 +694,17 @@
// Avoid overflows.
if (NumTraits<Scalar>::IsInteger && NumTraits<Scalar>::IsSigned &&
Eigen::internal::unpacket_traits<Packet>::size > 1) {
- Scalar limit =
- static_cast<Scalar>(std::pow(static_cast<double>(numext::real(NumTraits<Scalar>::highest())),
- 1.0 / static_cast<double>(Eigen::internal::unpacket_traits<Packet>::size)));
+ Scalar limit = static_cast<Scalar>(
+ static_cast<RealScalar>(std::pow(static_cast<double>(numext::real(NumTraits<Scalar>::highest())),
+ 1.0 / static_cast<double>(Eigen::internal::unpacket_traits<Packet>::size))));
for (int i = 0; i < PacketSize; ++i) {
- data1[i] = internal::random<Scalar>(-limit, limit);
+ data1[i] = internal::random<Scalar>(Scalar(0) - limit, limit);
+ }
+ } else if (!NumTraits<Scalar>::IsInteger && !NumTraits<Scalar>::IsComplex) {
+ // Prevent very small product results by adjusting range. Otherwise,
+ // we may end up with multiplying e.g. 32 Eigen::halfs with values < 1.
+ for (int i = 0; i < PacketSize; ++i) {
+ data1[i] = internal::random<Scalar>(Scalar(0.5), Scalar(1)) * (internal::random<bool>() ? Scalar(-1) : Scalar(1));
}
}
ref[0] = Scalar(1);
@@ -724,11 +792,6 @@
packetmath_pcast_ops_runner<Scalar, Packet>::run();
packetmath_minus_zero_add_test<Scalar, Packet>::run();
- for (int i = 0; i < size; ++i) {
- data1[i] = numext::abs(internal::random<Scalar>());
- }
- CHECK_CWISE1_IF(PacketTraits::HasSqrt, numext::sqrt, internal::psqrt);
- CHECK_CWISE1_IF(PacketTraits::HasRsqrt, numext::rsqrt, internal::prsqrt);
CHECK_CWISE3_IF(true, REF_MADD, internal::pmadd);
if (!std::is_same<Scalar, bool>::value && NumTraits<Scalar>::IsSigned) {
nmsub_test<Scalar, Packet>(data1, data2, ref, PacketSize);
@@ -738,14 +801,18 @@
// which can lead to very flaky tests. Here we ensure the signs are such that
// they do not cancel.
for (int i = 0; i < PacketSize; ++i) {
- data1[i] = numext::abs(internal::random<Scalar>());
- data1[i + PacketSize] = numext::abs(internal::random<Scalar>());
- data1[i + 2 * PacketSize] = Scalar(0) - numext::abs(internal::random<Scalar>());
+ data1[i] = abs_helper(internal::random<Scalar>());
+ data1[i + PacketSize] = abs_helper(internal::random<Scalar>());
+ data1[i + 2 * PacketSize] = Scalar(0) - abs_helper(internal::random<Scalar>());
}
if (!std::is_same<Scalar, bool>::value && NumTraits<Scalar>::IsSigned) {
CHECK_CWISE3_IF(true, REF_MSUB, internal::pmsub);
CHECK_CWISE3_IF(true, REF_NMADD, internal::pnmadd);
}
+
+ CHECK_CWISE1_IF(PacketTraits::HasSqrt, numext::sqrt, internal::psqrt);
+ CHECK_CWISE1_IF(PacketTraits::HasRsqrt, numext::rsqrt, internal::prsqrt);
+ CHECK_CWISE1_IF(PacketTraits::HasCbrt, numext::cbrt, internal::pcbrt);
}
// Notice that this definition works for complex types as well.
@@ -767,6 +834,7 @@
CREATE_FUNCTOR(psqrt_functor, internal::psqrt);
CREATE_FUNCTOR(prsqrt_functor, internal::prsqrt);
+CREATE_FUNCTOR(pcbrt_functor, internal::pcbrt);
// TODO(rmlarsen): Run this test for more functions.
template <bool Cond, typename Scalar, typename Packet, typename RefFunctorT, typename FunctorT>
@@ -1137,6 +1205,7 @@
packetmath_test_IEEE_corner_cases<PacketTraits::HasSqrt, Scalar, Packet>(numext::sqrt<Scalar>, psqrt_functor());
packetmath_test_IEEE_corner_cases<PacketTraits::HasRsqrt, Scalar, Packet>(numext::rsqrt<Scalar>, prsqrt_functor());
+ packetmath_test_IEEE_corner_cases<PacketTraits::HasCbrt, Scalar, Packet>(numext::cbrt<Scalar>, pcbrt_functor());
// TODO(rmlarsen): Re-enable for half and bfloat16.
if (PacketTraits::HasCos && !internal::is_same<Scalar, half>::value &&
@@ -1665,7 +1734,7 @@
for (Index N = 0; N <= PacketSize; ++N) {
for (Index i = 0; i < N; ++i) {
- data1[i] = internal::random<Scalar>() / RealScalar(PacketSize);
+ data1[i] = internal::random<Scalar>();
}
for (Index i = 0; i < N * 20; ++i) {
@@ -1684,7 +1753,7 @@
}
for (Index i = 0; i < N * 7; ++i) {
- buffer[i] = internal::random<Scalar>() / RealScalar(PacketSize);
+ buffer[i] = internal::random<Scalar>();
}
packet = internal::pgather_partial<Scalar, Packet>(buffer, 7, N);
internal::pstore_partial(data1, packet, N);
diff --git a/test/packetmath_test_shared.h b/test/packetmath_test_shared.h
index 93d4149..7d7a0da 100644
--- a/test/packetmath_test_shared.h
+++ b/test/packetmath_test_shared.h
@@ -162,7 +162,9 @@
template <typename T>
inline Packet load(const T* from, unsigned long long umask) const {
- return internal::ploadu<Packet>(from, umask);
+ using UMaskType = typename numext::get_integer_by_size<internal::plain_enum_max(
+ internal::unpacket_traits<Packet>::size / CHAR_BIT, 1)>::unsigned_type;
+ return internal::ploadu<Packet>(from, static_cast<UMaskType>(umask));
}
template <typename T>
@@ -172,7 +174,9 @@
template <typename T>
inline void store(T* to, const Packet& x, unsigned long long umask) const {
- internal::pstoreu(to, x, umask);
+ using UMaskType = typename numext::get_integer_by_size<internal::plain_enum_max(
+ internal::unpacket_traits<Packet>::size / CHAR_BIT, 1)>::unsigned_type;
+ internal::pstoreu(to, x, static_cast<UMaskType>(umask));
}
template <typename T>
diff --git a/test/product.h b/test/product.h
index f8eb5df..f37a932 100644
--- a/test/product.h
+++ b/test/product.h
@@ -38,6 +38,15 @@
std::enable_if_t<RhsType::SizeAtCompileTime != Dynamic, void> check_mismatched_product(LhsType& /*unused*/,
const RhsType& /*unused*/) {}
+template <typename Scalar, typename V1, typename V2>
+Scalar ref_dot_product(const V1& v1, const V2& v2) {
+ Scalar out = Scalar(0);
+ for (Index i = 0; i < v1.size(); ++i) {
+ out = Eigen::numext::fma(v1[i], v2[i], out);
+ }
+ return out;
+}
+
template <typename MatrixType>
void product(const MatrixType& m) {
/* this test covers the following files:
@@ -245,7 +254,10 @@
// inner product
{
Scalar x = square2.row(c) * square2.col(c2);
- VERIFY_IS_APPROX(x, square2.row(c).transpose().cwiseProduct(square2.col(c2)).sum());
+ // NOTE: FMA is necessary here in the reference to ensure accuracy for
+ // large vector sizes and float16/bfloat16 types.
+ Scalar y = ref_dot_product<Scalar>(square2.row(c), square2.col(c2));
+ VERIFY_IS_APPROX(x, y);
}
// outer product
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
index 902aa96..ea210a1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
@@ -45,19 +45,19 @@
template <typename Scalar, typename StorageIndex, StorageIndex NCWindow, StorageIndex CFactor, StorageIndex NCFactor>
struct TVPanelSize {
// LocalThreadSizeC: determines total number of thread per workgroup for the contracting dimension
- static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeC = EIGEN_SYCL_LOCAL_THREAD_DIM0;
+ static constexpr StorageIndex LocalThreadSizeC = EIGEN_SYCL_LOCAL_THREAD_DIM0;
// LocalThreadSizeNC: determines total number of thread per workgroup for the non-contracting dimension
- static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC = EIGEN_SYCL_LOCAL_THREAD_DIM1;
+ static constxpr StorageIndex LocalThreadSizeNC = EIGEN_SYCL_LOCAL_THREAD_DIM1;
// TileSizeDimNC: determines the tile size for the non-contracting dimension
- static EIGEN_CONSTEXPR StorageIndex TileSizeDimNC = NCWindow / NCFactor;
+ static constexpr StorageIndex TileSizeDimNC = NCWindow / NCFactor;
// TileSizeDimC: determines the tile size for the contracting dimension
- static EIGEN_CONSTEXPR StorageIndex TileSizeDimC = CFactor * LocalThreadSizeNC * LocalThreadSizeC;
+ static constexpr StorageIndex TileSizeDimC = CFactor * LocalThreadSizeNC * LocalThreadSizeC;
// WorkLoadPerThreadNC : determines workload per thread for loading the non-contracting dimension
- static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC = TileSizeDimNC / LocalThreadSizeNC;
+ static constexpr StorageIndex WorkLoadPerThreadNC = TileSizeDimNC / LocalThreadSizeNC;
// WorkLoadPerThreadC: determines workload per thread for loading the non-contracting dimension
- static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadC = TileSizeDimC / LocalThreadSizeC;
+ static constexpr StorageIndex WorkLoadPerThreadC = TileSizeDimC / LocalThreadSizeC;
// BC : determines if supporting bank conflict is required
- static EIGEN_CONSTEXPR bool BC = false;
+ static constexpr bool BC = false;
};
#endif
@@ -81,40 +81,40 @@
template <typename Scalar, typename StorageIndex, StorageIndex REG_SIZE_M, StorageIndex REG_SIZE_N, StorageIndex TSDK>
struct TTPanelSize {
// TileSizeDimK: determines Tile size for dimension K. The packet size is assumed to be considered
- static EIGEN_CONSTEXPR StorageIndex TileSizeDimK = TSDK;
+ static constexpr StorageIndex TileSizeDimK = TSDK;
// WorkLoadPerThreadM : determines workload per thread for loading the M dimension This can be varied based on the
// available register on a chosen device(can be controlled by EIGEN_SYCL_REG_M macro//
#ifndef EIGEN_SYCL_REG_M
- static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM = REG_SIZE_M;
+ static constexpr StorageIndex WorkLoadPerThreadM = REG_SIZE_M;
#else
- static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM = EIGEN_SYCL_REG_M;
+ static constexpr StorageIndex WorkLoadPerThreadM = EIGEN_SYCL_REG_M;
#endif
// WorkLoadPerThreadN : determines workload per thread for loading the N dimension This can be varied based on the
// available register on a chosen device(can be controlled by EIGEN_SYCL_REG_N macro
#ifndef EIGEN_SYCL_REG_N
- static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN = REG_SIZE_N;
+ static constexpr StorageIndex WorkLoadPerThreadN = REG_SIZE_N;
#else
- static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN = EIGEN_SYCL_REG_N;
+ static constexpr StorageIndex WorkLoadPerThreadN = EIGEN_SYCL_REG_N;
#endif
// LocalThreadSizeM: determines total number of thread per workgroup for the m dimension
- static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeM = EIGEN_SYCL_LOCAL_THREAD_DIM0;
+ static constexpr StorageIndex LocalThreadSizeM = EIGEN_SYCL_LOCAL_THREAD_DIM0;
// LocalThreadSizeN: determines total number of thread per workgroup for the n dimension
- static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeN = EIGEN_SYCL_LOCAL_THREAD_DIM1;
+ static constexpr StorageIndex LocalThreadSizeN = EIGEN_SYCL_LOCAL_THREAD_DIM1;
// TileSizeDimM: determines the tile size for the m dimension
- static EIGEN_CONSTEXPR StorageIndex TileSizeDimM = LocalThreadSizeM * WorkLoadPerThreadM;
+ static constexpr StorageIndex TileSizeDimM = LocalThreadSizeM * WorkLoadPerThreadM;
// TileSizeDimN: determines the tile size for the n dimension
- static EIGEN_CONSTEXPR StorageIndex TileSizeDimN = LocalThreadSizeN * WorkLoadPerThreadN;
+ static constexpr StorageIndex TileSizeDimN = LocalThreadSizeN * WorkLoadPerThreadN;
// LoadPerThreadLhs: determines workload per thread for loading Lhs Tensor. This must be divisible by packetsize
- static EIGEN_CONSTEXPR StorageIndex LoadPerThreadLhs =
+ static constexpr StorageIndex LoadPerThreadLhs =
((TileSizeDimK * WorkLoadPerThreadM * WorkLoadPerThreadN) / (TileSizeDimN));
// LoadPerThreadRhs: determines workload per thread for loading Rhs Tensor. This must be divisible by packetsize
- static EIGEN_CONSTEXPR StorageIndex LoadPerThreadRhs =
+ static constexpr StorageIndex LoadPerThreadRhs =
((TileSizeDimK * WorkLoadPerThreadM * WorkLoadPerThreadN) / (TileSizeDimM));
// BC : determines if supporting bank conflict is required
- static EIGEN_CONSTEXPR bool BC = true;
+ static constexpr bool BC = true;
// DoubleBuffer: determines if double buffering technique should be used (This can be disabled by
// EIGEN_SYCL_DISABLE_DOUBLE_BUFFER macro when the device does not have sufficient local memory)
- static EIGEN_CONSTEXPR bool DoubleBuffer =
+ static constexpr bool DoubleBuffer =
#ifdef EIGEN_SYCL_DISABLE_DOUBLE_BUFFER
false;
#else
@@ -220,7 +220,7 @@
template <typename StorageIndex, StorageIndex ld, data_source dt, typename PacketType, typename DataScalar>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<dt != data_source::global_mem, void> write(
PacketType &packet_data, DataScalar ptr) {
- EIGEN_CONSTEXPR int PacketSize = Eigen::internal::unpacket_traits<PacketType>::size;
+ constexpr int PacketSize = Eigen::internal::unpacket_traits<PacketType>::size;
EIGEN_UNROLL_LOOP
for (int i = 0; i < PacketSize; i++) {
*ptr = PacketWrapper<PacketType, PacketSize>::scalarize(i, packet_data);
@@ -320,14 +320,14 @@
*/
template <bool is_transposed, bool is_rhs_, bool packet_load_, typename PacketType>
struct BlockProperties {
- static EIGEN_CONSTEXPR bool packet_load = packet_load_;
+ static constexpr bool packet_load = packet_load_;
typedef typename Eigen::internal::unpacket_traits<PacketType>::type OutScalar;
- static EIGEN_CONSTEXPR bool is_rhs = is_rhs_;
+ static constexpr bool is_rhs = is_rhs_;
typedef std::conditional_t<packet_load, PacketType, OutScalar> OutType;
- static EIGEN_CONSTEXPR int elements_per_access = Eigen::internal::unpacket_traits<OutType>::size;
- static EIGEN_CONSTEXPR bool is_coalesced_layout = !(is_transposed ^ is_rhs);
- static EIGEN_CONSTEXPR int nc_stride = (is_coalesced_layout ? elements_per_access : 1);
- static EIGEN_CONSTEXPR int c_stride = (is_coalesced_layout ? 1 : elements_per_access);
+ static constexpr int elements_per_access = Eigen::internal::unpacket_traits<OutType>::size;
+ static constexpr bool is_coalesced_layout = !(is_transposed ^ is_rhs);
+ static constexpr int nc_stride = (is_coalesced_layout ? elements_per_access : 1);
+ static constexpr int c_stride = (is_coalesced_layout ? 1 : elements_per_access);
};
/*!
@@ -458,11 +458,11 @@
public:
typedef typename Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketReturnType
PacketReturnType;
- static EIGEN_CONSTEXPR int PacketSize =
+ static constexpr int PacketSize =
Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketSize;
- static EIGEN_CONSTEXPR bool is_lhs_transposed =
+ static constexpr bool is_lhs_transposed =
!::Eigen::internal::TensorContractionInputMapperTrait<LhsMapper>::inner_dim_contiguous;
- static EIGEN_CONSTEXPR bool is_rhs_transposed =
+ static constexpr bool is_rhs_transposed =
!::Eigen::internal::TensorContractionInputMapperTrait<RhsMapper>::inner_dim_contiguous;
typedef BlockProperties<is_lhs_transposed, false, input_mapper_properties::is_lhs_matrix && Vectorizable,
@@ -473,20 +473,20 @@
PacketReturnType>
RHSBlockProperties;
- static EIGEN_CONSTEXPR StorageIndex NStride =
+ static constexpr StorageIndex NStride =
contraction_tp == contraction_type::local ? Properties::WorkLoadPerThreadN : RHSBlockProperties::nc_stride;
typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch;
typedef cl::sycl::multi_ptr<OutScalar, cl::sycl::access::address_space::local_space> local_ptr;
typedef OutScalar * /*cl::sycl::multi_ptr<OutScalar, cl::sycl::access::address_space::private_space>*/ private_ptr;
typedef std::conditional_t<contraction_tp == contraction_type::local, local_ptr, private_ptr> tile_ptr;
- static EIGEN_CONSTEXPR StorageIndex LSDL = contraction_tp == contraction_type::local
- ? Properties::TileSizeDimM + Properties::BC
- : Properties::WorkLoadPerThreadM;
- static EIGEN_CONSTEXPR StorageIndex LSDR = contraction_tp == contraction_type::local
- ? Properties::TileSizeDimN + Properties::BC
- : Properties::WorkLoadPerThreadN;
- static EIGEN_CONSTEXPR StorageIndex LocalOffset = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN;
+ static constexpr StorageIndex LSDL = contraction_tp == contraction_type::local
+ ? Properties::TileSizeDimM + Properties::BC
+ : Properties::WorkLoadPerThreadM;
+ static constexpr StorageIndex LSDR = contraction_tp == contraction_type::local
+ ? Properties::TileSizeDimN + Properties::BC
+ : Properties::WorkLoadPerThreadN;
+ static constexpr StorageIndex LocalOffset = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN;
/**
* \brief MemHolder this is a place holder struct for creating memory hierarchy in SYCL. Inside SYCL kernel it is not
@@ -638,7 +638,7 @@
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_block_per_tile(OutScalar *lhs_block_ptr, OutScalar *rhs_block_ptr,
PacketReturnType *privateRes) const {
StorageIndex idx = 0;
- EIGEN_CONSTEXPR StorageIndex lhs_stride =
+ constexpr StorageIndex lhs_stride =
contraction_tp == contraction_type::local ? (PacketSize * Properties::LocalThreadSizeM) : 1;
EIGEN_UNROLL_LOOP
for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN; wLPTN++) {
@@ -668,8 +668,7 @@
// when local memory is not used M and N are both accessed in a coalesced way. However, when local memory is
// available the k*N is transposed in the local to N*K therefore, each blocks operates on blockId*
// WorkLoadPerThreadN slice of N
- EIGEN_CONSTEXPR StorageIndex GlobalNStride =
- contraction_tp == contraction_type::local ? 1 : Properties::LocalThreadSizeN;
+ constexpr StorageIndex GlobalNStride = contraction_tp == contraction_type::local ? 1 : Properties::LocalThreadSizeN;
EIGEN_UNROLL_LOOP
for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN / PrivateNStride; wLPTN++) {
// output leading dimension
@@ -713,9 +712,9 @@
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<contract_tp == contraction_type::no_local> extract_block(
const Input &inpt, PrivateReg private_ptr, const std::pair<StorageIndex, StorageIndex> &,
const StorageIndex &ncOffset, const StorageIndex cOffset) const {
- EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC =
+ constexpr StorageIndex LocalThreadSizeNC =
InputBlockProperties::is_rhs ? Properties::LocalThreadSizeN : Properties::LocalThreadSizeM;
- EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC =
+ constexpr StorageIndex WorkLoadPerThreadNC =
InputBlockProperties::is_rhs ? Properties::WorkLoadPerThreadN : Properties::WorkLoadPerThreadM;
const StorageIndex &NC = InputBlockProperties::is_rhs ? triple_dim.N : triple_dim.M;
@@ -891,11 +890,11 @@
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<contract_tp == contraction_type::local> extract_block(
const Input &inpt, Local local_ptr, const std::pair<StorageIndex, StorageIndex> &local_index,
const StorageIndex &ncOffset, const StorageIndex cOffset) const {
- EIGEN_CONSTEXPR StorageIndex TileSizeDimNC =
+ constexpr StorageIndex TileSizeDimNC =
InputBlockProperties::is_rhs ? Properties::TileSizeDimN : Properties::TileSizeDimM;
- EIGEN_CONSTEXPR StorageIndex LoadPerThread =
+ constexpr StorageIndex LoadPerThread =
InputBlockProperties::is_rhs ? Properties::LoadPerThreadRhs : Properties::LoadPerThreadLhs;
- EIGEN_CONSTEXPR StorageIndex LSD = InputBlockProperties::is_rhs ? LSDR : LSDL;
+ constexpr StorageIndex LSD = InputBlockProperties::is_rhs ? LSDR : LSDL;
static_assert(((LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride) == 0) &&
(LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride) == 0)),
" LocalOffset must be divisible by stride");
@@ -995,11 +994,11 @@
struct GeneralVectorTensor {
typedef typename Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketReturnType
PacketReturnType;
- static EIGEN_CONSTEXPR int PacketSize =
+ static constexpr int PacketSize =
Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketSize;
typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch;
- static EIGEN_CONSTEXPR StorageIndex OutScratchOffset =
+ static constexpr StorageIndex OutScratchOffset =
KFactor * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC;
// Since the access layout for a vector can always be coalesced, when LHS is a vector, we pass false and false to make
@@ -1328,8 +1327,8 @@
template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered>
struct input_mapper_propertis {
- static EIGEN_CONSTEXPR bool is_lhs_matrix = (LDims == 2 && ContractDims == 1) || lhs_inner_dim_contiguous;
- static EIGEN_CONSTEXPR bool is_rhs_matrix =
+ static constexpr bool is_lhs_matrix = (LDims == 2 && ContractDims == 1) || lhs_inner_dim_contiguous;
+ static constexpr bool is_rhs_matrix =
(RDims == 2 && ContractDims == 1) || (rhs_inner_dim_contiguous && !rhs_inner_dim_reordered);
};
@@ -1537,9 +1536,9 @@
void EIGEN_ALWAYS_INLINE LaunchVT(EvaluatorPointerType buffer, const VectorMapper &vec, const TensorMapper &mat,
StorageIndex NC, StorageIndex C) const {
const StorageIndex nonContractDim = NC;
- EIGEN_CONSTEXPR StorageIndex NCFactor = 1;
- EIGEN_CONSTEXPR StorageIndex CFactor = 1;
- EIGEN_CONSTEXPR StorageIndex NCWindow = 16;
+ constexpr StorageIndex NCFactor = 1;
+ constexpr StorageIndex CFactor = 1;
+ constexpr StorageIndex NCWindow = 16;
typedef Eigen::TensorSycl::internal::TVPanelSize<CoeffReturnType, StorageIndex, NCWindow, CFactor, NCFactor>
Properties;
const StorageIndex roundUpC = Eigen::TensorSycl::internal::roundUp(C, Properties::TileSizeDimC);
@@ -1601,7 +1600,7 @@
(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)),
"The Local thread size must be a power of 2 for the reduction "
"operation");
- EIGEN_CONSTEXPR StorageIndex local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;
+ constexpr StorageIndex local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;
// Here we force the code not to be more than 2-step reduction: Our empirical research shows that if each thread
// reduces at least 512 elementss individually, we get better performance.
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
index 6a7571c..394c150 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
@@ -42,10 +42,10 @@
typedef Index Real;
enum { IsComplex = 0, RequireInitialization = false, ReadCost = 1, AddCost = 1, MulCost = 1 };
- EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real epsilon() { return 0; }
- EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real dummy_precision() { return 0; }
- EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real highest() { return n; }
- EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real lowest() { return n; }
+ EIGEN_DEVICE_FUNC static constexpr EIGEN_STRONG_INLINE Real epsilon() { return 0; }
+ EIGEN_DEVICE_FUNC static constexpr EIGEN_STRONG_INLINE Real dummy_precision() { return 0; }
+ EIGEN_DEVICE_FUNC static constexpr EIGEN_STRONG_INLINE Real highest() { return n; }
+ EIGEN_DEVICE_FUNC static constexpr EIGEN_STRONG_INLINE Real lowest() { return n; }
};
namespace internal {
@@ -569,47 +569,47 @@
namespace Eigen {
namespace internal {
template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_known_statically(Index i) {
+static EIGEN_DEVICE_FUNC constexpr bool index_known_statically(Index i) {
return index_known_statically_impl<T>::run(i);
}
template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool all_indices_known_statically() {
+static EIGEN_DEVICE_FUNC constexpr bool all_indices_known_statically() {
return all_indices_known_statically_impl<T>::run();
}
template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool indices_statically_known_to_increase() {
+static EIGEN_DEVICE_FUNC constexpr bool indices_statically_known_to_increase() {
return indices_statically_known_to_increase_impl<T>::run();
}
template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_eq(Index i, Index value) {
+static EIGEN_DEVICE_FUNC constexpr bool index_statically_eq(Index i, Index value) {
return index_statically_eq_impl<T>::run(i, value);
}
template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_ne(Index i, Index value) {
+static EIGEN_DEVICE_FUNC constexpr bool index_statically_ne(Index i, Index value) {
return index_statically_ne_impl<T>::run(i, value);
}
template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_gt(Index i, Index value) {
+static EIGEN_DEVICE_FUNC constexpr bool index_statically_gt(Index i, Index value) {
return index_statically_gt_impl<T>::run(i, value);
}
template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_lt(Index i, Index value) {
+static EIGEN_DEVICE_FUNC constexpr bool index_statically_lt(Index i, Index value) {
return index_statically_lt_impl<T>::run(i, value);
}
template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_first_statically_eq(Index i, Index value) {
+static EIGEN_DEVICE_FUNC constexpr bool index_pair_first_statically_eq(Index i, Index value) {
return index_pair_first_statically_eq_impl<T>::run(i, value);
}
template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_second_statically_eq(Index i, Index value) {
+static EIGEN_DEVICE_FUNC constexpr bool index_pair_second_statically_eq(Index i, Index value) {
return index_pair_second_statically_eq_impl<T>::run(i, value);
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
index 8c2bb2e..8454070 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@@ -212,9 +212,9 @@
typedef U first_type;
typedef V second_type;
- EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Pair() : first(), second() {}
+ constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Pair() : first(), second() {}
- EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Pair(const U& f, const V& s) : first(f), second(s) {}
+ constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Pair(const U& f, const V& s) : first(f), second(s) {}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void swap(Pair& rhs) {
using numext::swap;
@@ -224,20 +224,20 @@
};
template <typename U, typename V>
-EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator==(const Pair<U, V>& x, const Pair<U, V>& y) {
+constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator==(const Pair<U, V>& x, const Pair<U, V>& y) {
return (x.first == y.first && x.second == y.second);
}
template <typename U, typename V>
-EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator!=(const Pair<U, V>& x, const Pair<U, V>& y) {
+constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator!=(const Pair<U, V>& x, const Pair<U, V>& y) {
return !(x == y);
}
// Can't use std::pairs on cuda devices
template <typename Idx>
struct IndexPair {
- EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair() : first(0), second(0) {}
- EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair(Idx f, Idx s) : first(f), second(s) {}
+ constexpr EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair() : first(0), second(0) {}
+ constexpr EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair(Idx f, Idx s) : first(f), second(s) {}
EIGEN_DEVICE_FUNC void set(IndexPair<Idx> val) {
first = val.first;
@@ -251,19 +251,18 @@
namespace internal {
template <typename IndexType, typename Index, Index First, Index... Is>
-EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array<Index, 1 + sizeof...(Is)> customIndices2Array(
+constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array<Index, 1 + sizeof...(Is)> customIndices2Array(
IndexType& idx, numeric_list<Index, First, Is...>) {
return {static_cast<Index>(idx[First]), static_cast<Index>(idx[Is])...};
}
template <typename IndexType, typename Index>
-EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array<Index, 0> customIndices2Array(IndexType&,
- numeric_list<Index>) {
+constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array<Index, 0> customIndices2Array(IndexType&, numeric_list<Index>) {
return array<Index, 0>();
}
/** Make an array (for index/dimensions) out of a custom index */
template <typename Index, std::size_t NumIndices, typename IndexType>
-EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array<Index, NumIndices> customIndices2Array(IndexType& idx) {
+constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array<Index, NumIndices> customIndices2Array(IndexType& idx) {
return customIndices2Array(idx, typename gen_numeric_list<Index, NumIndices>::type{});
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
index 6944c03..b4749b4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
@@ -402,9 +402,9 @@
template <typename Index, Index LTP, Index LTR, bool BC_>
struct ReductionPannel {
- static EIGEN_CONSTEXPR Index LocalThreadSizeP = LTP;
- static EIGEN_CONSTEXPR Index LocalThreadSizeR = LTR;
- static EIGEN_CONSTEXPR bool BC = BC_;
+ static constexpr Index LocalThreadSizeP = LTP;
+ static constexpr Index LocalThreadSizeR = LTR;
+ static constexpr bool BC = BC_;
};
template <typename Self, typename Op, TensorSycl::internal::reduction_dim rt>
@@ -430,7 +430,7 @@
"The Local thread size must be a power of 2 for the reduction "
"operation");
- EIGEN_CONSTEXPR Index localRange = PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR;
+ constexpr Index localRange = PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR;
// In this step, we force the code not to be more than 2-step reduction:
// Our empirical research shows that if each thread reduces at least 64
// elements individually, we get better performance. However, this can change
@@ -445,7 +445,7 @@
const Index rNumGroups = num_coeffs_to_reduce > reductionPerThread * localRange ? std::min(rGroups, localRange) : 1;
const Index globalRange = pNumGroups * rNumGroups * localRange;
- EIGEN_CONSTEXPR Index scratchSize =
+ constexpr Index scratchSize =
PannelParameters::LocalThreadSizeR * (PannelParameters::LocalThreadSizeP + PannelParameters::BC);
auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
if (rNumGroups > 1) {
@@ -482,15 +482,15 @@
struct FullReducer<Self, Op, Eigen::SyclDevice, Vectorizable> {
typedef typename Self::CoeffReturnType CoeffReturnType;
typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
- static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
- static EIGEN_CONSTEXPR int PacketSize = Self::PacketAccess ? Self::PacketSize : 1;
+ static constexpr bool HasOptimizedImplementation = true;
+ static constexpr int PacketSize = Self::PacketAccess ? Self::PacketSize : 1;
static void run(const Self &self, Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType data) {
typedef std::conditional_t<Self::PacketAccess, typename Self::PacketReturnType, CoeffReturnType> OutType;
static_assert(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) &
(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)),
"The Local thread size must be a power of 2 for the reduction "
"operation");
- EIGEN_CONSTEXPR Index local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;
+ constexpr Index local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;
typename Self::Index inputSize = self.impl().dimensions().TotalSize();
// In this step we force the code not to be more than 2-step reduction:
@@ -535,7 +535,7 @@
// col reduction
template <typename Self, typename Op>
struct OuterReducer<Self, Op, Eigen::SyclDevice> {
- static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
+ static constexpr bool HasOptimizedImplementation = true;
static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
@@ -549,7 +549,7 @@
// row reduction
template <typename Self, typename Op>
struct InnerReducer<Self, Op, Eigen::SyclDevice> {
- static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
+ static constexpr bool HasOptimizedImplementation = true;
static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
@@ -566,7 +566,7 @@
// generic partial reduction
template <typename Self, typename Op>
struct GenericReducer<Self, Op, Eigen::SyclDevice> {
- static EIGEN_CONSTEXPR bool HasOptimizedImplementation = false;
+ static constexpr bool HasOptimizedImplementation = false;
static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
typename Self::EvaluatorPointerType output, typename Self::Index num_values_to_reduce,
typename Self::Index num_coeffs_to_preserve) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
index f0a390f..6de0867 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
@@ -197,7 +197,7 @@
// threads. Currently set to twice the cache line size on Intel and ARM
// processors.
EIGEN_STRONG_INLINE Index AdjustBlockSize(Index item_size, Index block_size) {
- EIGEN_CONSTEXPR Index kBlockAlignment = 128;
+ constexpr Index kBlockAlignment = 128;
const Index items_per_cacheline = numext::maxi<Index>(1, kBlockAlignment / item_size);
return items_per_cacheline * numext::div_ceil(block_size, items_per_cacheline);
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h
index 30fde91..3636788 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h
@@ -51,7 +51,7 @@
template <typename index_t>
struct ScanParameters {
// must be power of 2
- static EIGEN_CONSTEXPR index_t ScanPerThread = 8;
+ static constexpr index_t ScanPerThread = 8;
const index_t total_size;
const index_t non_scan_size;
const index_t scan_size;
@@ -86,7 +86,7 @@
struct ScanKernelFunctor {
typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
LocalAccessor;
- static EIGEN_CONSTEXPR int PacketSize = ScanParameters<Index>::ScanPerThread / 2;
+ static constexpr int PacketSize = ScanParameters<Index>::ScanPerThread / 2;
LocalAccessor scratch;
Evaluator dev_eval;
@@ -288,7 +288,7 @@
struct ScanAdjustmentKernelFunctor {
typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
LocalAccessor;
- static EIGEN_CONSTEXPR int PacketSize = ScanParameters<Index>::ScanPerThread / 2;
+ static constexpr int PacketSize = ScanParameters<Index>::ScanPerThread / 2;
InAccessor in_ptr;
OutAccessor out_ptr;
const ScanParameters<Index> scanParameters;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
index c149985..5357a48 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
@@ -27,6 +27,10 @@
typedef std::remove_reference_t<Nested> Nested_;
static constexpr int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value;
static constexpr int Layout = XprTraits::Layout;
+ enum {
+ // Trace is read-only.
+ Flags = traits<XprType>::Flags & ~LvalueBit
+ };
};
template <typename Dims, typename XprType>
@@ -203,6 +207,8 @@
return true;
}
+ EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return nullptr; }
+
EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h b/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h
index 51c0ad6..ae5c4f4 100644
--- a/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h
+++ b/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h
@@ -106,7 +106,7 @@
int one;
int two;
int flags;
- constexpr inline Generator(int one_, int two_, int flags_) : one(one_), two(two_), flags(flags_) {}
+ constexpr Generator(int one_, int two_, int flags_) : one(one_), two(two_), flags(flags_) {}
};
std::size_t m_numIndices;
diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h b/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h
index 3f9bb51..66a982b 100644
--- a/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h
+++ b/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h
@@ -25,7 +25,7 @@
constexpr static std::size_t N = sizeof...(nn);
template <typename T>
- constexpr static inline std::array<T, N> run(const std::array<T, N>& indices) {
+ constexpr static std::array<T, N> run(const std::array<T, N>& indices) {
return {{indices[nn]...}};
}
};
@@ -51,7 +51,7 @@
template <typename iib>
struct tensor_static_symgroup_multiply_helper {
template <int... iia>
- constexpr static inline numeric_list<int, get<iia, iib>::value...> helper(numeric_list<int, iia...>) {
+ constexpr static numeric_list<int, get<iia, iib>::value...> helper(numeric_list<int, iia...>) {
return numeric_list<int, get<iia, iib>::value...>();
}
};
@@ -107,9 +107,9 @@
};
template <typename Index, std::size_t N, int... ii, int... jj>
-constexpr static inline std::array<Index, N> tensor_static_symgroup_index_permute(std::array<Index, N> idx,
- internal::numeric_list<int, ii...>,
- internal::numeric_list<int, jj...>) {
+constexpr static std::array<Index, N> tensor_static_symgroup_index_permute(std::array<Index, N> idx,
+ internal::numeric_list<int, ii...>,
+ internal::numeric_list<int, jj...>) {
return {{idx[ii]..., idx[jj]...}};
}
@@ -179,9 +179,9 @@
typedef typename group_elements::type ge;
public:
- constexpr inline StaticSGroup() {}
- constexpr inline StaticSGroup(const StaticSGroup<Gen...>&) {}
- constexpr inline StaticSGroup(StaticSGroup<Gen...>&&) {}
+ constexpr StaticSGroup() {}
+ constexpr StaticSGroup(const StaticSGroup<Gen...>&) {}
+ constexpr StaticSGroup(StaticSGroup<Gen...>&&) {}
template <typename Op, typename RV, typename Index, std::size_t N, typename... Args>
static inline RV apply(const std::array<Index, N>& idx, RV initial, Args&&... args) {
@@ -196,8 +196,8 @@
constexpr static std::size_t static_size = ge::count;
- constexpr static inline std::size_t size() { return ge::count; }
- constexpr static inline int globalFlags() { return group_elements::global_flags; }
+ constexpr static std::size_t size() { return ge::count; }
+ constexpr static int globalFlags() { return group_elements::global_flags; }
template <typename Tensor_, typename... IndexTypes>
inline internal::tensor_symmetry_value_setter<Tensor_, StaticSGroup<Gen...>> operator()(
diff --git a/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h b/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h
index e3be69d..632f437 100644
--- a/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h
+++ b/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h
@@ -36,15 +36,15 @@
*/
template <std::size_t I_, class T>
-constexpr inline T& array_get(std::vector<T>& a) {
+constexpr T& array_get(std::vector<T>& a) {
return a[I_];
}
template <std::size_t I_, class T>
-constexpr inline T&& array_get(std::vector<T>&& a) {
+constexpr T&& array_get(std::vector<T>&& a) {
return a[I_];
}
template <std::size_t I_, class T>
-constexpr inline T const& array_get(std::vector<T> const& a) {
+constexpr T const& array_get(std::vector<T> const& a) {
return a[I_];
}
diff --git a/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h b/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h
index 808fd7d..19ec8ea 100644
--- a/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h
+++ b/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h
@@ -268,7 +268,7 @@
/* compute the scaled predicted reduction and */
/* the scaled directional derivative. */
- wa3 = fjac.template triangularView<Upper>() * (qrfac.colsPermutation().inverse() * wa1);
+ wa3.noalias() = fjac.template triangularView<Upper>() * (qrfac.colsPermutation().inverse() * wa1);
temp1 = numext::abs2(wa3.stableNorm() / fnorm);
temp2 = numext::abs2(sqrt(par) * pnorm / fnorm);
prered = temp1 + temp2 / Scalar(.5);
diff --git a/unsupported/Eigen/src/NonLinearOptimization/r1updt.h b/unsupported/Eigen/src/NonLinearOptimization/r1updt.h
index 4bf7ee5..201fba3 100644
--- a/unsupported/Eigen/src/NonLinearOptimization/r1updt.h
+++ b/unsupported/Eigen/src/NonLinearOptimization/r1updt.h
@@ -75,7 +75,7 @@
/* givens rotation. */
w_givens[j] = givens;
} else
- v_givens[j] = IdentityRotation;
+ w_givens[j] = IdentityRotation;
/* test for zero diagonal elements in the output s. */
if (s(j, j) == 0.) {
diff --git a/unsupported/test/cxx11_tensor_ref.cpp b/unsupported/test/cxx11_tensor_ref.cpp
index cf09749..8d6821a 100644
--- a/unsupported/test/cxx11_tensor_ref.cpp
+++ b/unsupported/test/cxx11_tensor_ref.cpp
@@ -108,6 +108,17 @@
VERIFY_IS_EQUAL(slice.data(), tensor.data());
}
+static void test_ref_of_trace() {
+ Tensor<int, 2> input(6, 6);
+ input.setRandom();
+ int trace = 0;
+ for (int i = 0; i < 6; ++i) {
+ trace += input(i, i);
+ }
+ TensorRef<const Tensor<int, 0>> ref(input.trace());
+ VERIFY_IS_EQUAL(ref.coeff(0), trace);
+}
+
static void test_ref_of_ref() {
Tensor<float, 3> input(3, 5, 7);
input.setRandom();
@@ -224,6 +235,7 @@
CALL_SUBTEST(test_simple_rvalue_ref());
CALL_SUBTEST(test_multiple_dims());
CALL_SUBTEST(test_slice());
+ CALL_SUBTEST(test_ref_of_trace());
CALL_SUBTEST(test_ref_of_ref());
CALL_SUBTEST(test_ref_in_expr());
CALL_SUBTEST(test_coeff_ref());