Update Eigen to commit:d2dce377670f1742f3f701b3585682d02f10bb92 CHANGELOG ========= d2dce3776 - Optimize slerp() as proposed by Gopinath Vasalamarri. 66d8111ac - Use a more conservative method to detect non-finite inputs to cbrt. d6689a15d - Replace instances of EIGEN_CONSTEXPR macro 33f5f5961 - Vectorize cbrt for float and double. 533096090 - Enable packet segment in partial redux 6266d430c - packet segment: also check DiagonalWrapper e39ad8bad - fix constexpr in CoreEvaluators.h 7aefb9f4d - fix memset optimization for std::complex types 73ca849a6 - fix packetSegment for ArrayWrapper / MatrixWrapper 28c3b26d5 - masked load/store framework cebe09110 - Fix a potential deadlock because of Eigen thread pool 11fd34cc1 - Fix the typing of the Tasks in ForkJoin.h 2cd47d743 - Fixe Conversion Warning in Parallelizer b86004226 - Add postream for ostream-ing packets more reliably. 02d9e1138 - Add missing pmadd for Packet16bf. 9cc9209b9 - Fix cmake warning and default to j0. e0c99a8dd - By default, run ctests on all available cores in parallel. 63a40ffb9 - Use fma<float> for fma<half> and fma<bfloat16> if native fma is not available on the platform. 44fb6422b - All triggering full CI if MR label containts all-tests 3866cbfbe - Fix test for TensorRef of trace. 6579e36eb - Allow Tensor trace to be passed to a TensorRef. 8e32cbf7d - Reduce flakiness of test for Eigen::half. d935916ac - Add numext::fma and missing pmadd implementations. 754bd24f5 - fix 2828 ac2165c11 - fix allFinite 314396819 - Generalize the Eigen ForkJoin scheduler to use any ThreadPool interface. 70f2aead9 - Use native _Float16 for AVX512FP16 and update vectorization. 0259a52b0 - Use more .noalias() 14f845a1a - Fix givens rotation. 33b04fe51 - CMake: add install-doc target 10e62ccd2 - Fix x86 complex vectorized fma PiperOrigin-RevId: 753703769 Change-Id: I43bc1cf7c598ca3f306fffea9844e9c5b1a21b79
diff --git a/Eigen/Core b/Eigen/Core index 99cd473..6ae069a 100644 --- a/Eigen/Core +++ b/Eigen/Core
@@ -193,21 +193,27 @@ #include "src/Core/arch/Default/GenericPacketMathFunctionsFwd.h" #if defined EIGEN_VECTORIZE_AVX512 +#include "src/Core/arch/SSE/PacketMath.h" +#include "src/Core/arch/AVX/PacketMath.h" +#include "src/Core/arch/AVX512/PacketMath.h" #if defined EIGEN_VECTORIZE_AVX512FP16 #include "src/Core/arch/AVX512/PacketMathFP16.h" #endif -#include "src/Core/arch/SSE/PacketMath.h" #include "src/Core/arch/SSE/TypeCasting.h" -#include "src/Core/arch/SSE/Complex.h" -#include "src/Core/arch/AVX/PacketMath.h" #include "src/Core/arch/AVX/TypeCasting.h" -#include "src/Core/arch/AVX/Complex.h" -#include "src/Core/arch/AVX512/PacketMath.h" #include "src/Core/arch/AVX512/TypeCasting.h" +#if defined EIGEN_VECTORIZE_AVX512FP16 +#include "src/Core/arch/AVX512/TypeCastingFP16.h" +#endif +#include "src/Core/arch/SSE/Complex.h" +#include "src/Core/arch/AVX/Complex.h" #include "src/Core/arch/AVX512/Complex.h" #include "src/Core/arch/SSE/MathFunctions.h" #include "src/Core/arch/AVX/MathFunctions.h" #include "src/Core/arch/AVX512/MathFunctions.h" +#if defined EIGEN_VECTORIZE_AVX512FP16 +#include "src/Core/arch/AVX512/MathFunctionsFP16.h" +#endif #include "src/Core/arch/AVX512/TrsmKernel.h" #elif defined EIGEN_VECTORIZE_AVX // Use AVX for floats and doubles, SSE for integers
diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h index 5d52ab2..b1d801d 100644 --- a/Eigen/src/Cholesky/LDLT.h +++ b/Eigen/src/Cholesky/LDLT.h
@@ -230,8 +230,8 @@ */ const LDLT& adjoint() const { return *this; } - EIGEN_DEVICE_FUNC inline EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); } - EIGEN_DEVICE_FUNC inline EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); } + EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_matrix.rows(); } + EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_matrix.cols(); } /** \brief Reports whether previous computation was successful. *
diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h index 01b4476..7fa4fa2 100644 --- a/Eigen/src/Cholesky/LLT.h +++ b/Eigen/src/Cholesky/LLT.h
@@ -182,10 +182,10 @@ * This method is provided for compatibility with other matrix decompositions, thus enabling generic code such as: * \code x = decomposition.adjoint().solve(b) \endcode */ - const LLT& adjoint() const EIGEN_NOEXCEPT { return *this; } + const LLT& adjoint() const noexcept { return *this; } - inline EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); } - inline EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); } + constexpr Index rows() const noexcept { return m_matrix.rows(); } + constexpr Index cols() const noexcept { return m_matrix.cols(); } template <typename VectorType> LLT& rankUpdate(const VectorType& vec, const RealScalar& sigma = 1);
diff --git a/Eigen/src/Core/Array.h b/Eigen/src/Core/Array.h index 3376dfc..57f3186 100644 --- a/Eigen/src/Core/Array.h +++ b/Eigen/src/Core/Array.h
@@ -129,7 +129,7 @@ #endif /** \brief Move constructor */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Array(Array&&) = default; - EIGEN_DEVICE_FUNC Array& operator=(Array&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable<Scalar>::value) { + EIGEN_DEVICE_FUNC Array& operator=(Array&& other) noexcept(std::is_nothrow_move_assignable<Scalar>::value) { Base::operator=(std::move(other)); return *this; } @@ -253,8 +253,8 @@ PrivateType()) : Base(other.derived()) {} - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const EIGEN_NOEXCEPT { return 1; } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const EIGEN_NOEXCEPT { return this->innerSize(); } + EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { return 1; } + EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return this->innerSize(); } #ifdef EIGEN_ARRAY_PLUGIN #include EIGEN_ARRAY_PLUGIN
diff --git a/Eigen/src/Core/ArrayWrapper.h b/Eigen/src/Core/ArrayWrapper.h index b636d88..c9a194e 100644 --- a/Eigen/src/Core/ArrayWrapper.h +++ b/Eigen/src/Core/ArrayWrapper.h
@@ -56,14 +56,10 @@ EIGEN_DEVICE_FUNC explicit EIGEN_STRONG_INLINE ArrayWrapper(ExpressionType& matrix) : m_expression(matrix) {} - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_expression.rows(); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_expression.cols(); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const EIGEN_NOEXCEPT { - return m_expression.outerStride(); - } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const EIGEN_NOEXCEPT { - return m_expression.innerStride(); - } + EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_expression.rows(); } + EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_expression.cols(); } + EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return m_expression.outerStride(); } + EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { return m_expression.innerStride(); } EIGEN_DEVICE_FUNC constexpr ScalarWithConstIfNotLvalue* data() { return m_expression.data(); } EIGEN_DEVICE_FUNC constexpr const Scalar* data() const { return m_expression.data(); } @@ -135,14 +131,10 @@ EIGEN_DEVICE_FUNC explicit inline MatrixWrapper(ExpressionType& matrix) : m_expression(matrix) {} - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_expression.rows(); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_expression.cols(); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const EIGEN_NOEXCEPT { - return m_expression.outerStride(); - } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const EIGEN_NOEXCEPT { - return m_expression.innerStride(); - } + EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_expression.rows(); } + EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_expression.cols(); } + EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return m_expression.outerStride(); } + EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { return m_expression.innerStride(); } EIGEN_DEVICE_FUNC constexpr ScalarWithConstIfNotLvalue* data() { return m_expression.data(); } EIGEN_DEVICE_FUNC constexpr const Scalar* data() const { return m_expression.data(); }
diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h index 0ea1bc3..36f0a9d 100644 --- a/Eigen/src/Core/AssignEvaluator.h +++ b/Eigen/src/Core/AssignEvaluator.h
@@ -136,6 +136,7 @@ : Traversal == SliceVectorizedTraversal ? (MayUnrollInner ? InnerUnrolling : NoUnrolling) #endif : NoUnrolling; + static constexpr bool UsePacketSegment = has_packet_segment<PacketType>::value; #ifdef EIGEN_DEBUG_ASSIGN static void debug() { @@ -199,7 +200,7 @@ template <typename Kernel, int Stop> struct copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, Stop, Stop> { - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel&) {} + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&) {} }; template <typename Kernel, int Index_, int Stop> @@ -253,7 +254,7 @@ template <typename Kernel, int Stop> struct copy_using_evaluator_innervec_CompleteUnrolling<Kernel, Stop, Stop> { - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel&) {} + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&) {} }; template <typename Kernel, int Index_, int Stop, int SrcAlignment, int DstAlignment> @@ -273,6 +274,33 @@ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&, Index) {} }; +template <typename Kernel, int Start, int Stop, int SrcAlignment, int DstAlignment, bool UsePacketSegment> +struct copy_using_evaluator_innervec_segment { + using PacketType = typename Kernel::PacketType; + + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel, Index outer) { + kernel.template assignPacketSegmentByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, Start, 0, + Stop - Start); + } +}; + +template <typename Kernel, int Start, int Stop, int SrcAlignment, int DstAlignment> +struct copy_using_evaluator_innervec_segment<Kernel, Start, Stop, SrcAlignment, DstAlignment, + /*UsePacketSegment*/ false> + : copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, Start, Stop> {}; + +template <typename Kernel, int Stop, int SrcAlignment, int DstAlignment> +struct copy_using_evaluator_innervec_segment<Kernel, Stop, Stop, SrcAlignment, DstAlignment, + /*UsePacketSegment*/ true> { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&, Index) {} +}; + +template <typename Kernel, int Stop, int SrcAlignment, int DstAlignment> +struct copy_using_evaluator_innervec_segment<Kernel, Stop, Stop, SrcAlignment, DstAlignment, + /*UsePacketSegment*/ false> { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&, Index) {} +}; + /*************************************************************************** * Part 3 : implementation of all cases ***************************************************************************/ @@ -306,7 +334,7 @@ struct dense_assignment_loop_impl<Kernel, AllAtOnceTraversal, Unrolling> { static constexpr int SizeAtCompileTime = Kernel::AssignmentTraits::SizeAtCompileTime; - EIGEN_DEVICE_FUNC static void EIGEN_STRONG_INLINE EIGEN_CONSTEXPR run(Kernel& /*kernel*/) { + EIGEN_DEVICE_FUNC static void EIGEN_STRONG_INLINE constexpr run(Kernel& /*kernel*/) { EIGEN_STATIC_ASSERT(SizeAtCompileTime == 0, EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT) } }; @@ -353,28 +381,46 @@ // The goal of unaligned_dense_assignment_loop is simply to factorize the handling // of the non vectorizable beginning and ending parts -template <bool IsAligned = false> +template <typename PacketType, int DstAlignment, int SrcAlignment, bool UsePacketSegment, bool Skip> struct unaligned_dense_assignment_loop { - // if IsAligned = true, then do nothing + // if Skip == true, then do nothing template <typename Kernel> - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel&, Index, Index) {} + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& /*kernel*/, Index /*start*/, Index /*end*/) {} + template <typename Kernel> + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& /*kernel*/, Index /*outer*/, + Index /*innerStart*/, Index /*innerEnd*/) {} }; -template <> -struct unaligned_dense_assignment_loop<false> { - // MSVC must not inline this functions. If it does, it fails to optimize the - // packet access path. - // FIXME check which version exhibits this issue -#if EIGEN_COMP_MSVC +template <typename PacketType, int DstAlignment, int SrcAlignment> +struct unaligned_dense_assignment_loop<PacketType, DstAlignment, SrcAlignment, /*UsePacketSegment*/ true, + /*Skip*/ false> { template <typename Kernel> - static EIGEN_DONT_INLINE void run(Kernel& kernel, Index start, Index end) -#else + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel, Index start, Index end) { + Index count = end - start; + eigen_assert(count <= unpacket_traits<PacketType>::size); + if (count > 0) kernel.template assignPacketSegment<DstAlignment, SrcAlignment, PacketType>(start, 0, count); + } template <typename Kernel> - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel, Index start, Index end) -#endif - { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel, Index outer, Index start, Index end) { + Index count = end - start; + eigen_assert(count <= unpacket_traits<PacketType>::size); + if (count > 0) + kernel.template assignPacketSegmentByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, start, 0, count); + } +}; + +template <typename PacketType, int DstAlignment, int SrcAlignment> +struct unaligned_dense_assignment_loop<PacketType, DstAlignment, SrcAlignment, /*UsePacketSegment*/ false, + /*Skip*/ false> { + template <typename Kernel> + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel, Index start, Index end) { for (Index index = start; index < end; ++index) kernel.assignCoeff(index); } + template <typename Kernel> + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel, Index outer, Index innerStart, + Index innerEnd) { + for (Index inner = innerStart; inner < innerEnd; ++inner) kernel.assignCoeffByOuterInner(outer, inner); + } }; template <typename Kernel, int Index_, int Stop> @@ -392,7 +438,32 @@ template <typename Kernel, int Stop> struct copy_using_evaluator_linearvec_CompleteUnrolling<Kernel, Stop, Stop> { - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel&) {} + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&) {} +}; + +template <typename Kernel, int Index_, int Stop, bool UsePacketSegment> +struct copy_using_evaluator_linearvec_segment { + using PacketType = typename Kernel::PacketType; + static constexpr int SrcAlignment = Kernel::AssignmentTraits::SrcAlignment; + static constexpr int DstAlignment = Kernel::AssignmentTraits::DstAlignment; + + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel) { + kernel.template assignPacketSegment<DstAlignment, SrcAlignment, PacketType>(Index_, 0, Stop - Index_); + } +}; + +template <typename Kernel, int Index_, int Stop> +struct copy_using_evaluator_linearvec_segment<Kernel, Index_, Stop, /*UsePacketSegment*/ false> + : copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, Index_, Stop> {}; + +template <typename Kernel, int Stop> +struct copy_using_evaluator_linearvec_segment<Kernel, Stop, Stop, /*UsePacketSegment*/ true> { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&) {} +}; + +template <typename Kernel, int Stop> +struct copy_using_evaluator_linearvec_segment<Kernel, Stop, Stop, /*UsePacketSegment*/ false> { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&) {} }; template <typename Kernel> @@ -400,23 +471,30 @@ using Scalar = typename Kernel::Scalar; using PacketType = typename Kernel::PacketType; static constexpr int PacketSize = unpacket_traits<PacketType>::size; - static constexpr int RequestedAlignment = Kernel::AssignmentTraits::LinearRequiredAlignment; - static constexpr bool DstIsAligned = Kernel::AssignmentTraits::DstAlignment >= RequestedAlignment; static constexpr int SrcAlignment = Kernel::AssignmentTraits::JointAlignment; - static constexpr int DstAlignment = - packet_traits<Scalar>::AlignedOnScalar ? RequestedAlignment : Kernel::AssignmentTraits::DstAlignment; + static constexpr int DstAlignment = plain_enum_max(Kernel::AssignmentTraits::DstAlignment, alignof(Scalar)); + static constexpr int RequestedAlignment = unpacket_traits<PacketType>::alignment; + static constexpr bool Alignable = + (DstAlignment >= RequestedAlignment) || ((RequestedAlignment - DstAlignment) % sizeof(Scalar) == 0); + static constexpr int Alignment = Alignable ? RequestedAlignment : DstAlignment; + static constexpr bool DstIsAligned = DstAlignment >= Alignment; + static constexpr bool UsePacketSegment = Kernel::AssignmentTraits::UsePacketSegment; - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) { + using head_loop = + unaligned_dense_assignment_loop<PacketType, DstAlignment, SrcAlignment, UsePacketSegment, DstIsAligned>; + using tail_loop = unaligned_dense_assignment_loop<PacketType, Alignment, SrcAlignment, UsePacketSegment, false>; + + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel) { const Index size = kernel.size(); - const Index alignedStart = DstIsAligned ? 0 : first_aligned<RequestedAlignment>(kernel.dstDataPtr(), size); + const Index alignedStart = DstIsAligned ? 0 : first_aligned<Alignment>(kernel.dstDataPtr(), size); const Index alignedEnd = alignedStart + numext::round_down(size - alignedStart, PacketSize); - unaligned_dense_assignment_loop<DstIsAligned>::run(kernel, 0, alignedStart); + head_loop::run(kernel, 0, alignedStart); for (Index index = alignedStart; index < alignedEnd; index += PacketSize) - kernel.template assignPacket<DstAlignment, SrcAlignment, PacketType>(index); + kernel.template assignPacket<Alignment, SrcAlignment, PacketType>(index); - unaligned_dense_assignment_loop<>::run(kernel, alignedEnd, size); + tail_loop::run(kernel, alignedEnd, size); } }; @@ -426,10 +504,11 @@ static constexpr int PacketSize = unpacket_traits<PacketType>::size; static constexpr int Size = Kernel::AssignmentTraits::SizeAtCompileTime; static constexpr int AlignedSize = numext::round_down(Size, PacketSize); + static constexpr bool UsePacketSegment = Kernel::AssignmentTraits::UsePacketSegment; - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel) { copy_using_evaluator_linearvec_CompleteUnrolling<Kernel, 0, AlignedSize>::run(kernel); - copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, AlignedSize, Size>::run(kernel); + copy_using_evaluator_linearvec_segment<Kernel, AlignedSize, Size, UsePacketSegment>::run(kernel); } }; @@ -444,7 +523,7 @@ static constexpr int SrcAlignment = Kernel::AssignmentTraits::JointAlignment; static constexpr int DstAlignment = Kernel::AssignmentTraits::DstAlignment; - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel) { const Index innerSize = kernel.innerSize(); const Index outerSize = kernel.outerSize(); for (Index outer = 0; outer < outerSize; ++outer) @@ -482,7 +561,7 @@ template <typename Kernel> struct dense_assignment_loop_impl<Kernel, LinearTraversal, NoUnrolling> { - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel) { const Index size = kernel.size(); for (Index i = 0; i < size; ++i) kernel.assignCoeff(i); } @@ -490,7 +569,7 @@ template <typename Kernel> struct dense_assignment_loop_impl<Kernel, LinearTraversal, CompleteUnrolling> { - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel) { copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, 0, Kernel::AssignmentTraits::SizeAtCompileTime>::run( kernel); } @@ -505,35 +584,35 @@ using Scalar = typename Kernel::Scalar; using PacketType = typename Kernel::PacketType; static constexpr int PacketSize = unpacket_traits<PacketType>::size; - static constexpr int RequestedAlignment = Kernel::AssignmentTraits::InnerRequiredAlignment; + static constexpr int SrcAlignment = Kernel::AssignmentTraits::JointAlignment; + static constexpr int DstAlignment = plain_enum_max(Kernel::AssignmentTraits::DstAlignment, alignof(Scalar)); + static constexpr int RequestedAlignment = unpacket_traits<PacketType>::alignment; static constexpr bool Alignable = - packet_traits<Scalar>::AlignedOnScalar || Kernel::AssignmentTraits::DstAlignment >= sizeof(Scalar); - static constexpr bool DstIsAligned = Kernel::AssignmentTraits::DstAlignment >= RequestedAlignment; - static constexpr int DstAlignment = Alignable ? RequestedAlignment : Kernel::AssignmentTraits::DstAlignment; + (DstAlignment >= RequestedAlignment) || ((RequestedAlignment - DstAlignment) % sizeof(Scalar) == 0); + static constexpr int Alignment = Alignable ? RequestedAlignment : DstAlignment; + static constexpr bool DstIsAligned = DstAlignment >= Alignment; + static constexpr bool UsePacketSegment = Kernel::AssignmentTraits::UsePacketSegment; - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) { + using head_loop = unaligned_dense_assignment_loop<PacketType, DstAlignment, Unaligned, UsePacketSegment, !Alignable>; + using tail_loop = unaligned_dense_assignment_loop<PacketType, Alignment, Unaligned, UsePacketSegment, false>; + + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel) { const Scalar* dst_ptr = kernel.dstDataPtr(); - if ((!DstIsAligned) && (std::uintptr_t(dst_ptr) % sizeof(Scalar)) > 0) { - // the pointer is not aligned-on scalar, so alignment is not possible - return dense_assignment_loop<Kernel, DefaultTraversal, NoUnrolling>::run(kernel); - } const Index innerSize = kernel.innerSize(); const Index outerSize = kernel.outerSize(); const Index alignedStep = Alignable ? (PacketSize - kernel.outerStride() % PacketSize) % PacketSize : 0; - Index alignedStart = - ((!Alignable) || DstIsAligned) ? 0 : internal::first_aligned<RequestedAlignment>(dst_ptr, innerSize); + Index alignedStart = ((!Alignable) || DstIsAligned) ? 0 : internal::first_aligned<Alignment>(dst_ptr, innerSize); for (Index outer = 0; outer < outerSize; ++outer) { const Index alignedEnd = alignedStart + numext::round_down(innerSize - alignedStart, PacketSize); - // do the non-vectorizable part of the assignment - for (Index inner = 0; inner < alignedStart; ++inner) kernel.assignCoeffByOuterInner(outer, inner); + + head_loop::run(kernel, outer, 0, alignedStart); // do the vectorizable part of the assignment for (Index inner = alignedStart; inner < alignedEnd; inner += PacketSize) - kernel.template assignPacketByOuterInner<DstAlignment, Unaligned, PacketType>(outer, inner); + kernel.template assignPacketByOuterInner<Alignment, Unaligned, PacketType>(outer, inner); - // do the non-vectorizable part of the assignment - for (Index inner = alignedEnd; inner < innerSize; ++inner) kernel.assignCoeffByOuterInner(outer, inner); + tail_loop::run(kernel, outer, alignedEnd, innerSize); alignedStart = numext::mini((alignedStart + alignedStep) % PacketSize, innerSize); } @@ -547,11 +626,16 @@ static constexpr int PacketSize = unpacket_traits<PacketType>::size; static constexpr int InnerSize = Kernel::AssignmentTraits::InnerSizeAtCompileTime; static constexpr int VectorizableSize = numext::round_down(InnerSize, PacketSize); + static constexpr bool UsePacketSegment = Kernel::AssignmentTraits::UsePacketSegment; - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) { + using packet_loop = copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, VectorizableSize, Unaligned, Unaligned>; + using packet_segment_loop = copy_using_evaluator_innervec_segment<Kernel, VectorizableSize, InnerSize, Unaligned, + Unaligned, UsePacketSegment>; + + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel) { for (Index outer = 0; outer < kernel.outerSize(); ++outer) { - copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, VectorizableSize, 0, 0>::run(kernel, outer); - copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, VectorizableSize, InnerSize>::run(kernel, outer); + packet_loop::run(kernel, outer); + packet_segment_loop::run(kernel, outer); } } }; @@ -590,15 +674,15 @@ #endif } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index size() const EIGEN_NOEXCEPT { return m_dstExpr.size(); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index innerSize() const EIGEN_NOEXCEPT { return m_dstExpr.innerSize(); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index outerSize() const EIGEN_NOEXCEPT { return m_dstExpr.outerSize(); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_dstExpr.rows(); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_dstExpr.cols(); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index outerStride() const EIGEN_NOEXCEPT { return m_dstExpr.outerStride(); } + EIGEN_DEVICE_FUNC constexpr Index size() const noexcept { return m_dstExpr.size(); } + EIGEN_DEVICE_FUNC constexpr Index innerSize() const noexcept { return m_dstExpr.innerSize(); } + EIGEN_DEVICE_FUNC constexpr Index outerSize() const noexcept { return m_dstExpr.outerSize(); } + EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_dstExpr.rows(); } + EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_dstExpr.cols(); } + EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return m_dstExpr.outerStride(); } - EIGEN_DEVICE_FUNC DstEvaluatorType& dstEvaluator() EIGEN_NOEXCEPT { return m_dst; } - EIGEN_DEVICE_FUNC const SrcEvaluatorType& srcEvaluator() const EIGEN_NOEXCEPT { return m_src; } + EIGEN_DEVICE_FUNC DstEvaluatorType& dstEvaluator() noexcept { return m_dst; } + EIGEN_DEVICE_FUNC const SrcEvaluatorType& srcEvaluator() const noexcept { return m_src; } /// Assign src(row,col) to dst(row,col) through the assignment functor. EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void assignCoeff(Index row, Index col) { @@ -635,6 +719,27 @@ assignPacket<StoreMode, LoadMode, Packet>(row, col); } + template <int StoreMode, int LoadMode, typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacketSegment(Index row, Index col, Index begin, Index count) { + m_functor.template assignPacketSegment<StoreMode>( + &m_dst.coeffRef(row, col), m_src.template packetSegment<LoadMode, Packet>(row, col, begin, count), begin, + count); + } + + template <int StoreMode, int LoadMode, typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacketSegment(Index index, Index begin, Index count) { + m_functor.template assignPacketSegment<StoreMode>( + &m_dst.coeffRef(index), m_src.template packetSegment<LoadMode, Packet>(index, begin, count), begin, count); + } + + template <int StoreMode, int LoadMode, typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacketSegmentByOuterInner(Index outer, Index inner, Index begin, + Index count) { + Index row = rowIndexByOuterInner(outer, inner); + Index col = colIndexByOuterInner(outer, inner); + assignPacketSegment<StoreMode, LoadMode, Packet>(row, col, begin, count); + } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr Index rowIndexByOuterInner(Index outer, Index inner) { typedef typename DstEvaluatorType::ExpressionTraits Traits; return int(Traits::RowsAtCompileTime) == 1 ? 0 @@ -704,9 +809,8 @@ } template <typename DstXprType, typename SrcXprType, typename Functor> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void call_dense_assignment_loop(DstXprType& dst, - const SrcXprType& src, - const Functor& func) { +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src, + const Functor& func) { typedef evaluator<DstXprType> DstEvaluatorType; typedef evaluator<SrcXprType> SrcEvaluatorType; @@ -775,7 +879,7 @@ // Deal with "assume-aliasing" template <typename Dst, typename Src, typename Func> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void call_assignment( +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_assignment( Dst& dst, const Src& src, const Func& func, std::enable_if_t<evaluator_assume_aliasing<Src>::value, void*> = 0) { typename plain_matrix_type<Src>::type tmp(src); call_assignment_no_alias(dst, tmp, func); @@ -790,14 +894,14 @@ // by-pass "assume-aliasing" // When there is no aliasing, we require that 'dst' has been properly resized template <typename Dst, template <typename> class StorageBase, typename Src, typename Func> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void call_assignment(NoAlias<Dst, StorageBase>& dst, - const Src& src, const Func& func) { +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_assignment(NoAlias<Dst, StorageBase>& dst, const Src& src, + const Func& func) { call_assignment_no_alias(dst.expression(), src, func); } template <typename Dst, typename Src, typename Func> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void call_assignment_no_alias(Dst& dst, const Src& src, - const Func& func) { +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_assignment_no_alias(Dst& dst, const Src& src, + const Func& func) { enum { NeedToTranspose = ((int(Dst::RowsAtCompileTime) == 1 && int(Src::ColsAtCompileTime) == 1) || (int(Dst::ColsAtCompileTime) == 1 && int(Src::RowsAtCompileTime) == 1)) && @@ -836,14 +940,13 @@ } template <typename Dst, typename Src> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void call_assignment_no_alias(Dst& dst, const Src& src) { +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_assignment_no_alias(Dst& dst, const Src& src) { call_assignment_no_alias(dst, src, internal::assign_op<typename Dst::Scalar, typename Src::Scalar>()); } template <typename Dst, typename Src, typename Func> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void call_assignment_no_alias_no_transpose(Dst& dst, - const Src& src, - const Func& func) { +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src, + const Func& func) { // TODO check whether this is the right place to perform these checks: EIGEN_STATIC_ASSERT_LVALUE(Dst) EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Dst, Src) @@ -852,8 +955,7 @@ Assignment<Dst, Src, Func>::run(dst, src, func); } template <typename Dst, typename Src> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void call_assignment_no_alias_no_transpose(Dst& dst, - const Src& src) { +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src) { call_assignment_no_alias_no_transpose(dst, src, internal::assign_op<typename Dst::Scalar, typename Src::Scalar>()); }
diff --git a/Eigen/src/Core/BandMatrix.h b/Eigen/src/Core/BandMatrix.h index ca991ca..57b0322 100644 --- a/Eigen/src/Core/BandMatrix.h +++ b/Eigen/src/Core/BandMatrix.h
@@ -200,16 +200,16 @@ : m_coeffs(1 + supers + subs, cols), m_rows(rows), m_supers(supers), m_subs(subs) {} /** \returns the number of columns */ - inline EIGEN_CONSTEXPR Index rows() const { return m_rows.value(); } + constexpr Index rows() const { return m_rows.value(); } /** \returns the number of rows */ - inline EIGEN_CONSTEXPR Index cols() const { return m_coeffs.cols(); } + constexpr Index cols() const { return m_coeffs.cols(); } /** \returns the number of super diagonals */ - inline EIGEN_CONSTEXPR Index supers() const { return m_supers.value(); } + constexpr Index supers() const { return m_supers.value(); } /** \returns the number of sub diagonals */ - inline EIGEN_CONSTEXPR Index subs() const { return m_subs.value(); } + constexpr Index subs() const { return m_subs.value(); } inline const CoefficientsType& coeffs() const { return m_coeffs; } inline CoefficientsType& coeffs() { return m_coeffs; } @@ -260,16 +260,16 @@ } /** \returns the number of columns */ - inline EIGEN_CONSTEXPR Index rows() const { return m_rows.value(); } + constexpr Index rows() const { return m_rows.value(); } /** \returns the number of rows */ - inline EIGEN_CONSTEXPR Index cols() const { return m_coeffs.cols(); } + constexpr Index cols() const { return m_coeffs.cols(); } /** \returns the number of super diagonals */ - inline EIGEN_CONSTEXPR Index supers() const { return m_supers.value(); } + constexpr Index supers() const { return m_supers.value(); } /** \returns the number of sub diagonals */ - inline EIGEN_CONSTEXPR Index subs() const { return m_subs.value(); } + constexpr Index subs() const { return m_subs.value(); } inline const CoefficientsType& coeffs() const { return m_coeffs; }
diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h index 709264c..39abff7 100644 --- a/Eigen/src/Core/Block.h +++ b/Eigen/src/Core/Block.h
@@ -289,13 +289,9 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE XprType& nestedExpression() { return m_xpr; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR StorageIndex startRow() const EIGEN_NOEXCEPT { - return m_startRow.value(); - } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr StorageIndex startRow() const noexcept { return m_startRow.value(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR StorageIndex startCol() const EIGEN_NOEXCEPT { - return m_startCol.value(); - } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr StorageIndex startCol() const noexcept { return m_startCol.value(); } protected: XprTypeNested m_xpr; @@ -319,8 +315,7 @@ * Adding an offset to nullptr is undefined behavior, so we must avoid it. */ template <typename Scalar> - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE static Scalar* add_to_nullable_pointer(Scalar* base, - Index offset) { + EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE static Scalar* add_to_nullable_pointer(Scalar* base, Index offset) { return base != nullptr ? base + offset : nullptr; } @@ -378,30 +373,25 @@ init(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all_t<XprTypeNested>& nestedExpression() const - EIGEN_NOEXCEPT { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all_t<XprTypeNested>& nestedExpression() const noexcept { return m_xpr; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE XprType& nestedExpression() { return m_xpr; } /** \sa MapBase::innerStride() */ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index innerStride() const EIGEN_NOEXCEPT { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index innerStride() const noexcept { return internal::traits<BlockType>::HasSameStorageOrderAsXprType ? m_xpr.innerStride() : m_xpr.outerStride(); } /** \sa MapBase::outerStride() */ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index outerStride() const EIGEN_NOEXCEPT { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index outerStride() const noexcept { return internal::traits<BlockType>::HasSameStorageOrderAsXprType ? m_xpr.outerStride() : m_xpr.innerStride(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR StorageIndex startRow() const EIGEN_NOEXCEPT { - return m_startRow.value(); - } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr StorageIndex startRow() const noexcept { return m_startRow.value(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR StorageIndex startCol() const EIGEN_NOEXCEPT { - return m_startCol.value(); - } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr StorageIndex startCol() const noexcept { return m_startCol.value(); } #ifndef __SUNPRO_CC // FIXME sunstudio is not friendly with the above friend...
diff --git a/Eigen/src/Core/CommaInitializer.h b/Eigen/src/Core/CommaInitializer.h index c629123..c414117 100644 --- a/Eigen/src/Core/CommaInitializer.h +++ b/Eigen/src/Core/CommaInitializer.h
@@ -92,7 +92,7 @@ EIGEN_DEVICE_FUNC inline ~CommaInitializer() #if defined VERIFY_RAISES_ASSERT && (!defined EIGEN_NO_ASSERTION_CHECKING) && defined EIGEN_EXCEPTIONS - EIGEN_EXCEPTION_SPEC(Eigen::eigen_assert_exception) + noexcept(false) // Eigen::eigen_assert_exception #endif { finished();
diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h index 156ca2b..e3af2d2 100644 --- a/Eigen/src/Core/CoreEvaluators.h +++ b/Eigen/src/Core/CoreEvaluators.h
@@ -149,7 +149,7 @@ #endif eigen_internal_assert(outerStride == OuterStride); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index outerStride() const EIGEN_NOEXCEPT { return OuterStride; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index outerStride() const noexcept { return OuterStride; } const Scalar* data; }; @@ -198,19 +198,13 @@ } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType coeff(Index row, Index col) const { - if (IsRowMajor) - return m_d.data[row * m_d.outerStride() + col]; - else - return m_d.data[row + col * m_d.outerStride()]; + return coeff(getIndex(row, col)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType coeff(Index index) const { return m_d.data[index]; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& coeffRef(Index row, Index col) { - if (IsRowMajor) - return const_cast<Scalar*>(m_d.data)[row * m_d.outerStride() + col]; - else - return const_cast<Scalar*>(m_d.data)[row + col * m_d.outerStride()]; + return coeffRef(getIndex(row, col)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& coeffRef(Index index) { @@ -219,10 +213,7 @@ template <int LoadMode, typename PacketType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { - if (IsRowMajor) - return ploadt<PacketType, LoadMode>(m_d.data + row * m_d.outerStride() + col); - else - return ploadt<PacketType, LoadMode>(m_d.data + row + col * m_d.outerStride()); + return packet<LoadMode, PacketType>(getIndex(row, col)); } template <int LoadMode, typename PacketType> @@ -232,19 +223,43 @@ template <int StoreMode, typename PacketType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) { - if (IsRowMajor) - return pstoret<Scalar, PacketType, StoreMode>(const_cast<Scalar*>(m_d.data) + row * m_d.outerStride() + col, x); - else - return pstoret<Scalar, PacketType, StoreMode>(const_cast<Scalar*>(m_d.data) + row + col * m_d.outerStride(), x); + writePacket<StoreMode, PacketType>(getIndex(row, col), x); } template <int StoreMode, typename PacketType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) { - return pstoret<Scalar, PacketType, StoreMode>(const_cast<Scalar*>(m_d.data) + index, x); + pstoret<Scalar, PacketType, StoreMode>(const_cast<Scalar*>(m_d.data) + index, x); + } + + template <int LoadMode, typename PacketType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const { + return packetSegment<LoadMode, PacketType>(getIndex(row, col), begin, count); + } + + template <int LoadMode, typename PacketType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const { + return ploadtSegment<PacketType, LoadMode>(m_d.data + index, begin, count); + } + + template <int StoreMode, typename PacketType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index row, Index col, const PacketType& x, Index begin, + Index count) { + writePacketSegment<StoreMode, PacketType>(getIndex(row, col), x, begin, count); + } + + template <int StoreMode, typename PacketType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index index, const PacketType& x, Index begin, + Index count) { + pstoretSegment<Scalar, PacketType, StoreMode>(const_cast<Scalar*>(m_d.data) + index, x, begin, count); } protected: plainobjectbase_evaluator_data<Scalar, OuterStrideAtCompileTime> m_d; + + private: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index constexpr getIndex(Index row, Index col) const { + return IsRowMajor ? row * m_d.outerStride() + col : row + col * m_d.outerStride(); + } }; template <typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols> @@ -318,6 +333,28 @@ m_argImpl.template writePacket<StoreMode, PacketType>(index, x); } + template <int LoadMode, typename PacketType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const { + return m_argImpl.template packetSegment<LoadMode, PacketType>(col, row, begin, count); + } + + template <int LoadMode, typename PacketType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const { + return m_argImpl.template packetSegment<LoadMode, PacketType>(index, begin, count); + } + + template <int StoreMode, typename PacketType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index row, Index col, const PacketType& x, Index begin, + Index count) { + m_argImpl.template writePacketSegment<StoreMode, PacketType>(col, row, x, begin, count); + } + + template <int StoreMode, typename PacketType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index index, const PacketType& x, Index begin, + Index count) { + m_argImpl.template writePacketSegment<StoreMode, PacketType>(index, x, begin, count); + } + protected: evaluator<ArgType> m_argImpl; }; @@ -464,10 +501,10 @@ struct evaluator<CwiseNullaryOp<NullaryOp, PlainObjectType>> : evaluator_base<CwiseNullaryOp<NullaryOp, PlainObjectType>> { typedef CwiseNullaryOp<NullaryOp, PlainObjectType> XprType; - typedef internal::remove_all_t<PlainObjectType> PlainObjectTypeCleaned; + typedef remove_all_t<PlainObjectType> PlainObjectTypeCleaned; enum { - CoeffReadCost = internal::functor_traits<NullaryOp>::Cost, + CoeffReadCost = functor_traits<NullaryOp>::Cost, Flags = (evaluator<PlainObjectTypeCleaned>::Flags & (HereditaryBits | (functor_has_linear_access<NullaryOp>::ret ? LinearAccessBit : 0) | @@ -502,9 +539,21 @@ return m_wrapper.template packetOp<PacketType>(m_functor, index); } + template <int LoadMode, typename PacketType, typename IndexType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(IndexType row, IndexType col, Index /*begin*/, + Index /*count*/) const { + return packet<LoadMode, PacketType, IndexType>(row, col); + } + + template <int LoadMode, typename PacketType, typename IndexType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(IndexType index, Index /*begin*/, + Index /*count*/) const { + return packet<LoadMode, PacketType, IndexType>(index); + } + protected: const NullaryOp m_functor; - const internal::nullary_wrapper<CoeffReturnType, NullaryOp> m_wrapper; + const nullary_wrapper<CoeffReturnType, NullaryOp> m_wrapper; }; // -------------------- CwiseUnaryOp -------------------- @@ -546,6 +595,16 @@ return m_d.func().packetOp(m_d.argImpl.template packet<LoadMode, PacketType>(index)); } + template <int LoadMode, typename PacketType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const { + return m_d.func().packetOp(m_d.argImpl.template packetSegment<LoadMode, PacketType>(row, col, begin, count)); + } + + template <int LoadMode, typename PacketType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const { + return m_d.func().packetOp(m_d.argImpl.template packetSegment<LoadMode, PacketType>(index, begin, count)); + } + protected: // this helper permits to completely eliminate the functor if it is empty struct Data { @@ -600,16 +659,11 @@ template <typename DstPacketType> using SrcPacketArgs8 = std::enable_if_t<(unpacket_traits<DstPacketType>::size) == (8 * SrcPacketSize), bool>; - template <bool UseRowMajor = IsRowMajor, std::enable_if_t<UseRowMajor, bool> = true> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_array_bounds(Index, Index col, Index packetSize) const { - return col + packetSize <= cols(); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_array_bounds(Index row, Index col, Index begin, Index count) const { + return IsRowMajor ? (col + count + begin <= cols()) : (row + count + begin <= rows()); } - template <bool UseRowMajor = IsRowMajor, std::enable_if_t<!UseRowMajor, bool> = true> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_array_bounds(Index row, Index, Index packetSize) const { - return row + packetSize <= rows(); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_array_bounds(Index index, Index packetSize) const { - return index + packetSize <= size(); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_array_bounds(Index index, Index begin, Index count) const { + return index + count + begin <= size(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SrcType srcCoeff(Index row, Index col, Index offset) const { @@ -632,43 +686,86 @@ template <int LoadMode, typename PacketType = SrcPacketType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType srcPacket(Index row, Index col, Index offset) const { constexpr int PacketSize = unpacket_traits<PacketType>::size; - Index actualRow = IsRowMajor ? row : row + (offset * PacketSize); - Index actualCol = IsRowMajor ? col + (offset * PacketSize) : col; - eigen_assert(check_array_bounds(actualRow, actualCol, PacketSize) && "Array index out of bounds"); + Index packetOffset = offset * PacketSize; + Index actualRow = IsRowMajor ? row : row + packetOffset; + Index actualCol = IsRowMajor ? col + packetOffset : col; + eigen_assert(check_array_bounds(actualRow, actualCol, 0, PacketSize) && "Array index out of bounds"); return m_argImpl.template packet<LoadMode, PacketType>(actualRow, actualCol); } template <int LoadMode, typename PacketType = SrcPacketType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType srcPacket(Index index, Index offset) const { constexpr int PacketSize = unpacket_traits<PacketType>::size; - Index actualIndex = index + (offset * PacketSize); - eigen_assert(check_array_bounds(actualIndex, PacketSize) && "Array index out of bounds"); + Index packetOffset = offset * PacketSize; + Index actualIndex = index + packetOffset; + eigen_assert(check_array_bounds(actualIndex, 0, PacketSize) && "Array index out of bounds"); return m_argImpl.template packet<LoadMode, PacketType>(actualIndex); } + template <int LoadMode, typename PacketType = SrcPacketType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType srcPacketSegment(Index row, Index col, Index begin, Index count, + Index offset) const { + constexpr int PacketSize = unpacket_traits<PacketType>::size; + Index packetOffset = offset * PacketSize; + Index actualRow = IsRowMajor ? row : row + packetOffset; + Index actualCol = IsRowMajor ? col + packetOffset : col; + eigen_assert(check_array_bounds(actualRow, actualCol, 0, count) && "Array index out of bounds"); + return m_argImpl.template packetSegment<LoadMode, PacketType>(actualRow, actualCol, begin, count); + } + template <int LoadMode, typename PacketType = SrcPacketType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType srcPacketSegment(Index index, Index begin, Index count, + Index offset) const { + constexpr int PacketSize = unpacket_traits<PacketType>::size; + Index packetOffset = offset * PacketSize; + Index actualIndex = index + packetOffset + begin; + eigen_assert(check_array_bounds(actualIndex, 0, count) && "Array index out of bounds"); + return m_argImpl.template packetSegment<LoadMode, PacketType>(actualIndex, begin, count); + } + + template <int NumPackets, int LoadMode, typename PacketType = SrcPacketType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketBlock<PacketType, NumPackets> srcPacketSegmentHelper(Index row, Index col, + Index begin, + Index count) const { + constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode); + PacketBlock<PacketType, NumPackets> packets; + Index offset = begin / SrcPacketSize; + Index actualBegin = begin % SrcPacketSize; + for (; offset < NumPackets; offset++) { + Index actualCount = numext::mini(SrcPacketSize - actualBegin, count); + packets.packet[offset] = srcPacketSegment<SrcLoadMode>(row, col, actualBegin, actualCount, offset); + if (count == actualCount) break; + actualBegin = 0; + count -= actualCount; + } + return packets; + } + template <int NumPackets, int LoadMode, typename PacketType = SrcPacketType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketBlock<PacketType, NumPackets> srcPacketSegmentHelper(Index index, + Index begin, + Index count) const { + constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode); + PacketBlock<PacketType, NumPackets> packets; + Index offset = begin / SrcPacketSize; + Index actualBegin = begin % SrcPacketSize; + for (; offset < NumPackets; offset++) { + Index actualCount = numext::mini(SrcPacketSize - actualBegin, count); + packets.packet[offset] = srcPacketSegment<SrcLoadMode>(index, actualBegin, actualCount, offset); + if (count == actualCount) break; + actualBegin = 0; + count -= actualCount; + } + return packets; + } // There is no source packet type with equal or fewer elements than DstPacketType. // This is problematic as the evaluation loop may attempt to access data outside the bounds of the array. // For example, consider the cast utilizing pcast<Packet4f,Packet2d> with an array of size 4: {0.0f,1.0f,2.0f,3.0f}. // The first iteration of the evaluation loop will load 16 bytes: {0.0f,1.0f,2.0f,3.0f} and cast to {0.0,1.0}, which // is acceptable. The second iteration will load 16 bytes: {2.0f,3.0f,?,?}, which is outside the bounds of the array. - - // Instead, perform runtime check to determine if the load would access data outside the bounds of the array. - // If not, perform full load. Otherwise, revert to a scalar loop to perform a partial load. - // In either case, perform a vectorized cast of the source packet. template <int LoadMode, typename DstPacketType, AltSrcScalarOp<DstPacketType> = true> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packet(Index row, Index col) const { constexpr int DstPacketSize = unpacket_traits<DstPacketType>::size; constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType); constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode); - SrcPacketType src; - if (EIGEN_PREDICT_TRUE(check_array_bounds(row, col, SrcPacketSize))) { - src = srcPacket<SrcLoadMode>(row, col, 0); - } else { - Array<SrcType, SrcPacketSize, 1> srcArray; - for (size_t k = 0; k < DstPacketSize; k++) srcArray[k] = srcCoeff(row, col, k); - for (size_t k = DstPacketSize; k < SrcPacketSize; k++) srcArray[k] = SrcType(0); - src = pload<SrcPacketType>(srcArray.data()); - } - return pcast<SrcPacketType, DstPacketType>(src); + return pcast<SrcPacketType, DstPacketType>(srcPacketSegment<SrcLoadMode>(row, col, 0, DstPacketSize, 0)); } // Use the source packet type with the same size as DstPacketType, if it exists template <int LoadMode, typename DstPacketType, SrcPacketArgs1<DstPacketType> = true> @@ -704,22 +801,67 @@ srcPacket<SrcLoadMode>(row, col, 6), srcPacket<SrcLoadMode>(row, col, 7)); } + // packetSegment variants + template <int LoadMode, typename DstPacketType, AltSrcScalarOp<DstPacketType> = true> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index row, Index col, Index begin, + Index count) const { + constexpr int DstPacketSize = unpacket_traits<DstPacketType>::size; + constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType); + constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode); + return pcast<SrcPacketType, DstPacketType>(srcPacketSegment<SrcLoadMode>(row, col, begin, count, 0)); + } + // Use the source packet type with the same size as DstPacketType, if it exists + template <int LoadMode, typename DstPacketType, SrcPacketArgs1<DstPacketType> = true> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index row, Index col, Index begin, + Index count) const { + constexpr int DstPacketSize = unpacket_traits<DstPacketType>::size; + using SizedSrcPacketType = typename find_packet_by_size<SrcType, DstPacketSize>::type; + constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType); + constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode); + return pcast<SizedSrcPacketType, DstPacketType>( + srcPacketSegment<SrcLoadMode, SizedSrcPacketType>(row, col, begin, count, 0)); + } + // unpacket_traits<DstPacketType>::size == 2 * SrcPacketSize + template <int LoadMode, typename DstPacketType, SrcPacketArgs2<DstPacketType> = true> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index row, Index col, Index begin, + Index count) const { + constexpr int NumPackets = 2; + constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode); + PacketBlock<SrcPacketType, NumPackets> packets = + srcPacketSegmentHelper<NumPackets, SrcLoadMode>(row, col, begin, count); + return pcast<SrcPacketType, DstPacketType>(packets.packet[0], packets.packet[1]); + } + // unpacket_traits<DstPacketType>::size == 4 * SrcPacketSize + template <int LoadMode, typename DstPacketType, SrcPacketArgs4<DstPacketType> = true> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index row, Index col, Index begin, + Index count) const { + constexpr int NumPackets = 4; + constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode); + PacketBlock<SrcPacketType, NumPackets> packets = + srcPacketSegmentHelper<NumPackets, SrcLoadMode>(row, col, begin, count); + return pcast<SrcPacketType, DstPacketType>(packets.packet[0], packets.packet[1], packets.packet[2], + packets.packet[3]); + } + // unpacket_traits<DstPacketType>::size == 8 * SrcPacketSize + template <int LoadMode, typename DstPacketType, SrcPacketArgs8<DstPacketType> = true> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index row, Index col, Index begin, + Index count) const { + constexpr int NumPackets = 8; + constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode); + PacketBlock<SrcPacketType, NumPackets> packets = + srcPacketSegmentHelper<NumPackets, SrcLoadMode>(row, col, begin, count); + return pcast<SrcPacketType, DstPacketType>(packets.packet[0], packets.packet[1], packets.packet[2], + packets.packet[3], packets.packet[4], packets.packet[5], + packets.packet[6], packets.packet[7]); + } + // Analogous routines for linear access. template <int LoadMode, typename DstPacketType, AltSrcScalarOp<DstPacketType> = true> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packet(Index index) const { constexpr int DstPacketSize = unpacket_traits<DstPacketType>::size; constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType); constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode); - SrcPacketType src; - if (EIGEN_PREDICT_TRUE(check_array_bounds(index, SrcPacketSize))) { - src = srcPacket<SrcLoadMode>(index, 0); - } else { - Array<SrcType, SrcPacketSize, 1> srcArray; - for (size_t k = 0; k < DstPacketSize; k++) srcArray[k] = srcCoeff(index, k); - for (size_t k = DstPacketSize; k < SrcPacketSize; k++) srcArray[k] = SrcType(0); - src = pload<SrcPacketType>(srcArray.data()); - } - return pcast<SrcPacketType, DstPacketType>(src); + return pcast<SrcPacketType, DstPacketType>(srcPacketSegment<SrcLoadMode>(index, 0, DstPacketSize, 0)); } template <int LoadMode, typename DstPacketType, SrcPacketArgs1<DstPacketType> = true> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packet(Index index) const { @@ -749,6 +891,55 @@ srcPacket<SrcLoadMode>(index, 6), srcPacket<SrcLoadMode>(index, 7)); } + // packetSegment variants + template <int LoadMode, typename DstPacketType, AltSrcScalarOp<DstPacketType> = true> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index index, Index begin, Index count) const { + constexpr int DstPacketSize = unpacket_traits<DstPacketType>::size; + constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType); + constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode); + return pcast<SrcPacketType, DstPacketType>(srcPacketSegment<SrcLoadMode>(index, begin, count, 0)); + } + // Use the source packet type with the same size as DstPacketType, if it exists + template <int LoadMode, typename DstPacketType, SrcPacketArgs1<DstPacketType> = true> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index index, Index begin, Index count) const { + constexpr int DstPacketSize = unpacket_traits<DstPacketType>::size; + using SizedSrcPacketType = typename find_packet_by_size<SrcType, DstPacketSize>::type; + constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType); + constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode); + return pcast<SizedSrcPacketType, DstPacketType>( + srcPacketSegment<SrcLoadMode, SizedSrcPacketType>(index, begin, count, 0)); + } + // unpacket_traits<DstPacketType>::size == 2 * SrcPacketSize + template <int LoadMode, typename DstPacketType, SrcPacketArgs2<DstPacketType> = true> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index index, Index begin, Index count) const { + constexpr int NumPackets = 2; + constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode); + PacketBlock<SrcPacketType, NumPackets> packets = + srcPacketSegmentHelper<NumPackets, SrcLoadMode>(index, begin, count); + return pcast<SrcPacketType, DstPacketType>(packets.packet[0], packets.packet[1]); + } + // unpacket_traits<DstPacketType>::size == 4 * SrcPacketSize + template <int LoadMode, typename DstPacketType, SrcPacketArgs4<DstPacketType> = true> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index index, Index begin, Index count) const { + constexpr int NumPackets = 4; + constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode); + PacketBlock<SrcPacketType, NumPackets> packets = + srcPacketSegmentHelper<NumPackets, SrcLoadMode>(index, begin, count); + return pcast<SrcPacketType, DstPacketType>(packets.packet[0], packets.packet[1], packets.packet[2], + packets.packet[3]); + } + // unpacket_traits<DstPacketType>::size == 8 * SrcPacketSize + template <int LoadMode, typename DstPacketType, SrcPacketArgs8<DstPacketType> = true> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index index, Index begin, Index count) const { + constexpr int NumPackets = 8; + constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode); + PacketBlock<SrcPacketType, NumPackets> packets = + srcPacketSegmentHelper<NumPackets, SrcLoadMode>(index, begin, count); + return pcast<SrcPacketType, DstPacketType>(packets.packet[0], packets.packet[1], packets.packet[2], + packets.packet[3], packets.packet[4], packets.packet[5], + packets.packet[6], packets.packet[7]); + } + constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rows() const { return m_rows; } constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index cols() const { return m_cols; } constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_rows * m_cols; } @@ -826,6 +1017,20 @@ m_d.arg3Impl.template packet<LoadMode, PacketType>(index)); } + template <int LoadMode, typename PacketType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const { + return m_d.func().packetOp(m_d.arg1Impl.template packetSegment<LoadMode, PacketType>(row, col, begin, count), + m_d.arg2Impl.template packetSegment<LoadMode, PacketType>(row, col, begin, count), + m_d.arg3Impl.template packetSegment<LoadMode, PacketType>(row, col, begin, count)); + } + + template <int LoadMode, typename PacketType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const { + return m_d.func().packetOp(m_d.arg1Impl.template packetSegment<LoadMode, PacketType>(index, begin, count), + m_d.arg2Impl.template packetSegment<LoadMode, PacketType>(index, begin, count), + m_d.arg3Impl.template packetSegment<LoadMode, PacketType>(index, begin, count)); + } + protected: // this helper permits to completely eliminate the functor if it is empty struct Data { @@ -922,6 +1127,18 @@ m_d.rhsImpl.template packet<LoadMode, PacketType>(index)); } + template <int LoadMode, typename PacketType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const { + return m_d.func().packetOp(m_d.lhsImpl.template packetSegment<LoadMode, PacketType>(row, col, begin, count), + m_d.rhsImpl.template packetSegment<LoadMode, PacketType>(row, col, begin, count)); + } + + template <int LoadMode, typename PacketType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const { + return m_d.func().packetOp(m_d.lhsImpl.template packetSegment<LoadMode, PacketType>(index, begin, count), + m_d.rhsImpl.template packetSegment<LoadMode, PacketType>(index, begin, count)); + } + protected: // this helper permits to completely eliminate the functor if it is empty struct Data { @@ -1013,7 +1230,7 @@ m_innerStride(map.innerStride()), m_outerStride(map.outerStride()) { EIGEN_STATIC_ASSERT(check_implication((evaluator<Derived>::Flags & PacketAccessBit) != 0, - internal::inner_stride_at_compile_time<Derived>::ret == 1), + inner_stride_at_compile_time<Derived>::ret == 1), PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1); EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); } @@ -1035,36 +1252,60 @@ template <int LoadMode, typename PacketType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { PointerType ptr = m_data + row * rowStride() + col * colStride(); - return internal::ploadt<PacketType, LoadMode>(ptr); + return ploadt<PacketType, LoadMode>(ptr); } template <int LoadMode, typename PacketType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index index) const { - return internal::ploadt<PacketType, LoadMode>(m_data + index * m_innerStride.value()); + return ploadt<PacketType, LoadMode>(m_data + index * m_innerStride.value()); } template <int StoreMode, typename PacketType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) { PointerType ptr = m_data + row * rowStride() + col * colStride(); - return internal::pstoret<Scalar, PacketType, StoreMode>(ptr, x); + pstoret<Scalar, PacketType, StoreMode>(ptr, x); } template <int StoreMode, typename PacketType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) { - internal::pstoret<Scalar, PacketType, StoreMode>(m_data + index * m_innerStride.value(), x); + pstoret<Scalar, PacketType, StoreMode>(m_data + index * m_innerStride.value(), x); + } + + template <int LoadMode, typename PacketType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const { + PointerType ptr = m_data + row * rowStride() + col * colStride(); + return ploadtSegment<PacketType, LoadMode>(ptr, begin, count); + } + + template <int LoadMode, typename PacketType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const { + return ploadtSegment<PacketType, LoadMode>(m_data + index * m_innerStride.value(), begin, count); + } + + template <int StoreMode, typename PacketType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index row, Index col, const PacketType& x, Index begin, + Index count) { + PointerType ptr = m_data + row * rowStride() + col * colStride(); + pstoretSegment<Scalar, PacketType, StoreMode>(ptr, x, begin, count); + } + + template <int StoreMode, typename PacketType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index index, const PacketType& x, Index begin, + Index count) { + pstoretSegment<Scalar, PacketType, StoreMode>(m_data + index * m_innerStride.value(), x, begin, count); } protected: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index rowStride() const EIGEN_NOEXCEPT { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rowStride() const noexcept { return XprType::IsRowMajor ? m_outerStride.value() : m_innerStride.value(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index colStride() const EIGEN_NOEXCEPT { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index colStride() const noexcept { return XprType::IsRowMajor ? m_innerStride.value() : m_outerStride.value(); } PointerType m_data; - const internal::variable_if_dynamic<Index, XprType::InnerStrideAtCompileTime> m_innerStride; - const internal::variable_if_dynamic<Index, XprType::OuterStrideAtCompileTime> m_outerStride; + const variable_if_dynamic<Index, XprType::InnerStrideAtCompileTime> m_innerStride; + const variable_if_dynamic<Index, XprType::OuterStrideAtCompileTime> m_outerStride; }; template <typename PlainObjectType, int MapOptions, typename StrideType> @@ -1117,7 +1358,7 @@ // -------------------- Block -------------------- template <typename ArgType, int BlockRows, int BlockCols, bool InnerPanel, - bool HasDirectAccess = internal::has_direct_access<ArgType>::ret> + bool HasDirectAccess = has_direct_access<ArgType>::ret> struct block_evaluator; template <typename ArgType, int BlockRows, int BlockCols, bool InnerPanel> @@ -1246,6 +1487,39 @@ x); } + template <int LoadMode, typename PacketType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const { + return m_argImpl.template packetSegment<LoadMode, PacketType>(m_startRow.value() + row, m_startCol.value() + col, + begin, count); + } + + template <int LoadMode, typename PacketType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const { + if (ForwardLinearAccess) + return m_argImpl.template packetSegment<LoadMode, PacketType>(m_linear_offset.value() + index, begin, count); + else + return packetSegment<LoadMode, PacketType>(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0, + begin, count); + } + + template <int StoreMode, typename PacketType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index row, Index col, const PacketType& x, Index begin, + Index count) { + return m_argImpl.template writePacketSegment<StoreMode, PacketType>(m_startRow.value() + row, + m_startCol.value() + col, x, begin, count); + } + + template <int StoreMode, typename PacketType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index index, const PacketType& x, Index begin, + Index count) { + if (ForwardLinearAccess) + return m_argImpl.template writePacketSegment<StoreMode, PacketType>(m_linear_offset.value() + index, x, begin, + count); + else + return writePacketSegment<StoreMode, PacketType>(RowsAtCompileTime == 1 ? 0 : index, + RowsAtCompileTime == 1 ? index : 0, x, begin, count); + } + protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType linear_coeff_impl(Index index, internal::true_type /* ForwardLinearAccess */) const { @@ -1341,8 +1615,8 @@ typedef Replicate<ArgType, RowFactor, ColFactor> XprType; typedef typename XprType::CoeffReturnType CoeffReturnType; enum { Factor = (RowFactor == Dynamic || ColFactor == Dynamic) ? Dynamic : RowFactor * ColFactor }; - typedef typename internal::nested_eval<ArgType, Factor>::type ArgTypeNested; - typedef internal::remove_all_t<ArgTypeNested> ArgTypeNestedCleaned; + typedef typename nested_eval<ArgType, Factor>::type ArgTypeNested; + typedef remove_all_t<ArgTypeNested> ArgTypeNestedCleaned; enum { CoeffReadCost = evaluator<ArgTypeNestedCleaned>::CoeffReadCost, @@ -1361,19 +1635,15 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const { // try to avoid using modulo; this is a pure optimization strategy - const Index actual_row = internal::traits<XprType>::RowsAtCompileTime == 1 ? 0 - : RowFactor == 1 ? row - : row % m_rows.value(); - const Index actual_col = internal::traits<XprType>::ColsAtCompileTime == 1 ? 0 - : ColFactor == 1 ? col - : col % m_cols.value(); + const Index actual_row = traits<XprType>::RowsAtCompileTime == 1 ? 0 : RowFactor == 1 ? row : row % m_rows.value(); + const Index actual_col = traits<XprType>::ColsAtCompileTime == 1 ? 0 : ColFactor == 1 ? col : col % m_cols.value(); return m_argImpl.coeff(actual_row, actual_col); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { // try to avoid using modulo; this is a pure optimization strategy - const Index actual_index = internal::traits<XprType>::RowsAtCompileTime == 1 + const Index actual_index = traits<XprType>::RowsAtCompileTime == 1 ? (ColFactor == 1 ? index : index % m_cols.value()) : (RowFactor == 1 ? index : index % m_rows.value()); @@ -1382,25 +1652,38 @@ template <int LoadMode, typename PacketType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { - const Index actual_row = internal::traits<XprType>::RowsAtCompileTime == 1 ? 0 - : RowFactor == 1 ? row - : row % m_rows.value(); - const Index actual_col = internal::traits<XprType>::ColsAtCompileTime == 1 ? 0 - : ColFactor == 1 ? col - : col % m_cols.value(); + const Index actual_row = traits<XprType>::RowsAtCompileTime == 1 ? 0 : RowFactor == 1 ? row : row % m_rows.value(); + const Index actual_col = traits<XprType>::ColsAtCompileTime == 1 ? 0 : ColFactor == 1 ? col : col % m_cols.value(); return m_argImpl.template packet<LoadMode, PacketType>(actual_row, actual_col); } template <int LoadMode, typename PacketType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index index) const { - const Index actual_index = internal::traits<XprType>::RowsAtCompileTime == 1 + const Index actual_index = traits<XprType>::RowsAtCompileTime == 1 ? (ColFactor == 1 ? index : index % m_cols.value()) : (RowFactor == 1 ? index : index % m_rows.value()); return m_argImpl.template packet<LoadMode, PacketType>(actual_index); } + template <int LoadMode, typename PacketType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const { + const Index actual_row = traits<XprType>::RowsAtCompileTime == 1 ? 0 : RowFactor == 1 ? row : row % m_rows.value(); + const Index actual_col = traits<XprType>::ColsAtCompileTime == 1 ? 0 : ColFactor == 1 ? col : col % m_cols.value(); + + return m_argImpl.template packetSegment<LoadMode, PacketType>(actual_row, actual_col, begin, count); + } + + template <int LoadMode, typename PacketType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const { + const Index actual_index = traits<XprType>::RowsAtCompileTime == 1 + ? (ColFactor == 1 ? index : index % m_cols.value()) + : (RowFactor == 1 ? index : index % m_rows.value()); + + return m_argImpl.template packetSegment<LoadMode, PacketType>(actual_index, begin, count); + } + protected: const ArgTypeNested m_arg; evaluator<ArgTypeNestedCleaned> m_argImpl; @@ -1457,6 +1740,28 @@ m_argImpl.template writePacket<StoreMode>(index, x); } + template <int LoadMode, typename PacketType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const { + return m_argImpl.template packetSegment<LoadMode, PacketType>(row, col, begin, count); + } + + template <int LoadMode, typename PacketType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const { + return m_argImpl.template packetSegment<LoadMode, PacketType>(index, begin, count); + } + + template <int StoreMode, typename PacketType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index row, Index col, const PacketType& x, Index begin, + Index count) { + m_argImpl.template writePacketSegment<StoreMode>(row, col, x, begin, count); + } + + template <int StoreMode, typename PacketType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index index, const PacketType& x, Index begin, + Index count) { + m_argImpl.template writePacketSegment<StoreMode>(index, x, begin, count); + } + protected: evaluator<ArgType> m_argImpl; }; @@ -1536,41 +1841,97 @@ template <int LoadMode, typename PacketType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { - enum { - PacketSize = unpacket_traits<PacketType>::size, - OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1, - OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1 - }; - typedef internal::reverse_packet_cond<PacketType, ReversePacket> reverse_packet; - return reverse_packet::run(m_argImpl.template packet<LoadMode, PacketType>( - ReverseRow ? m_rows.value() - row - OffsetRow : row, ReverseCol ? m_cols.value() - col - OffsetCol : col)); + static constexpr int PacketSize = unpacket_traits<PacketType>::size; + static constexpr int OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1; + static constexpr int OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1; + using reverse_packet = reverse_packet_cond<PacketType, ReversePacket>; + + Index actualRow = ReverseRow ? m_rows.value() - row - OffsetRow : row; + Index actualCol = ReverseCol ? m_cols.value() - col - OffsetCol : col; + + return reverse_packet::run(m_argImpl.template packet<LoadMode, PacketType>(actualRow, actualCol)); } template <int LoadMode, typename PacketType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index index) const { - enum { PacketSize = unpacket_traits<PacketType>::size }; - return preverse( - m_argImpl.template packet<LoadMode, PacketType>(m_rows.value() * m_cols.value() - index - PacketSize)); + static constexpr int PacketSize = unpacket_traits<PacketType>::size; + + Index actualIndex = m_rows.value() * m_cols.value() - index - PacketSize; + + return preverse(m_argImpl.template packet<LoadMode, PacketType>(actualIndex)); } template <int LoadMode, typename PacketType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) { - // FIXME we could factorize some code with packet(i,j) - enum { - PacketSize = unpacket_traits<PacketType>::size, - OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1, - OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1 - }; - typedef internal::reverse_packet_cond<PacketType, ReversePacket> reverse_packet; - m_argImpl.template writePacket<LoadMode>(ReverseRow ? m_rows.value() - row - OffsetRow : row, - ReverseCol ? m_cols.value() - col - OffsetCol : col, - reverse_packet::run(x)); + static constexpr int PacketSize = unpacket_traits<PacketType>::size; + static constexpr int OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1; + static constexpr int OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1; + using reverse_packet = reverse_packet_cond<PacketType, ReversePacket>; + + Index actualRow = ReverseRow ? m_rows.value() - row - OffsetRow : row; + Index actualCol = ReverseCol ? m_cols.value() - col - OffsetCol : col; + + m_argImpl.template writePacket<LoadMode>(actualRow, actualCol, reverse_packet::run(x)); } template <int LoadMode, typename PacketType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) { - enum { PacketSize = unpacket_traits<PacketType>::size }; - m_argImpl.template writePacket<LoadMode>(m_rows.value() * m_cols.value() - index - PacketSize, preverse(x)); + static constexpr int PacketSize = unpacket_traits<PacketType>::size; + + Index actualIndex = m_rows.value() * m_cols.value() - index - PacketSize; + + m_argImpl.template writePacket<LoadMode>(actualIndex, preverse(x)); + } + + template <int LoadMode, typename PacketType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const { + static constexpr int PacketSize = unpacket_traits<PacketType>::size; + static constexpr int OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1; + static constexpr int OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1; + using reverse_packet = reverse_packet_cond<PacketType, ReversePacket>; + + Index actualRow = ReverseRow ? m_rows.value() - row - OffsetRow : row; + Index actualCol = ReverseCol ? m_cols.value() - col - OffsetCol : col; + Index actualBegin = ReversePacket ? (PacketSize - count - begin) : begin; + + return reverse_packet::run( + m_argImpl.template packetSegment<LoadMode, PacketType>(actualRow, actualCol, actualBegin, count)); + } + + template <int LoadMode, typename PacketType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const { + static constexpr int PacketSize = unpacket_traits<PacketType>::size; + + Index actualIndex = m_rows.value() * m_cols.value() - index - PacketSize; + Index actualBegin = PacketSize - count - begin; + + return preverse(m_argImpl.template packetSegment<LoadMode, PacketType>(actualIndex, actualBegin, count)); + } + + template <int LoadMode, typename PacketType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index row, Index col, const PacketType& x, Index begin, + Index count) { + static constexpr int PacketSize = unpacket_traits<PacketType>::size; + static constexpr int OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1; + static constexpr int OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1; + using reverse_packet = reverse_packet_cond<PacketType, ReversePacket>; + + Index actualRow = ReverseRow ? m_rows.value() - row - OffsetRow : row; + Index actualCol = ReverseCol ? m_cols.value() - col - OffsetCol : col; + Index actualBegin = ReversePacket ? (PacketSize - count - begin) : begin; + + m_argImpl.template writePacketSegment<LoadMode>(actualRow, actualCol, reverse_packet::run(x), actualBegin, count); + } + + template <int LoadMode, typename PacketType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index index, const PacketType& x, Index begin, + Index count) { + static constexpr int PacketSize = unpacket_traits<PacketType>::size; + + Index actualIndex = m_rows.value() * m_cols.value() - index - PacketSize; + Index actualBegin = PacketSize - count - begin; + + m_argImpl.template writePacketSegment<LoadMode>(actualIndex, preverse(x), actualBegin, count); } protected: @@ -1621,13 +1982,13 @@ protected: evaluator<ArgType> m_argImpl; - const internal::variable_if_dynamicindex<Index, XprType::DiagIndex> m_index; + const variable_if_dynamicindex<Index, XprType::DiagIndex> m_index; private: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index rowOffset() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rowOffset() const { return m_index.value() > 0 ? 0 : -m_index.value(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index colOffset() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index colOffset() const { return m_index.value() > 0 ? m_index.value() : 0; } }; @@ -1656,9 +2017,9 @@ const ArgType& arg() const { return m_arg; } - EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_arg.rows(); } + constexpr Index rows() const noexcept { return m_arg.rows(); } - EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_arg.cols(); } + constexpr Index cols() const noexcept { return m_arg.cols(); } private: const ArgType& m_arg;
diff --git a/Eigen/src/Core/CwiseBinaryOp.h b/Eigen/src/Core/CwiseBinaryOp.h index aa79b60..e2b2da5 100644 --- a/Eigen/src/Core/CwiseBinaryOp.h +++ b/Eigen/src/Core/CwiseBinaryOp.h
@@ -108,12 +108,12 @@ eigen_assert(aLhs.rows() == aRhs.rows() && aLhs.cols() == aRhs.cols()); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const noexcept { // return the fixed size type if available to enable compile time optimizations return internal::traits<internal::remove_all_t<LhsNested>>::RowsAtCompileTime == Dynamic ? m_rhs.rows() : m_lhs.rows(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const noexcept { // return the fixed size type if available to enable compile time optimizations return internal::traits<internal::remove_all_t<LhsNested>>::ColsAtCompileTime == Dynamic ? m_rhs.cols() : m_lhs.cols();
diff --git a/Eigen/src/Core/CwiseNullaryOp.h b/Eigen/src/Core/CwiseNullaryOp.h index 9c305c6..13a542a 100644 --- a/Eigen/src/Core/CwiseNullaryOp.h +++ b/Eigen/src/Core/CwiseNullaryOp.h
@@ -76,8 +76,8 @@ EIGEN_STATIC_ASSERT(CwiseNullaryOp::IsVectorAtCompileTime, YOU_TRIED_CALLING_A_VECTOR_METHOD_ON_A_MATRIX); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index rows() const { return m_rows.value(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index cols() const { return m_cols.value(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return m_rows.value(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return m_cols.value(); } /** \returns the functor representing the nullary operation */ EIGEN_DEVICE_FUNC const NullaryOp& functor() const { return m_functor; }
diff --git a/Eigen/src/Core/CwiseUnaryOp.h b/Eigen/src/Core/CwiseUnaryOp.h index 42ed459..94ec1a0 100644 --- a/Eigen/src/Core/CwiseUnaryOp.h +++ b/Eigen/src/Core/CwiseUnaryOp.h
@@ -60,8 +60,8 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit CwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp()) : m_xpr(xpr), m_functor(func) {} - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_xpr.rows(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_xpr.cols(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const noexcept { return m_xpr.rows(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const noexcept { return m_xpr.cols(); } /** \returns the functor representing the unary operation */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const UnaryOp& functor() const { return m_functor; }
diff --git a/Eigen/src/Core/CwiseUnaryView.h b/Eigen/src/Core/CwiseUnaryView.h index 49b1410..7dd7623 100644 --- a/Eigen/src/Core/CwiseUnaryView.h +++ b/Eigen/src/Core/CwiseUnaryView.h
@@ -66,13 +66,13 @@ EIGEN_DEVICE_FUNC inline const Scalar* data() const { return &(this->coeffRef(0)); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const { + EIGEN_DEVICE_FUNC constexpr Index innerStride() const { return StrideType::InnerStrideAtCompileTime != 0 ? int(StrideType::InnerStrideAtCompileTime) : derived().nestedExpression().innerStride() * sizeof(typename traits<MatrixType>::Scalar) / sizeof(Scalar); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const { + EIGEN_DEVICE_FUNC constexpr Index outerStride() const { return StrideType::OuterStrideAtCompileTime != 0 ? int(StrideType::OuterStrideAtCompileTime) : derived().nestedExpression().outerStride() * sizeof(typename traits<MatrixType>::Scalar) / sizeof(Scalar); @@ -145,8 +145,8 @@ EIGEN_INHERIT_ASSIGNMENT_OPERATORS(CwiseUnaryView) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const noexcept { return m_matrix.rows(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const noexcept { return m_matrix.cols(); } /** \returns the functor representing unary operation */ EIGEN_DEVICE_FUNC const ViewOp& functor() const { return m_functor; }
diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h index d5906bd..4f68942 100644 --- a/Eigen/src/Core/DenseBase.h +++ b/Eigen/src/Core/DenseBase.h
@@ -208,7 +208,7 @@ * \note For a vector, this returns just 1. For a matrix (non-vector), this is the major dimension * with respect to the \ref TopicStorageOrders "storage order", i.e., the number of columns for a * column-major matrix, and the number of rows for a row-major matrix. */ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index outerSize() const { + EIGEN_DEVICE_FUNC constexpr Index outerSize() const { return IsVectorAtCompileTime ? 1 : int(IsRowMajor) ? this->rows() : this->cols(); } @@ -217,7 +217,7 @@ * \note For a vector, this is just the size. For a matrix (non-vector), this is the minor dimension * with respect to the \ref TopicStorageOrders "storage order", i.e., the number of rows for a * column-major matrix, and the number of columns for a row-major matrix. */ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index innerSize() const { + EIGEN_DEVICE_FUNC constexpr Index innerSize() const { return IsVectorAtCompileTime ? this->size() : int(IsRowMajor) ? this->cols() : this->rows(); }
diff --git a/Eigen/src/Core/DenseCoeffsBase.h b/Eigen/src/Core/DenseCoeffsBase.h index 97f9b50..cff104c 100644 --- a/Eigen/src/Core/DenseCoeffsBase.h +++ b/Eigen/src/Core/DenseCoeffsBase.h
@@ -89,13 +89,12 @@ * * \sa operator()(Index,Index) const, coeffRef(Index,Index), coeff(Index) const */ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR CoeffReturnType coeff(Index row, Index col) const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType coeff(Index row, Index col) const { eigen_internal_assert(row >= 0 && row < rows() && col >= 0 && col < cols()); return internal::evaluator<Derived>(derived()).coeff(row, col); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR CoeffReturnType coeffByOuterInner(Index outer, - Index inner) const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType coeffByOuterInner(Index outer, Index inner) const { return coeff(rowIndexByOuterInner(outer, inner), colIndexByOuterInner(outer, inner)); } @@ -103,7 +102,7 @@ * * \sa operator()(Index,Index), operator[](Index) */ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR CoeffReturnType operator()(Index row, Index col) const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType operator()(Index row, Index col) const { eigen_assert(row >= 0 && row < rows() && col >= 0 && col < cols()); return coeff(row, col); } @@ -123,7 +122,7 @@ * \sa operator[](Index) const, coeffRef(Index), coeff(Index,Index) const */ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR CoeffReturnType coeff(Index index) const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType coeff(Index index) const { EIGEN_STATIC_ASSERT(internal::evaluator<Derived>::Flags & LinearAccessBit, THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS) eigen_internal_assert(index >= 0 && index < size()); @@ -138,7 +137,7 @@ * z() const, w() const */ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR CoeffReturnType operator[](Index index) const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType operator[](Index index) const { EIGEN_STATIC_ASSERT(Derived::IsVectorAtCompileTime, THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD) eigen_assert(index >= 0 && index < size()); @@ -155,32 +154,32 @@ * z() const, w() const */ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR CoeffReturnType operator()(Index index) const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType operator()(Index index) const { eigen_assert(index >= 0 && index < size()); return coeff(index); } /** equivalent to operator[](0). */ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR CoeffReturnType x() const { return (*this)[0]; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType x() const { return (*this)[0]; } /** equivalent to operator[](1). */ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR CoeffReturnType y() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType y() const { EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime == -1 || Derived::SizeAtCompileTime >= 2, OUT_OF_RANGE_ACCESS); return (*this)[1]; } /** equivalent to operator[](2). */ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR CoeffReturnType z() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType z() const { EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime == -1 || Derived::SizeAtCompileTime >= 3, OUT_OF_RANGE_ACCESS); return (*this)[2]; } /** equivalent to operator[](3). */ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR CoeffReturnType w() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType w() const { EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime == -1 || Derived::SizeAtCompileTime >= 4, OUT_OF_RANGE_ACCESS); return (*this)[3]; } @@ -362,32 +361,32 @@ * \sa operator[](Index) const, operator()(Index,Index), x(), y(), z(), w() */ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Scalar& operator()(Index index) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& operator()(Index index) { eigen_assert(index >= 0 && index < size()); return coeffRef(index); } /** equivalent to operator[](0). */ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Scalar& x() { return (*this)[0]; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& x() { return (*this)[0]; } /** equivalent to operator[](1). */ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Scalar& y() { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& y() { EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime == -1 || Derived::SizeAtCompileTime >= 2, OUT_OF_RANGE_ACCESS); return (*this)[1]; } /** equivalent to operator[](2). */ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Scalar& z() { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& z() { EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime == -1 || Derived::SizeAtCompileTime >= 3, OUT_OF_RANGE_ACCESS); return (*this)[2]; } /** equivalent to operator[](3). */ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Scalar& w() { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& w() { EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime == -1 || Derived::SizeAtCompileTime >= 4, OUT_OF_RANGE_ACCESS); return (*this)[3]; } @@ -421,33 +420,29 @@ * * \sa outerStride(), rowStride(), colStride() */ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const { return derived().innerStride(); } + EIGEN_DEVICE_FUNC constexpr Index innerStride() const { return derived().innerStride(); } /** \returns the pointer increment between two consecutive inner slices (for example, between two consecutive columns * in a column-major matrix). * * \sa innerStride(), rowStride(), colStride() */ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const { return derived().outerStride(); } + EIGEN_DEVICE_FUNC constexpr Index outerStride() const { return derived().outerStride(); } // FIXME shall we remove it ? - EIGEN_CONSTEXPR inline Index stride() const { return Derived::IsVectorAtCompileTime ? innerStride() : outerStride(); } + constexpr Index stride() const { return Derived::IsVectorAtCompileTime ? innerStride() : outerStride(); } /** \returns the pointer increment between two consecutive rows. * * \sa innerStride(), outerStride(), colStride() */ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rowStride() const { - return Derived::IsRowMajor ? outerStride() : innerStride(); - } + EIGEN_DEVICE_FUNC constexpr Index rowStride() const { return Derived::IsRowMajor ? outerStride() : innerStride(); } /** \returns the pointer increment between two consecutive columns. * * \sa innerStride(), outerStride(), rowStride() */ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index colStride() const { - return Derived::IsRowMajor ? innerStride() : outerStride(); - } + EIGEN_DEVICE_FUNC constexpr Index colStride() const { return Derived::IsRowMajor ? innerStride() : outerStride(); } }; /** \brief Base class providing direct read/write coefficient access to matrices and arrays. @@ -478,25 +473,23 @@ * * \sa outerStride(), rowStride(), colStride() */ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const EIGEN_NOEXCEPT { return derived().innerStride(); } + EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { return derived().innerStride(); } /** \returns the pointer increment between two consecutive inner slices (for example, between two consecutive columns * in a column-major matrix). * * \sa innerStride(), rowStride(), colStride() */ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const EIGEN_NOEXCEPT { return derived().outerStride(); } + EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return derived().outerStride(); } // FIXME shall we remove it ? - EIGEN_CONSTEXPR inline Index stride() const EIGEN_NOEXCEPT { - return Derived::IsVectorAtCompileTime ? innerStride() : outerStride(); - } + constexpr Index stride() const noexcept { return Derived::IsVectorAtCompileTime ? innerStride() : outerStride(); } /** \returns the pointer increment between two consecutive rows. * * \sa innerStride(), outerStride(), colStride() */ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rowStride() const EIGEN_NOEXCEPT { + EIGEN_DEVICE_FUNC constexpr Index rowStride() const noexcept { return Derived::IsRowMajor ? outerStride() : innerStride(); } @@ -504,7 +497,7 @@ * * \sa innerStride(), outerStride(), rowStride() */ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index colStride() const EIGEN_NOEXCEPT { + EIGEN_DEVICE_FUNC constexpr Index colStride() const noexcept { return Derived::IsRowMajor ? innerStride() : outerStride(); } }; @@ -513,7 +506,7 @@ template <int Alignment, typename Derived, bool JustReturnZero> struct first_aligned_impl { - static EIGEN_CONSTEXPR inline Index run(const Derived&) EIGEN_NOEXCEPT { return 0; } + static constexpr Index run(const Derived&) noexcept { return 0; } }; template <int Alignment, typename Derived>
diff --git a/Eigen/src/Core/DeviceWrapper.h b/Eigen/src/Core/DeviceWrapper.h index 75fc8e7..012dce1 100644 --- a/Eigen/src/Core/DeviceWrapper.h +++ b/Eigen/src/Core/DeviceWrapper.h
@@ -87,13 +87,13 @@ int Unrolling = Kernel::AssignmentTraits::Unrolling> struct dense_assignment_loop_with_device { using Base = dense_assignment_loop<Kernel, Traversal, Unrolling>; - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel, Device&) { Base::run(kernel); } + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel, Device&) { Base::run(kernel); } }; // entry point for a generic expression with device template <typename Dst, typename Src, typename Func, typename Device> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void call_assignment_no_alias(DeviceWrapper<Dst, Device> dst, - const Src& src, const Func& func) { +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_assignment_no_alias(DeviceWrapper<Dst, Device> dst, + const Src& src, const Func& func) { enum { NeedToTranspose = ((int(Dst::RowsAtCompileTime) == 1 && int(Src::ColsAtCompileTime) == 1) || (int(Dst::ColsAtCompileTime) == 1 && int(Src::RowsAtCompileTime) == 1)) && @@ -115,10 +115,8 @@ // copy and pasted from AssignEvaluator except forward device to kernel template <typename DstXprType, typename SrcXprType, typename Functor, typename Device> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void call_dense_assignment_loop(DstXprType& dst, - const SrcXprType& src, - const Functor& func, - Device& device) { +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src, + const Functor& func, Device& device) { using DstEvaluatorType = evaluator<DstXprType>; using SrcEvaluatorType = evaluator<SrcXprType>;
diff --git a/Eigen/src/Core/Diagonal.h b/Eigen/src/Core/Diagonal.h index 8d27857..ff8611c 100644 --- a/Eigen/src/Core/Diagonal.h +++ b/Eigen/src/Core/Diagonal.h
@@ -83,13 +83,11 @@ : numext::mini<Index>(m_matrix.rows(), m_matrix.cols() - m_index.value()); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return 1; } + EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return 1; } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const EIGEN_NOEXCEPT { - return m_matrix.outerStride() + 1; - } + EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { return m_matrix.outerStride() + 1; } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const EIGEN_NOEXCEPT { return 0; } + EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return 0; } typedef std::conditional_t<internal::is_lvalue<MatrixType>::value, Scalar, const Scalar> ScalarWithConstIfNotLvalue; @@ -134,13 +132,13 @@ private: // some compilers may fail to optimize std::max etc in case of compile-time constants... - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index absDiagIndex() const EIGEN_NOEXCEPT { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index absDiagIndex() const noexcept { return m_index.value() > 0 ? m_index.value() : -m_index.value(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index rowOffset() const EIGEN_NOEXCEPT { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rowOffset() const noexcept { return m_index.value() > 0 ? 0 : -m_index.value(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index colOffset() const EIGEN_NOEXCEPT { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index colOffset() const noexcept { return m_index.value() > 0 ? m_index.value() : 0; } // trigger a compile-time error if someone try to call packet
diff --git a/Eigen/src/Core/DiagonalMatrix.h b/Eigen/src/Core/DiagonalMatrix.h index 4115b64..52630d9 100644 --- a/Eigen/src/Core/DiagonalMatrix.h +++ b/Eigen/src/Core/DiagonalMatrix.h
@@ -76,9 +76,9 @@ } /** \returns the number of rows. */ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const { return diagonal().size(); } + EIGEN_DEVICE_FUNC constexpr Index rows() const { return diagonal().size(); } /** \returns the number of columns. */ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const { return diagonal().size(); } + EIGEN_DEVICE_FUNC constexpr Index cols() const { return diagonal().size(); } /** \returns the diagonal matrix product of \c *this by the dense matrix, \a matrix */ template <typename MatrixDerived>
diff --git a/Eigen/src/Core/EigenBase.h b/Eigen/src/Core/EigenBase.h index 894bfc1..c9a6e88 100644 --- a/Eigen/src/Core/EigenBase.h +++ b/Eigen/src/Core/EigenBase.h
@@ -56,12 +56,12 @@ EIGEN_DEVICE_FUNC inline const Derived& const_derived() const { return *static_cast<const Derived*>(this); } /** \returns the number of rows. \sa cols(), RowsAtCompileTime */ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return derived().rows(); } + EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return derived().rows(); } /** \returns the number of columns. \sa rows(), ColsAtCompileTime*/ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return derived().cols(); } + EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return derived().cols(); } /** \returns the number of coefficients, which is rows()*cols(). * \sa rows(), cols(), SizeAtCompileTime. */ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index size() const EIGEN_NOEXCEPT { return rows() * cols(); } + EIGEN_DEVICE_FUNC constexpr Index size() const noexcept { return rows() * cols(); } /** \internal Don't use it, but do the equivalent: \code dst = *this; \endcode */ template <typename Dest>
diff --git a/Eigen/src/Core/Fill.h b/Eigen/src/Core/Fill.h index 3b0af91..9d4ecd4 100644 --- a/Eigen/src/Core/Fill.h +++ b/Eigen/src/Core/Fill.h
@@ -92,7 +92,8 @@ template <typename Xpr> struct eigen_memset_helper { - static constexpr bool value = std::is_trivial<typename Xpr::Scalar>::value && eigen_fill_helper<Xpr>::value; + static constexpr bool value = + std::is_trivially_copyable<typename Xpr::Scalar>::value && eigen_fill_helper<Xpr>::value; }; template <typename Xpr>
diff --git a/Eigen/src/Core/ForceAlignedAccess.h b/Eigen/src/Core/ForceAlignedAccess.h index a91b0da..55beab3 100644 --- a/Eigen/src/Core/ForceAlignedAccess.h +++ b/Eigen/src/Core/ForceAlignedAccess.h
@@ -41,14 +41,10 @@ EIGEN_DEVICE_FUNC explicit inline ForceAlignedAccess(const ExpressionType& matrix) : m_expression(matrix) {} - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_expression.rows(); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_expression.cols(); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const EIGEN_NOEXCEPT { - return m_expression.outerStride(); - } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const EIGEN_NOEXCEPT { - return m_expression.innerStride(); - } + EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_expression.rows(); } + EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_expression.cols(); } + EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return m_expression.outerStride(); } + EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { return m_expression.innerStride(); } EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index row, Index col) const { return m_expression.coeff(row, col);
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 26a4634..d45cb4b 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h
@@ -72,6 +72,7 @@ HasReciprocal = 0, HasSqrt = 0, HasRsqrt = 0, + HasCbrt = 0, HasExp = 0, HasExpm1 = 0, HasLog = 0, @@ -368,6 +369,11 @@ EIGEN_DEVICE_FUNC inline Packet pdiv(const Packet& a, const Packet& b) { return a / b; } +// Avoid compiler warning for boolean algebra. +template <> +EIGEN_DEVICE_FUNC inline bool pdiv(const bool& a, const bool& b) { + return a && b; +} // In the generic case, memset to all one bits. template <typename Packet, typename EnableIf = void> @@ -449,48 +455,42 @@ template <typename T> struct bit_and { - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE T operator()(const T& a, const T& b) const { return a & b; } + EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE T operator()(const T& a, const T& b) const { return a & b; } }; template <typename T> struct bit_or { - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE T operator()(const T& a, const T& b) const { return a | b; } + EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE T operator()(const T& a, const T& b) const { return a | b; } }; template <typename T> struct bit_xor { - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE T operator()(const T& a, const T& b) const { return a ^ b; } + EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE T operator()(const T& a, const T& b) const { return a ^ b; } }; template <typename T> struct bit_not { - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE T operator()(const T& a) const { return ~a; } + EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE T operator()(const T& a) const { return ~a; } }; template <> struct bit_and<bool> { - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE bool operator()(const bool& a, const bool& b) const { - return a && b; - } + EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE bool operator()(const bool& a, const bool& b) const { return a && b; } }; template <> struct bit_or<bool> { - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE bool operator()(const bool& a, const bool& b) const { - return a || b; - } + EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE bool operator()(const bool& a, const bool& b) const { return a || b; } }; template <> struct bit_xor<bool> { - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE bool operator()(const bool& a, const bool& b) const { - return a != b; - } + EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE bool operator()(const bool& a, const bool& b) const { return a != b; } }; template <> struct bit_not<bool> { - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE bool operator()(const bool& a) const { return !a; } + EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE bool operator()(const bool& a) const { return !a; } }; // Use operators &, |, ^, ~. @@ -580,7 +580,7 @@ } // In the general case, use bitwise select. -template <typename Packet, typename EnableIf = void> +template <typename Packet, bool is_scalar = is_scalar<Packet>::value> struct pselect_impl { static EIGEN_DEVICE_FUNC inline Packet run(const Packet& mask, const Packet& a, const Packet& b) { return por(pand(a, mask), pandnot(b, mask)); @@ -589,9 +589,9 @@ // For scalars, use ternary select. template <typename Packet> -struct pselect_impl<Packet, std::enable_if_t<is_scalar<Packet>::value>> { +struct pselect_impl<Packet, true> { static EIGEN_DEVICE_FUNC inline Packet run(const Packet& mask, const Packet& a, const Packet& b) { - return numext::equal_strict(mask, Packet(0)) ? b : a; + return numext::select(mask, a, b); } }; @@ -1294,29 +1294,61 @@ * The following functions might not have to be overwritten for vectorized types ***************************************************************************/ +template <typename Packet, typename EnableIf = void> +struct pmadd_impl { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pmadd(const Packet& a, const Packet& b, const Packet& c) { + return padd(pmul(a, b), c); + } + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pmsub(const Packet& a, const Packet& b, const Packet& c) { + return psub(pmul(a, b), c); + } + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pnmadd(const Packet& a, const Packet& b, const Packet& c) { + return psub(c, pmul(a, b)); + } + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pnmsub(const Packet& a, const Packet& b, const Packet& c) { + return pnegate(pmadd(a, b, c)); + } +}; + +template <typename Scalar> +struct pmadd_impl<Scalar, std::enable_if_t<is_scalar<Scalar>::value && NumTraits<Scalar>::IsSigned>> { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar pmadd(const Scalar& a, const Scalar& b, const Scalar& c) { + return numext::fma(a, b, c); + } + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar pmsub(const Scalar& a, const Scalar& b, const Scalar& c) { + return numext::fma(a, b, Scalar(-c)); + } + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar pnmadd(const Scalar& a, const Scalar& b, const Scalar& c) { + return numext::fma(Scalar(-a), b, c); + } + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar pnmsub(const Scalar& a, const Scalar& b, const Scalar& c) { + return -Scalar(numext::fma(a, b, c)); + } +}; + // FMA instructions. /** \internal \returns a * b + c (coeff-wise) */ template <typename Packet> EIGEN_DEVICE_FUNC inline Packet pmadd(const Packet& a, const Packet& b, const Packet& c) { - return padd(pmul(a, b), c); + return pmadd_impl<Packet>::pmadd(a, b, c); } /** \internal \returns a * b - c (coeff-wise) */ template <typename Packet> EIGEN_DEVICE_FUNC inline Packet pmsub(const Packet& a, const Packet& b, const Packet& c) { - return psub(pmul(a, b), c); + return pmadd_impl<Packet>::pmsub(a, b, c); } /** \internal \returns -(a * b) + c (coeff-wise) */ template <typename Packet> EIGEN_DEVICE_FUNC inline Packet pnmadd(const Packet& a, const Packet& b, const Packet& c) { - return psub(c, pmul(a, b)); + return pmadd_impl<Packet>::pnmadd(a, b, c); } /** \internal \returns -((a * b + c) (coeff-wise) */ template <typename Packet> EIGEN_DEVICE_FUNC inline Packet pnmsub(const Packet& a, const Packet& b, const Packet& c) { - return pnegate(pmadd(a, b, c)); + return pmadd_impl<Packet>::pnmsub(a, b, c); } /** \internal copy a packet with constant coefficient \a a (e.g., [a,a,a,a]) to \a *to. \a to must be 16 bytes aligned @@ -1525,6 +1557,104 @@ return (Packet)pand(result, peven_mask(result)); // atan2 0 atan2 0 ... } +/** \internal \returns a packet populated with values in the range [begin, begin + count). Elements + * outside this range are not defined. \a *from does not need to be aligned, and can be null if \a count is zero.*/ +template <typename Packet> +EIGEN_DEVICE_FUNC inline Packet ploaduSegment(const typename unpacket_traits<Packet>::type* from, Index begin, + Index count) { + using Scalar = typename unpacket_traits<Packet>::type; + constexpr Index PacketSize = unpacket_traits<Packet>::size; + eigen_assert((begin >= 0 && count >= 0 && begin + count <= PacketSize) && "invalid range"); + Scalar aux[PacketSize]; + memset(static_cast<void*>(aux), 0x00, sizeof(Scalar) * PacketSize); + smart_copy(from + begin, from + begin + count, aux + begin); + return ploadu<Packet>(aux); +} + +/** \internal \returns a packet populated with values in the range [begin, begin + count). Elements + * outside this range are not defined. \a *from must be aligned, and cannot be null.*/ +template <typename Packet> +EIGEN_DEVICE_FUNC inline Packet ploadSegment(const typename unpacket_traits<Packet>::type* from, Index begin, + Index count) { + return ploaduSegment<Packet>(from, begin, count); +} + +/** \internal copy the packet \a from in the range [begin, begin + count) to \a *to. +Elements outside of the range [begin, begin + count) are not defined. \a *to does not need to be aligned, and can be +null if \a count is zero.*/ +template <typename Scalar, typename Packet> +EIGEN_DEVICE_FUNC inline void pstoreuSegment(Scalar* to, const Packet& from, Index begin, Index count) { + constexpr Index PacketSize = unpacket_traits<Packet>::size; + eigen_assert((begin >= 0 && count >= 0 && begin + count <= PacketSize) && "invalid range"); + Scalar aux[PacketSize]; + pstoreu<Scalar, Packet>(aux, from); + smart_copy(aux + begin, aux + begin + count, to + begin); +} + +/** \internal copy the packet \a from in the range [begin, begin + count) to \a *to. +Elements outside of the range [begin, begin + count) are not defined. \a *to must be aligned, and cannot be +null.*/ +template <typename Scalar, typename Packet> +EIGEN_DEVICE_FUNC inline void pstoreSegment(Scalar* to, const Packet& from, Index begin, Index count) { + return pstoreuSegment(to, from, begin, count); +} + +/** \internal \returns a packet populated with values in the range [begin, begin + count). Elements + * outside this range are not defined.*/ +template <typename Packet, int Alignment> +EIGEN_DEVICE_FUNC inline Packet ploadtSegment(const typename unpacket_traits<Packet>::type* from, Index begin, + Index count) { + constexpr int RequiredAlignment = unpacket_traits<Packet>::alignment; + if (Alignment >= RequiredAlignment) { + return ploadSegment<Packet>(from, begin, count); + } else { + return ploaduSegment<Packet>(from, begin, count); + } +} + +/** \internal copy the packet \a from in the range [begin, begin + count) to \a *to. +Elements outside of the range [begin, begin + count) are not defined.*/ +template <typename Scalar, typename Packet, int Alignment> +EIGEN_DEVICE_FUNC inline void pstoretSegment(Scalar* to, const Packet& from, Index begin, Index count) { + constexpr int RequiredAlignment = unpacket_traits<Packet>::alignment; + if (Alignment >= RequiredAlignment) { + pstoreSegment<Scalar, Packet>(to, from, begin, count); + } else { + pstoreuSegment<Scalar, Packet>(to, from, begin, count); + } +} + +#ifndef EIGEN_NO_IO + +template <typename Packet> +class StreamablePacket { + public: + using Scalar = typename unpacket_traits<Packet>::type; + StreamablePacket(const Packet& packet) { pstoreu(v_, packet); } + + friend std::ostream& operator<<(std::ostream& os, const StreamablePacket& packet) { + os << "{" << packet.v_[0]; + for (int i = 1; i < unpacket_traits<Packet>::size; ++i) { + os << "," << packet.v_[i]; + } + os << "}"; + return os; + } + + private: + Scalar v_[unpacket_traits<Packet>::size]; +}; + +/** + * \internal \returns an intermediary that can be used to ostream packets, e.g. for debugging. + */ +template <typename Packet> +StreamablePacket<Packet> postream(const Packet& packet) { + return StreamablePacket<Packet>(packet); +} + +#endif // EIGEN_NO_IO + } // end namespace internal } // end namespace Eigen
diff --git a/Eigen/src/Core/IndexedView.h b/Eigen/src/Core/IndexedView.h index 454e560..358239c 100644 --- a/Eigen/src/Core/IndexedView.h +++ b/Eigen/src/Core/IndexedView.h
@@ -225,14 +225,14 @@ return this->nestedExpression().data() + row_offset + col_offset; } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const EIGEN_NOEXCEPT { + EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { if (traits<Derived>::InnerStrideAtCompileTime != Dynamic) { return traits<Derived>::InnerStrideAtCompileTime; } return innerIncrement() * this->nestedExpression().innerStride(); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const EIGEN_NOEXCEPT { + EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { if (traits<Derived>::OuterStrideAtCompileTime != Dynamic) { return traits<Derived>::OuterStrideAtCompileTime; }
diff --git a/Eigen/src/Core/Inverse.h b/Eigen/src/Core/Inverse.h index 013ad0a..79fc3ab 100644 --- a/Eigen/src/Core/Inverse.h +++ b/Eigen/src/Core/Inverse.h
@@ -51,8 +51,8 @@ explicit EIGEN_DEVICE_FUNC Inverse(const XprType& xpr) : m_xpr(xpr) {} - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_xpr.cols(); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_xpr.rows(); } + EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_xpr.cols(); } + EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_xpr.rows(); } EIGEN_DEVICE_FUNC const XprTypeNestedCleaned& nestedExpression() const { return m_xpr; }
diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h index df7b7ca..c740da7 100644 --- a/Eigen/src/Core/Map.h +++ b/Eigen/src/Core/Map.h
@@ -102,11 +102,11 @@ typedef PointerType PointerArgType; EIGEN_DEVICE_FUNC inline PointerType cast_to_pointer_type(PointerArgType ptr) { return ptr; } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const { + EIGEN_DEVICE_FUNC constexpr Index innerStride() const { return StrideType::InnerStrideAtCompileTime != 0 ? m_stride.inner() : 1; } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const { + EIGEN_DEVICE_FUNC constexpr Index outerStride() const { return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer() : internal::traits<Map>::OuterStrideAtCompileTime != Dynamic ? Index(internal::traits<Map>::OuterStrideAtCompileTime)
diff --git a/Eigen/src/Core/MapBase.h b/Eigen/src/Core/MapBase.h index 1e83fdf..5e3d746 100644 --- a/Eigen/src/Core/MapBase.h +++ b/Eigen/src/Core/MapBase.h
@@ -84,9 +84,9 @@ typedef typename Base::CoeffReturnType CoeffReturnType; /** \copydoc DenseBase::rows() */ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_rows.value(); } + EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_rows.value(); } /** \copydoc DenseBase::cols() */ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_cols.value(); } + EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_cols.value(); } /** Returns a pointer to the first coefficient of the matrix or vector. *
diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 528aed2..941961d 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h
@@ -170,8 +170,8 @@ template <typename Scalar> struct imag_ref_default_impl<Scalar, false> { - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline Scalar run(Scalar&) { return Scalar(0); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline const Scalar run(const Scalar&) { return Scalar(0); } + EIGEN_DEVICE_FUNC constexpr static Scalar run(Scalar&) { return Scalar(0); } + EIGEN_DEVICE_FUNC constexpr static const Scalar run(const Scalar&) { return Scalar(0); } }; template <typename Scalar> @@ -182,6 +182,10 @@ typedef typename NumTraits<Scalar>::Real& type; }; +// implementation in MathFunctionsImpl.h +template <typename Mask, bool is_built_in_float = std::is_floating_point<Mask>::value> +struct scalar_select_mask; + } // namespace internal namespace numext { @@ -207,6 +211,11 @@ return EIGEN_MATHFUNC_IMPL(imag, Scalar)::run(x); } +template <typename Scalar, typename Mask> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar select(const Mask& mask, const Scalar& a, const Scalar& b) { + return internal::scalar_select_mask<Mask>::run(mask) ? b : a; +} + } // namespace numext namespace internal { @@ -827,8 +836,8 @@ template <typename T> EIGEN_DEVICE_FUNC - std::enable_if_t<!(std::numeric_limits<T>::has_quiet_NaN || std::numeric_limits<T>::has_signaling_NaN), bool> - isnan_impl(const T&) { +std::enable_if_t<!(std::numeric_limits<T>::has_quiet_NaN || std::numeric_limits<T>::has_signaling_NaN), bool> +isnan_impl(const T&) { return false; } @@ -936,6 +945,38 @@ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run_trunc(const Scalar& x) { return x; } }; +// Default implementation. +template <typename Scalar, typename Enable = void> +struct fma_impl { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run(const Scalar& a, const Scalar& b, const Scalar& c) { + return a * b + c; + } +}; + +// ADL version if it exists. +template <typename T> +struct fma_impl< + T, + std::enable_if_t<std::is_same<T, decltype(fma(std::declval<T>(), std::declval<T>(), std::declval<T>()))>::value>> { + static T run(const T& a, const T& b, const T& c) { return fma(a, b, c); } +}; + +#if defined(EIGEN_GPUCC) +template <> +struct fma_impl<float, void> { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float run(const float& a, const float& b, const float& c) { + return ::fmaf(a, b, c); + } +}; + +template <> +struct fma_impl<double, void> { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double run(const double& a, const double& b, const double& c) { + return ::fma(a, b, c); + } +}; +#endif + } // end namespace internal /**************************************************************************** @@ -1256,7 +1297,7 @@ // Integer division with rounding up. // T is assumed to be an integer type with a>=0, and b>0 template <typename T> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE EIGEN_CONSTEXPR T div_ceil(T a, T b) { +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE constexpr T div_ceil(T a, T b) { using UnsignedT = typename internal::make_unsigned<T>::type; EIGEN_STATIC_ASSERT((NumTraits<T>::IsInteger), THIS FUNCTION IS FOR INTEGER TYPES) // Note: explicitly declaring a and b as non-negative values allows the compiler to use better optimizations @@ -1269,7 +1310,7 @@ // Integer round down to nearest power of b // T is assumed to be an integer type with a>=0, and b>0 template <typename T, typename U> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE EIGEN_CONSTEXPR T round_down(T a, U b) { +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE constexpr T round_down(T a, U b) { using UnsignedT = typename internal::make_unsigned<T>::type; using UnsignedU = typename internal::make_unsigned<U>::type; EIGEN_STATIC_ASSERT((NumTraits<T>::IsInteger), THIS FUNCTION IS FOR INTEGER TYPES) @@ -1282,7 +1323,7 @@ /** Log base 2 for 32 bits positive integers. * Conveniently returns 0 for x==0. */ -EIGEN_CONSTEXPR inline int log2(int x) { +constexpr int log2(int x) { unsigned int v(x); constexpr int table[32] = {0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31}; @@ -1320,11 +1361,17 @@ /** \returns the cube root of \a x. **/ template <typename T> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T cbrt(const T& x) { +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t<!NumTraits<T>::IsComplex, T> cbrt(const T& x) { EIGEN_USING_STD(cbrt); return static_cast<T>(cbrt(x)); } +template <typename T> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t<NumTraits<T>::IsComplex, T> cbrt(const T& x) { + EIGEN_USING_STD(pow); + return pow(x, typename NumTraits<T>::Real(1.0 / 3.0)); +} + /** \returns the reciprocal square root of \a x. **/ template <typename T> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T rsqrt(const T& x) { @@ -1353,17 +1400,17 @@ #endif template <typename T> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE - std::enable_if_t<NumTraits<T>::IsSigned || NumTraits<T>::IsComplex, typename NumTraits<T>::Real> - abs(const T& x) { +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE std::enable_if_t<NumTraits<T>::IsSigned || NumTraits<T>::IsComplex, typename NumTraits<T>::Real> +abs(const T& x) { EIGEN_USING_STD(abs); return abs(x); } template <typename T> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE - std::enable_if_t<!(NumTraits<T>::IsSigned || NumTraits<T>::IsComplex), typename NumTraits<T>::Real> - abs(const T& x) { +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE std::enable_if_t<!(NumTraits<T>::IsSigned || NumTraits<T>::IsComplex), typename NumTraits<T>::Real> +abs(const T& x) { return x; } @@ -1843,6 +1890,15 @@ return bit_cast<Scalar, SignedScalar>(bit_cast<SignedScalar, Scalar>(a) >> n); } +// Use std::fma if available. +using std::fma; + +// Otherwise, rely on template implementation. +template <typename Scalar> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar fma(const Scalar& x, const Scalar& y, const Scalar& z) { + return internal::fma_impl<Scalar>::run(x, y, z); +} + } // end namespace numext namespace internal {
diff --git a/Eigen/src/Core/MathFunctionsImpl.h b/Eigen/src/Core/MathFunctionsImpl.h index 8e2705b..cbac1c2 100644 --- a/Eigen/src/Core/MathFunctionsImpl.h +++ b/Eigen/src/Core/MathFunctionsImpl.h
@@ -76,7 +76,7 @@ static_assert(Steps > 0, "Steps must be at least 1."); using Scalar = typename unpacket_traits<Packet>::type; EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Packet run(const Packet& a, const Packet& approx_rsqrt) { - constexpr Scalar kMinusHalf = Scalar(-1) / Scalar(2); + const Scalar kMinusHalf = Scalar(-1) / Scalar(2); const Packet cst_minus_half = pset1<Packet>(kMinusHalf); const Packet cst_minus_one = pset1<Packet>(Scalar(-1)); @@ -256,6 +256,48 @@ return ComplexT(numext::log(a), b); } +// For generic scalars, use ternary select. +template <typename Mask> +struct scalar_select_mask<Mask, /*is_built_in_float*/ false> { + static EIGEN_DEVICE_FUNC inline bool run(const Mask& mask) { return numext::is_exactly_zero(mask); } +}; + +// For built-in float mask, bitcast the mask to its integer counterpart and use ternary select. +template <typename Mask> +struct scalar_select_mask<Mask, /*is_built_in_float*/ true> { + using IntegerType = typename numext::get_integer_by_size<sizeof(Mask)>::unsigned_type; + static EIGEN_DEVICE_FUNC inline bool run(const Mask& mask) { + return numext::is_exactly_zero(numext::bit_cast<IntegerType>(std::abs(mask))); + } +}; + +template <int Size = sizeof(long double)> +struct ldbl_select_mask { + static constexpr int MantissaDigits = std::numeric_limits<long double>::digits; + static constexpr int NumBytes = (MantissaDigits == 64 ? 80 : 128) / CHAR_BIT; + static EIGEN_DEVICE_FUNC inline bool run(const long double& mask) { + const uint8_t* mask_bytes = reinterpret_cast<const uint8_t*>(&mask); + for (Index i = 0; i < NumBytes; i++) { + if (mask_bytes[i] != 0) return false; + } + return true; + } +}; + +template <> +struct ldbl_select_mask<sizeof(double)> : scalar_select_mask<double> {}; + +template <> +struct scalar_select_mask<long double, true> : ldbl_select_mask<> {}; + +template <typename RealMask> +struct scalar_select_mask<std::complex<RealMask>, false> { + using impl = scalar_select_mask<RealMask>; + static EIGEN_DEVICE_FUNC inline bool run(const std::complex<RealMask>& mask) { + return impl::run(numext::real(mask)) && impl::run(numext::imag(mask)); + } +}; + } // end namespace internal } // end namespace Eigen
diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h index 67590fb..a2c8eba 100644 --- a/Eigen/src/Core/Matrix.h +++ b/Eigen/src/Core/Matrix.h
@@ -258,8 +258,8 @@ /** \brief Moves the matrix into the other one. * */ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Matrix& operator=(Matrix&& other) - EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable<Scalar>::value) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Matrix& operator=(Matrix&& other) noexcept( + std::is_nothrow_move_assignable<Scalar>::value) { Base::operator=(std::move(other)); return *this; } @@ -393,8 +393,8 @@ template <typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Matrix(const EigenBase<OtherDerived>& other) : Base(other.derived()) {} - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const EIGEN_NOEXCEPT { return 1; } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const EIGEN_NOEXCEPT { return this->innerSize(); } + EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { return 1; } + EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return this->innerSize(); } /////////// Geometry module ///////////
diff --git a/Eigen/src/Core/NestByValue.h b/Eigen/src/Core/NestByValue.h index ec360eb..2ce83a8 100644 --- a/Eigen/src/Core/NestByValue.h +++ b/Eigen/src/Core/NestByValue.h
@@ -45,8 +45,8 @@ EIGEN_DEVICE_FUNC explicit inline NestByValue(const ExpressionType& matrix) : m_expression(matrix) {} - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_expression.rows(); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_expression.cols(); } + EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_expression.rows(); } + EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_expression.cols(); } EIGEN_DEVICE_FUNC operator const ExpressionType&() const { return m_expression; }
diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h index 1dc3448..5e4e5c2 100644 --- a/Eigen/src/Core/NumTraits.h +++ b/Eigen/src/Core/NumTraits.h
@@ -22,13 +22,13 @@ template <typename T, bool use_numeric_limits = std::numeric_limits<T>::is_specialized, bool is_integer = NumTraits<T>::IsInteger> struct default_digits_impl { - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int run() { return std::numeric_limits<T>::digits; } + EIGEN_DEVICE_FUNC constexpr static int run() { return std::numeric_limits<T>::digits; } }; template <typename T> struct default_digits_impl<T, false, false> // Floating point { - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int run() { + EIGEN_DEVICE_FUNC constexpr static int run() { using std::ceil; using std::log2; typedef typename NumTraits<T>::Real Real; @@ -39,7 +39,7 @@ template <typename T> struct default_digits_impl<T, false, true> // Integer { - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int run() { return 0; } + EIGEN_DEVICE_FUNC constexpr static int run() { return 0; } }; // default implementation of digits10(), based on numeric_limits if specialized, @@ -47,13 +47,13 @@ template <typename T, bool use_numeric_limits = std::numeric_limits<T>::is_specialized, bool is_integer = NumTraits<T>::IsInteger> struct default_digits10_impl { - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int run() { return std::numeric_limits<T>::digits10; } + EIGEN_DEVICE_FUNC constexpr static int run() { return std::numeric_limits<T>::digits10; } }; template <typename T> struct default_digits10_impl<T, false, false> // Floating point { - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int run() { + EIGEN_DEVICE_FUNC constexpr static int run() { using std::floor; using std::log10; typedef typename NumTraits<T>::Real Real; @@ -64,7 +64,7 @@ template <typename T> struct default_digits10_impl<T, false, true> // Integer { - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int run() { return 0; } + EIGEN_DEVICE_FUNC constexpr static int run() { return 0; } }; // default implementation of max_digits10(), based on numeric_limits if specialized, @@ -72,13 +72,13 @@ template <typename T, bool use_numeric_limits = std::numeric_limits<T>::is_specialized, bool is_integer = NumTraits<T>::IsInteger> struct default_max_digits10_impl { - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int run() { return std::numeric_limits<T>::max_digits10; } + EIGEN_DEVICE_FUNC constexpr static int run() { return std::numeric_limits<T>::max_digits10; } }; template <typename T> struct default_max_digits10_impl<T, false, false> // Floating point { - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int run() { + EIGEN_DEVICE_FUNC constexpr static int run() { using std::ceil; using std::log10; typedef typename NumTraits<T>::Real Real; @@ -89,7 +89,7 @@ template <typename T> struct default_max_digits10_impl<T, false, true> // Integer { - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int run() { return 0; } + EIGEN_DEVICE_FUNC constexpr static int run() { return 0; } }; } // end namespace internal @@ -188,32 +188,30 @@ typedef T Nested; typedef T Literal; - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline Real epsilon() { return numext::numeric_limits<T>::epsilon(); } + EIGEN_DEVICE_FUNC constexpr static Real epsilon() { return numext::numeric_limits<T>::epsilon(); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline int digits10() { return internal::default_digits10_impl<T>::run(); } + EIGEN_DEVICE_FUNC constexpr static int digits10() { return internal::default_digits10_impl<T>::run(); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline int max_digits10() { - return internal::default_max_digits10_impl<T>::run(); - } + EIGEN_DEVICE_FUNC constexpr static int max_digits10() { return internal::default_max_digits10_impl<T>::run(); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline int digits() { return internal::default_digits_impl<T>::run(); } + EIGEN_DEVICE_FUNC constexpr static int digits() { return internal::default_digits_impl<T>::run(); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline int min_exponent() { return numext::numeric_limits<T>::min_exponent; } + EIGEN_DEVICE_FUNC constexpr static int min_exponent() { return numext::numeric_limits<T>::min_exponent; } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline int max_exponent() { return numext::numeric_limits<T>::max_exponent; } + EIGEN_DEVICE_FUNC constexpr static int max_exponent() { return numext::numeric_limits<T>::max_exponent; } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline Real dummy_precision() { + EIGEN_DEVICE_FUNC constexpr static Real dummy_precision() { // make sure to override this for floating-point types return Real(0); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline T highest() { return (numext::numeric_limits<T>::max)(); } + EIGEN_DEVICE_FUNC constexpr static T highest() { return (numext::numeric_limits<T>::max)(); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline T lowest() { return (numext::numeric_limits<T>::lowest)(); } + EIGEN_DEVICE_FUNC constexpr static T lowest() { return (numext::numeric_limits<T>::lowest)(); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline T infinity() { return numext::numeric_limits<T>::infinity(); } + EIGEN_DEVICE_FUNC constexpr static T infinity() { return numext::numeric_limits<T>::infinity(); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline T quiet_NaN() { return numext::numeric_limits<T>::quiet_NaN(); } + EIGEN_DEVICE_FUNC constexpr static T quiet_NaN() { return numext::numeric_limits<T>::quiet_NaN(); } }; template <typename T> @@ -221,25 +219,23 @@ template <> struct NumTraits<float> : GenericNumTraits<float> { - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline float dummy_precision() { return 1e-5f; } + EIGEN_DEVICE_FUNC constexpr static float dummy_precision() { return 1e-5f; } }; template <> struct NumTraits<double> : GenericNumTraits<double> { - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline double dummy_precision() { return 1e-12; } + EIGEN_DEVICE_FUNC constexpr static double dummy_precision() { return 1e-12; } }; // GPU devices treat `long double` as `double`. #ifndef EIGEN_GPU_COMPILE_PHASE template <> struct NumTraits<long double> : GenericNumTraits<long double> { - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline long double dummy_precision() { - return static_cast<long double>(1e-15l); - } + EIGEN_DEVICE_FUNC constexpr static long double dummy_precision() { return static_cast<long double>(1e-15l); } #if defined(EIGEN_ARCH_PPC) && (__LDBL_MANT_DIG__ == 106) // PowerPC double double causes issues with some values - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline long double epsilon() { + EIGEN_DEVICE_FUNC constexpr static long double epsilon() { // 2^(-(__LDBL_MANT_DIG__)+1) return static_cast<long double>(2.4651903288156618919116517665087e-32l); } @@ -260,10 +256,10 @@ MulCost = 4 * NumTraits<Real>::MulCost + 2 * NumTraits<Real>::AddCost }; - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline Real epsilon() { return NumTraits<Real>::epsilon(); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline Real dummy_precision() { return NumTraits<Real>::dummy_precision(); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline int digits10() { return NumTraits<Real>::digits10(); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline int max_digits10() { return NumTraits<Real>::max_digits10(); } + EIGEN_DEVICE_FUNC constexpr static Real epsilon() { return NumTraits<Real>::epsilon(); } + EIGEN_DEVICE_FUNC constexpr static Real dummy_precision() { return NumTraits<Real>::dummy_precision(); } + EIGEN_DEVICE_FUNC constexpr static int digits10() { return NumTraits<Real>::digits10(); } + EIGEN_DEVICE_FUNC constexpr static int max_digits10() { return NumTraits<Real>::max_digits10(); } }; template <typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols> @@ -290,25 +286,19 @@ : ArrayType::SizeAtCompileTime * int(NumTraits<Scalar>::MulCost) }; - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline RealScalar epsilon() { return NumTraits<RealScalar>::epsilon(); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline RealScalar dummy_precision() { - return NumTraits<RealScalar>::dummy_precision(); - } + EIGEN_DEVICE_FUNC constexpr static RealScalar epsilon() { return NumTraits<RealScalar>::epsilon(); } + EIGEN_DEVICE_FUNC constexpr static RealScalar dummy_precision() { return NumTraits<RealScalar>::dummy_precision(); } - EIGEN_CONSTEXPR - static inline int digits10() { return NumTraits<Scalar>::digits10(); } - EIGEN_CONSTEXPR - static inline int max_digits10() { return NumTraits<Scalar>::max_digits10(); } + constexpr static int digits10() { return NumTraits<Scalar>::digits10(); } + constexpr static int max_digits10() { return NumTraits<Scalar>::max_digits10(); } }; template <> struct NumTraits<std::string> : GenericNumTraits<std::string> { enum { RequireInitialization = 1, ReadCost = HugeCost, AddCost = HugeCost, MulCost = HugeCost }; - EIGEN_CONSTEXPR - static inline int digits10() { return 0; } - EIGEN_CONSTEXPR - static inline int max_digits10() { return 0; } + constexpr static int digits10() { return 0; } + constexpr static int max_digits10() { return 0; } private: static inline std::string epsilon();
diff --git a/Eigen/src/Core/PartialReduxEvaluator.h b/Eigen/src/Core/PartialReduxEvaluator.h index 7b2c8dc..1f638f9 100644 --- a/Eigen/src/Core/PartialReduxEvaluator.h +++ b/Eigen/src/Core/PartialReduxEvaluator.h
@@ -103,19 +103,36 @@ EIGEN_DEVICE_FUNC static PacketType run(const Evaluator& eval, const Func& func, Index size) { if (size == 0) return packetwise_redux_empty_value<PacketType>(func); - const Index size4 = (size - 1) & (~3); + const Index size4 = 1 + numext::round_down(size - 1, 4); PacketType p = eval.template packetByOuterInner<Unaligned, PacketType>(0, 0); - Index i = 1; // This loop is optimized for instruction pipelining: // - each iteration generates two independent instructions // - thanks to branch prediction and out-of-order execution we have independent instructions across loops - for (; i < size4; i += 4) + for (Index i = 1; i < size4; i += 4) p = func.packetOp( p, func.packetOp(func.packetOp(eval.template packetByOuterInner<Unaligned, PacketType>(i + 0, 0), eval.template packetByOuterInner<Unaligned, PacketType>(i + 1, 0)), func.packetOp(eval.template packetByOuterInner<Unaligned, PacketType>(i + 2, 0), eval.template packetByOuterInner<Unaligned, PacketType>(i + 3, 0)))); - for (; i < size; ++i) p = func.packetOp(p, eval.template packetByOuterInner<Unaligned, PacketType>(i, 0)); + for (Index i = size4; i < size; ++i) + p = func.packetOp(p, eval.template packetByOuterInner<Unaligned, PacketType>(i, 0)); + return p; + } +}; + +template <typename Func, typename Evaluator> +struct packetwise_segment_redux_impl { + typedef typename Evaluator::Scalar Scalar; + typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar; + + template <typename PacketType> + EIGEN_DEVICE_FUNC static PacketType run(const Evaluator& eval, const Func& func, Index size, Index begin, + Index count) { + if (size == 0) return packetwise_redux_empty_value<PacketType>(func); + + PacketType p = eval.template packetSegmentByOuterInner<Unaligned, PacketType>(0, 0, begin, count); + for (Index i = 1; i < size; ++i) + p = func.packetOp(p, eval.template packetSegmentByOuterInner<Unaligned, PacketType>(i, 0, begin, count)); return p; } }; @@ -174,14 +191,13 @@ template <int LoadMode, typename PacketType> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC PacketType packet(Index idx) const { - enum { PacketSize = internal::unpacket_traits<PacketType>::size }; - typedef Block<const ArgTypeNestedCleaned, Direction == Vertical ? int(ArgType::RowsAtCompileTime) : int(PacketSize), - Direction == Vertical ? int(PacketSize) : int(ArgType::ColsAtCompileTime), true /* InnerPanel */> - PanelType; - - PanelType panel(m_arg, Direction == Vertical ? 0 : idx, Direction == Vertical ? idx : 0, - Direction == Vertical ? m_arg.rows() : Index(PacketSize), - Direction == Vertical ? Index(PacketSize) : m_arg.cols()); + static constexpr int PacketSize = internal::unpacket_traits<PacketType>::size; + static constexpr int PanelRows = Direction == Vertical ? ArgType::RowsAtCompileTime : PacketSize; + static constexpr int PanelCols = Direction == Vertical ? PacketSize : ArgType::ColsAtCompileTime; + using PanelType = Block<const ArgTypeNestedCleaned, PanelRows, PanelCols, true /* InnerPanel */>; + using PanelEvaluator = typename internal::redux_evaluator<PanelType>; + using BinaryOp = typename MemberOp::BinaryOp; + using Impl = internal::packetwise_redux_impl<BinaryOp, PanelEvaluator>; // FIXME // See bug 1612, currently if PacketSize==1 (i.e. complex<double> with 128bits registers) then the storage-order of @@ -189,11 +205,39 @@ // by pass "vectorization" in this case: if (PacketSize == 1) return internal::pset1<PacketType>(coeff(idx)); - typedef typename internal::redux_evaluator<PanelType> PanelEvaluator; + Index startRow = Direction == Vertical ? 0 : idx; + Index startCol = Direction == Vertical ? idx : 0; + Index numRows = Direction == Vertical ? m_arg.rows() : PacketSize; + Index numCols = Direction == Vertical ? PacketSize : m_arg.cols(); + + PanelType panel(m_arg, startRow, startCol, numRows, numCols); PanelEvaluator panel_eval(panel); - typedef typename MemberOp::BinaryOp BinaryOp; - PacketType p = internal::packetwise_redux_impl<BinaryOp, PanelEvaluator>::template run<PacketType>( - panel_eval, m_functor.binaryFunc(), m_arg.outerSize()); + PacketType p = Impl::template run<PacketType>(panel_eval, m_functor.binaryFunc(), m_arg.outerSize()); + return p; + } + + template <int LoadMode, typename PacketType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index i, Index j, Index begin, Index count) const { + return packetSegment<LoadMode, PacketType>(Direction == Vertical ? j : i, begin, count); + } + + template <int LoadMode, typename PacketType> + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC PacketType packetSegment(Index idx, Index begin, Index count) const { + static constexpr int PanelRows = Direction == Vertical ? ArgType::RowsAtCompileTime : Dynamic; + static constexpr int PanelCols = Direction == Vertical ? Dynamic : ArgType::ColsAtCompileTime; + using PanelType = Block<const ArgTypeNestedCleaned, PanelRows, PanelCols, true /* InnerPanel */>; + using PanelEvaluator = typename internal::redux_evaluator<PanelType>; + using BinaryOp = typename MemberOp::BinaryOp; + using Impl = internal::packetwise_segment_redux_impl<BinaryOp, PanelEvaluator>; + + Index startRow = Direction == Vertical ? 0 : idx; + Index startCol = Direction == Vertical ? idx : 0; + Index numRows = Direction == Vertical ? m_arg.rows() : begin + count; + Index numCols = Direction == Vertical ? begin + count : m_arg.cols(); + + PanelType panel(m_arg, startRow, startCol, numRows, numCols); + PanelEvaluator panel_eval(panel); + PacketType p = Impl::template run<PacketType>(panel_eval, m_functor.binaryFunc(), m_arg.outerSize(), begin, count); return p; }
diff --git a/Eigen/src/Core/PlainObjectBase.h b/Eigen/src/Core/PlainObjectBase.h index eca7e1f..a78305e 100644 --- a/Eigen/src/Core/PlainObjectBase.h +++ b/Eigen/src/Core/PlainObjectBase.h
@@ -162,8 +162,8 @@ EIGEN_DEVICE_FUNC Base& base() { return *static_cast<Base*>(this); } EIGEN_DEVICE_FUNC const Base& base() const { return *static_cast<const Base*>(this); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_storage.rows(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_storage.cols(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const noexcept { return m_storage.rows(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const noexcept { return m_storage.cols(); } /** This is an overloaded version of DenseCoeffsBase<Derived,ReadOnlyAccessors>::coeff(Index,Index) const * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts. @@ -298,7 +298,7 @@ * * \sa resize(Index,Index), resize(NoChange_t, Index), resize(Index, NoChange_t) */ - EIGEN_DEVICE_FUNC inline constexpr void resize(Index size) { + EIGEN_DEVICE_FUNC constexpr void resize(Index size) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(PlainObjectBase) eigen_assert(((SizeAtCompileTime == Dynamic && (MaxSizeAtCompileTime == Dynamic || size <= MaxSizeAtCompileTime)) || SizeAtCompileTime == size) && @@ -323,7 +323,7 @@ * * \sa resize(Index,Index) */ - EIGEN_DEVICE_FUNC inline constexpr void resize(NoChange_t, Index cols) { resize(rows(), cols); } + EIGEN_DEVICE_FUNC constexpr void resize(NoChange_t, Index cols) { resize(rows(), cols); } /** Resizes the matrix, changing only the number of rows. For the parameter of type NoChange_t, just pass the special * value \c NoChange as in the example below. @@ -333,7 +333,7 @@ * * \sa resize(Index,Index) */ - EIGEN_DEVICE_FUNC inline constexpr void resize(Index rows, NoChange_t) { resize(rows, cols()); } + EIGEN_DEVICE_FUNC constexpr void resize(Index rows, NoChange_t) { resize(rows, cols()); } /** Resizes \c *this to have the same dimensions as \a other. * Takes care of doing all the checking that's needed. @@ -450,7 +450,7 @@ /** \brief Move constructor */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr PlainObjectBase(PlainObjectBase&&) = default; /** \brief Move assignment operator */ - EIGEN_DEVICE_FUNC constexpr PlainObjectBase& operator=(PlainObjectBase&& other) EIGEN_NOEXCEPT { + EIGEN_DEVICE_FUNC constexpr PlainObjectBase& operator=(PlainObjectBase&& other) noexcept { m_storage = std::move(other.m_storage); return *this; }
diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h index 37683e3..e16c7cc 100644 --- a/Eigen/src/Core/Product.h +++ b/Eigen/src/Core/Product.h
@@ -224,8 +224,8 @@ "if you wanted a coeff-wise or a dot product use the respective explicit functions"); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_lhs.rows(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const noexcept { return m_lhs.rows(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const noexcept { return m_rhs.cols(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const LhsNestedCleaned& lhs() const { return m_lhs; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const RhsNestedCleaned& rhs() const { return m_rhs; }
diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index 9de6481..ce8d954 100644 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h
@@ -283,7 +283,7 @@ template <typename Lhs, typename Rhs> struct generic_product_impl<Lhs, Rhs, DenseShape, DenseShape, OuterProduct> { template <typename T> - struct is_row_major : std::conditional_t<(int(T::Flags) & RowMajorBit), internal::true_type, internal::false_type> {}; + struct is_row_major : bool_constant<(int(T::Flags) & RowMajorBit)> {}; typedef typename Product<Lhs, Rhs>::Scalar Scalar; // TODO it would be nice to be able to exploit our *_assign_op functors for that purpose @@ -445,7 +445,7 @@ eval_dynamic_impl(dst, blas_traits<Lhs>::extract(lhs).template conjugateIf<ConjLhs>(), blas_traits<Rhs>::extract(rhs).template conjugateIf<ConjRhs>(), func, actualAlpha, - std::conditional_t<HasScalarFactor, true_type, false_type>()); + bool_constant<HasScalarFactor>()); } protected: @@ -635,6 +635,24 @@ return packet<LoadMode, PacketType>(row, col); } + template <int LoadMode, typename PacketType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packetSegment(Index row, Index col, Index begin, + Index count) const { + PacketType res; + typedef etor_product_packet_impl<bool(int(Flags) & RowMajorBit) ? RowMajor : ColMajor, + Unroll ? int(InnerSize) : Dynamic, LhsEtorType, RhsEtorType, PacketType, LoadMode> + PacketImpl; + PacketImpl::run_segment(row, col, m_lhsImpl, m_rhsImpl, m_innerDim, res, begin, count); + return res; + } + + template <int LoadMode, typename PacketType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packetSegment(Index index, Index begin, Index count) const { + const Index row = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime == 1) ? 0 : index; + const Index col = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime == 1) ? index : 0; + return packetSegment<LoadMode, PacketType>(row, col, begin, count); + } + protected: add_const_on_value_type_t<LhsNested> m_lhs; add_const_on_value_type_t<RhsNested> m_rhs; @@ -670,6 +688,13 @@ res = pmadd(pset1<Packet>(lhs.coeff(row, Index(UnrollingIndex - 1))), rhs.template packet<LoadMode, Packet>(Index(UnrollingIndex - 1), col), res); } + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs, + Index innerDim, Packet& res, Index begin, Index count) { + etor_product_packet_impl<RowMajor, UnrollingIndex - 1, Lhs, Rhs, Packet, LoadMode>::run_segment( + row, col, lhs, rhs, innerDim, res, begin, count); + res = pmadd(pset1<Packet>(lhs.coeff(row, Index(UnrollingIndex - 1))), + rhs.template packetSegment<LoadMode, Packet>(Index(UnrollingIndex - 1), col, begin, count), res); + } }; template <int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode> @@ -681,6 +706,13 @@ res = pmadd(lhs.template packet<LoadMode, Packet>(row, Index(UnrollingIndex - 1)), pset1<Packet>(rhs.coeff(Index(UnrollingIndex - 1), col)), res); } + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs, + Index innerDim, Packet& res, Index begin, Index count) { + etor_product_packet_impl<ColMajor, UnrollingIndex - 1, Lhs, Rhs, Packet, LoadMode>::run_segment( + row, col, lhs, rhs, innerDim, res, begin, count); + res = pmadd(lhs.template packetSegment<LoadMode, Packet>(row, Index(UnrollingIndex - 1), begin, count), + pset1<Packet>(rhs.coeff(Index(UnrollingIndex - 1), col)), res); + } }; template <typename Lhs, typename Rhs, typename Packet, int LoadMode> @@ -689,6 +721,12 @@ Index /*innerDim*/, Packet& res) { res = pmul(pset1<Packet>(lhs.coeff(row, Index(0))), rhs.template packet<LoadMode, Packet>(Index(0), col)); } + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs, + Index /*innerDim*/, Packet& res, Index begin, + Index count) { + res = pmul(pset1<Packet>(lhs.coeff(row, Index(0))), + rhs.template packetSegment<LoadMode, Packet>(Index(0), col, begin, count)); + } }; template <typename Lhs, typename Rhs, typename Packet, int LoadMode> @@ -697,6 +735,12 @@ Index /*innerDim*/, Packet& res) { res = pmul(lhs.template packet<LoadMode, Packet>(row, Index(0)), pset1<Packet>(rhs.coeff(Index(0), col))); } + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs, + Index /*innerDim*/, Packet& res, Index begin, + Index count) { + res = pmul(lhs.template packetSegment<LoadMode, Packet>(row, Index(0), begin, count), + pset1<Packet>(rhs.coeff(Index(0), col))); + } }; template <typename Lhs, typename Rhs, typename Packet, int LoadMode> @@ -705,6 +749,11 @@ const Rhs& /*rhs*/, Index /*innerDim*/, Packet& res) { res = pset1<Packet>(typename unpacket_traits<Packet>::type(0)); } + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, + const Rhs& /*rhs*/, Index /*innerDim*/, Packet& res, + Index /*begin*/, Index /*count*/) { + res = pset1<Packet>(typename unpacket_traits<Packet>::type(0)); + } }; template <typename Lhs, typename Rhs, typename Packet, int LoadMode> @@ -713,6 +762,11 @@ const Rhs& /*rhs*/, Index /*innerDim*/, Packet& res) { res = pset1<Packet>(typename unpacket_traits<Packet>::type(0)); } + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, + const Rhs& /*rhs*/, Index /*innerDim*/, Packet& res, + Index /*begin*/, Index /*count*/) { + res = pset1<Packet>(typename unpacket_traits<Packet>::type(0)); + } }; template <typename Lhs, typename Rhs, typename Packet, int LoadMode> @@ -723,6 +777,13 @@ for (Index i = 0; i < innerDim; ++i) res = pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode, Packet>(i, col), res); } + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs, + Index innerDim, Packet& res, Index begin, Index count) { + res = pset1<Packet>(typename unpacket_traits<Packet>::type(0)); + for (Index i = 0; i < innerDim; ++i) + res = pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packetSegment<LoadMode, Packet>(i, col, begin, count), + res); + } }; template <typename Lhs, typename Rhs, typename Packet, int LoadMode> @@ -733,6 +794,13 @@ for (Index i = 0; i < innerDim; ++i) res = pmadd(lhs.template packet<LoadMode, Packet>(row, i), pset1<Packet>(rhs.coeff(i, col)), res); } + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs, + Index innerDim, Packet& res, Index begin, Index count) { + res = pset1<Packet>(typename unpacket_traits<Packet>::type(0)); + for (Index i = 0; i < innerDim; ++i) + res = pmadd(lhs.template packetSegment<LoadMode, Packet>(row, i, begin, count), pset1<Packet>(rhs.coeff(i, col)), + res); + } }; /*************************************************************************** @@ -871,6 +939,26 @@ m_diagImpl.template packet<DiagonalPacketLoadMode, PacketType>(id)); } + template <int LoadMode, typename PacketType> + EIGEN_STRONG_INLINE PacketType packet_segment_impl(Index row, Index col, Index id, Index begin, Index count, + internal::true_type) const { + return internal::pmul(m_matImpl.template packetSegment<LoadMode, PacketType>(row, col, begin, count), + internal::pset1<PacketType>(m_diagImpl.coeff(id))); + } + + template <int LoadMode, typename PacketType> + EIGEN_STRONG_INLINE PacketType packet_segment_impl(Index row, Index col, Index id, Index begin, Index count, + internal::false_type) const { + enum { + InnerSize = (MatrixType::Flags & RowMajorBit) ? MatrixType::ColsAtCompileTime : MatrixType::RowsAtCompileTime, + DiagonalPacketLoadMode = plain_enum_min( + LoadMode, + ((InnerSize % 16) == 0) ? int(Aligned16) : int(evaluator<DiagonalType>::Alignment)) // FIXME hardcoded 16!! + }; + return internal::pmul(m_matImpl.template packetSegment<LoadMode, PacketType>(row, col, begin, count), + m_diagImpl.template packetSegment<DiagonalPacketLoadMode, PacketType>(id, begin, count)); + } + evaluator<DiagonalType> m_diagImpl; evaluator<MatrixType> m_matImpl; }; @@ -892,7 +980,8 @@ typedef typename XprType::PlainObject PlainObject; typedef typename Lhs::DiagonalVectorType DiagonalType; - enum { StorageOrder = Base::StorageOrder_ }; + static constexpr int StorageOrder = Base::StorageOrder_; + using IsRowMajor_t = bool_constant<StorageOrder == RowMajor>; EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr) : Base(xpr.rhs(), xpr.lhs().diagonal()) {} @@ -905,8 +994,7 @@ EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { // FIXME: NVCC used to complain about the template keyword, but we have to check whether this is still the case. // See also similar calls below. - return this->template packet_impl<LoadMode, PacketType>( - row, col, row, std::conditional_t<int(StorageOrder) == RowMajor, internal::true_type, internal::false_type>()); + return this->template packet_impl<LoadMode, PacketType>(row, col, row, IsRowMajor_t()); } template <int LoadMode, typename PacketType> @@ -914,6 +1002,19 @@ return packet<LoadMode, PacketType>(int(StorageOrder) == ColMajor ? idx : 0, int(StorageOrder) == ColMajor ? 0 : idx); } + + template <int LoadMode, typename PacketType> + EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const { + // FIXME: NVCC used to complain about the template keyword, but we have to check whether this is still the case. + // See also similar calls below. + return this->template packet_segment_impl<LoadMode, PacketType>(row, col, row, begin, count, IsRowMajor_t()); + } + + template <int LoadMode, typename PacketType> + EIGEN_STRONG_INLINE PacketType packetSegment(Index idx, Index begin, Index count) const { + return packetSegment<LoadMode, PacketType>(StorageOrder == ColMajor ? idx : 0, StorageOrder == ColMajor ? 0 : idx, + begin, count); + } #endif }; @@ -933,7 +1034,8 @@ typedef Product<Lhs, Rhs, ProductKind> XprType; typedef typename XprType::PlainObject PlainObject; - enum { StorageOrder = Base::StorageOrder_ }; + static constexpr int StorageOrder = Base::StorageOrder_; + using IsColMajor_t = bool_constant<StorageOrder == ColMajor>; EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr) : Base(xpr.lhs(), xpr.rhs().diagonal()) {} @@ -944,14 +1046,23 @@ #ifndef EIGEN_GPUCC template <int LoadMode, typename PacketType> EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { - return this->template packet_impl<LoadMode, PacketType>( - row, col, col, std::conditional_t<int(StorageOrder) == ColMajor, internal::true_type, internal::false_type>()); + return this->template packet_impl<LoadMode, PacketType>(row, col, col, IsColMajor_t()); } template <int LoadMode, typename PacketType> EIGEN_STRONG_INLINE PacketType packet(Index idx) const { - return packet<LoadMode, PacketType>(int(StorageOrder) == ColMajor ? idx : 0, - int(StorageOrder) == ColMajor ? 0 : idx); + return packet<LoadMode, PacketType>(StorageOrder == ColMajor ? idx : 0, StorageOrder == ColMajor ? 0 : idx); + } + + template <int LoadMode, typename PacketType> + EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const { + return this->template packet_segment_impl<LoadMode, PacketType>(row, col, col, begin, count, IsColMajor_t()); + } + + template <int LoadMode, typename PacketType> + EIGEN_STRONG_INLINE PacketType packetSegment(Index idx, Index begin, Index count) const { + return packetSegment<LoadMode, PacketType>(StorageOrder == ColMajor ? idx : 0, StorageOrder == ColMajor ? 0 : idx, + begin, count); } #endif };
diff --git a/Eigen/src/Core/RandomImpl.h b/Eigen/src/Core/RandomImpl.h index 76e43f5..efba336 100644 --- a/Eigen/src/Core/RandomImpl.h +++ b/Eigen/src/Core/RandomImpl.h
@@ -122,7 +122,7 @@ ((std::numeric_limits<long double>::digits != (2 * std::numeric_limits<double>::digits)))> struct random_longdouble_impl { static constexpr int Size = sizeof(long double); - static constexpr EIGEN_DEVICE_FUNC inline int mantissaBits() { return NumTraits<long double>::digits() - 1; } + static constexpr EIGEN_DEVICE_FUNC int mantissaBits() { return NumTraits<long double>::digits() - 1; } static EIGEN_DEVICE_FUNC inline long double run(int numRandomBits) { eigen_assert(numRandomBits >= 0 && numRandomBits <= mantissaBits()); EIGEN_USING_STD(memcpy); @@ -140,7 +140,7 @@ }; template <> struct random_longdouble_impl<false> { - static constexpr EIGEN_DEVICE_FUNC inline int mantissaBits() { return NumTraits<double>::digits() - 1; } + static constexpr EIGEN_DEVICE_FUNC int mantissaBits() { return NumTraits<double>::digits() - 1; } static EIGEN_DEVICE_FUNC inline long double run(int numRandomBits) { return static_cast<long double>(random_float_impl<double>::run(numRandomBits)); }
diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h index 0c5f2d9..4e9ab0e 100644 --- a/Eigen/src/Core/Redux.h +++ b/Eigen/src/Core/Redux.h
@@ -414,6 +414,13 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetByOuterInner(Index outer, Index inner) const { return Base::template packet<LoadMode, PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); } + + template <int LoadMode, typename PacketType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegmentByOuterInner(Index outer, Index inner, Index begin, + Index count) const { + return Base::template packetSegment<LoadMode, PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer, + begin, count); + } }; } // end namespace internal
diff --git a/Eigen/src/Core/Ref.h b/Eigen/src/Core/Ref.h index 129bc85..30ec277 100644 --- a/Eigen/src/Core/Ref.h +++ b/Eigen/src/Core/Ref.h
@@ -73,11 +73,11 @@ typedef MapBase<Derived> Base; EIGEN_DENSE_PUBLIC_INTERFACE(RefBase) - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const { + EIGEN_DEVICE_FUNC constexpr Index innerStride() const { return StrideType::InnerStrideAtCompileTime != 0 ? m_stride.inner() : 1; } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const { + EIGEN_DEVICE_FUNC constexpr Index outerStride() const { return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer() : IsVectorAtCompileTime ? this->size() : int(Flags) & RowMajorBit ? this->cols() @@ -97,11 +97,11 @@ typedef Stride<StrideType::OuterStrideAtCompileTime, StrideType::InnerStrideAtCompileTime> StrideBase; // Resolves inner stride if default 0. - static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index resolveInnerStride(Index inner) { return inner == 0 ? 1 : inner; } + static EIGEN_DEVICE_FUNC constexpr Index resolveInnerStride(Index inner) { return inner == 0 ? 1 : inner; } // Resolves outer stride if default 0. - static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index resolveOuterStride(Index inner, Index outer, Index rows, Index cols, - bool isVectorAtCompileTime, bool isRowMajor) { + static EIGEN_DEVICE_FUNC constexpr Index resolveOuterStride(Index inner, Index outer, Index rows, Index cols, + bool isVectorAtCompileTime, bool isRowMajor) { return outer == 0 ? isVectorAtCompileTime ? inner * rows * cols : isRowMajor ? inner * cols : inner * rows : outer; }
diff --git a/Eigen/src/Core/Replicate.h b/Eigen/src/Core/Replicate.h index 11d7ad1..3415045 100644 --- a/Eigen/src/Core/Replicate.h +++ b/Eigen/src/Core/Replicate.h
@@ -85,8 +85,8 @@ THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE) } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const { return m_matrix.rows() * m_rowFactor.value(); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const { return m_matrix.cols() * m_colFactor.value(); } + EIGEN_DEVICE_FUNC constexpr Index rows() const { return m_matrix.rows() * m_rowFactor.value(); } + EIGEN_DEVICE_FUNC constexpr Index cols() const { return m_matrix.cols() * m_colFactor.value(); } EIGEN_DEVICE_FUNC const MatrixTypeNested_& nestedExpression() const { return m_matrix; }
diff --git a/Eigen/src/Core/Reshaped.h b/Eigen/src/Core/Reshaped.h index 4b34e16..22acdc0 100644 --- a/Eigen/src/Core/Reshaped.h +++ b/Eigen/src/Core/Reshaped.h
@@ -215,10 +215,10 @@ EIGEN_DEVICE_FUNC XprType& nestedExpression() { return m_xpr; } /** \sa MapBase::innerStride() */ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const { return m_xpr.innerStride(); } + EIGEN_DEVICE_FUNC constexpr Index innerStride() const { return m_xpr.innerStride(); } /** \sa MapBase::outerStride() */ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const { + EIGEN_DEVICE_FUNC constexpr Index outerStride() const { return (((Flags & RowMajorBit) == RowMajorBit) ? this->cols() : this->rows()) * m_xpr.innerStride(); }
diff --git a/Eigen/src/Core/ReturnByValue.h b/Eigen/src/Core/ReturnByValue.h index 3b5e470..892c193 100644 --- a/Eigen/src/Core/ReturnByValue.h +++ b/Eigen/src/Core/ReturnByValue.h
@@ -58,12 +58,8 @@ EIGEN_DEVICE_FUNC inline void evalTo(Dest& dst) const { static_cast<const Derived*>(this)->evalTo(dst); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { - return static_cast<const Derived*>(this)->rows(); - } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { - return static_cast<const Derived*>(this)->cols(); - } + EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return static_cast<const Derived*>(this)->rows(); } + EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return static_cast<const Derived*>(this)->cols(); } #ifndef EIGEN_PARSED_BY_DOXYGEN #define Unusable \
diff --git a/Eigen/src/Core/Reverse.h b/Eigen/src/Core/Reverse.h index eb06fff..d11ba16 100644 --- a/Eigen/src/Core/Reverse.h +++ b/Eigen/src/Core/Reverse.h
@@ -87,8 +87,8 @@ EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Reverse) - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); } + EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_matrix.rows(); } + EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_matrix.cols(); } EIGEN_DEVICE_FUNC inline Index innerStride() const { return -m_matrix.innerStride(); }
diff --git a/Eigen/src/Core/Select.h b/Eigen/src/Core/Select.h index 9f46120..0fa5f1e 100644 --- a/Eigen/src/Core/Select.h +++ b/Eigen/src/Core/Select.h
@@ -63,8 +63,8 @@ eigen_assert(m_condition.cols() == m_then.cols() && m_condition.cols() == m_else.cols()); } - inline EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_condition.rows(); } - inline EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_condition.cols(); } + EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_condition.rows(); } + EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_condition.cols(); } inline EIGEN_DEVICE_FUNC const Scalar coeff(Index i, Index j) const { if (m_condition.coeff(i, j))
diff --git a/Eigen/src/Core/SelfAdjointView.h b/Eigen/src/Core/SelfAdjointView.h index 4e9a923..16f0e75 100644 --- a/Eigen/src/Core/SelfAdjointView.h +++ b/Eigen/src/Core/SelfAdjointView.h
@@ -73,10 +73,10 @@ EIGEN_DEVICE_FUNC explicit inline SelfAdjointView(MatrixType& matrix) : m_matrix(matrix) {} - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const EIGEN_NOEXCEPT { return m_matrix.outerStride(); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const EIGEN_NOEXCEPT { return m_matrix.innerStride(); } + EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_matrix.rows(); } + EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_matrix.cols(); } + EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return m_matrix.outerStride(); } + EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { return m_matrix.innerStride(); } /** \sa MatrixBase::coeff() * \warning the coordinates must fit into the referenced triangular part
diff --git a/Eigen/src/Core/SkewSymmetricMatrix3.h b/Eigen/src/Core/SkewSymmetricMatrix3.h index 1945fd3..3545afc 100644 --- a/Eigen/src/Core/SkewSymmetricMatrix3.h +++ b/Eigen/src/Core/SkewSymmetricMatrix3.h
@@ -66,7 +66,7 @@ EIGEN_DEVICE_FUNC DenseMatrixType toDenseMatrix() const { return derived(); } /** Determinant vanishes */ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Scalar determinant() const { return 0; } + EIGEN_DEVICE_FUNC constexpr Scalar determinant() const { return 0; } /** A.transpose() = -A */ EIGEN_DEVICE_FUNC PlainObject transpose() const { return (-vector()).asSkewSymmetric(); } @@ -91,9 +91,9 @@ EIGEN_DEVICE_FUNC inline SkewSymmetricVectorType& vector() { return derived().vector(); } /** \returns the number of rows. */ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const { return 3; } + EIGEN_DEVICE_FUNC constexpr Index rows() const { return 3; } /** \returns the number of columns. */ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const { return 3; } + EIGEN_DEVICE_FUNC constexpr Index cols() const { return 3; } /** \returns the matrix product of \c *this by the dense matrix, \a matrix */ template <typename MatrixDerived>
diff --git a/Eigen/src/Core/Solve.h b/Eigen/src/Core/Solve.h index dfea9c6..aa51410 100644 --- a/Eigen/src/Core/Solve.h +++ b/Eigen/src/Core/Solve.h
@@ -66,8 +66,8 @@ Solve(const Decomposition &dec, const RhsType &rhs) : m_dec(dec), m_rhs(rhs) {} - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_dec.cols(); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); } + EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_dec.cols(); } + EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_rhs.cols(); } EIGEN_DEVICE_FUNC const Decomposition &dec() const { return m_dec; } EIGEN_DEVICE_FUNC const RhsType &rhs() const { return m_rhs; }
diff --git a/Eigen/src/Core/SolveTriangular.h b/Eigen/src/Core/SolveTriangular.h index 26d62ff..9d31874 100644 --- a/Eigen/src/Core/SolveTriangular.h +++ b/Eigen/src/Core/SolveTriangular.h
@@ -216,8 +216,8 @@ triangular_solve_retval(const TriangularType& tri, const Rhs& rhs) : m_triangularMatrix(tri), m_rhs(rhs) {} - inline EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_rhs.rows(); } - inline EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); } + constexpr Index rows() const noexcept { return m_rhs.rows(); } + constexpr Index cols() const noexcept { return m_rhs.cols(); } template <typename Dest> inline void evalTo(Dest& dst) const {
diff --git a/Eigen/src/Core/StlIterators.h b/Eigen/src/Core/StlIterators.h index 4576cc0..a24d4c2 100644 --- a/Eigen/src/Core/StlIterators.h +++ b/Eigen/src/Core/StlIterators.h
@@ -36,11 +36,11 @@ typedef Index difference_type; typedef std::random_access_iterator_tag iterator_category; - indexed_based_stl_iterator_base() EIGEN_NO_THROW : mp_xpr(0), m_index(0) {} - indexed_based_stl_iterator_base(XprType& xpr, Index index) EIGEN_NO_THROW : mp_xpr(&xpr), m_index(index) {} + indexed_based_stl_iterator_base() noexcept : mp_xpr(0), m_index(0) {} + indexed_based_stl_iterator_base(XprType& xpr, Index index) noexcept : mp_xpr(&xpr), m_index(index) {} - indexed_based_stl_iterator_base(const non_const_iterator& other) EIGEN_NO_THROW : mp_xpr(other.mp_xpr), - m_index(other.m_index) {} + indexed_based_stl_iterator_base(const non_const_iterator& other) noexcept + : mp_xpr(other.mp_xpr), m_index(other.m_index) {} indexed_based_stl_iterator_base& operator=(const non_const_iterator& other) { mp_xpr = other.mp_xpr; @@ -335,15 +335,14 @@ typedef std::conditional_t<bool(is_lvalue), value_type*, const value_type*> pointer; typedef std::conditional_t<bool(is_lvalue), value_type&, const value_type&> reference; - pointer_based_stl_iterator() EIGEN_NO_THROW : m_ptr(0) {} - pointer_based_stl_iterator(XprType& xpr, Index index) EIGEN_NO_THROW : m_incr(xpr.innerStride()) { + pointer_based_stl_iterator() noexcept : m_ptr(0) {} + pointer_based_stl_iterator(XprType& xpr, Index index) noexcept : m_incr(xpr.innerStride()) { m_ptr = xpr.data() + index * m_incr.value(); } - pointer_based_stl_iterator(const non_const_iterator& other) EIGEN_NO_THROW : m_ptr(other.m_ptr), - m_incr(other.m_incr) {} + pointer_based_stl_iterator(const non_const_iterator& other) noexcept : m_ptr(other.m_ptr), m_incr(other.m_incr) {} - pointer_based_stl_iterator& operator=(const non_const_iterator& other) EIGEN_NO_THROW { + pointer_based_stl_iterator& operator=(const non_const_iterator& other) noexcept { m_ptr = other.m_ptr; m_incr.setValue(other.m_incr); return *this;
diff --git a/Eigen/src/Core/Stride.h b/Eigen/src/Core/Stride.h index 14b025c..692f0a1 100644 --- a/Eigen/src/Core/Stride.h +++ b/Eigen/src/Core/Stride.h
@@ -78,9 +78,9 @@ } /** \returns the outer stride */ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outer() const { return m_outer.value(); } + EIGEN_DEVICE_FUNC constexpr Index outer() const { return m_outer.value(); } /** \returns the inner stride */ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index inner() const { return m_inner.value(); } + EIGEN_DEVICE_FUNC constexpr Index inner() const { return m_inner.value(); } protected: internal::variable_if_dynamic<Index, OuterStrideAtCompileTime> m_outer;
diff --git a/Eigen/src/Core/Swap.h b/Eigen/src/Core/Swap.h index d417c1a..dd825e9 100644 --- a/Eigen/src/Core/Swap.h +++ b/Eigen/src/Core/Swap.h
@@ -65,6 +65,31 @@ Index col = Base::colIndexByOuterInner(outer, inner); assignPacket<StoreMode, LoadMode, PacketType>(row, col); } + + template <int StoreMode, int LoadMode, typename PacketType> + EIGEN_STRONG_INLINE void assignPacketSegment(Index row, Index col, Index begin, Index count) { + PacketType tmp = m_src.template packetSegment<LoadMode, PacketType>(row, col, begin, count); + const_cast<SrcEvaluatorTypeT &>(m_src).template writePacketSegment<LoadMode>( + row, col, m_dst.template packetSegment<StoreMode, PacketType>(row, col, begin, count), begin, count); + m_dst.template writePacketSegment<StoreMode>(row, col, tmp, begin, count); + } + + template <int StoreMode, int LoadMode, typename PacketType> + EIGEN_STRONG_INLINE void assignPacketSegment(Index index, Index begin, Index count) { + PacketType tmp = m_src.template packetSegment<LoadMode, PacketType>(index, begin, count); + const_cast<SrcEvaluatorTypeT &>(m_src).template writePacketSegment<LoadMode>( + index, m_dst.template packetSegment<StoreMode, PacketType>(index, begin, count), begin, count); + m_dst.template writePacketSegment<StoreMode>(index, tmp, begin, count); + } + + // TODO find a simple way not to have to copy/paste this function from generic_dense_assignment_kernel, by simple I + // mean no CRTP (Gael) + template <int StoreMode, int LoadMode, typename PacketType> + EIGEN_STRONG_INLINE void assignPacketSegmentByOuterInner(Index outer, Index inner, Index begin, Index count) { + Index row = Base::rowIndexByOuterInner(outer, inner); + Index col = Base::colIndexByOuterInner(outer, inner); + assignPacketSegment<StoreMode, LoadMode, PacketType>(row, col, begin, count); + } }; } // namespace internal
diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h index 89e3d95..0676a25 100644 --- a/Eigen/src/Core/Transpose.h +++ b/Eigen/src/Core/Transpose.h
@@ -65,8 +65,8 @@ EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Transpose) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_matrix.cols(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.rows(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const noexcept { return m_matrix.cols(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const noexcept { return m_matrix.rows(); } /** \returns the nested expression */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all_t<MatrixTypeNested>& nestedExpression() const {
diff --git a/Eigen/src/Core/Transpositions.h b/Eigen/src/Core/Transpositions.h index 6fbbbd8..f6dd258 100644 --- a/Eigen/src/Core/Transpositions.h +++ b/Eigen/src/Core/Transpositions.h
@@ -293,9 +293,9 @@ public: explicit Transpose(const TranspositionType& t) : m_transpositions(t) {} - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index size() const EIGEN_NOEXCEPT { return m_transpositions.size(); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_transpositions.size(); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_transpositions.size(); } + EIGEN_DEVICE_FUNC constexpr Index size() const noexcept { return m_transpositions.size(); } + EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_transpositions.size(); } + EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_transpositions.size(); } /** \returns the \a matrix with the inverse transpositions applied to the columns. */
diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h index 2b1683b..27ad78e 100644 --- a/Eigen/src/Core/TriangularMatrix.h +++ b/Eigen/src/Core/TriangularMatrix.h
@@ -58,10 +58,10 @@ eigen_assert(!((int(Mode) & int(UnitDiag)) && (int(Mode) & int(ZeroDiag)))); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return derived().rows(); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return derived().cols(); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const EIGEN_NOEXCEPT { return derived().outerStride(); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const EIGEN_NOEXCEPT { return derived().innerStride(); } + EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return derived().rows(); } + EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return derived().cols(); } + EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return derived().outerStride(); } + EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { return derived().innerStride(); } // dummy resize function EIGEN_DEVICE_FUNC void resize(Index rows, Index cols) { @@ -194,9 +194,9 @@ EIGEN_INHERIT_ASSIGNMENT_OPERATORS(TriangularView) /** \copydoc EigenBase::rows() */ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); } + EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_matrix.rows(); } /** \copydoc EigenBase::cols() */ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); } + EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_matrix.cols(); } /** \returns a const reference to the nested expression */ EIGEN_DEVICE_FUNC const NestedExpression& nestedExpression() const { return m_matrix; }
diff --git a/Eigen/src/Core/VectorwiseOp.h b/Eigen/src/Core/VectorwiseOp.h index 9887db6..ac52dc5 100644 --- a/Eigen/src/Core/VectorwiseOp.h +++ b/Eigen/src/Core/VectorwiseOp.h
@@ -36,6 +36,7 @@ class PartialReduxExpr; namespace internal { + template <typename MatrixType, typename MemberOp, int Direction> struct traits<PartialReduxExpr<MatrixType, MemberOp, Direction> > : traits<MatrixType> { typedef typename MemberOp::result_type Scalar; @@ -63,12 +64,8 @@ EIGEN_DEVICE_FUNC explicit PartialReduxExpr(const MatrixType& mat, const MemberOp& func = MemberOp()) : m_matrix(mat), m_functor(func) {} - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { - return (Direction == Vertical ? 1 : m_matrix.rows()); - } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { - return (Direction == Horizontal ? 1 : m_matrix.cols()); - } + EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return (Direction == Vertical ? 1 : m_matrix.rows()); } + EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return (Direction == Horizontal ? 1 : m_matrix.cols()); } EIGEN_DEVICE_FUNC typename MatrixType::Nested nestedExpression() const { return m_matrix; }
diff --git a/Eigen/src/Core/Visitor.h b/Eigen/src/Core/Visitor.h index 198ec95..0450e2d 100644 --- a/Eigen/src/Core/Visitor.h +++ b/Eigen/src/Core/Visitor.h
@@ -25,14 +25,12 @@ template <typename Visitor, bool ShortCircuitEvaluation = false> struct short_circuit_eval_impl { // if short circuit evaluation is not used, do nothing - static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(const Visitor&) { return false; } + static constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(const Visitor&) { return false; } }; template <typename Visitor> struct short_circuit_eval_impl<Visitor, true> { // if short circuit evaluation is used, check the visitor - static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(const Visitor& visitor) { - return visitor.done(); - } + static constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(const Visitor& visitor) { return visitor.done(); } }; // unrolled inner-outer traversal @@ -296,9 +294,9 @@ EIGEN_DEVICE_FUNC explicit visitor_evaluator(const XprType& xpr) : m_evaluator(xpr), m_xpr(xpr) {} - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_xpr.rows(); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_xpr.cols(); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index size() const EIGEN_NOEXCEPT { return m_xpr.size(); } + EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_xpr.rows(); } + EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_xpr.cols(); } + EIGEN_DEVICE_FUNC constexpr Index size() const noexcept { return m_xpr.size(); } // outer-inner access EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const { return m_evaluator.coeff(row, col); @@ -632,6 +630,17 @@ }; }; +template <typename Derived, bool AlwaysTrue = NumTraits<typename traits<Derived>::Scalar>::IsInteger> +struct all_finite_impl { + static EIGEN_DEVICE_FUNC inline bool run(const Derived& /*derived*/) { return true; } +}; +#if !defined(__FINITE_MATH_ONLY__) || !(__FINITE_MATH_ONLY__) +template <typename Derived> +struct all_finite_impl<Derived, false> { + static EIGEN_DEVICE_FUNC inline bool run(const Derived& derived) { return derived.array().isFiniteTyped().all(); } +}; +#endif + } // end namespace internal /** \fn DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const @@ -781,7 +790,7 @@ */ template <typename Derived> EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::allFinite() const { - return derived().array().isFiniteTyped().all(); + return internal::all_finite_impl<Derived>::run(derived()); } } // end namespace Eigen
diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index d5506da..a4a87c4 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h
@@ -475,19 +475,11 @@ } template <> EIGEN_STRONG_INLINE Packet4cf pnmadd(const Packet4cf& a, const Packet4cf& b, const Packet4cf& c) { - __m256 a_odd = _mm256_movehdup_ps(a.v); - __m256 a_even = _mm256_moveldup_ps(a.v); - __m256 b_swap = _mm256_permute_ps(b.v, _MM_SHUFFLE(2, 3, 0, 1)); - __m256 result = _mm256_fmaddsub_ps(a_odd, b_swap, _mm256_fmaddsub_ps(a_even, b.v, c.v)); - return Packet4cf(result); + return pnegate(pmsub(a, b, c)); } template <> EIGEN_STRONG_INLINE Packet4cf pnmsub(const Packet4cf& a, const Packet4cf& b, const Packet4cf& c) { - __m256 a_odd = _mm256_movehdup_ps(a.v); - __m256 a_even = _mm256_moveldup_ps(a.v); - __m256 b_swap = _mm256_permute_ps(b.v, _MM_SHUFFLE(2, 3, 0, 1)); - __m256 result = _mm256_fmaddsub_ps(a_odd, b_swap, _mm256_fmsubadd_ps(a_even, b.v, c.v)); - return Packet4cf(result); + return pnegate(pmadd(a, b, c)); } // std::complex<double> template <> @@ -508,21 +500,64 @@ } template <> EIGEN_STRONG_INLINE Packet2cd pnmadd(const Packet2cd& a, const Packet2cd& b, const Packet2cd& c) { - __m256d a_odd = _mm256_permute_pd(a.v, 0xF); - __m256d a_even = _mm256_movedup_pd(a.v); - __m256d b_swap = _mm256_permute_pd(b.v, 0x5); - __m256d result = _mm256_fmaddsub_pd(a_odd, b_swap, _mm256_fmaddsub_pd(a_even, b.v, c.v)); - return Packet2cd(result); + return pnegate(pmsub(a, b, c)); } template <> EIGEN_STRONG_INLINE Packet2cd pnmsub(const Packet2cd& a, const Packet2cd& b, const Packet2cd& c) { - __m256d a_odd = _mm256_permute_pd(a.v, 0xF); - __m256d a_even = _mm256_movedup_pd(a.v); - __m256d b_swap = _mm256_permute_pd(b.v, 0x5); - __m256d result = _mm256_fmaddsub_pd(a_odd, b_swap, _mm256_fmsubadd_pd(a_even, b.v, c.v)); - return Packet2cd(result); + return pnegate(pmadd(a, b, c)); } #endif + +/*---------------- load/store segment support ----------------*/ + +/*---------------- std::complex<float> ----------------*/ + +template <> +struct has_packet_segment<Packet2cf> : std::true_type {}; + +template <> +struct has_packet_segment<Packet4cf> : std::true_type {}; + +template <> +inline Packet2cf ploaduSegment<Packet2cf>(const std::complex<float>* from, Index begin, Index count) { + return (Packet2cf)_mm_maskload_ps(&numext::real_ref(*from), segment_mask_2x64(begin, count)); +} + +template <> +inline void pstoreuSegment<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index begin, + Index count) { + _mm_maskstore_ps(&numext::real_ref(*to), segment_mask_2x64(begin, count), from.v); +} + +template <> +inline Packet4cf ploaduSegment<Packet4cf>(const std::complex<float>* from, Index begin, Index count) { + return (Packet4cf)_mm256_maskload_ps(&numext::real_ref(*from), segment_mask_4x64(begin, count)); +} + +template <> +inline void pstoreuSegment<std::complex<float>, Packet4cf>(std::complex<float>* to, const Packet4cf& from, Index begin, + Index count) { + _mm256_maskstore_ps(&numext::real_ref(*to), segment_mask_4x64(begin, count), from.v); +} + +/*---------------- std::complex<double> ----------------*/ + +template <> +struct has_packet_segment<Packet2cd> : std::true_type {}; + +template <> +inline Packet2cd ploaduSegment<Packet2cd>(const std::complex<double>* from, Index begin, Index count) { + return (Packet2cd)_mm256_maskload_pd(&numext::real_ref(*from), segment_mask_4x64(2 * begin, 2 * count)); +} + +template <> +inline void pstoreuSegment<std::complex<double>, Packet2cd>(std::complex<double>* to, const Packet2cd& from, + Index begin, Index count) { + _mm256_maskstore_pd(&numext::real_ref(*to), segment_mask_4x64(2 * begin, 2 * count), from.v); +} + +/*---------------- end load/store segment support ----------------*/ + } // end namespace internal } // end namespace Eigen
diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h index a5c38e7..5b7285f 100644 --- a/Eigen/src/Core/arch/AVX/MathFunctions.h +++ b/Eigen/src/Core/arch/AVX/MathFunctions.h
@@ -28,6 +28,7 @@ EIGEN_DOUBLE_PACKET_FUNCTION(log2, Packet4d) EIGEN_DOUBLE_PACKET_FUNCTION(exp, Packet4d) EIGEN_DOUBLE_PACKET_FUNCTION(tanh, Packet4d) +EIGEN_DOUBLE_PACKET_FUNCTION(cbrt, Packet4d) #ifdef EIGEN_VECTORIZE_AVX2 EIGEN_DOUBLE_PACKET_FUNCTION(sin, Packet4d) EIGEN_DOUBLE_PACKET_FUNCTION(cos, Packet4d) @@ -106,6 +107,8 @@ BF16_PACKET_FUNCTION(Packet8f, Packet8bf, psin) BF16_PACKET_FUNCTION(Packet8f, Packet8bf, psqrt) BF16_PACKET_FUNCTION(Packet8f, Packet8bf, ptanh) + +#ifndef EIGEN_VECTORIZE_AVX512FP16 F16_PACKET_FUNCTION(Packet8f, Packet8h, pcos) F16_PACKET_FUNCTION(Packet8f, Packet8h, pexp) F16_PACKET_FUNCTION(Packet8f, Packet8h, pexp2) @@ -118,6 +121,7 @@ F16_PACKET_FUNCTION(Packet8f, Packet8h, psin) F16_PACKET_FUNCTION(Packet8f, Packet8h, psqrt) F16_PACKET_FUNCTION(Packet8f, Packet8h, ptanh) +#endif } // end namespace internal
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index c29523a..470e36d 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -122,6 +122,7 @@ HasBessel = 1, HasSqrt = 1, HasRsqrt = 1, + HasCbrt = 1, HasTanh = EIGEN_FAST_MATH, HasErf = EIGEN_FAST_MATH, HasErfc = EIGEN_FAST_MATH, @@ -150,6 +151,7 @@ HasExp = 1, HasSqrt = 1, HasRsqrt = 1, + HasCbrt = 1, HasATan = 1, HasATanh = 1, HasBlend = 1 @@ -1839,10 +1841,13 @@ return a; } +#ifndef EIGEN_VECTORIZE_AVX512FP16 template <> EIGEN_STRONG_INLINE Packet8h psignbit(const Packet8h& a) { return _mm_cmpgt_epi16(_mm_setzero_si128(), a); } +#endif // EIGEN_VECTORIZE_AVX512FP16 + template <> EIGEN_STRONG_INLINE Packet8bf psignbit(const Packet8bf& a) { return _mm_cmpgt_epi16(_mm_setzero_si128(), a); @@ -2044,10 +2049,13 @@ return _mm256_movemask_ps(_mm256_castsi256_ps(x)) != 0; } +#ifndef EIGEN_VECTORIZE_AVX512FP16 template <> EIGEN_STRONG_INLINE bool predux_any(const Packet8h& x) { return _mm_movemask_epi8(x) != 0; } +#endif // EIGEN_VECTORIZE_AVX512FP16 + template <> EIGEN_STRONG_INLINE bool predux_any(const Packet8bf& x) { return _mm_movemask_epi8(x) != 0; @@ -2211,7 +2219,6 @@ }; typedef Packet8h half; }; -#endif template <> EIGEN_STRONG_INLINE Packet8h pset1<Packet8h>(const Eigen::half& from) { @@ -2411,6 +2418,26 @@ } template <> +EIGEN_STRONG_INLINE Packet8h pmadd<Packet8h>(const Packet8h& a, const Packet8h& b, const Packet8h& c) { + return float2half(pmadd(half2float(a), half2float(b), half2float(c))); +} + +template <> +EIGEN_STRONG_INLINE Packet8h pmsub<Packet8h>(const Packet8h& a, const Packet8h& b, const Packet8h& c) { + return float2half(pmsub(half2float(a), half2float(b), half2float(c))); +} + +template <> +EIGEN_STRONG_INLINE Packet8h pnmadd<Packet8h>(const Packet8h& a, const Packet8h& b, const Packet8h& c) { + return float2half(pnmadd(half2float(a), half2float(b), half2float(c))); +} + +template <> +EIGEN_STRONG_INLINE Packet8h pnmsub<Packet8h>(const Packet8h& a, const Packet8h& b, const Packet8h& c) { + return float2half(pnmsub(half2float(a), half2float(b), half2float(c))); +} + +template <> EIGEN_STRONG_INLINE Packet8h pdiv<Packet8h>(const Packet8h& a, const Packet8h& b) { Packet8f af = half2float(a); Packet8f bf = half2float(b); @@ -2446,14 +2473,12 @@ to[stride * 7] = aux[7]; } -#ifndef EIGEN_VECTORIZE_AVX512FP16 template <> EIGEN_STRONG_INLINE Eigen::half predux<Packet8h>(const Packet8h& a) { Packet8f af = half2float(a); float reduced = predux<Packet8f>(af); return Eigen::half(reduced); } -#endif template <> EIGEN_STRONG_INLINE Eigen::half predux_max<Packet8h>(const Packet8h& a) { @@ -2553,6 +2578,8 @@ kernel.packet[3] = pload<Packet8h>(out[3]); } +#endif + // BFloat16 implementation. EIGEN_STRONG_INLINE Packet8f Bf16ToF32(const Packet8bf& a) { @@ -2781,6 +2808,26 @@ } template <> +EIGEN_STRONG_INLINE Packet8bf pmadd<Packet8bf>(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) { + return F32ToBf16(pmadd(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c))); +} + +template <> +EIGEN_STRONG_INLINE Packet8bf pmsub<Packet8bf>(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) { + return F32ToBf16(pmsub(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c))); +} + +template <> +EIGEN_STRONG_INLINE Packet8bf pnmadd<Packet8bf>(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) { + return F32ToBf16(pnmadd(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c))); +} + +template <> +EIGEN_STRONG_INLINE Packet8bf pnmsub<Packet8bf>(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) { + return F32ToBf16(pnmsub(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c))); +} + +template <> EIGEN_STRONG_INLINE Packet8bf pdiv<Packet8bf>(const Packet8bf& a, const Packet8bf& b) { return F32ToBf16(pdiv<Packet8f>(Bf16ToF32(a), Bf16ToF32(b))); } @@ -2893,6 +2940,258 @@ kernel.packet[3] = _mm_unpackhi_epi32(ab_47, cd_47); } +/*---------------- load/store segment support ----------------*/ + +// returns a mask of 8-bit elements (at most 4) that are all 1's in the range [begin, begin + count) and 0 elsewhere. +inline __m128i segment_mask_4x8(Index begin, Index count) { + eigen_assert(begin >= 0 && begin + count <= 4); + long long mask = 1; + mask <<= CHAR_BIT * count; + mask--; + mask <<= CHAR_BIT * begin; +#if defined(_WIN32) && !defined(_WIN64) + return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&mask)); +#else + return _mm_cvtsi64_si128(mask); +#endif +} + +// returns a mask of 8-bit elements (at most 8) that are all 1's in the range [begin, begin + count) and 0 elsewhere. +inline __m128i segment_mask_8x8(Index begin, Index count) { + eigen_assert(begin >= 0 && begin + count <= 8); + long long mask = 1; + // avoid UB when count == 8 + mask <<= (CHAR_BIT / 2) * count; + mask <<= (CHAR_BIT / 2) * count; + mask--; + mask <<= CHAR_BIT * begin; +#if defined(_WIN32) && !defined(_WIN64) + return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&mask)); +#else + return _mm_cvtsi64_si128(mask); +#endif +} + +// returns a mask of 32-bit elements (at most 4) that are all 1's in the range [begin, begin + count) and 0 elsewhere. +inline __m128i segment_mask_4x32(Index begin, Index count) { + eigen_assert(begin >= 0 && begin + count <= 4); + return _mm_cvtepi8_epi32(segment_mask_4x8(begin, count)); +} + +// returns a mask of 64-bit elements (at most 2) that are all 1's in the range [begin, begin + count) and 0 elsewhere. +inline __m128i segment_mask_2x64(Index begin, Index count) { + eigen_assert(begin >= 0 && begin + count <= 2); + return _mm_cvtepi8_epi64(segment_mask_4x8(begin, count)); +} + +// returns a mask of 32-bit elements (at most 8) that are all 1's in the range [begin, begin + count) and 0 elsewhere. +inline __m256i segment_mask_8x32(Index begin, Index count) { + __m128i mask_epi8 = segment_mask_8x8(begin, count); +#ifdef EIGEN_VECTORIZE_AVX2 + __m256i mask_epi32 = _mm256_cvtepi8_epi32(mask_epi8); +#else + __m128i mask_epi32_lo = _mm_cvtepi8_epi32(mask_epi8); + __m128i mask_epi32_hi = _mm_cvtepi8_epi32(_mm_srli_epi64(mask_epi8, 32)); + __m256i mask_epi32 = _mm256_insertf128_si256(_mm256_castsi128_si256(mask_epi32_lo), mask_epi32_hi, 1); +#endif + return mask_epi32; +} + +// returns a mask of 64-bit elements (at most 4) that are all 1's in the range [begin, begin + count) and 0 elsewhere. +inline __m256i segment_mask_4x64(Index begin, Index count) { + __m128i mask_epi8 = segment_mask_4x8(begin, count); +#ifdef EIGEN_VECTORIZE_AVX2 + __m256i mask_epi64 = _mm256_cvtepi8_epi64(mask_epi8); +#else + __m128i mask_epi64_lo = _mm_cvtepi8_epi64(mask_epi8); + __m128i mask_epi64_hi = _mm_cvtepi8_epi64(_mm_srli_epi64(mask_epi8, 16)); + __m256i mask_epi64 = _mm256_insertf128_si256(_mm256_castsi128_si256(mask_epi64_lo), mask_epi64_hi, 1); +#endif + return mask_epi64; +} + +/*---------------- float ----------------*/ + +template <> +struct has_packet_segment<Packet4f> : std::true_type {}; + +template <> +struct has_packet_segment<Packet8f> : std::true_type {}; + +template <> +inline Packet4f ploaduSegment<Packet4f>(const float* from, Index begin, Index count) { + return _mm_maskload_ps(from, segment_mask_4x32(begin, count)); +} + +template <> +inline void pstoreuSegment<float, Packet4f>(float* to, const Packet4f& from, Index begin, Index count) { + _mm_maskstore_ps(to, segment_mask_4x32(begin, count), from); +} + +template <> +inline Packet8f ploaduSegment<Packet8f>(const float* from, Index begin, Index count) { + return _mm256_maskload_ps(from, segment_mask_8x32(begin, count)); +} + +template <> +inline void pstoreuSegment<float, Packet8f>(float* to, const Packet8f& from, Index begin, Index count) { + _mm256_maskstore_ps(to, segment_mask_8x32(begin, count), from); +} + +/*---------------- int32 ----------------*/ + +template <> +struct has_packet_segment<Packet4i> : std::true_type {}; + +template <> +struct has_packet_segment<Packet8i> : std::true_type {}; + +#ifdef EIGEN_VECTORIZE_AVX2 + +template <> +inline Packet4i ploaduSegment<Packet4i>(const int* from, Index begin, Index count) { + return _mm_maskload_epi32(from, segment_mask_4x32(begin, count)); +} + +template <> +inline void pstoreuSegment<int, Packet4i>(int* to, const Packet4i& from, Index begin, Index count) { + _mm_maskstore_epi32(to, segment_mask_4x32(begin, count), from); +} + +template <> +inline Packet8i ploaduSegment<Packet8i>(const int* from, Index begin, Index count) { + return _mm256_maskload_epi32(from, segment_mask_8x32(begin, count)); +} + +template <> +inline void pstoreuSegment<int, Packet8i>(int* to, const Packet8i& from, Index begin, Index count) { + _mm256_maskstore_epi32(to, segment_mask_8x32(begin, count), from); +} + +#else + +template <> +inline Packet4i ploaduSegment<Packet4i>(const int* from, Index begin, Index count) { + return _mm_castps_si128(ploaduSegment<Packet4f>(reinterpret_cast<const float*>(from), begin, count)); +} + +template <> +inline void pstoreuSegment<int, Packet4i>(int* to, const Packet4i& from, Index begin, Index count) { + pstoreuSegment<float, Packet4f>(reinterpret_cast<float*>(to), _mm_castsi128_ps(from), begin, count); +} + +template <> +inline Packet8i ploaduSegment<Packet8i>(const int* from, Index begin, Index count) { + return _mm256_castps_si256(ploaduSegment<Packet8f>(reinterpret_cast<const float*>(from), begin, count)); +} + +template <> +inline void pstoreuSegment<int, Packet8i>(int* to, const Packet8i& from, Index begin, Index count) { + pstoreuSegment<float, Packet8f>(reinterpret_cast<float*>(to), _mm256_castsi256_ps(from), begin, count); +} + +#endif + +/*---------------- uint32 ----------------*/ + +template <> +struct has_packet_segment<Packet4ui> : std::true_type {}; + +template <> +struct has_packet_segment<Packet8ui> : std::true_type {}; + +template <> +inline Packet4ui ploaduSegment<Packet4ui>(const uint32_t* from, Index begin, Index count) { + return Packet4ui(ploaduSegment<Packet4i>(reinterpret_cast<const int*>(from), begin, count)); +} + +template <> +inline void pstoreuSegment<uint32_t, Packet4ui>(uint32_t* to, const Packet4ui& from, Index begin, Index count) { + pstoreuSegment<int, Packet4i>(reinterpret_cast<int*>(to), Packet4i(from), begin, count); +} + +template <> +inline Packet8ui ploaduSegment<Packet8ui>(const uint32_t* from, Index begin, Index count) { + return Packet8ui(ploaduSegment<Packet8i>(reinterpret_cast<const int*>(from), begin, count)); +} + +template <> +inline void pstoreuSegment<uint32_t, Packet8ui>(uint32_t* to, const Packet8ui& from, Index begin, Index count) { + pstoreuSegment<int, Packet8i>(reinterpret_cast<int*>(to), Packet8i(from), begin, count); +} + +/*---------------- double ----------------*/ + +template <> +struct has_packet_segment<Packet2d> : std::true_type {}; + +template <> +struct has_packet_segment<Packet4d> : std::true_type {}; + +template <> +inline Packet2d ploaduSegment<Packet2d>(const double* from, Index begin, Index count) { + return _mm_maskload_pd(from, segment_mask_2x64(begin, count)); +} + +template <> +inline void pstoreuSegment<double, Packet2d>(double* to, const Packet2d& from, Index begin, Index count) { + _mm_maskstore_pd(to, segment_mask_2x64(begin, count), from); +} + +template <> +inline Packet4d ploaduSegment<Packet4d>(const double* from, Index begin, Index count) { + return _mm256_maskload_pd(from, segment_mask_4x64(begin, count)); +} + +template <> +inline void pstoreuSegment<double, Packet4d>(double* to, const Packet4d& from, Index begin, Index count) { + _mm256_maskstore_pd(to, segment_mask_4x64(begin, count), from); +} + +#ifdef EIGEN_VECTORIZE_AVX2 + +/*---------------- int64_t ----------------*/ + +template <> +struct has_packet_segment<Packet2l> : std::true_type {}; + +template <> +struct has_packet_segment<Packet4l> : std::true_type {}; + +template <> +inline Packet2l ploaduSegment<Packet2l>(const int64_t* from, Index begin, Index count) { + return _mm_maskload_epi64(reinterpret_cast<const long long*>(from), segment_mask_2x64(begin, count)); +} +template <> +inline void pstoreuSegment<int64_t, Packet2l>(int64_t* to, const Packet2l& from, Index begin, Index count) { + _mm_maskstore_epi64(reinterpret_cast<long long*>(to), segment_mask_2x64(begin, count), from); +} +template <> +inline Packet4l ploaduSegment<Packet4l>(const int64_t* from, Index begin, Index count) { + return _mm256_maskload_epi64(reinterpret_cast<const long long*>(from), segment_mask_4x64(begin, count)); +} +template <> +inline void pstoreuSegment<int64_t, Packet4l>(int64_t* to, const Packet4l& from, Index begin, Index count) { + _mm256_maskstore_epi64(reinterpret_cast<long long*>(to), segment_mask_4x64(begin, count), from); +} + +/*---------------- uint64_t ----------------*/ + +template <> +struct has_packet_segment<Packet4ul> : std::true_type {}; + +template <> +inline Packet4ul ploaduSegment<Packet4ul>(const uint64_t* from, Index begin, Index count) { + return Packet4ul(ploaduSegment<Packet4l>(reinterpret_cast<const int64_t*>(from), begin, count)); +} +template <> +inline void pstoreuSegment<uint64_t, Packet4ul>(uint64_t* to, const Packet4ul& from, Index begin, Index count) { + pstoreuSegment<int64_t, Packet4l>(reinterpret_cast<int64_t*>(to), Packet4l(from), begin, count); +} +#endif + +/*---------------- end load/store segment support ----------------*/ + } // end namespace internal } // end namespace Eigen
diff --git a/Eigen/src/Core/arch/AVX/TypeCasting.h b/Eigen/src/Core/arch/AVX/TypeCasting.h index 9dcd6ef..5b73ffe 100644 --- a/Eigen/src/Core/arch/AVX/TypeCasting.h +++ b/Eigen/src/Core/arch/AVX/TypeCasting.h
@@ -279,20 +279,22 @@ } #endif +#ifndef EIGEN_VECTORIZE_AVX512FP16 template <> EIGEN_STRONG_INLINE Packet8f pcast<Packet8h, Packet8f>(const Packet8h& a) { return half2float(a); } template <> -EIGEN_STRONG_INLINE Packet8f pcast<Packet8bf, Packet8f>(const Packet8bf& a) { - return Bf16ToF32(a); -} - -template <> EIGEN_STRONG_INLINE Packet8h pcast<Packet8f, Packet8h>(const Packet8f& a) { return float2half(a); } +#endif + +template <> +EIGEN_STRONG_INLINE Packet8f pcast<Packet8bf, Packet8f>(const Packet8bf& a) { + return Bf16ToF32(a); +} template <> EIGEN_STRONG_INLINE Packet8bf pcast<Packet8f, Packet8bf>(const Packet8f& a) {
diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h index 6039254..04499a0 100644 --- a/Eigen/src/Core/arch/AVX512/MathFunctions.h +++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h
@@ -47,16 +47,16 @@ #if EIGEN_FAST_MATH template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f psqrt<Packet16f>(const Packet16f& _x) { - return generic_sqrt_newton_step<Packet16f>::run(_x, _mm512_rsqrt14_ps(_x)); +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f psqrt<Packet16f>(const Packet16f& x) { + return generic_sqrt_newton_step<Packet16f>::run(x, _mm512_rsqrt14_ps(x)); } template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8d psqrt<Packet8d>(const Packet8d& _x) { +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8d psqrt<Packet8d>(const Packet8d& x) { #ifdef EIGEN_VECTORIZE_AVX512ER - return generic_sqrt_newton_step<Packet8d, /*Steps=*/1>::run(_x, _mm512_rsqrt28_pd(_x)); + return generic_sqrt_newton_step<Packet8d, /*Steps=*/1>::run(x, _mm512_rsqrt28_pd(x)); #else - return generic_sqrt_newton_step<Packet8d, /*Steps=*/2>::run(_x, _mm512_rsqrt14_pd(_x)); + return generic_sqrt_newton_step<Packet8d, /*Steps=*/2>::run(x, _mm512_rsqrt14_pd(x)); #endif } #else @@ -80,19 +80,19 @@ #elif EIGEN_FAST_MATH template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f prsqrt<Packet16f>(const Packet16f& _x) { - return generic_rsqrt_newton_step<Packet16f, /*Steps=*/1>::run(_x, _mm512_rsqrt14_ps(_x)); +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f prsqrt<Packet16f>(const Packet16f& x) { + return generic_rsqrt_newton_step<Packet16f, /*Steps=*/1>::run(x, _mm512_rsqrt14_ps(x)); } #endif // prsqrt for double. #if EIGEN_FAST_MATH template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8d prsqrt<Packet8d>(const Packet8d& _x) { +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8d prsqrt<Packet8d>(const Packet8d& x) { #ifdef EIGEN_VECTORIZE_AVX512ER - return generic_rsqrt_newton_step<Packet8d, /*Steps=*/1>::run(_x, _mm512_rsqrt28_pd(_x)); + return generic_rsqrt_newton_step<Packet8d, /*Steps=*/1>::run(x, _mm512_rsqrt28_pd(x)); #else - return generic_rsqrt_newton_step<Packet8d, /*Steps=*/2>::run(_x, _mm512_rsqrt14_pd(_x)); + return generic_rsqrt_newton_step<Packet8d, /*Steps=*/2>::run(x, _mm512_rsqrt14_pd(x)); #endif } @@ -118,6 +118,8 @@ BF16_PACKET_FUNCTION(Packet16f, Packet16bf, psin) BF16_PACKET_FUNCTION(Packet16f, Packet16bf, psqrt) BF16_PACKET_FUNCTION(Packet16f, Packet16bf, ptanh) + +#ifndef EIGEN_VECTORIZE_AVX512FP16 F16_PACKET_FUNCTION(Packet16f, Packet16h, pcos) F16_PACKET_FUNCTION(Packet16f, Packet16h, pexp) F16_PACKET_FUNCTION(Packet16f, Packet16h, pexp2) @@ -130,6 +132,7 @@ F16_PACKET_FUNCTION(Packet16f, Packet16h, psin) F16_PACKET_FUNCTION(Packet16f, Packet16h, psqrt) F16_PACKET_FUNCTION(Packet16f, Packet16h, ptanh) +#endif // EIGEN_VECTORIZE_AVX512FP16 } // end namespace internal
diff --git a/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h b/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h new file mode 100644 index 0000000..240ade4 --- /dev/null +++ b/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h
@@ -0,0 +1,75 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2025 The Eigen Authors. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_MATH_FUNCTIONS_FP16_AVX512_H +#define EIGEN_MATH_FUNCTIONS_FP16_AVX512_H + +// IWYU pragma: private +#include "../../InternalHeaderCheck.h" + +namespace Eigen { +namespace internal { + +EIGEN_STRONG_INLINE Packet32h combine2Packet16h(const Packet16h& a, const Packet16h& b) { + __m512i result = _mm512_castsi256_si512(_mm256_castph_si256(a)); + result = _mm512_inserti64x4(result, _mm256_castph_si256(b), 1); + return _mm512_castsi512_ph(result); +} + +EIGEN_STRONG_INLINE void extract2Packet16h(const Packet32h& x, Packet16h& a, Packet16h& b) { + a = _mm256_castsi256_ph(_mm512_castsi512_si256(_mm512_castph_si512(x))); + b = _mm256_castsi256_ph(_mm512_extracti64x4_epi64(_mm512_castph_si512(x), 1)); +} + +#define _EIGEN_GENERATE_FP16_MATH_FUNCTION(func) \ + template <> \ + EIGEN_STRONG_INLINE Packet8h func<Packet8h>(const Packet8h& a) { \ + return float2half(func(half2float(a))); \ + } \ + \ + template <> \ + EIGEN_STRONG_INLINE Packet16h func<Packet16h>(const Packet16h& a) { \ + return float2half(func(half2float(a))); \ + } \ + \ + template <> \ + EIGEN_STRONG_INLINE Packet32h func<Packet32h>(const Packet32h& a) { \ + Packet16h low; \ + Packet16h high; \ + extract2Packet16h(a, low, high); \ + return combine2Packet16h(func(low), func(high)); \ + } + +_EIGEN_GENERATE_FP16_MATH_FUNCTION(psin) +_EIGEN_GENERATE_FP16_MATH_FUNCTION(pcos) +_EIGEN_GENERATE_FP16_MATH_FUNCTION(plog) +_EIGEN_GENERATE_FP16_MATH_FUNCTION(plog2) +_EIGEN_GENERATE_FP16_MATH_FUNCTION(plog1p) +_EIGEN_GENERATE_FP16_MATH_FUNCTION(pexp) +_EIGEN_GENERATE_FP16_MATH_FUNCTION(pexpm1) +_EIGEN_GENERATE_FP16_MATH_FUNCTION(pexp2) +_EIGEN_GENERATE_FP16_MATH_FUNCTION(ptanh) +#undef _EIGEN_GENERATE_FP16_MATH_FUNCTION + +// pfrexp +template <> +EIGEN_STRONG_INLINE Packet32h pfrexp<Packet32h>(const Packet32h& a, Packet32h& exponent) { + return pfrexp_generic(a, exponent); +} + +// pldexp +template <> +EIGEN_STRONG_INLINE Packet32h pldexp<Packet32h>(const Packet32h& a, const Packet32h& exponent) { + return pldexp_generic(a, exponent); +} + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_MATH_FUNCTIONS_FP16_AVX512_H \ No newline at end of file
diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 5d869e4..27a0f10 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
@@ -40,6 +40,10 @@ #endif typedef eigen_packet_wrapper<__m256i, 2> Packet16bf; +typedef eigen_packet_wrapper<__m512i, 6> Packet32s; +typedef eigen_packet_wrapper<__m256i, 6> Packet16s; +typedef eigen_packet_wrapper<__m128i, 6> Packet8s; + template <> struct is_arithmetic<__m512> { enum { value = true }; @@ -124,6 +128,7 @@ HasATanh = 1, HasSqrt = 1, HasRsqrt = 1, + HasCbrt = 1, HasLog = 1, HasLog1p = 1, HasExpm1 = 1, @@ -149,6 +154,7 @@ HasBlend = 1, HasSqrt = 1, HasRsqrt = 1, + HasCbrt = 1, HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, HasLog = 1, @@ -249,6 +255,39 @@ #endif template <> +struct unpacket_traits<Packet32s> { + typedef numext::int16_t type; + typedef Packet16s half; + enum { + size = 32, + alignment = Aligned64, + vectorizable = false, + }; +}; + +template <> +struct unpacket_traits<Packet16s> { + typedef numext::int16_t type; + typedef Packet8s half; + enum { + size = 16, + alignment = Aligned32, + vectorizable = false, + }; +}; + +template <> +struct unpacket_traits<Packet8s> { + typedef numext::int16_t type; + typedef Packet8s half; + enum { + size = 8, + alignment = Aligned16, + vectorizable = false, + }; +}; + +template <> EIGEN_STRONG_INLINE Packet16f pset1<Packet16f>(const float& from) { return _mm512_set1_ps(from); } @@ -1335,10 +1374,13 @@ return _mm512_abs_epi64(a); } +#ifndef EIGEN_VECTORIZE_AVX512FP16 template <> EIGEN_STRONG_INLINE Packet16h psignbit(const Packet16h& a) { return _mm256_srai_epi16(a, 15); } +#endif // EIGEN_VECTORIZE_AVX512FP16 + template <> EIGEN_STRONG_INLINE Packet16bf psignbit(const Packet16bf& a) { return _mm256_srai_epi16(a, 15); @@ -2199,6 +2241,7 @@ } // Packet math for Eigen::half +#ifndef EIGEN_VECTORIZE_AVX512FP16 template <> EIGEN_STRONG_INLINE Packet16h pset1<Packet16h>(const Eigen::half& from) { return _mm256_set1_epi16(from.x); @@ -2223,6 +2266,7 @@ EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet16h& from) { // (void*) -> workaround clang warning: // cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32 + EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256((__m256i*)(void*)to, from); } @@ -2230,6 +2274,7 @@ EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet16h& from) { // (void*) -> workaround clang warning: // cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32 + EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256((__m256i*)(void*)to, from); } @@ -2369,7 +2414,6 @@ return _mm256_xor_si256(a, sign_mask); } -#ifndef EIGEN_VECTORIZE_AVX512FP16 template <> EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) { Packet16f af = half2float(a); @@ -2403,13 +2447,31 @@ } template <> +EIGEN_STRONG_INLINE Packet16h pmadd<Packet16h>(const Packet16h& a, const Packet16h& b, const Packet16h& c) { + return float2half(pmadd(half2float(a), half2float(b), half2float(c))); +} + +template <> +EIGEN_STRONG_INLINE Packet16h pmsub<Packet16h>(const Packet16h& a, const Packet16h& b, const Packet16h& c) { + return float2half(pmsub(half2float(a), half2float(b), half2float(c))); +} + +template <> +EIGEN_STRONG_INLINE Packet16h pnmadd<Packet16h>(const Packet16h& a, const Packet16h& b, const Packet16h& c) { + return float2half(pnmadd(half2float(a), half2float(b), half2float(c))); +} + +template <> +EIGEN_STRONG_INLINE Packet16h pnmsub<Packet16h>(const Packet16h& a, const Packet16h& b, const Packet16h& c) { + return float2half(pnmsub(half2float(a), half2float(b), half2float(c))); +} + +template <> EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& from) { Packet16f from_float = half2float(from); return half(predux(from_float)); } -#endif - template <> EIGEN_STRONG_INLINE Packet8h predux_half_dowto4<Packet16h>(const Packet16h& a) { Packet8h lane0 = _mm256_extractf128_si256(a, 0); @@ -2643,6 +2705,8 @@ kernel.packet[3] = pload<Packet16h>(out[3]); } +#endif // EIGEN_VECTORIZE_AVX512FP16 + template <> struct is_arithmetic<Packet16bf> { enum { value = true }; @@ -2714,11 +2778,13 @@ template <> EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet16bf& from) { + EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), from); } template <> EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet16bf& from) { + EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); } @@ -2889,7 +2955,27 @@ template <> EIGEN_STRONG_INLINE Packet16bf pmul<Packet16bf>(const Packet16bf& a, const Packet16bf& b) { - return F32ToBf16(pmul<Packet16f>(Bf16ToF32(a), Bf16ToF32(b))); + return F32ToBf16(pmul(Bf16ToF32(a), Bf16ToF32(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet16bf pmadd<Packet16bf>(const Packet16bf& a, const Packet16bf& b, const Packet16bf& c) { + return F32ToBf16(pmadd(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c))); +} + +template <> +EIGEN_STRONG_INLINE Packet16bf pmsub<Packet16bf>(const Packet16bf& a, const Packet16bf& b, const Packet16bf& c) { + return F32ToBf16(pmsub(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c))); +} + +template <> +EIGEN_STRONG_INLINE Packet16bf pnmadd<Packet16bf>(const Packet16bf& a, const Packet16bf& b, const Packet16bf& c) { + return F32ToBf16(pnmadd(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c))); +} + +template <> +EIGEN_STRONG_INLINE Packet16bf pnmsub<Packet16bf>(const Packet16bf& a, const Packet16bf& b, const Packet16bf& c) { + return F32ToBf16(pnmsub(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c))); } template <> @@ -3095,6 +3181,172 @@ kernel.packet[3] = _mm256_permute2x128_si256(abcd_8b, abcd_cf, 0x31); } +// Minimal implementation of 16-bit int packets for use in pfrexp, pldexp. + +template <> +EIGEN_STRONG_INLINE Packet32s pset1<Packet32s>(const numext::int16_t& x) { + return _mm512_set1_epi16(x); +} + +template <> +EIGEN_STRONG_INLINE Packet16s pset1<Packet16s>(const numext::int16_t& x) { + return _mm256_set1_epi16(x); +} + +template <> +EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const numext::int16_t& x) { + return _mm_set1_epi16(x); +} + +template <> +EIGEN_STRONG_INLINE void pstore<numext::int16_t, Packet32s>(numext::int16_t* out, const Packet32s& x) { + EIGEN_DEBUG_ALIGNED_STORE + _mm512_store_epi32(out, x); +} + +template <> +EIGEN_STRONG_INLINE void pstore<numext::int16_t, Packet16s>(numext::int16_t* out, const Packet16s& x) { + EIGEN_DEBUG_ALIGNED_STORE +#if defined(EIGEN_VECTORIZE_AVX512F) && defined(EIGEN_VECTORIZE_AVX512VL) + _mm256_store_epi32(out, x); +#else + _mm256_store_si256(reinterpret_cast<__m256i*>(out), x); +#endif +} + +template <> +EIGEN_STRONG_INLINE void pstore<numext::int16_t, Packet8s>(numext::int16_t* out, const Packet8s& x) { + EIGEN_DEBUG_ALIGNED_STORE +#if defined(EIGEN_VECTORIZE_AVX512F) && defined(EIGEN_VECTORIZE_AVX512VL) + _mm256_store_epi32(out, x); +#else + _mm_store_si128(reinterpret_cast<__m128i*>(out), x); +#endif +} + +template <> +EIGEN_STRONG_INLINE void pstoreu<numext::int16_t, Packet32s>(numext::int16_t* out, const Packet32s& x) { + EIGEN_DEBUG_UNALIGNED_STORE + _mm512_storeu_epi32(out, x); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu<numext::int16_t, Packet16s>(numext::int16_t* out, const Packet16s& x) { + EIGEN_DEBUG_UNALIGNED_STORE + _mm256_storeu_epi32(out, x); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu<numext::int16_t, Packet8s>(numext::int16_t* out, const Packet8s& x) { + EIGEN_DEBUG_UNALIGNED_STORE + _mm_storeu_epi32(out, x); +} + +template <> +EIGEN_STRONG_INLINE Packet32s padd(const Packet32s& a, const Packet32s& b) { + return _mm512_add_epi16(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet16s padd(const Packet16s& a, const Packet16s& b) { + return _mm256_add_epi16(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet8s padd(const Packet8s& a, const Packet8s& b) { + return _mm_add_epi16(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet32s psub(const Packet32s& a, const Packet32s& b) { + return _mm512_sub_epi16(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet16s psub(const Packet16s& a, const Packet16s& b) { + return _mm256_sub_epi16(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet8s psub(const Packet8s& a, const Packet8s& b) { + return _mm_sub_epi16(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet32s pmul(const Packet32s& a, const Packet32s& b) { + return _mm512_mullo_epi16(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet16s pmul(const Packet16s& a, const Packet16s& b) { + return _mm256_mullo_epi16(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet8s pmul(const Packet8s& a, const Packet8s& b) { + return _mm_mullo_epi16(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet32s pnegate(const Packet32s& a) { + return _mm512_sub_epi16(_mm512_setzero_si512(), a); +} + +template <> +EIGEN_STRONG_INLINE Packet16s pnegate(const Packet16s& a) { + return _mm256_sub_epi16(_mm256_setzero_si256(), a); +} + +template <> +EIGEN_STRONG_INLINE Packet8s pnegate(const Packet8s& a) { + return _mm_sub_epi16(_mm_setzero_si128(), a); +} + +template <int N> +EIGEN_STRONG_INLINE Packet32s parithmetic_shift_right(Packet32s a) { + return _mm512_srai_epi16(a, N); +} + +template <int N> +EIGEN_STRONG_INLINE Packet16s parithmetic_shift_right(Packet16s a) { + return _mm256_srai_epi16(a, N); +} + +template <int N> +EIGEN_STRONG_INLINE Packet8s parithmetic_shift_right(Packet8s a) { + return _mm_srai_epi16(a, N); +} + +template <int N> +EIGEN_STRONG_INLINE Packet32s plogical_shift_left(Packet32s a) { + return _mm512_slli_epi16(a, N); +} + +template <int N> +EIGEN_STRONG_INLINE Packet16s plogical_shift_left(Packet16s a) { + return _mm256_slli_epi16(a, N); +} + +template <int N> +EIGEN_STRONG_INLINE Packet8s plogical_shift_left(Packet8s a) { + return _mm_slli_epi16(a, N); +} + +template <int N> +EIGEN_STRONG_INLINE Packet32s plogical_shift_right(Packet32s a) { + return _mm512_srli_epi16(a, N); +} + +template <int N> +EIGEN_STRONG_INLINE Packet16s plogical_shift_right(Packet16s a) { + return _mm256_srli_epi16(a, N); +} + +template <int N> +EIGEN_STRONG_INLINE Packet8s plogical_shift_right(Packet8s a) { + return _mm_srli_epi16(a, N); +} + } // end namespace internal } // end namespace Eigen
diff --git a/Eigen/src/Core/arch/AVX512/PacketMathFP16.h b/Eigen/src/Core/arch/AVX512/PacketMathFP16.h index 038e233..ef64bc5 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMathFP16.h +++ b/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
@@ -1,7 +1,7 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // -// +// Copyright (C) 2025 The Eigen Authors. // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -18,8 +18,8 @@ namespace internal { typedef __m512h Packet32h; -typedef eigen_packet_wrapper<__m256i, 1> Packet16h; -typedef eigen_packet_wrapper<__m128i, 2> Packet8h; +typedef __m256h Packet16h; +typedef __m128h Packet8h; template <> struct is_arithmetic<Packet8h> { @@ -68,6 +68,7 @@ struct unpacket_traits<Packet32h> { typedef Eigen::half type; typedef Packet16h half; + typedef Packet32s integer_packet; enum { size = 32, alignment = Aligned64, @@ -81,6 +82,7 @@ struct unpacket_traits<Packet16h> { typedef Eigen::half type; typedef Packet8h half; + typedef Packet16s integer_packet; enum { size = 16, alignment = Aligned32, @@ -94,6 +96,7 @@ struct unpacket_traits<Packet8h> { typedef Eigen::half type; typedef Packet8h half; + typedef Packet8s integer_packet; enum { size = 8, alignment = Aligned16, @@ -103,14 +106,33 @@ }; }; +// Conversions + +EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) { return _mm512_cvtxph_ps(a); } + +EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) { return _mm256_cvtxph_ps(a); } + +EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) { return _mm512_cvtxps_ph(a); } + +EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) { return _mm256_cvtxps_ph(a); } + // Memory functions // pset1 template <> EIGEN_STRONG_INLINE Packet32h pset1<Packet32h>(const Eigen::half& from) { - // half/half_raw is bit compatible - return _mm512_set1_ph(numext::bit_cast<_Float16>(from)); + return _mm512_set1_ph(from.x); +} + +template <> +EIGEN_STRONG_INLINE Packet16h pset1<Packet16h>(const Eigen::half& from) { + return _mm256_set1_ph(from.x); +} + +template <> +EIGEN_STRONG_INLINE Packet8h pset1<Packet8h>(const Eigen::half& from) { + return _mm_set1_ph(from.x); } template <> @@ -118,24 +140,47 @@ return _mm512_setzero_ph(); } +template <> +EIGEN_STRONG_INLINE Packet16h pzero(const Packet16h& /*a*/) { + return _mm256_setzero_ph(); +} + +template <> +EIGEN_STRONG_INLINE Packet8h pzero(const Packet8h& /*a*/) { + return _mm_setzero_ph(); +} + // pset1frombits template <> EIGEN_STRONG_INLINE Packet32h pset1frombits<Packet32h>(unsigned short from) { return _mm512_castsi512_ph(_mm512_set1_epi16(from)); } +template <> +EIGEN_STRONG_INLINE Packet16h pset1frombits<Packet16h>(unsigned short from) { + return _mm256_castsi256_ph(_mm256_set1_epi16(from)); +} + +template <> +EIGEN_STRONG_INLINE Packet8h pset1frombits<Packet8h>(unsigned short from) { + return _mm_castsi128_ph(_mm_set1_epi16(from)); +} + // pfirst template <> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet32h>(const Packet32h& from) { -#ifdef EIGEN_VECTORIZE_AVX512DQ - return half_impl::raw_uint16_to_half( - static_cast<unsigned short>(_mm256_extract_epi16(_mm512_extracti32x8_epi32(_mm512_castph_si512(from), 0), 0))); -#else - Eigen::half dest[32]; - _mm512_storeu_ph(dest, from); - return dest[0]; -#endif + return Eigen::half(_mm512_cvtsh_h(from)); +} + +template <> +EIGEN_STRONG_INLINE Eigen::half pfirst<Packet16h>(const Packet16h& from) { + return Eigen::half(_mm256_cvtsh_h(from)); +} + +template <> +EIGEN_STRONG_INLINE Eigen::half pfirst<Packet8h>(const Packet8h& from) { + return Eigen::half(_mm_cvtsh_h(from)); } // pload @@ -145,6 +190,16 @@ EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_ph(from); } +template <> +EIGEN_STRONG_INLINE Packet16h pload<Packet16h>(const Eigen::half* from) { + EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ph(from); +} + +template <> +EIGEN_STRONG_INLINE Packet8h pload<Packet8h>(const Eigen::half* from) { + EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ph(from); +} + // ploadu template <> @@ -152,6 +207,16 @@ EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_ph(from); } +template <> +EIGEN_STRONG_INLINE Packet16h ploadu<Packet16h>(const Eigen::half* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_ph(from); +} + +template <> +EIGEN_STRONG_INLINE Packet8h ploadu<Packet8h>(const Eigen::half* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_ph(from); +} + // pstore template <> @@ -159,6 +224,16 @@ EIGEN_DEBUG_ALIGNED_STORE _mm512_store_ph(to, from); } +template <> +EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet16h& from) { + EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ph(to, from); +} + +template <> +EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet8h& from) { + EIGEN_DEBUG_ALIGNED_STORE _mm_store_ph(to, from); +} + // pstoreu template <> @@ -166,6 +241,16 @@ EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_ph(to, from); } +template <> +EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet16h& from) { + EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_ph(to, from); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet8h& from) { + EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ph(to, from); +} + // ploaddup template <> EIGEN_STRONG_INLINE Packet32h ploaddup<Packet32h>(const Eigen::half* from) { @@ -175,6 +260,17 @@ a); } +template <> +EIGEN_STRONG_INLINE Packet16h ploaddup<Packet16h>(const Eigen::half* from) { + __m256h a = _mm256_castph128_ph256(_mm_loadu_ph(from)); + return _mm256_permutexvar_ph(_mm256_set_epi16(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0), a); +} + +template <> +EIGEN_STRONG_INLINE Packet8h ploaddup<Packet8h>(const Eigen::half* from) { + return _mm_set_ph(from[3].x, from[3].x, from[2].x, from[2].x, from[1].x, from[1].x, from[0].x, from[0].x); +} + // ploadquad template <> EIGEN_STRONG_INLINE Packet32h ploadquad<Packet32h>(const Eigen::half* from) { @@ -184,6 +280,17 @@ a); } +template <> +EIGEN_STRONG_INLINE Packet16h ploadquad<Packet16h>(const Eigen::half* from) { + return _mm256_set_ph(from[3].x, from[3].x, from[3].x, from[3].x, from[2].x, from[2].x, from[2].x, from[2].x, + from[1].x, from[1].x, from[1].x, from[1].x, from[0].x, from[0].x, from[0].x, from[0].x); +} + +template <> +EIGEN_STRONG_INLINE Packet8h ploadquad<Packet8h>(const Eigen::half* from) { + return _mm_set_ph(from[1].x, from[1].x, from[1].x, from[1].x, from[0].x, from[0].x, from[0].x, from[0].x); +} + // pabs template <> @@ -191,6 +298,16 @@ return _mm512_abs_ph(a); } +template <> +EIGEN_STRONG_INLINE Packet16h pabs<Packet16h>(const Packet16h& a) { + return _mm256_abs_ph(a); +} + +template <> +EIGEN_STRONG_INLINE Packet8h pabs<Packet8h>(const Packet8h& a) { + return _mm_abs_ph(a); +} + // psignbit template <> @@ -198,6 +315,16 @@ return _mm512_castsi512_ph(_mm512_srai_epi16(_mm512_castph_si512(a), 15)); } +template <> +EIGEN_STRONG_INLINE Packet16h psignbit<Packet16h>(const Packet16h& a) { + return _mm256_castsi256_ph(_mm256_srai_epi16(_mm256_castph_si256(a), 15)); +} + +template <> +EIGEN_STRONG_INLINE Packet8h psignbit<Packet8h>(const Packet8h& a) { + return _mm_castsi128_ph(_mm_srai_epi16(_mm_castph_si128(a), 15)); +} + // pmin template <> @@ -205,6 +332,16 @@ return _mm512_min_ph(a, b); } +template <> +EIGEN_STRONG_INLINE Packet16h pmin<Packet16h>(const Packet16h& a, const Packet16h& b) { + return _mm256_min_ph(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet8h pmin<Packet8h>(const Packet8h& a, const Packet8h& b) { + return _mm_min_ph(a, b); +} + // pmax template <> @@ -212,6 +349,16 @@ return _mm512_max_ph(a, b); } +template <> +EIGEN_STRONG_INLINE Packet16h pmax<Packet16h>(const Packet16h& a, const Packet16h& b) { + return _mm256_max_ph(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet8h pmax<Packet8h>(const Packet8h& a, const Packet8h& b) { + return _mm_max_ph(a, b); +} + // plset template <> EIGEN_STRONG_INLINE Packet32h plset<Packet32h>(const half& a) { @@ -219,6 +366,16 @@ 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); } +template <> +EIGEN_STRONG_INLINE Packet16h plset<Packet16h>(const half& a) { + return _mm256_add_ph(pset1<Packet16h>(a), _mm256_set_ph(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); +} + +template <> +EIGEN_STRONG_INLINE Packet8h plset<Packet8h>(const half& a) { + return _mm_add_ph(pset1<Packet8h>(a), _mm_set_ph(7, 6, 5, 4, 3, 2, 1, 0)); +} + // por template <> @@ -226,6 +383,16 @@ return _mm512_castsi512_ph(_mm512_or_si512(_mm512_castph_si512(a), _mm512_castph_si512(b))); } +template <> +EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a, const Packet16h& b) { + return _mm256_castsi256_ph(_mm256_or_si256(_mm256_castph_si256(a), _mm256_castph_si256(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a, const Packet8h& b) { + return _mm_castsi128_ph(_mm_or_si128(_mm_castph_si128(a), _mm_castph_si128(b))); +} + // pxor template <> @@ -233,6 +400,16 @@ return _mm512_castsi512_ph(_mm512_xor_si512(_mm512_castph_si512(a), _mm512_castph_si512(b))); } +template <> +EIGEN_STRONG_INLINE Packet16h pxor(const Packet16h& a, const Packet16h& b) { + return _mm256_castsi256_ph(_mm256_xor_si256(_mm256_castph_si256(a), _mm256_castph_si256(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet8h pxor(const Packet8h& a, const Packet8h& b) { + return _mm_castsi128_ph(_mm_xor_si128(_mm_castph_si128(a), _mm_castph_si128(b))); +} + // pand template <> @@ -240,6 +417,16 @@ return _mm512_castsi512_ph(_mm512_and_si512(_mm512_castph_si512(a), _mm512_castph_si512(b))); } +template <> +EIGEN_STRONG_INLINE Packet16h pand(const Packet16h& a, const Packet16h& b) { + return _mm256_castsi256_ph(_mm256_and_si256(_mm256_castph_si256(a), _mm256_castph_si256(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet8h pand(const Packet8h& a, const Packet8h& b) { + return _mm_castsi128_ph(_mm_and_si128(_mm_castph_si128(a), _mm_castph_si128(b))); +} + // pandnot template <> @@ -247,6 +434,16 @@ return _mm512_castsi512_ph(_mm512_andnot_si512(_mm512_castph_si512(b), _mm512_castph_si512(a))); } +template <> +EIGEN_STRONG_INLINE Packet16h pandnot(const Packet16h& a, const Packet16h& b) { + return _mm256_castsi256_ph(_mm256_andnot_si256(_mm256_castph_si256(b), _mm256_castph_si256(a))); +} + +template <> +EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h& a, const Packet8h& b) { + return _mm_castsi128_ph(_mm_andnot_si128(_mm_castph_si128(b), _mm_castph_si128(a))); +} + // pselect template <> @@ -255,6 +452,18 @@ return _mm512_mask_blend_ph(mask32, a, b); } +template <> +EIGEN_DEVICE_FUNC inline Packet16h pselect(const Packet16h& mask, const Packet16h& a, const Packet16h& b) { + __mmask16 mask16 = _mm256_cmp_epi16_mask(_mm256_castph_si256(mask), _mm256_setzero_si256(), _MM_CMPINT_EQ); + return _mm256_mask_blend_ph(mask16, a, b); +} + +template <> +EIGEN_DEVICE_FUNC inline Packet8h pselect(const Packet8h& mask, const Packet8h& a, const Packet8h& b) { + __mmask8 mask8 = _mm_cmp_epi16_mask(_mm_castph_si128(mask), _mm_setzero_si128(), _MM_CMPINT_EQ); + return _mm_mask_blend_ph(mask8, a, b); +} + // pcmp_eq template <> @@ -263,6 +472,18 @@ return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, static_cast<short>(0xffffu))); } +template <> +EIGEN_STRONG_INLINE Packet16h pcmp_eq(const Packet16h& a, const Packet16h& b) { + __mmask16 mask = _mm256_cmp_ph_mask(a, b, _CMP_EQ_OQ); + return _mm256_castsi256_ph(_mm256_mask_set1_epi16(_mm256_set1_epi32(0), mask, static_cast<short>(0xffffu))); +} + +template <> +EIGEN_STRONG_INLINE Packet8h pcmp_eq(const Packet8h& a, const Packet8h& b) { + __mmask8 mask = _mm_cmp_ph_mask(a, b, _CMP_EQ_OQ); + return _mm_castsi128_ph(_mm_mask_set1_epi16(_mm_set1_epi32(0), mask, static_cast<short>(0xffffu))); +} + // pcmp_le template <> @@ -271,6 +492,18 @@ return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, static_cast<short>(0xffffu))); } +template <> +EIGEN_STRONG_INLINE Packet16h pcmp_le(const Packet16h& a, const Packet16h& b) { + __mmask16 mask = _mm256_cmp_ph_mask(a, b, _CMP_LE_OQ); + return _mm256_castsi256_ph(_mm256_mask_set1_epi16(_mm256_set1_epi32(0), mask, static_cast<short>(0xffffu))); +} + +template <> +EIGEN_STRONG_INLINE Packet8h pcmp_le(const Packet8h& a, const Packet8h& b) { + __mmask8 mask = _mm_cmp_ph_mask(a, b, _CMP_LE_OQ); + return _mm_castsi128_ph(_mm_mask_set1_epi16(_mm_set1_epi32(0), mask, static_cast<short>(0xffffu))); +} + // pcmp_lt template <> @@ -279,6 +512,18 @@ return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, static_cast<short>(0xffffu))); } +template <> +EIGEN_STRONG_INLINE Packet16h pcmp_lt(const Packet16h& a, const Packet16h& b) { + __mmask16 mask = _mm256_cmp_ph_mask(a, b, _CMP_LT_OQ); + return _mm256_castsi256_ph(_mm256_mask_set1_epi16(_mm256_set1_epi32(0), mask, static_cast<short>(0xffffu))); +} + +template <> +EIGEN_STRONG_INLINE Packet8h pcmp_lt(const Packet8h& a, const Packet8h& b) { + __mmask8 mask = _mm_cmp_ph_mask(a, b, _CMP_LT_OQ); + return _mm_castsi128_ph(_mm_mask_set1_epi16(_mm_set1_epi32(0), mask, static_cast<short>(0xffffu))); +} + // pcmp_lt_or_nan template <> @@ -287,6 +532,18 @@ return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi16(0), mask, static_cast<short>(0xffffu))); } +template <> +EIGEN_STRONG_INLINE Packet16h pcmp_lt_or_nan(const Packet16h& a, const Packet16h& b) { + __mmask16 mask = _mm256_cmp_ph_mask(a, b, _CMP_NGE_UQ); + return _mm256_castsi256_ph(_mm256_mask_set1_epi16(_mm256_set1_epi32(0), mask, static_cast<short>(0xffffu))); +} + +template <> +EIGEN_STRONG_INLINE Packet8h pcmp_lt_or_nan(const Packet8h& a, const Packet8h& b) { + __mmask8 mask = _mm_cmp_ph_mask(a, b, _CMP_NGE_UQ); + return _mm_castsi128_ph(_mm_mask_set1_epi16(_mm_set1_epi32(0), mask, static_cast<short>(0xffffu))); +} + // padd template <> @@ -296,12 +553,12 @@ template <> EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) { - return _mm256_castph_si256(_mm256_add_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b))); + return _mm256_add_ph(a, b); } template <> EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) { - return _mm_castph_si128(_mm_add_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b))); + return _mm_add_ph(a, b); } // psub @@ -313,12 +570,12 @@ template <> EIGEN_STRONG_INLINE Packet16h psub<Packet16h>(const Packet16h& a, const Packet16h& b) { - return _mm256_castph_si256(_mm256_sub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b))); + return _mm256_sub_ph(a, b); } template <> EIGEN_STRONG_INLINE Packet8h psub<Packet8h>(const Packet8h& a, const Packet8h& b) { - return _mm_castph_si128(_mm_sub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b))); + return _mm_sub_ph(a, b); } // pmul @@ -330,12 +587,12 @@ template <> EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) { - return _mm256_castph_si256(_mm256_mul_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b))); + return _mm256_mul_ph(a, b); } template <> EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) { - return _mm_castph_si128(_mm_mul_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b))); + return _mm_mul_ph(a, b); } // pdiv @@ -347,12 +604,13 @@ template <> EIGEN_STRONG_INLINE Packet16h pdiv<Packet16h>(const Packet16h& a, const Packet16h& b) { - return _mm256_castph_si256(_mm256_div_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b))); + return _mm256_div_ph(a, b); } template <> EIGEN_STRONG_INLINE Packet8h pdiv<Packet8h>(const Packet8h& a, const Packet8h& b) { - return _mm_castph_si128(_mm_div_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b))); + return _mm_div_ph(a, b); + ; } // pround @@ -361,14 +619,40 @@ EIGEN_STRONG_INLINE Packet32h pround<Packet32h>(const Packet32h& a) { // Work-around for default std::round rounding mode. - // Mask for the sign bit - const Packet32h signMask = pset1frombits<Packet32h>(static_cast<numext::uint16_t>(0x8000u)); - // The largest half-preicision float less than 0.5 + // Mask for the sign bit. + const Packet32h signMask = + pset1frombits<Packet32h>(static_cast<numext::uint16_t>(static_cast<std::uint16_t>(0x8000u))); + // The largest half-precision float less than 0.5. const Packet32h prev0dot5 = pset1frombits<Packet32h>(static_cast<numext::uint16_t>(0x37FFu)); return _mm512_roundscale_ph(padd(por(pand(a, signMask), prev0dot5), a), _MM_FROUND_TO_ZERO); } +template <> +EIGEN_STRONG_INLINE Packet16h pround<Packet16h>(const Packet16h& a) { + // Work-around for default std::round rounding mode. + + // Mask for the sign bit. + const Packet16h signMask = + pset1frombits<Packet16h>(static_cast<numext::uint16_t>(static_cast<std::uint16_t>(0x8000u))); + // The largest half-precision float less than 0.5. + const Packet16h prev0dot5 = pset1frombits<Packet16h>(static_cast<numext::uint16_t>(0x37FFu)); + + return _mm256_roundscale_ph(padd(por(pand(a, signMask), prev0dot5), a), _MM_FROUND_TO_ZERO); +} + +template <> +EIGEN_STRONG_INLINE Packet8h pround<Packet8h>(const Packet8h& a) { + // Work-around for default std::round rounding mode. + + // Mask for the sign bit. + const Packet8h signMask = pset1frombits<Packet8h>(static_cast<numext::uint16_t>(static_cast<std::uint16_t>(0x8000u))); + // The largest half-precision float less than 0.5. + const Packet8h prev0dot5 = pset1frombits<Packet8h>(static_cast<numext::uint16_t>(0x37FFu)); + + return _mm_roundscale_ph(padd(por(pand(a, signMask), prev0dot5), a), _MM_FROUND_TO_ZERO); +} + // print template <> @@ -376,6 +660,16 @@ return _mm512_roundscale_ph(a, _MM_FROUND_CUR_DIRECTION); } +template <> +EIGEN_STRONG_INLINE Packet16h print<Packet16h>(const Packet16h& a) { + return _mm256_roundscale_ph(a, _MM_FROUND_CUR_DIRECTION); +} + +template <> +EIGEN_STRONG_INLINE Packet8h print<Packet8h>(const Packet8h& a) { + return _mm_roundscale_ph(a, _MM_FROUND_CUR_DIRECTION); +} + // pceil template <> @@ -383,6 +677,16 @@ return _mm512_roundscale_ph(a, _MM_FROUND_TO_POS_INF); } +template <> +EIGEN_STRONG_INLINE Packet16h pceil<Packet16h>(const Packet16h& a) { + return _mm256_roundscale_ph(a, _MM_FROUND_TO_POS_INF); +} + +template <> +EIGEN_STRONG_INLINE Packet8h pceil<Packet8h>(const Packet8h& a) { + return _mm_roundscale_ph(a, _MM_FROUND_TO_POS_INF); +} + // pfloor template <> @@ -390,6 +694,16 @@ return _mm512_roundscale_ph(a, _MM_FROUND_TO_NEG_INF); } +template <> +EIGEN_STRONG_INLINE Packet16h pfloor<Packet16h>(const Packet16h& a) { + return _mm256_roundscale_ph(a, _MM_FROUND_TO_NEG_INF); +} + +template <> +EIGEN_STRONG_INLINE Packet8h pfloor<Packet8h>(const Packet8h& a) { + return _mm_roundscale_ph(a, _MM_FROUND_TO_NEG_INF); +} + // ptrunc template <> @@ -397,47 +711,99 @@ return _mm512_roundscale_ph(a, _MM_FROUND_TO_ZERO); } +template <> +EIGEN_STRONG_INLINE Packet16h ptrunc<Packet16h>(const Packet16h& a) { + return _mm256_roundscale_ph(a, _MM_FROUND_TO_ZERO); +} + +template <> +EIGEN_STRONG_INLINE Packet8h ptrunc<Packet8h>(const Packet8h& a) { + return _mm_roundscale_ph(a, _MM_FROUND_TO_ZERO); +} + // predux template <> EIGEN_STRONG_INLINE half predux<Packet32h>(const Packet32h& a) { - return (half)_mm512_reduce_add_ph(a); + return half(_mm512_reduce_add_ph(a)); } template <> EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& a) { - return (half)_mm256_reduce_add_ph(_mm256_castsi256_ph(a)); + return half(_mm256_reduce_add_ph(a)); } template <> EIGEN_STRONG_INLINE half predux<Packet8h>(const Packet8h& a) { - return (half)_mm_reduce_add_ph(_mm_castsi128_ph(a)); + return half(_mm_reduce_add_ph(a)); } // predux_half_dowto4 template <> EIGEN_STRONG_INLINE Packet16h predux_half_dowto4<Packet32h>(const Packet32h& a) { -#ifdef EIGEN_VECTORIZE_AVX512DQ - __m256i lowHalf = _mm256_castps_si256(_mm512_extractf32x8_ps(_mm512_castph_ps(a), 0)); - __m256i highHalf = _mm256_castps_si256(_mm512_extractf32x8_ps(_mm512_castph_ps(a), 1)); + const __m512i bits = _mm512_castph_si512(a); + Packet16h lo = _mm256_castsi256_ph(_mm512_castsi512_si256(bits)); + Packet16h hi = _mm256_castsi256_ph(_mm512_extracti64x4_epi64(bits, 1)); + return padd(lo, hi); +} - return Packet16h(padd<Packet16h>(lowHalf, highHalf)); -#else - Eigen::half data[32]; - _mm512_storeu_ph(data, a); - - __m256i lowHalf = _mm256_castph_si256(_mm256_loadu_ph(data)); - __m256i highHalf = _mm256_castph_si256(_mm256_loadu_ph(data + 16)); - - return Packet16h(padd<Packet16h>(lowHalf, highHalf)); -#endif +template <> +EIGEN_STRONG_INLINE Packet8h predux_half_dowto4<Packet16h>(const Packet16h& a) { + Packet8h lo = _mm_castsi128_ph(_mm256_castsi256_si128(_mm256_castph_si256(a))); + Packet8h hi = _mm_castps_ph(_mm256_extractf128_ps(_mm256_castph_ps(a), 1)); + return padd(lo, hi); } // predux_max +template <> +EIGEN_STRONG_INLINE half predux_max<Packet32h>(const Packet32h& a) { + return half(_mm512_reduce_max_ph(a)); +} + +template <> +EIGEN_STRONG_INLINE half predux_max<Packet16h>(const Packet16h& a) { + return half(_mm256_reduce_max_ph(a)); +} + +template <> +EIGEN_STRONG_INLINE half predux_max<Packet8h>(const Packet8h& a) { + return half(_mm_reduce_max_ph(a)); +} + // predux_min +template <> +EIGEN_STRONG_INLINE half predux_min<Packet32h>(const Packet32h& a) { + return half(_mm512_reduce_min_ph(a)); +} + +template <> +EIGEN_STRONG_INLINE half predux_min<Packet16h>(const Packet16h& a) { + return half(_mm256_reduce_min_ph(a)); +} + +template <> +EIGEN_STRONG_INLINE half predux_min<Packet8h>(const Packet8h& a) { + return half(_mm_reduce_min_ph(a)); +} + // predux_mul +template <> +EIGEN_STRONG_INLINE half predux_mul<Packet32h>(const Packet32h& a) { + return half(_mm512_reduce_mul_ph(a)); +} + +template <> +EIGEN_STRONG_INLINE half predux_mul<Packet16h>(const Packet16h& a) { + return half(_mm256_reduce_mul_ph(a)); +} + +template <> +EIGEN_STRONG_INLINE half predux_mul<Packet8h>(const Packet8h& a) { + return half(_mm_reduce_mul_ph(a)); +} + #ifdef EIGEN_VECTORIZE_FMA // pmadd @@ -449,12 +815,12 @@ template <> EIGEN_STRONG_INLINE Packet16h pmadd(const Packet16h& a, const Packet16h& b, const Packet16h& c) { - return _mm256_castph_si256(_mm256_fmadd_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c))); + return _mm256_fmadd_ph(a, b, c); } template <> EIGEN_STRONG_INLINE Packet8h pmadd(const Packet8h& a, const Packet8h& b, const Packet8h& c) { - return _mm_castph_si128(_mm_fmadd_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c))); + return _mm_fmadd_ph(a, b, c); } // pmsub @@ -466,12 +832,12 @@ template <> EIGEN_STRONG_INLINE Packet16h pmsub(const Packet16h& a, const Packet16h& b, const Packet16h& c) { - return _mm256_castph_si256(_mm256_fmsub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c))); + return _mm256_fmsub_ph(a, b, c); } template <> EIGEN_STRONG_INLINE Packet8h pmsub(const Packet8h& a, const Packet8h& b, const Packet8h& c) { - return _mm_castph_si128(_mm_fmsub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c))); + return _mm_fmsub_ph(a, b, c); } // pnmadd @@ -483,12 +849,12 @@ template <> EIGEN_STRONG_INLINE Packet16h pnmadd(const Packet16h& a, const Packet16h& b, const Packet16h& c) { - return _mm256_castph_si256(_mm256_fnmadd_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c))); + return _mm256_fnmadd_ph(a, b, c); } template <> EIGEN_STRONG_INLINE Packet8h pnmadd(const Packet8h& a, const Packet8h& b, const Packet8h& c) { - return _mm_castph_si128(_mm_fnmadd_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c))); + return _mm_fnmadd_ph(a, b, c); } // pnmsub @@ -500,12 +866,12 @@ template <> EIGEN_STRONG_INLINE Packet16h pnmsub(const Packet16h& a, const Packet16h& b, const Packet16h& c) { - return _mm256_castph_si256(_mm256_fnmsub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c))); + return _mm256_fnmsub_ph(a, b, c); } template <> EIGEN_STRONG_INLINE Packet8h pnmsub(const Packet8h& a, const Packet8h& b, const Packet8h& c) { - return _mm_castph_si128(_mm_fnmsub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c))); + return _mm_fnmsub_ph(a, b, c); } #endif @@ -514,35 +880,74 @@ template <> EIGEN_STRONG_INLINE Packet32h pnegate<Packet32h>(const Packet32h& a) { - return psub(pzero(a), a); + return _mm512_castsi512_ph( + _mm512_xor_si512(_mm512_castph_si512(a), _mm512_set1_epi16(static_cast<std::uint16_t>(0x8000u)))); +} + +template <> +EIGEN_STRONG_INLINE Packet16h pnegate<Packet16h>(const Packet16h& a) { + return _mm256_castsi256_ph( + _mm256_xor_si256(_mm256_castph_si256(a), _mm256_set1_epi16(static_cast<std::uint16_t>(0x8000u)))); +} + +template <> +EIGEN_STRONG_INLINE Packet8h pnegate<Packet8h>(const Packet8h& a) { + return _mm_castsi128_ph(_mm_xor_si128(_mm_castph_si128(a), _mm_set1_epi16(static_cast<std::uint16_t>(0x8000u)))); } // pconj -template <> -EIGEN_STRONG_INLINE Packet32h pconj<Packet32h>(const Packet32h& a) { - return a; -} +// Nothing, packets are real. // psqrt template <> EIGEN_STRONG_INLINE Packet32h psqrt<Packet32h>(const Packet32h& a) { - return _mm512_sqrt_ph(a); + return generic_sqrt_newton_step<Packet32h>::run(a, _mm512_rsqrt_ph(a)); +} + +template <> +EIGEN_STRONG_INLINE Packet16h psqrt<Packet16h>(const Packet16h& a) { + return generic_sqrt_newton_step<Packet16h>::run(a, _mm256_rsqrt_ph(a)); +} + +template <> +EIGEN_STRONG_INLINE Packet8h psqrt<Packet8h>(const Packet8h& a) { + return generic_sqrt_newton_step<Packet8h>::run(a, _mm_rsqrt_ph(a)); } // prsqrt template <> EIGEN_STRONG_INLINE Packet32h prsqrt<Packet32h>(const Packet32h& a) { - return _mm512_rsqrt_ph(a); + return generic_rsqrt_newton_step<Packet32h, /*Steps=*/1>::run(a, _mm512_rsqrt_ph(a)); +} + +template <> +EIGEN_STRONG_INLINE Packet16h prsqrt<Packet16h>(const Packet16h& a) { + return generic_rsqrt_newton_step<Packet16h, /*Steps=*/1>::run(a, _mm256_rsqrt_ph(a)); +} + +template <> +EIGEN_STRONG_INLINE Packet8h prsqrt<Packet8h>(const Packet8h& a) { + return generic_rsqrt_newton_step<Packet8h, /*Steps=*/1>::run(a, _mm_rsqrt_ph(a)); } // preciprocal template <> EIGEN_STRONG_INLINE Packet32h preciprocal<Packet32h>(const Packet32h& a) { - return _mm512_rcp_ph(a); + return generic_reciprocal_newton_step<Packet32h, /*Steps=*/1>::run(a, _mm512_rcp_ph(a)); +} + +template <> +EIGEN_STRONG_INLINE Packet16h preciprocal<Packet16h>(const Packet16h& a) { + return generic_reciprocal_newton_step<Packet16h, /*Steps=*/1>::run(a, _mm256_rcp_ph(a)); +} + +template <> +EIGEN_STRONG_INLINE Packet8h preciprocal<Packet8h>(const Packet8h& a) { + return generic_reciprocal_newton_step<Packet8h, /*Steps=*/1>::run(a, _mm_rcp_ph(a)); } // ptranspose @@ -663,6 +1068,246 @@ a.packet[3] = _mm512_castsi512_ph(a3); } +EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16h, 16>& kernel) { + __m256i a = _mm256_castph_si256(kernel.packet[0]); + __m256i b = _mm256_castph_si256(kernel.packet[1]); + __m256i c = _mm256_castph_si256(kernel.packet[2]); + __m256i d = _mm256_castph_si256(kernel.packet[3]); + __m256i e = _mm256_castph_si256(kernel.packet[4]); + __m256i f = _mm256_castph_si256(kernel.packet[5]); + __m256i g = _mm256_castph_si256(kernel.packet[6]); + __m256i h = _mm256_castph_si256(kernel.packet[7]); + __m256i i = _mm256_castph_si256(kernel.packet[8]); + __m256i j = _mm256_castph_si256(kernel.packet[9]); + __m256i k = _mm256_castph_si256(kernel.packet[10]); + __m256i l = _mm256_castph_si256(kernel.packet[11]); + __m256i m = _mm256_castph_si256(kernel.packet[12]); + __m256i n = _mm256_castph_si256(kernel.packet[13]); + __m256i o = _mm256_castph_si256(kernel.packet[14]); + __m256i p = _mm256_castph_si256(kernel.packet[15]); + + __m256i ab_07 = _mm256_unpacklo_epi16(a, b); + __m256i cd_07 = _mm256_unpacklo_epi16(c, d); + __m256i ef_07 = _mm256_unpacklo_epi16(e, f); + __m256i gh_07 = _mm256_unpacklo_epi16(g, h); + __m256i ij_07 = _mm256_unpacklo_epi16(i, j); + __m256i kl_07 = _mm256_unpacklo_epi16(k, l); + __m256i mn_07 = _mm256_unpacklo_epi16(m, n); + __m256i op_07 = _mm256_unpacklo_epi16(o, p); + + __m256i ab_8f = _mm256_unpackhi_epi16(a, b); + __m256i cd_8f = _mm256_unpackhi_epi16(c, d); + __m256i ef_8f = _mm256_unpackhi_epi16(e, f); + __m256i gh_8f = _mm256_unpackhi_epi16(g, h); + __m256i ij_8f = _mm256_unpackhi_epi16(i, j); + __m256i kl_8f = _mm256_unpackhi_epi16(k, l); + __m256i mn_8f = _mm256_unpackhi_epi16(m, n); + __m256i op_8f = _mm256_unpackhi_epi16(o, p); + + __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07); + __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07); + __m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07); + __m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07); + __m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07); + __m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07); + __m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07); + __m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07); + + __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f); + __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f); + __m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f); + __m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f); + __m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f); + __m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f); + __m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f); + __m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f); + + __m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03); + __m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03); + __m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03); + __m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03); + __m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47); + __m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47); + __m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47); + __m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47); + __m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b); + __m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b); + __m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b); + __m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b); + __m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf); + __m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf); + __m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf); + __m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf); + + // NOTE: no unpacklo/hi instr in this case, so using permute instr. + __m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20); + __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20); + __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20); + __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20); + __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20); + __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20); + __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20); + __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20); + __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31); + __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31); + __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31); + __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31); + __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31); + __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31); + __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31); + __m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31); + + kernel.packet[0] = _mm256_castsi256_ph(a_p_0); + kernel.packet[1] = _mm256_castsi256_ph(a_p_1); + kernel.packet[2] = _mm256_castsi256_ph(a_p_2); + kernel.packet[3] = _mm256_castsi256_ph(a_p_3); + kernel.packet[4] = _mm256_castsi256_ph(a_p_4); + kernel.packet[5] = _mm256_castsi256_ph(a_p_5); + kernel.packet[6] = _mm256_castsi256_ph(a_p_6); + kernel.packet[7] = _mm256_castsi256_ph(a_p_7); + kernel.packet[8] = _mm256_castsi256_ph(a_p_8); + kernel.packet[9] = _mm256_castsi256_ph(a_p_9); + kernel.packet[10] = _mm256_castsi256_ph(a_p_a); + kernel.packet[11] = _mm256_castsi256_ph(a_p_b); + kernel.packet[12] = _mm256_castsi256_ph(a_p_c); + kernel.packet[13] = _mm256_castsi256_ph(a_p_d); + kernel.packet[14] = _mm256_castsi256_ph(a_p_e); + kernel.packet[15] = _mm256_castsi256_ph(a_p_f); +} + +EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16h, 8>& kernel) { + EIGEN_ALIGN64 half in[8][16]; + pstore<half>(in[0], kernel.packet[0]); + pstore<half>(in[1], kernel.packet[1]); + pstore<half>(in[2], kernel.packet[2]); + pstore<half>(in[3], kernel.packet[3]); + pstore<half>(in[4], kernel.packet[4]); + pstore<half>(in[5], kernel.packet[5]); + pstore<half>(in[6], kernel.packet[6]); + pstore<half>(in[7], kernel.packet[7]); + + EIGEN_ALIGN64 half out[8][16]; + + for (int i = 0; i < 8; ++i) { + for (int j = 0; j < 8; ++j) { + out[i][j] = in[j][2 * i]; + } + for (int j = 0; j < 8; ++j) { + out[i][j + 8] = in[j][2 * i + 1]; + } + } + + kernel.packet[0] = pload<Packet16h>(out[0]); + kernel.packet[1] = pload<Packet16h>(out[1]); + kernel.packet[2] = pload<Packet16h>(out[2]); + kernel.packet[3] = pload<Packet16h>(out[3]); + kernel.packet[4] = pload<Packet16h>(out[4]); + kernel.packet[5] = pload<Packet16h>(out[5]); + kernel.packet[6] = pload<Packet16h>(out[6]); + kernel.packet[7] = pload<Packet16h>(out[7]); +} + +EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16h, 4>& kernel) { + EIGEN_ALIGN64 half in[4][16]; + pstore<half>(in[0], kernel.packet[0]); + pstore<half>(in[1], kernel.packet[1]); + pstore<half>(in[2], kernel.packet[2]); + pstore<half>(in[3], kernel.packet[3]); + + EIGEN_ALIGN64 half out[4][16]; + + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 4; ++j) { + out[i][j] = in[j][4 * i]; + } + for (int j = 0; j < 4; ++j) { + out[i][j + 4] = in[j][4 * i + 1]; + } + for (int j = 0; j < 4; ++j) { + out[i][j + 8] = in[j][4 * i + 2]; + } + for (int j = 0; j < 4; ++j) { + out[i][j + 12] = in[j][4 * i + 3]; + } + } + + kernel.packet[0] = pload<Packet16h>(out[0]); + kernel.packet[1] = pload<Packet16h>(out[1]); + kernel.packet[2] = pload<Packet16h>(out[2]); + kernel.packet[3] = pload<Packet16h>(out[3]); +} + +EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8h, 8>& kernel) { + __m128i a = _mm_castph_si128(kernel.packet[0]); + __m128i b = _mm_castph_si128(kernel.packet[1]); + __m128i c = _mm_castph_si128(kernel.packet[2]); + __m128i d = _mm_castph_si128(kernel.packet[3]); + __m128i e = _mm_castph_si128(kernel.packet[4]); + __m128i f = _mm_castph_si128(kernel.packet[5]); + __m128i g = _mm_castph_si128(kernel.packet[6]); + __m128i h = _mm_castph_si128(kernel.packet[7]); + + __m128i a03b03 = _mm_unpacklo_epi16(a, b); + __m128i c03d03 = _mm_unpacklo_epi16(c, d); + __m128i e03f03 = _mm_unpacklo_epi16(e, f); + __m128i g03h03 = _mm_unpacklo_epi16(g, h); + __m128i a47b47 = _mm_unpackhi_epi16(a, b); + __m128i c47d47 = _mm_unpackhi_epi16(c, d); + __m128i e47f47 = _mm_unpackhi_epi16(e, f); + __m128i g47h47 = _mm_unpackhi_epi16(g, h); + + __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03); + __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03); + __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03); + __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03); + __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47); + __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47); + __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47); + __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47); + + __m128i a0b0c0d0e0f0g0h0 = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01); + __m128i a1b1c1d1e1f1g1h1 = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01); + __m128i a2b2c2d2e2f2g2h2 = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23); + __m128i a3b3c3d3e3f3g3h3 = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23); + __m128i a4b4c4d4e4f4g4h4 = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45); + __m128i a5b5c5d5e5f5g5h5 = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45); + __m128i a6b6c6d6e6f6g6h6 = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67); + __m128i a7b7c7d7e7f7g7h7 = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67); + + kernel.packet[0] = _mm_castsi128_ph(a0b0c0d0e0f0g0h0); + kernel.packet[1] = _mm_castsi128_ph(a1b1c1d1e1f1g1h1); + kernel.packet[2] = _mm_castsi128_ph(a2b2c2d2e2f2g2h2); + kernel.packet[3] = _mm_castsi128_ph(a3b3c3d3e3f3g3h3); + kernel.packet[4] = _mm_castsi128_ph(a4b4c4d4e4f4g4h4); + kernel.packet[5] = _mm_castsi128_ph(a5b5c5d5e5f5g5h5); + kernel.packet[6] = _mm_castsi128_ph(a6b6c6d6e6f6g6h6); + kernel.packet[7] = _mm_castsi128_ph(a7b7c7d7e7f7g7h7); +} + +EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8h, 4>& kernel) { + EIGEN_ALIGN32 Eigen::half in[4][8]; + pstore<Eigen::half>(in[0], kernel.packet[0]); + pstore<Eigen::half>(in[1], kernel.packet[1]); + pstore<Eigen::half>(in[2], kernel.packet[2]); + pstore<Eigen::half>(in[3], kernel.packet[3]); + + EIGEN_ALIGN32 Eigen::half out[4][8]; + + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 4; ++j) { + out[i][j] = in[j][2 * i]; + } + for (int j = 0; j < 4; ++j) { + out[i][j + 4] = in[j][2 * i + 1]; + } + } + + kernel.packet[0] = pload<Packet8h>(out[0]); + kernel.packet[1] = pload<Packet8h>(out[1]); + kernel.packet[2] = pload<Packet8h>(out[2]); + kernel.packet[3] = pload<Packet8h>(out[3]); +} + // preverse template <> @@ -672,6 +1317,20 @@ a); } +template <> +EIGEN_STRONG_INLINE Packet16h preverse(const Packet16h& a) { + __m128i m = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); + return _mm256_castsi256_ph(_mm256_insertf128_si256( + _mm256_castsi128_si256(_mm_shuffle_epi8(_mm256_extractf128_si256(_mm256_castph_si256(a), 1), m)), + _mm_shuffle_epi8(_mm256_extractf128_si256(_mm256_castph_si256(a), 0), m), 1)); +} + +template <> +EIGEN_STRONG_INLINE Packet8h preverse(const Packet8h& a) { + __m128i m = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); + return _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(a), m)); +} + // pscatter template <> @@ -684,191 +1343,68 @@ to[stride * i] = aux[i]; } } +template <> +EIGEN_STRONG_INLINE void pscatter<half, Packet16h>(half* to, const Packet16h& from, Index stride) { + EIGEN_ALIGN64 half aux[16]; + pstore(aux, from); + to[stride * 0] = aux[0]; + to[stride * 1] = aux[1]; + to[stride * 2] = aux[2]; + to[stride * 3] = aux[3]; + to[stride * 4] = aux[4]; + to[stride * 5] = aux[5]; + to[stride * 6] = aux[6]; + to[stride * 7] = aux[7]; + to[stride * 8] = aux[8]; + to[stride * 9] = aux[9]; + to[stride * 10] = aux[10]; + to[stride * 11] = aux[11]; + to[stride * 12] = aux[12]; + to[stride * 13] = aux[13]; + to[stride * 14] = aux[14]; + to[stride * 15] = aux[15]; +} + +template <> +EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8h>(Eigen::half* to, const Packet8h& from, Index stride) { + EIGEN_ALIGN32 Eigen::half aux[8]; + pstore(aux, from); + to[stride * 0] = aux[0]; + to[stride * 1] = aux[1]; + to[stride * 2] = aux[2]; + to[stride * 3] = aux[3]; + to[stride * 4] = aux[4]; + to[stride * 5] = aux[5]; + to[stride * 6] = aux[6]; + to[stride * 7] = aux[7]; +} // pgather template <> EIGEN_STRONG_INLINE Packet32h pgather<Eigen::half, Packet32h>(const Eigen::half* from, Index stride) { - return _mm512_castsi512_ph(_mm512_set_epi16( - from[31 * stride].x, from[30 * stride].x, from[29 * stride].x, from[28 * stride].x, from[27 * stride].x, - from[26 * stride].x, from[25 * stride].x, from[24 * stride].x, from[23 * stride].x, from[22 * stride].x, - from[21 * stride].x, from[20 * stride].x, from[19 * stride].x, from[18 * stride].x, from[17 * stride].x, - from[16 * stride].x, from[15 * stride].x, from[14 * stride].x, from[13 * stride].x, from[12 * stride].x, - from[11 * stride].x, from[10 * stride].x, from[9 * stride].x, from[8 * stride].x, from[7 * stride].x, - from[6 * stride].x, from[5 * stride].x, from[4 * stride].x, from[3 * stride].x, from[2 * stride].x, - from[1 * stride].x, from[0 * stride].x)); + return _mm512_set_ph(from[31 * stride].x, from[30 * stride].x, from[29 * stride].x, from[28 * stride].x, + from[27 * stride].x, from[26 * stride].x, from[25 * stride].x, from[24 * stride].x, + from[23 * stride].x, from[22 * stride].x, from[21 * stride].x, from[20 * stride].x, + from[19 * stride].x, from[18 * stride].x, from[17 * stride].x, from[16 * stride].x, + from[15 * stride].x, from[14 * stride].x, from[13 * stride].x, from[12 * stride].x, + from[11 * stride].x, from[10 * stride].x, from[9 * stride].x, from[8 * stride].x, + from[7 * stride].x, from[6 * stride].x, from[5 * stride].x, from[4 * stride].x, + from[3 * stride].x, from[2 * stride].x, from[1 * stride].x, from[0 * stride].x); } template <> -EIGEN_STRONG_INLINE Packet16h pcos<Packet16h>(const Packet16h&); -template <> -EIGEN_STRONG_INLINE Packet16h psin<Packet16h>(const Packet16h&); -template <> -EIGEN_STRONG_INLINE Packet16h plog<Packet16h>(const Packet16h&); -template <> -EIGEN_STRONG_INLINE Packet16h plog2<Packet16h>(const Packet16h&); -template <> -EIGEN_STRONG_INLINE Packet16h plog1p<Packet16h>(const Packet16h&); -template <> -EIGEN_STRONG_INLINE Packet16h pexp<Packet16h>(const Packet16h&); -template <> -EIGEN_STRONG_INLINE Packet16h pexpm1<Packet16h>(const Packet16h&); -template <> -EIGEN_STRONG_INLINE Packet16h ptanh<Packet16h>(const Packet16h&); -template <> -EIGEN_STRONG_INLINE Packet16h pfrexp<Packet16h>(const Packet16h&, Packet16h&); -template <> -EIGEN_STRONG_INLINE Packet16h pldexp<Packet16h>(const Packet16h&, const Packet16h&); - -EIGEN_STRONG_INLINE Packet32h combine2Packet16h(const Packet16h& a, const Packet16h& b) { - __m512d result = _mm512_undefined_pd(); - result = _mm512_insertf64x4(result, _mm256_castsi256_pd(a), 0); - result = _mm512_insertf64x4(result, _mm256_castsi256_pd(b), 1); - return _mm512_castpd_ph(result); +EIGEN_STRONG_INLINE Packet16h pgather<Eigen::half, Packet16h>(const Eigen::half* from, Index stride) { + return _mm256_set_ph(from[15 * stride].x, from[14 * stride].x, from[13 * stride].x, from[12 * stride].x, + from[11 * stride].x, from[10 * stride].x, from[9 * stride].x, from[8 * stride].x, + from[7 * stride].x, from[6 * stride].x, from[5 * stride].x, from[4 * stride].x, + from[3 * stride].x, from[2 * stride].x, from[1 * stride].x, from[0 * stride].x); } -EIGEN_STRONG_INLINE void extract2Packet16h(const Packet32h& x, Packet16h& a, Packet16h& b) { - a = _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(x), 0)); - b = _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(x), 1)); -} - -// psin template <> -EIGEN_STRONG_INLINE Packet32h psin<Packet32h>(const Packet32h& a) { - Packet16h low; - Packet16h high; - extract2Packet16h(a, low, high); - - Packet16h lowOut = psin(low); - Packet16h highOut = psin(high); - - return combine2Packet16h(lowOut, highOut); -} - -// pcos -template <> -EIGEN_STRONG_INLINE Packet32h pcos<Packet32h>(const Packet32h& a) { - Packet16h low; - Packet16h high; - extract2Packet16h(a, low, high); - - Packet16h lowOut = pcos(low); - Packet16h highOut = pcos(high); - - return combine2Packet16h(lowOut, highOut); -} - -// plog -template <> -EIGEN_STRONG_INLINE Packet32h plog<Packet32h>(const Packet32h& a) { - Packet16h low; - Packet16h high; - extract2Packet16h(a, low, high); - - Packet16h lowOut = plog(low); - Packet16h highOut = plog(high); - - return combine2Packet16h(lowOut, highOut); -} - -// plog2 -template <> -EIGEN_STRONG_INLINE Packet32h plog2<Packet32h>(const Packet32h& a) { - Packet16h low; - Packet16h high; - extract2Packet16h(a, low, high); - - Packet16h lowOut = plog2(low); - Packet16h highOut = plog2(high); - - return combine2Packet16h(lowOut, highOut); -} - -// plog1p -template <> -EIGEN_STRONG_INLINE Packet32h plog1p<Packet32h>(const Packet32h& a) { - Packet16h low; - Packet16h high; - extract2Packet16h(a, low, high); - - Packet16h lowOut = plog1p(low); - Packet16h highOut = plog1p(high); - - return combine2Packet16h(lowOut, highOut); -} - -// pexp -template <> -EIGEN_STRONG_INLINE Packet32h pexp<Packet32h>(const Packet32h& a) { - Packet16h low; - Packet16h high; - extract2Packet16h(a, low, high); - - Packet16h lowOut = pexp(low); - Packet16h highOut = pexp(high); - - return combine2Packet16h(lowOut, highOut); -} - -// pexpm1 -template <> -EIGEN_STRONG_INLINE Packet32h pexpm1<Packet32h>(const Packet32h& a) { - Packet16h low; - Packet16h high; - extract2Packet16h(a, low, high); - - Packet16h lowOut = pexpm1(low); - Packet16h highOut = pexpm1(high); - - return combine2Packet16h(lowOut, highOut); -} - -// ptanh -template <> -EIGEN_STRONG_INLINE Packet32h ptanh<Packet32h>(const Packet32h& a) { - Packet16h low; - Packet16h high; - extract2Packet16h(a, low, high); - - Packet16h lowOut = ptanh(low); - Packet16h highOut = ptanh(high); - - return combine2Packet16h(lowOut, highOut); -} - -// pfrexp -template <> -EIGEN_STRONG_INLINE Packet32h pfrexp<Packet32h>(const Packet32h& a, Packet32h& exponent) { - Packet16h low; - Packet16h high; - extract2Packet16h(a, low, high); - - Packet16h exp1 = _mm256_undefined_si256(); - Packet16h exp2 = _mm256_undefined_si256(); - - Packet16h lowOut = pfrexp(low, exp1); - Packet16h highOut = pfrexp(high, exp2); - - exponent = combine2Packet16h(exp1, exp2); - - return combine2Packet16h(lowOut, highOut); -} - -// pldexp -template <> -EIGEN_STRONG_INLINE Packet32h pldexp<Packet32h>(const Packet32h& a, const Packet32h& exponent) { - Packet16h low; - Packet16h high; - extract2Packet16h(a, low, high); - - Packet16h exp1; - Packet16h exp2; - extract2Packet16h(exponent, exp1, exp2); - - Packet16h lowOut = pldexp(low, exp1); - Packet16h highOut = pldexp(high, exp2); - - return combine2Packet16h(lowOut, highOut); +EIGEN_STRONG_INLINE Packet8h pgather<Eigen::half, Packet8h>(const Eigen::half* from, Index stride) { + return _mm_set_ph(from[7 * stride].x, from[6 * stride].x, from[5 * stride].x, from[4 * stride].x, from[3 * stride].x, + from[2 * stride].x, from[1 * stride].x, from[0 * stride].x); } } // end namespace internal
diff --git a/Eigen/src/Core/arch/AVX512/TypeCasting.h b/Eigen/src/Core/arch/AVX512/TypeCasting.h index 9508ac6..fc55fd8 100644 --- a/Eigen/src/Core/arch/AVX512/TypeCasting.h +++ b/Eigen/src/Core/arch/AVX512/TypeCasting.h
@@ -237,17 +237,13 @@ return _mm512_castsi512_si128(a); } +#ifndef EIGEN_VECTORIZE_AVX512FP16 template <> EIGEN_STRONG_INLINE Packet8h preinterpret<Packet8h, Packet16h>(const Packet16h& a) { return _mm256_castsi256_si128(a); } template <> -EIGEN_STRONG_INLINE Packet8bf preinterpret<Packet8bf, Packet16bf>(const Packet16bf& a) { - return _mm256_castsi256_si128(a); -} - -template <> EIGEN_STRONG_INLINE Packet16f pcast<Packet16h, Packet16f>(const Packet16h& a) { return half2float(a); } @@ -257,6 +253,13 @@ return float2half(a); } +#endif + +template <> +EIGEN_STRONG_INLINE Packet8bf preinterpret<Packet8bf, Packet16bf>(const Packet16bf& a) { + return _mm256_castsi256_si128(a); +} + template <> EIGEN_STRONG_INLINE Packet16f pcast<Packet16bf, Packet16f>(const Packet16bf& a) { return Bf16ToF32(a); @@ -267,68 +270,6 @@ return F32ToBf16(a); } -#ifdef EIGEN_VECTORIZE_AVX512FP16 - -template <> -EIGEN_STRONG_INLINE Packet16h preinterpret<Packet16h, Packet32h>(const Packet32h& a) { - return _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(a), 0)); -} -template <> -EIGEN_STRONG_INLINE Packet8h preinterpret<Packet8h, Packet32h>(const Packet32h& a) { - return _mm256_castsi256_si128(preinterpret<Packet16h>(a)); -} - -template <> -EIGEN_STRONG_INLINE Packet16f pcast<Packet32h, Packet16f>(const Packet32h& a) { - // Discard second-half of input. - Packet16h low = _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(a), 0)); - return _mm512_cvtxph_ps(_mm256_castsi256_ph(low)); -} - -template <> -EIGEN_STRONG_INLINE Packet32h pcast<Packet16f, Packet32h>(const Packet16f& a, const Packet16f& b) { - __m512d result = _mm512_undefined_pd(); - result = _mm512_insertf64x4( - result, _mm256_castsi256_pd(_mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)), 0); - result = _mm512_insertf64x4( - result, _mm256_castsi256_pd(_mm512_cvtps_ph(b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)), 1); - return _mm512_castpd_ph(result); -} - -template <> -EIGEN_STRONG_INLINE Packet8f pcast<Packet16h, Packet8f>(const Packet16h& a) { - // Discard second-half of input. - Packet8h low = _mm_castps_si128(_mm256_extractf32x4_ps(_mm256_castsi256_ps(a), 0)); - return _mm256_cvtxph_ps(_mm_castsi128_ph(low)); -} - -template <> -EIGEN_STRONG_INLINE Packet16h pcast<Packet8f, Packet16h>(const Packet8f& a, const Packet8f& b) { - __m256d result = _mm256_undefined_pd(); - result = _mm256_insertf64x2(result, - _mm_castsi128_pd(_mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)), 0); - result = _mm256_insertf64x2(result, - _mm_castsi128_pd(_mm256_cvtps_ph(b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)), 1); - return _mm256_castpd_si256(result); -} - -template <> -EIGEN_STRONG_INLINE Packet4f pcast<Packet8h, Packet4f>(const Packet8h& a) { - Packet8f full = _mm256_cvtxph_ps(_mm_castsi128_ph(a)); - // Discard second-half of input. - return _mm256_extractf32x4_ps(full, 0); -} - -template <> -EIGEN_STRONG_INLINE Packet8h pcast<Packet4f, Packet8h>(const Packet4f& a, const Packet4f& b) { - __m256 result = _mm256_undefined_ps(); - result = _mm256_insertf128_ps(result, a, 0); - result = _mm256_insertf128_ps(result, b, 1); - return _mm256_cvtps_ph(result, _MM_FROUND_TO_NEAREST_INT); -} - -#endif - } // end namespace internal } // end namespace Eigen
diff --git a/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h b/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h new file mode 100644 index 0000000..f06f13d --- /dev/null +++ b/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h
@@ -0,0 +1,130 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2025 The Eigen Authors. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_TYPE_CASTING_FP16_AVX512_H +#define EIGEN_TYPE_CASTING_FP16_AVX512_H + +// IWYU pragma: private +#include "../../InternalHeaderCheck.h" + +namespace Eigen { +namespace internal { + +template <> +EIGEN_STRONG_INLINE Packet32s preinterpret<Packet32s, Packet32h>(const Packet32h& a) { + return _mm512_castph_si512(a); +} +template <> +EIGEN_STRONG_INLINE Packet16s preinterpret<Packet16s, Packet16h>(const Packet16h& a) { + return _mm256_castph_si256(a); +} +template <> +EIGEN_STRONG_INLINE Packet8s preinterpret<Packet8s, Packet8h>(const Packet8h& a) { + return _mm_castph_si128(a); +} + +template <> +EIGEN_STRONG_INLINE Packet32h preinterpret<Packet32h, Packet32s>(const Packet32s& a) { + return _mm512_castsi512_ph(a); +} +template <> +EIGEN_STRONG_INLINE Packet16h preinterpret<Packet16h, Packet16s>(const Packet16s& a) { + return _mm256_castsi256_ph(a); +} +template <> +EIGEN_STRONG_INLINE Packet8h preinterpret<Packet8h, Packet8s>(const Packet8s& a) { + return _mm_castsi128_ph(a); +} + +template <> +EIGEN_STRONG_INLINE Packet16f pcast<Packet16h, Packet16f>(const Packet16h& a) { + return half2float(a); +} +template <> +EIGEN_STRONG_INLINE Packet8f pcast<Packet8h, Packet8f>(const Packet8h& a) { + return half2float(a); +} + +template <> +EIGEN_STRONG_INLINE Packet16h pcast<Packet16f, Packet16h>(const Packet16f& a) { + return float2half(a); +} +template <> +EIGEN_STRONG_INLINE Packet8h pcast<Packet8f, Packet8h>(const Packet8f& a) { + return float2half(a); +} + +template <> +EIGEN_STRONG_INLINE Packet16f pcast<Packet32h, Packet16f>(const Packet32h& a) { + // Discard second-half of input. + Packet16h low = _mm256_castpd_ph(_mm512_extractf64x4_pd(_mm512_castph_pd(a), 0)); + return _mm512_cvtxph_ps(low); +} +template <> +EIGEN_STRONG_INLINE Packet8f pcast<Packet16h, Packet8f>(const Packet16h& a) { + // Discard second-half of input. + Packet8h low = _mm_castps_ph(_mm256_extractf32x4_ps(_mm256_castph_ps(a), 0)); + return _mm256_cvtxph_ps(low); +} +template <> +EIGEN_STRONG_INLINE Packet4f pcast<Packet8h, Packet4f>(const Packet8h& a) { + Packet8f full = _mm256_cvtxph_ps(a); + // Discard second-half of input. + return _mm256_extractf32x4_ps(full, 0); +} + +template <> +EIGEN_STRONG_INLINE Packet32h pcast<Packet16f, Packet32h>(const Packet16f& a, const Packet16f& b) { + __m512 result = _mm512_castsi512_ps(_mm512_castsi256_si512(_mm256_castph_si256(_mm512_cvtxps_ph(a)))); + result = _mm512_insertf32x8(result, _mm256_castph_ps(_mm512_cvtxps_ph(b)), 1); + return _mm512_castps_ph(result); +} +template <> +EIGEN_STRONG_INLINE Packet16h pcast<Packet8f, Packet16h>(const Packet8f& a, const Packet8f& b) { + __m256 result = _mm256_castsi256_ps(_mm256_castsi128_si256(_mm_castph_si128(_mm256_cvtxps_ph(a)))); + result = _mm256_insertf32x4(result, _mm_castph_ps(_mm256_cvtxps_ph(b)), 1); + return _mm256_castps_ph(result); +} +template <> +EIGEN_STRONG_INLINE Packet8h pcast<Packet4f, Packet8h>(const Packet4f& a, const Packet4f& b) { + __m256 result = _mm256_castsi256_ps(_mm256_castsi128_si256(_mm_castps_si128(a))); + result = _mm256_insertf128_ps(result, b, 1); + return _mm256_cvtxps_ph(result); +} + +template <> +EIGEN_STRONG_INLINE Packet32s pcast<Packet32h, Packet32s>(const Packet32h& a) { + return _mm512_cvtph_epi16(a); +} +template <> +EIGEN_STRONG_INLINE Packet16s pcast<Packet16h, Packet16s>(const Packet16h& a) { + return _mm256_cvtph_epi16(a); +} +template <> +EIGEN_STRONG_INLINE Packet8s pcast<Packet8h, Packet8s>(const Packet8h& a) { + return _mm_cvtph_epi16(a); +} + +template <> +EIGEN_STRONG_INLINE Packet32h pcast<Packet32s, Packet32h>(const Packet32s& a) { + return _mm512_cvtepi16_ph(a); +} +template <> +EIGEN_STRONG_INLINE Packet16h pcast<Packet16s, Packet16h>(const Packet16s& a) { + return _mm256_cvtepi16_ph(a); +} +template <> +EIGEN_STRONG_INLINE Packet8h pcast<Packet8s, Packet8h>(const Packet8s& a) { + return _mm_cvtepi16_ph(a); +} + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_TYPE_CASTING_FP16_AVX512_H
diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index 482064e..d7bd9be 100644 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -186,6 +186,7 @@ HasExp = 1, #ifdef EIGEN_VECTORIZE_VSX HasSqrt = 1, + HasCbrt = 1, #if !EIGEN_COMP_CLANG HasRsqrt = 1, #else @@ -424,55 +425,6 @@ masked_store_available = false }; }; -inline std::ostream& operator<<(std::ostream& s, const Packet16c& v) { - union { - Packet16c v; - signed char n[16]; - } vt; - vt.v = v; - for (int i = 0; i < 16; i++) s << vt.n[i] << ", "; - return s; -} - -inline std::ostream& operator<<(std::ostream& s, const Packet16uc& v) { - union { - Packet16uc v; - unsigned char n[16]; - } vt; - vt.v = v; - for (int i = 0; i < 16; i++) s << vt.n[i] << ", "; - return s; -} - -inline std::ostream& operator<<(std::ostream& s, const Packet4f& v) { - union { - Packet4f v; - float n[4]; - } vt; - vt.v = v; - s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; - return s; -} - -inline std::ostream& operator<<(std::ostream& s, const Packet4i& v) { - union { - Packet4i v; - int n[4]; - } vt; - vt.v = v; - s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; - return s; -} - -inline std::ostream& operator<<(std::ostream& s, const Packet4ui& v) { - union { - Packet4ui v; - unsigned int n[4]; - } vt; - vt.v = v; - s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; - return s; -} template <typename Packet> EIGEN_STRONG_INLINE Packet pload_common(const __UNPACK_TYPE__(Packet) * from) { @@ -2385,6 +2337,44 @@ } template <> +EIGEN_STRONG_INLINE Packet8bf pmsub(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) { + Packet4f a_even = Bf16ToF32Even(a); + Packet4f a_odd = Bf16ToF32Odd(a); + Packet4f b_even = Bf16ToF32Even(b); + Packet4f b_odd = Bf16ToF32Odd(b); + Packet4f c_even = Bf16ToF32Even(c); + Packet4f c_odd = Bf16ToF32Odd(c); + Packet4f pmadd_even = pmsub<Packet4f>(a_even, b_even, c_even); + Packet4f pmadd_odd = pmsub<Packet4f>(a_odd, b_odd, c_odd); + return F32ToBf16(pmadd_even, pmadd_odd); +} +template <> +EIGEN_STRONG_INLINE Packet8bf pnmadd(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) { + Packet4f a_even = Bf16ToF32Even(a); + Packet4f a_odd = Bf16ToF32Odd(a); + Packet4f b_even = Bf16ToF32Even(b); + Packet4f b_odd = Bf16ToF32Odd(b); + Packet4f c_even = Bf16ToF32Even(c); + Packet4f c_odd = Bf16ToF32Odd(c); + Packet4f pmadd_even = pnmadd<Packet4f>(a_even, b_even, c_even); + Packet4f pmadd_odd = pnmadd<Packet4f>(a_odd, b_odd, c_odd); + return F32ToBf16(pmadd_even, pmadd_odd); +} + +template <> +EIGEN_STRONG_INLINE Packet8bf pnmsub(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) { + Packet4f a_even = Bf16ToF32Even(a); + Packet4f a_odd = Bf16ToF32Odd(a); + Packet4f b_even = Bf16ToF32Even(b); + Packet4f b_odd = Bf16ToF32Odd(b); + Packet4f c_even = Bf16ToF32Even(c); + Packet4f c_odd = Bf16ToF32Odd(c); + Packet4f pmadd_even = pnmsub<Packet4f>(a_even, b_even, c_even); + Packet4f pmadd_odd = pnmsub<Packet4f>(a_odd, b_odd, c_odd); + return F32ToBf16(pmadd_even, pmadd_odd); +} + +template <> EIGEN_STRONG_INLINE Packet8bf pmin<Packet8bf>(const Packet8bf& a, const Packet8bf& b) { BF16_TO_F32_BINARY_OP_WRAPPER(pmin<Packet4f>, a, b); } @@ -3187,6 +3177,7 @@ HasLog = 0, HasExp = 1, HasSqrt = 1, + HasCbrt = 1, #if !EIGEN_COMP_CLANG HasRsqrt = 1, #else
diff --git a/Eigen/src/Core/arch/Default/BFloat16.h b/Eigen/src/Core/arch/Default/BFloat16.h index b8d3b4f..f2e55f3 100644 --- a/Eigen/src/Core/arch/Default/BFloat16.h +++ b/Eigen/src/Core/arch/Default/BFloat16.h
@@ -673,6 +673,11 @@ return bfloat16(::fmaxf(f1, f2)); } +EIGEN_DEVICE_FUNC inline bfloat16 fma(const bfloat16& a, const bfloat16& b, const bfloat16& c) { + // Emulate FMA via float. + return bfloat16(numext::fma(static_cast<float>(a), static_cast<float>(b), static_cast<float>(c))); +} + #ifndef EIGEN_NO_IO EIGEN_ALWAYS_INLINE std::ostream& operator<<(std::ostream& os, const bfloat16& v) { os << static_cast<float>(v);
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index 174eb57..e9f564b 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@@ -187,7 +187,7 @@ static constexpr int TotalBits = sizeof(Scalar) * CHAR_BIT, MantissaBits = numext::numeric_limits<Scalar>::digits - 1, ExponentBits = TotalBits - MantissaBits - 1; - EIGEN_CONSTEXPR ScalarUI scalar_sign_mantissa_mask = + constexpr ScalarUI scalar_sign_mantissa_mask = ~(((ScalarUI(1) << ExponentBits) - ScalarUI(1)) << MantissaBits); // ~0x7f800000 const Packet sign_mantissa_mask = pset1frombits<Packet>(static_cast<ScalarUI>(scalar_sign_mantissa_mask)); const Packet half = pset1<Packet>(Scalar(0.5)); @@ -196,7 +196,7 @@ // To handle denormals, normalize by multiplying by 2^(int(MantissaBits)+1). const Packet is_denormal = pcmp_lt(pabs(a), normal_min); - EIGEN_CONSTEXPR ScalarUI scalar_normalization_offset = ScalarUI(MantissaBits + 1); // 24 + constexpr ScalarUI scalar_normalization_offset = ScalarUI(MantissaBits + 1); // 24 // The following cannot be constexpr because bfloat16(uint16_t) is not constexpr. const Scalar scalar_normalization_factor = Scalar(ScalarUI(1) << int(scalar_normalization_offset)); // 2^24 const Packet normalization_factor = pset1<Packet>(scalar_normalization_factor); @@ -289,6 +289,142 @@ return pmul(a, preinterpret<Packet>(plogical_shift_left<MantissaBits>(e))); } +// This function implements a single step of Halley's iteration for +// computing x = y^(1/3): +// x_{k+1} = x_k - (x_k^3 - y) x_k / (2x_k^3 + y) +template <typename Packet> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet cbrt_halley_iteration_step(const Packet& x_k, + const Packet& y) { + typedef typename unpacket_traits<Packet>::type Scalar; + Packet x_k_cb = pmul(x_k, pmul(x_k, x_k)); + Packet denom = pmadd(pset1<Packet>(Scalar(2)), x_k_cb, y); + Packet num = psub(x_k_cb, y); + Packet r = pdiv(num, denom); + return pnmadd(x_k, r, x_k); +} + +// Decompose the input such that x^(1/3) = y^(1/3) * 2^e_div3, and y is in the +// interval [0.125,1]. +template <typename Packet> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet cbrt_decompose(const Packet& x, Packet& e_div3) { + typedef typename unpacket_traits<Packet>::type Scalar; + // Extract the significant s in the range [0.5,1) and exponent e, such that + // x = 2^e * s. + Packet e, s; + s = pfrexp(x, e); + + // Split the exponent into a part divisible by 3 and the remainder. + // e = 3*e_div3 + e_mod3. + constexpr Scalar kOneThird = Scalar(1) / 3; + e_div3 = pceil(pmul(e, pset1<Packet>(kOneThird))); + Packet e_mod3 = pnmadd(pset1<Packet>(Scalar(3)), e_div3, e); + + // Replace s by y = (s * 2^e_mod3). + return pldexp_fast(s, e_mod3); +} + +template <typename Packet> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet cbrt_special_cases_and_sign(const Packet& x, + const Packet& abs_root) { + typedef typename unpacket_traits<Packet>::type Scalar; + + // Set sign. + const Packet sign_mask = pset1<Packet>(Scalar(-0.0)); + const Packet x_sign = pand(sign_mask, x); + Packet root = por(x_sign, abs_root); + + // Pass non-finite and zero values of x straight through. + const Packet is_not_finite = por(pisinf(x), pisnan(x)); + const Packet is_zero = pcmp_eq(pzero(x), x); + const Packet use_x = por(is_not_finite, is_zero); + return pselect(use_x, x, root); +} + +// Generic implementation of cbrt(x) for float. +// +// The algorithm computes the cubic root of the input by first +// decomposing it into a exponent and significant +// x = s * 2^e. +// +// We can then write the cube root as +// +// x^(1/3) = 2^(e/3) * s^(1/3) +// = 2^((3*e_div3 + e_mod3)/3) * s^(1/3) +// = 2^(e_div3) * 2^(e_mod3/3) * s^(1/3) +// = 2^(e_div3) * (s * 2^e_mod3)^(1/3) +// +// where e_div3 = ceil(e/3) and e_mod3 = e - 3*e_div3. +// +// The cube root of the second term y = (s * 2^e_mod3)^(1/3) is coarsely +// approximated using a cubic polynomial and subsequently refined using a +// single step of Halley's iteration, and finally the two terms are combined +// using pldexp_fast. +// +// Note: Many alternatives exist for implementing cbrt. See, for example, +// the excellent discussion in Kahan's note: +// https://csclub.uwaterloo.ca/~pbarfuss/qbrt.pdf +// This particular implementation was found to be very fast and accurate +// among several alternatives tried, but is probably not "optimal" on all +// platforms. +// +// This is accurate to 2 ULP. +template <typename Packet> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcbrt_float(const Packet& x) { + typedef typename unpacket_traits<Packet>::type Scalar; + static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float"); + + // Decompose the input such that x^(1/3) = y^(1/3) * 2^e_div3, and y is in the + // interval [0.125,1]. + Packet e_div3; + const Packet y = cbrt_decompose(pabs(x), e_div3); + + // Compute initial approximation accurate to 5.22e-3. + // The polynomial was computed using Rminimax. + constexpr float alpha[] = {5.9220016002655029296875e-01f, -1.3859539031982421875e+00f, 1.4581282138824462890625e+00f, + 3.408401906490325927734375e-01f}; + Packet r = ppolevl<Packet, 3>::run(y, alpha); + + // Take one step of Halley's iteration. + r = cbrt_halley_iteration_step(r, y); + + // Finally multiply by 2^(e_div3) + r = pldexp_fast(r, e_div3); + + return cbrt_special_cases_and_sign(x, r); +} + +// Generic implementation of cbrt(x) for double. +// +// The algorithm is identical to the one for float except that a different initial +// approximation is used for y^(1/3) and two Halley iteration steps are peformed. +// +// This is accurate to 1 ULP. +template <typename Packet> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcbrt_double(const Packet& x) { + typedef typename unpacket_traits<Packet>::type Scalar; + static_assert(std::is_same<Scalar, double>::value, "Scalar type must be double"); + + // Decompose the input such that x^(1/3) = y^(1/3) * 2^e_div3, and y is in the + // interval [0.125,1]. + Packet e_div3; + const Packet y = cbrt_decompose(pabs(x), e_div3); + + // Compute initial approximation accurate to 0.016. + // The polynomial was computed using Rminimax. + constexpr double alpha[] = {-4.69470621553356115551736138513660989701747894287109375e-01, + 1.072314636518546304699839311069808900356292724609375e+00, + 3.81249427609571867048288140722434036433696746826171875e-01}; + Packet r = ppolevl<Packet, 2>::run(y, alpha); + + // Take two steps of Halley's iteration. + r = cbrt_halley_iteration_step(r, y); + r = cbrt_halley_iteration_step(r, y); + + // Finally multiply by 2^(e_div3). + r = pldexp_fast(r, e_div3); + return cbrt_special_cases_and_sign(x, r); +} + // Natural or base 2 logarithm. // Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2) // and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can @@ -1123,7 +1259,7 @@ constexpr Scalar kPiOverTwo = static_cast<Scalar>(EIGEN_PI / 2); - const Packet cst_signmask = pset1<Packet>(-Scalar(0)); + const Packet cst_signmask = pset1<Packet>(Scalar(-0.0)); const Packet cst_one = pset1<Packet>(Scalar(1)); const Packet cst_pi_over_two = pset1<Packet>(kPiOverTwo); @@ -1685,7 +1821,7 @@ template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void veltkamp_splitting(const Packet& x, Packet& x_hi, Packet& x_lo) { typedef typename unpacket_traits<Packet>::type Scalar; - EIGEN_CONSTEXPR int shift = (NumTraits<Scalar>::digits() + 1) / 2; + constexpr int shift = (NumTraits<Scalar>::digits() + 1) / 2; const Scalar shift_scale = Scalar(uint64_t(1) << shift); // Scalar constructor not necessarily constexpr. const Packet gamma = pmul(pset1<Packet>(shift_scale + Scalar(1)), x); Packet rho = psub(x, gamma); @@ -1991,7 +2127,7 @@ Packet m_x = pfrexp(x, e_x); // Adjust m_x to lie in [1/sqrt(2):sqrt(2)] to minimize absolute error in log2(m_x). - EIGEN_CONSTEXPR Scalar sqrt_half = Scalar(0.70710678118654752440); + constexpr Scalar sqrt_half = Scalar(0.70710678118654752440); const Packet m_x_scale_mask = pcmp_lt(m_x, pset1<Packet>(sqrt_half)); m_x = pselect(m_x_scale_mask, pmul(pset1<Packet>(Scalar(2)), m_x), m_x); e_x = pselect(m_x_scale_mask, psub(e_x, pset1<Packet>(Scalar(1))), e_x); @@ -2074,7 +2210,7 @@ const Packet y_is_even = pcmp_eq(pround(y_div_2), y_div_2); const Packet y_is_odd_int = pandnot(y_is_int, y_is_even); // Smallest exponent for which (1 + epsilon) overflows to infinity. - EIGEN_CONSTEXPR Scalar huge_exponent = + constexpr Scalar huge_exponent = (NumTraits<Scalar>::max_exponent() * Scalar(EIGEN_LN2)) / NumTraits<Scalar>::epsilon(); const Packet y_abs_is_huge = pcmp_le(pset1<Packet>(huge_exponent), y_abs);
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h index ac0e2cf..673954e 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
@@ -54,6 +54,14 @@ template <typename Packet> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_fast(const Packet& a, const Packet& exponent); +/** \internal \returns cbrt(x) for single precision float */ +template <typename Packet> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcbrt_float(const Packet& x_in); + +/** \internal \returns cbrt(x) for double precision float */ +template <typename Packet> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcbrt_double(const Packet& x_in); + /** \internal \returns log(x) for single precision float */ template <typename Packet> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_float(const Packet _x); @@ -195,6 +203,7 @@ EIGEN_FLOAT_PACKET_FUNCTION(log, PACKET) \ EIGEN_FLOAT_PACKET_FUNCTION(log2, PACKET) \ EIGEN_FLOAT_PACKET_FUNCTION(exp, PACKET) \ + EIGEN_FLOAT_PACKET_FUNCTION(cbrt, PACKET) \ EIGEN_GENERIC_PACKET_FUNCTION(expm1, PACKET) \ EIGEN_GENERIC_PACKET_FUNCTION(exp2, PACKET) \ EIGEN_GENERIC_PACKET_FUNCTION(log1p, PACKET) \ @@ -208,6 +217,7 @@ EIGEN_DOUBLE_PACKET_FUNCTION(log2, PACKET) \ EIGEN_DOUBLE_PACKET_FUNCTION(exp, PACKET) \ EIGEN_DOUBLE_PACKET_FUNCTION(tanh, PACKET) \ + EIGEN_DOUBLE_PACKET_FUNCTION(cbrt, PACKET) \ EIGEN_GENERIC_PACKET_FUNCTION(atan, PACKET) \ EIGEN_GENERIC_PACKET_FUNCTION(exp2, PACKET)
diff --git a/Eigen/src/Core/arch/Default/Half.h b/Eigen/src/Core/arch/Default/Half.h index 95697f3..ba70d5f 100644 --- a/Eigen/src/Core/arch/Default/Half.h +++ b/Eigen/src/Core/arch/Default/Half.h
@@ -37,21 +37,23 @@ // IWYU pragma: private #include "../../InternalHeaderCheck.h" -#if defined(EIGEN_HAS_GPU_FP16) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) // When compiling with GPU support, the "__half_raw" base class as well as // some other routines are defined in the GPU compiler header files // (cuda_fp16.h, hip_fp16.h), and they are not tagged constexpr // As a consequence, we get compile failures when compiling Eigen with // GPU support. Hence the need to disable EIGEN_CONSTEXPR when building -// Eigen with GPU support -#pragma push_macro("EIGEN_CONSTEXPR") -#undef EIGEN_CONSTEXPR -#define EIGEN_CONSTEXPR +// Eigen with GPU support. +// Any functions that require `numext::bit_cast` may also not be constexpr, +// including any native types when setting via raw bit values. +#if defined(EIGEN_HAS_GPU_FP16) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16) +#define _EIGEN_MAYBE_CONSTEXPR +#else +#define _EIGEN_MAYBE_CONSTEXPR constexpr #endif #define F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, METHOD) \ template <> \ - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_UNUSED PACKET_F16 METHOD<PACKET_F16>(const PACKET_F16& _x) { \ + EIGEN_UNUSED EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC PACKET_F16 METHOD<PACKET_F16>(const PACKET_F16& _x) { \ return float2half(METHOD<PACKET_F>(half2float(_x))); \ } @@ -81,8 +83,10 @@ // Making the host side compile phase of hipcc use the same Eigen::half impl, as the gcc compile, resolves // this error, and hence the following convoluted #if condition #if !defined(EIGEN_HAS_GPU_FP16) || !defined(EIGEN_GPU_COMPILE_PHASE) + // Make our own __half_raw definition that is similar to CUDA's. struct __half_raw { + struct construct_from_rep_tag {}; #if (defined(EIGEN_HAS_GPU_FP16) && !defined(EIGEN_GPU_COMPILE_PHASE)) // Eigen::half can be used as the datatype for shared memory declarations (in Eigen and TF) // The element type for shared memory cannot have non-trivial constructors @@ -91,43 +95,53 @@ // hence the need for this EIGEN_DEVICE_FUNC __half_raw() {} #else - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw() : x(0) {} + EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR __half_raw() : x(0) {} #endif + #if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) - explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw(numext::uint16_t raw) : x(numext::bit_cast<__fp16>(raw)) {} + explicit EIGEN_DEVICE_FUNC __half_raw(numext::uint16_t raw) : x(numext::bit_cast<__fp16>(raw)) {} + EIGEN_DEVICE_FUNC constexpr __half_raw(construct_from_rep_tag, __fp16 rep) : x{rep} {} __fp16 x; +#elif defined(EIGEN_HAS_BUILTIN_FLOAT16) + explicit EIGEN_DEVICE_FUNC __half_raw(numext::uint16_t raw) : x(numext::bit_cast<_Float16>(raw)) {} + EIGEN_DEVICE_FUNC constexpr __half_raw(construct_from_rep_tag, _Float16 rep) : x{rep} {} + _Float16 x; #else - explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw(numext::uint16_t raw) : x(raw) {} + explicit EIGEN_DEVICE_FUNC constexpr __half_raw(numext::uint16_t raw) : x(raw) {} + EIGEN_DEVICE_FUNC constexpr __half_raw(construct_from_rep_tag, numext::uint16_t rep) : x{rep} {} numext::uint16_t x; #endif }; #elif defined(EIGEN_HAS_HIP_FP16) -// Nothing to do here +// HIP GPU compile phase: nothing to do here. // HIP fp16 header file has a definition for __half_raw #elif defined(EIGEN_HAS_CUDA_FP16) + +// CUDA GPU compile phase. #if EIGEN_CUDA_SDK_VER < 90000 // In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw typedef __half __half_raw; #endif // defined(EIGEN_HAS_CUDA_FP16) + #elif defined(SYCL_DEVICE_ONLY) typedef cl::sycl::half __half_raw; #endif -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw raw_uint16_to_half(numext::uint16_t x); +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR __half_raw raw_uint16_to_half(numext::uint16_t x); EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff); EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h); struct half_base : public __half_raw { - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half_base() {} - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half_base(const __half_raw& h) : __half_raw(h) {} + EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base() {} + EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base(const __half_raw& h) : __half_raw(h) {} #if defined(EIGEN_HAS_GPU_FP16) #if defined(EIGEN_HAS_HIP_FP16) - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half_base(const __half& h) { x = __half_as_ushort(h); } + EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base(const __half& h) { x = __half_as_ushort(h); } #elif defined(EIGEN_HAS_CUDA_FP16) #if EIGEN_CUDA_SDK_VER >= 90000 - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {} + EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {} #endif #endif #endif @@ -156,21 +170,29 @@ #endif #endif - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half() {} + EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half() {} - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half(const __half_raw& h) : half_impl::half_base(h) {} + EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(const __half_raw& h) : half_impl::half_base(h) {} #if defined(EIGEN_HAS_GPU_FP16) #if defined(EIGEN_HAS_HIP_FP16) - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {} + EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {} #elif defined(EIGEN_HAS_CUDA_FP16) #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000 - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {} + EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {} #endif #endif #endif - explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half(bool b) +#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) + explicit EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(__fp16 b) + : half(__half_raw(__half_raw::construct_from_rep_tag(), b)) {} +#elif defined(EIGEN_HAS_BUILTIN_FLOAT16) + explicit EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(_Float16 b) + : half(__half_raw(__half_raw::construct_from_rep_tag(), b)) {} +#endif + + explicit EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(bool b) : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {} template <class T> explicit EIGEN_DEVICE_FUNC half(T val) @@ -201,99 +223,99 @@ namespace half_impl { template <typename = void> struct numeric_limits_half_impl { - static EIGEN_CONSTEXPR const bool is_specialized = true; - static EIGEN_CONSTEXPR const bool is_signed = true; - static EIGEN_CONSTEXPR const bool is_integer = false; - static EIGEN_CONSTEXPR const bool is_exact = false; - static EIGEN_CONSTEXPR const bool has_infinity = true; - static EIGEN_CONSTEXPR const bool has_quiet_NaN = true; - static EIGEN_CONSTEXPR const bool has_signaling_NaN = true; + static constexpr const bool is_specialized = true; + static constexpr const bool is_signed = true; + static constexpr const bool is_integer = false; + static constexpr const bool is_exact = false; + static constexpr const bool has_infinity = true; + static constexpr const bool has_quiet_NaN = true; + static constexpr const bool has_signaling_NaN = true; EIGEN_DIAGNOSTICS(push) EIGEN_DISABLE_DEPRECATED_WARNING - static EIGEN_CONSTEXPR const std::float_denorm_style has_denorm = std::denorm_present; - static EIGEN_CONSTEXPR const bool has_denorm_loss = false; + static constexpr const std::float_denorm_style has_denorm = std::denorm_present; + static constexpr const bool has_denorm_loss = false; EIGEN_DIAGNOSTICS(pop) - static EIGEN_CONSTEXPR const std::float_round_style round_style = std::round_to_nearest; - static EIGEN_CONSTEXPR const bool is_iec559 = true; + static constexpr const std::float_round_style round_style = std::round_to_nearest; + static constexpr const bool is_iec559 = true; // The C++ standard defines this as "true if the set of values representable // by the type is finite." Half has finite precision. - static EIGEN_CONSTEXPR const bool is_bounded = true; - static EIGEN_CONSTEXPR const bool is_modulo = false; - static EIGEN_CONSTEXPR const int digits = 11; - static EIGEN_CONSTEXPR const int digits10 = + static constexpr const bool is_bounded = true; + static constexpr const bool is_modulo = false; + static constexpr const int digits = 11; + static constexpr const int digits10 = 3; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html - static EIGEN_CONSTEXPR const int max_digits10 = + static constexpr const int max_digits10 = 5; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html - static EIGEN_CONSTEXPR const int radix = std::numeric_limits<float>::radix; - static EIGEN_CONSTEXPR const int min_exponent = -13; - static EIGEN_CONSTEXPR const int min_exponent10 = -4; - static EIGEN_CONSTEXPR const int max_exponent = 16; - static EIGEN_CONSTEXPR const int max_exponent10 = 4; - static EIGEN_CONSTEXPR const bool traps = std::numeric_limits<float>::traps; + static constexpr const int radix = std::numeric_limits<float>::radix; + static constexpr const int min_exponent = -13; + static constexpr const int min_exponent10 = -4; + static constexpr const int max_exponent = 16; + static constexpr const int max_exponent10 = 4; + static constexpr const bool traps = std::numeric_limits<float>::traps; // IEEE754: "The implementer shall choose how tininess is detected, but shall // detect tininess in the same way for all operations in radix two" - static EIGEN_CONSTEXPR const bool tinyness_before = std::numeric_limits<float>::tinyness_before; + static constexpr const bool tinyness_before = std::numeric_limits<float>::tinyness_before; - static EIGEN_CONSTEXPR Eigen::half(min)() { return Eigen::half_impl::raw_uint16_to_half(0x0400); } - static EIGEN_CONSTEXPR Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); } - static EIGEN_CONSTEXPR Eigen::half(max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); } - static EIGEN_CONSTEXPR Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x1400); } - static EIGEN_CONSTEXPR Eigen::half round_error() { return Eigen::half_impl::raw_uint16_to_half(0x3800); } - static EIGEN_CONSTEXPR Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); } - static EIGEN_CONSTEXPR Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); } - static EIGEN_CONSTEXPR Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7d00); } - static EIGEN_CONSTEXPR Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x0001); } + static _EIGEN_MAYBE_CONSTEXPR Eigen::half(min)() { return Eigen::half_impl::raw_uint16_to_half(0x0400); } + static _EIGEN_MAYBE_CONSTEXPR Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); } + static _EIGEN_MAYBE_CONSTEXPR Eigen::half(max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); } + static _EIGEN_MAYBE_CONSTEXPR Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x1400); } + static _EIGEN_MAYBE_CONSTEXPR Eigen::half round_error() { return Eigen::half_impl::raw_uint16_to_half(0x3800); } + static _EIGEN_MAYBE_CONSTEXPR Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); } + static _EIGEN_MAYBE_CONSTEXPR Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); } + static _EIGEN_MAYBE_CONSTEXPR Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7d00); } + static _EIGEN_MAYBE_CONSTEXPR Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x0001); } }; template <typename T> -EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::is_specialized; +constexpr const bool numeric_limits_half_impl<T>::is_specialized; template <typename T> -EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::is_signed; +constexpr const bool numeric_limits_half_impl<T>::is_signed; template <typename T> -EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::is_integer; +constexpr const bool numeric_limits_half_impl<T>::is_integer; template <typename T> -EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::is_exact; +constexpr const bool numeric_limits_half_impl<T>::is_exact; template <typename T> -EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::has_infinity; +constexpr const bool numeric_limits_half_impl<T>::has_infinity; template <typename T> -EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::has_quiet_NaN; +constexpr const bool numeric_limits_half_impl<T>::has_quiet_NaN; template <typename T> -EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::has_signaling_NaN; +constexpr const bool numeric_limits_half_impl<T>::has_signaling_NaN; EIGEN_DIAGNOSTICS(push) EIGEN_DISABLE_DEPRECATED_WARNING template <typename T> -EIGEN_CONSTEXPR const std::float_denorm_style numeric_limits_half_impl<T>::has_denorm; +constexpr const std::float_denorm_style numeric_limits_half_impl<T>::has_denorm; template <typename T> -EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::has_denorm_loss; +constexpr const bool numeric_limits_half_impl<T>::has_denorm_loss; EIGEN_DIAGNOSTICS(pop) template <typename T> -EIGEN_CONSTEXPR const std::float_round_style numeric_limits_half_impl<T>::round_style; +constexpr const std::float_round_style numeric_limits_half_impl<T>::round_style; template <typename T> -EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::is_iec559; +constexpr const bool numeric_limits_half_impl<T>::is_iec559; template <typename T> -EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::is_bounded; +constexpr const bool numeric_limits_half_impl<T>::is_bounded; template <typename T> -EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::is_modulo; +constexpr const bool numeric_limits_half_impl<T>::is_modulo; template <typename T> -EIGEN_CONSTEXPR const int numeric_limits_half_impl<T>::digits; +constexpr const int numeric_limits_half_impl<T>::digits; template <typename T> -EIGEN_CONSTEXPR const int numeric_limits_half_impl<T>::digits10; +constexpr const int numeric_limits_half_impl<T>::digits10; template <typename T> -EIGEN_CONSTEXPR const int numeric_limits_half_impl<T>::max_digits10; +constexpr const int numeric_limits_half_impl<T>::max_digits10; template <typename T> -EIGEN_CONSTEXPR const int numeric_limits_half_impl<T>::radix; +constexpr const int numeric_limits_half_impl<T>::radix; template <typename T> -EIGEN_CONSTEXPR const int numeric_limits_half_impl<T>::min_exponent; +constexpr const int numeric_limits_half_impl<T>::min_exponent; template <typename T> -EIGEN_CONSTEXPR const int numeric_limits_half_impl<T>::min_exponent10; +constexpr const int numeric_limits_half_impl<T>::min_exponent10; template <typename T> -EIGEN_CONSTEXPR const int numeric_limits_half_impl<T>::max_exponent; +constexpr const int numeric_limits_half_impl<T>::max_exponent; template <typename T> -EIGEN_CONSTEXPR const int numeric_limits_half_impl<T>::max_exponent10; +constexpr const int numeric_limits_half_impl<T>::max_exponent10; template <typename T> -EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::traps; +constexpr const bool numeric_limits_half_impl<T>::traps; template <typename T> -EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::tinyness_before; +constexpr const bool numeric_limits_half_impl<T>::tinyness_before; } // end namespace half_impl } // end namespace Eigen @@ -320,8 +342,7 @@ (defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE)) // Note: We deliberately do *not* define this to 1 even if we have Arm's native // fp16 type since GPU half types are rather different from native CPU half types. -// TODO: Rename to something like EIGEN_HAS_NATIVE_GPU_FP16 -#define EIGEN_HAS_NATIVE_FP16 +#define EIGEN_HAS_NATIVE_GPU_FP16 #endif // Intrinsics for native fp16 support. Note that on current hardware, @@ -329,7 +350,7 @@ // versions to get the ALU speed increased), but you do save the // conversion steps back and forth. -#if defined(EIGEN_HAS_NATIVE_FP16) +#if defined(EIGEN_HAS_NATIVE_GPU_FP16) EIGEN_STRONG_INLINE __device__ half operator+(const half& a, const half& b) { #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000 return __hadd(::__half(a), ::__half(b)); @@ -371,7 +392,8 @@ EIGEN_STRONG_INLINE __device__ bool operator<=(const half& a, const half& b) { return __hle(a, b); } EIGEN_STRONG_INLINE __device__ bool operator>(const half& a, const half& b) { return __hgt(a, b); } EIGEN_STRONG_INLINE __device__ bool operator>=(const half& a, const half& b) { return __hge(a, b); } -#endif + +#endif // EIGEN_HAS_NATIVE_GPU_FP16 #if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) && !defined(EIGEN_GPU_COMPILE_PHASE) EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator+(const half& a, const half& b) { return half(vaddh_f16(a.x, b.x)); } @@ -401,16 +423,47 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<=(const half& a, const half& b) { return vcleh_f16(a.x, b.x); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>(const half& a, const half& b) { return vcgth_f16(a.x, b.x); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>=(const half& a, const half& b) { return vcgeh_f16(a.x, b.x); } + +#elif defined(EIGEN_HAS_BUILTIN_FLOAT16) && !defined(EIGEN_GPU_COMPILE_PHASE) + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator+(const half& a, const half& b) { return half(a.x + b.x); } +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator*(const half& a, const half& b) { return half(a.x * b.x); } +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator-(const half& a, const half& b) { return half(a.x - b.x); } +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator/(const half& a, const half& b) { return half(a.x / b.x); } +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator-(const half& a) { return half(-a.x); } +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator+=(half& a, const half& b) { + a = a + b; + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator*=(half& a, const half& b) { + a = a * b; + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator-=(half& a, const half& b) { + a = a - b; + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator/=(half& a, const half& b) { + a = a / b; + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator==(const half& a, const half& b) { return a.x == b.x; } +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator!=(const half& a, const half& b) { return a.x != b.x; } +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<(const half& a, const half& b) { return a.x < b.x; } +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<=(const half& a, const half& b) { return a.x <= b.x; } +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>(const half& a, const half& b) { return a.x > b.x; } +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>=(const half& a, const half& b) { return a.x >= b.x; } + // We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler, // invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation // of the functions, while the latter can only deal with one of them. -#elif !defined(EIGEN_HAS_NATIVE_FP16) || (EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) // Emulate support for half floats +#elif !defined(EIGEN_HAS_NATIVE_GPU_FP16) || (EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) // Emulate support for half floats #if EIGEN_COMP_CLANG && defined(EIGEN_GPUCC) // We need to provide emulated *host-side* FP16 operators for clang. #pragma push_macro("EIGEN_DEVICE_FUNC") #undef EIGEN_DEVICE_FUNC -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_HAS_NATIVE_FP16) +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_HAS_NATIVE_GPU_FP16) #define EIGEN_DEVICE_FUNC __host__ #else // both host and device need emulated ops. #define EIGEN_DEVICE_FUNC __host__ __device__ @@ -458,6 +511,7 @@ #if EIGEN_COMP_CLANG && defined(EIGEN_GPUCC) #pragma pop_macro("EIGEN_DEVICE_FUNC") #endif + #endif // Emulate support for half floats // Division by an index. Do it in full float precision to avoid accuracy @@ -493,7 +547,7 @@ // these in hardware. If we need more performance on older/other CPUs, they are // also possible to vectorize directly. -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw raw_uint16_to_half(numext::uint16_t x) { +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR __half_raw raw_uint16_to_half(numext::uint16_t x) { // We cannot simply do a "return __half_raw(x)" here, because __half_raw is union type // in the hip_fp16 header file, and that will trigger a compile error // On the other hand, having anything but a return statement also triggers a compile error @@ -515,6 +569,8 @@ // For SYCL, cl::sycl::half is _Float16, so cast directly. #if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) return numext::bit_cast<numext::uint16_t>(h.x); +#elif defined(EIGEN_HAS_BUILTIN_FLOAT16) + return numext::bit_cast<numext::uint16_t>(h.x); #elif defined(SYCL_DEVICE_ONLY) return numext::bit_cast<numext::uint16_t>(h); #else @@ -528,6 +584,16 @@ __half tmp_ff = __float2half(ff); return *(__half_raw*)&tmp_ff; +#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) + __half_raw h; + h.x = static_cast<__fp16>(ff); + return h; + +#elif defined(EIGEN_HAS_BUILTIN_FLOAT16) + __half_raw h; + h.x = static_cast<_Float16>(ff); + return h; + #elif defined(EIGEN_HAS_FP16_C) __half_raw h; #if EIGEN_COMP_MSVC @@ -538,11 +604,6 @@ #endif return h; -#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) - __half_raw h; - h.x = static_cast<__fp16>(ff); - return h; - #else uint32_t f_bits = Eigen::numext::bit_cast<uint32_t>(ff); const uint32_t f32infty_bits = {255 << 23}; @@ -595,6 +656,8 @@ #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) return __half2float(h); +#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16) + return static_cast<float>(h.x); #elif defined(EIGEN_HAS_FP16_C) #if EIGEN_COMP_MSVC // MSVC does not have scalar instructions. @@ -602,8 +665,6 @@ #else return _cvtsh_ss(h.x); #endif -#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) - return static_cast<float>(h.x); #else const float magic = Eigen::numext::bit_cast<float>(static_cast<uint32_t>(113 << 23)); const uint32_t shifted_exp = 0x7c00 << 13; // exponent mask after shift @@ -628,7 +689,7 @@ // --- standard functions --- EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isinf)(const half& a) { -#ifdef EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC +#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16) return (numext::bit_cast<numext::uint16_t>(a.x) & 0x7fff) == 0x7c00; #else return (a.x & 0x7fff) == 0x7c00; @@ -638,7 +699,7 @@ #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) return __hisnan(a); -#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) +#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16) return (numext::bit_cast<numext::uint16_t>(a.x) & 0x7fff) > 0x7c00; #else return (a.x & 0x7fff) > 0x7c00; @@ -651,6 +712,11 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) { #if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) return half(vabsh_f16(a.x)); +#elif defined(EIGEN_HAS_BUILTIN_FLOAT16) + half result; + result.x = + numext::bit_cast<_Float16>(static_cast<numext::uint16_t>(numext::bit_cast<numext::uint16_t>(a.x) & 0x7FFF)); + return result; #else half result; result.x = a.x & 0x7FFF; @@ -734,24 +800,19 @@ return half(::fmodf(float(a), float(b))); } -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half(min)(const half& a, const half& b) { -#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ - (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) - return __hlt(b, a) ? b : a; +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half(min)(const half& a, const half& b) { return b < a ? b : a; } + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half(max)(const half& a, const half& b) { return a < b ? b : a; } + +EIGEN_DEVICE_FUNC inline half fma(const half& a, const half& b, const half& c) { +#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) + return half(vfmah_f16(c.x, a.x, b.x)); +#elif defined(EIGEN_VECTORIZE_AVX512FP16) + // Reduces to vfmadd213sh. + return half(_mm_cvtsh_h(_mm_fmadd_ph(_mm_set_sh(a.x), _mm_set_sh(b.x), _mm_set_sh(c.x)))); #else - const float f1 = static_cast<float>(a); - const float f2 = static_cast<float>(b); - return f2 < f1 ? b : a; -#endif -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half(max)(const half& a, const half& b) { -#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ - (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) - return __hlt(a, b) ? b : a; -#else - const float f1 = static_cast<float>(a); - const float f2 = static_cast<float>(b); - return f1 < f2 ? b : a; + // Emulate FMA via float. + return half(numext::fma(static_cast<float>(a), static_cast<float>(b), static_cast<float>(c))); #endif } @@ -794,31 +855,29 @@ struct NumTraits<Eigen::half> : GenericNumTraits<Eigen::half> { enum { IsSigned = true, IsInteger = false, IsComplex = false, RequireInitialization = false }; - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half epsilon() { + EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half epsilon() { return half_impl::raw_uint16_to_half(0x0800); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half dummy_precision() { + EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half dummy_precision() { return half_impl::raw_uint16_to_half(0x211f); // Eigen::half(1e-2f); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half highest() { + EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half highest() { return half_impl::raw_uint16_to_half(0x7bff); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half lowest() { + EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half lowest() { return half_impl::raw_uint16_to_half(0xfbff); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half infinity() { + EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half infinity() { return half_impl::raw_uint16_to_half(0x7c00); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half quiet_NaN() { + EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half quiet_NaN() { return half_impl::raw_uint16_to_half(0x7e00); } }; } // end namespace Eigen -#if defined(EIGEN_HAS_GPU_FP16) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) -#pragma pop_macro("EIGEN_CONSTEXPR") -#endif +#undef _EIGEN_MAYBE_CONSTEXPR namespace Eigen { namespace numext {
diff --git a/Eigen/src/Core/arch/GPU/Tuple.h b/Eigen/src/Core/arch/GPU/Tuple.h index 6bea9ac..402d92f 100644 --- a/Eigen/src/Core/arch/GPU/Tuple.h +++ b/Eigen/src/Core/arch/GPU/Tuple.h
@@ -34,7 +34,7 @@ template <typename U1 = T1, typename EnableIf = std::enable_if_t<std::is_default_constructible<U1>::value && reduce_all<std::is_default_constructible<Ts>::value...>::value>> - EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC TupleImpl() : head_{}, tail_{} {} + constexpr EIGEN_DEVICE_FUNC TupleImpl() : head_{}, tail_{} {} // Element constructor. template <typename U1, typename... Us, @@ -44,7 +44,7 @@ sizeof...(Us) == sizeof...(Ts) && ( // this does not look like a copy/move constructor. N > 1 || std::is_convertible<U1, T1>::value)>> - EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC TupleImpl(U1&& arg1, Us&&... args) + constexpr EIGEN_DEVICE_FUNC TupleImpl(U1&& arg1, Us&&... args) : head_(std::forward<U1>(arg1)), tail_(std::forward<Us>(args)...) {} // The first stored value. @@ -102,11 +102,11 @@ using TupleType = TupleImpl<sizeof...(Ts) + 1, T1, Ts...>; using ReturnType = typename tuple_get_impl<Idx - 1, Ts...>::ReturnType; - static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ReturnType& run(TupleType& tuple) { + static constexpr EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ReturnType& run(TupleType& tuple) { return tuple_get_impl<Idx - 1, Ts...>::run(tuple.tail()); } - static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const ReturnType& run(const TupleType& tuple) { + static constexpr EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const ReturnType& run(const TupleType& tuple) { return tuple_get_impl<Idx - 1, Ts...>::run(tuple.tail()); } }; @@ -117,11 +117,9 @@ using TupleType = TupleImpl<sizeof...(Ts) + 1, T1, Ts...>; using ReturnType = T1; - static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T1& run(TupleType& tuple) { return tuple.head(); } + static constexpr EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T1& run(TupleType& tuple) { return tuple.head(); } - static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const T1& run(const TupleType& tuple) { - return tuple.head(); - } + static constexpr EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const T1& run(const TupleType& tuple) { return tuple.head(); } }; // Concatenates N Tuples. @@ -139,11 +137,9 @@ // Uses the index sequences to extract and merge elements from tuple1 and tuple2, // then recursively calls again. template <typename Tuple1, size_t... I1s, typename Tuple2, size_t... I2s, typename... MoreTuples> - static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType run(Tuple1&& tuple1, - std::index_sequence<I1s...>, - Tuple2&& tuple2, - std::index_sequence<I2s...>, - MoreTuples&&... tuples) { + static constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType run(Tuple1&& tuple1, std::index_sequence<I1s...>, + Tuple2&& tuple2, std::index_sequence<I2s...>, + MoreTuples&&... tuples) { return tuple_cat_impl<NTuples - 1, MergedTupleType, Tuples...>::run( MergedTupleType(tuple_get_impl<I1s, Args1...>::run(std::forward<Tuple1>(tuple1))..., tuple_get_impl<I2s, Args2...>::run(std::forward<Tuple2>(tuple2))...), @@ -152,8 +148,8 @@ // Concatenates the first two tuples. template <typename Tuple1, typename Tuple2, typename... MoreTuples> - static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType run(Tuple1&& tuple1, Tuple2&& tuple2, - MoreTuples&&... tuples) { + static constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType run(Tuple1&& tuple1, Tuple2&& tuple2, + MoreTuples&&... tuples) { return run(std::forward<Tuple1>(tuple1), std::make_index_sequence<N1>{}, std::forward<Tuple2>(tuple2), std::make_index_sequence<N2>{}, std::forward<MoreTuples>(tuples)...); } @@ -165,7 +161,7 @@ using ReturnType = TupleImpl<N, Args...>; template <typename Tuple1> - static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType run(Tuple1&& tuple1) { + static constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType run(Tuple1&& tuple1) { return tuple1; } }; @@ -174,7 +170,7 @@ template <> struct tuple_cat_impl<0> { using ReturnType = TupleImpl<0>; - static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType run() { return ReturnType{}; } + static constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType run() { return ReturnType{}; } }; // For use in make_tuple, unwraps a reference_wrapper. @@ -211,13 +207,13 @@ * \return a reference to the desired element. */ template <size_t Idx, typename... Types> -EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename tuple_get_impl<Idx, Types...>::ReturnType& get( +constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename tuple_get_impl<Idx, Types...>::ReturnType& get( const TupleImpl<sizeof...(Types), Types...>& tuple) { return tuple_get_impl<Idx, Types...>::run(tuple); } template <size_t Idx, typename... Types> -EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename tuple_get_impl<Idx, Types...>::ReturnType& get( +constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename tuple_get_impl<Idx, Types...>::ReturnType& get( TupleImpl<sizeof...(Types), Types...>& tuple) { return tuple_get_impl<Idx, Types...>::run(tuple); } @@ -229,7 +225,7 @@ */ template <typename... Tuples, typename EnableIf = std::enable_if_t< internal::reduce_all<is_tuple<typename std::decay<Tuples>::type>::value...>::value>> -EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename tuple_cat_impl<sizeof...(Tuples), typename std::decay<Tuples>::type...>::ReturnType tuple_cat(Tuples&&... tuples) { return tuple_cat_impl<sizeof...(Tuples), typename std::decay<Tuples>::type...>::run(std::forward<Tuples>(tuples)...); @@ -239,7 +235,7 @@ * Tie arguments together into a tuple. */ template <typename... Args, typename ReturnType = TupleImpl<sizeof...(Args), Args&...>> -EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType tie(Args&... args) EIGEN_NOEXCEPT { +constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType tie(Args&... args) noexcept { return ReturnType{args...}; } @@ -247,7 +243,7 @@ * Create a tuple of l-values with the supplied arguments. */ template <typename... Args, typename ReturnType = TupleImpl<sizeof...(Args), typename unwrap_decay<Args>::type...>> -EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType make_tuple(Args&&... args) { +constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType make_tuple(Args&&... args) { return ReturnType{std::forward<Args>(args)...}; } @@ -255,8 +251,7 @@ * Forward a set of arguments as a tuple. */ template <typename... Args> -EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TupleImpl<sizeof...(Args), Args...> forward_as_tuple( - Args&&... args) { +constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TupleImpl<sizeof...(Args), Args...> forward_as_tuple(Args&&... args) { return TupleImpl<sizeof...(Args), Args...>(std::forward<Args>(args)...); }
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 3f2d9d5..6d7f038 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -207,6 +207,7 @@ HasExp = 1, HasSqrt = 1, HasRsqrt = 1, + HasCbrt = 1, HasTanh = EIGEN_FAST_MATH, HasErf = EIGEN_FAST_MATH, HasErfc = EIGEN_FAST_MATH, @@ -4965,6 +4966,26 @@ } template <> +EIGEN_STRONG_INLINE Packet4bf pmadd<Packet4bf>(const Packet4bf& a, const Packet4bf& b, const Packet4bf& c) { + return F32ToBf16(pmadd<Packet4f>(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c))); +} + +template <> +EIGEN_STRONG_INLINE Packet4bf pmsub<Packet4bf>(const Packet4bf& a, const Packet4bf& b, const Packet4bf& c) { + return F32ToBf16(pmsub<Packet4f>(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c))); +} + +template <> +EIGEN_STRONG_INLINE Packet4bf pnmadd<Packet4bf>(const Packet4bf& a, const Packet4bf& b, const Packet4bf& c) { + return F32ToBf16(pnmadd<Packet4f>(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c))); +} + +template <> +EIGEN_STRONG_INLINE Packet4bf pnmsub<Packet4bf>(const Packet4bf& a, const Packet4bf& b, const Packet4bf& c) { + return F32ToBf16(pnmsub<Packet4f>(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c))); +} + +template <> EIGEN_STRONG_INLINE Packet4bf pdiv<Packet4bf>(const Packet4bf& a, const Packet4bf& b) { return F32ToBf16(pdiv<Packet4f>(Bf16ToF32(a), Bf16ToF32(b))); } @@ -5140,6 +5161,7 @@ HasCos = EIGEN_FAST_MATH, HasSqrt = 1, HasRsqrt = 1, + HasCbrt = 1, HasTanh = EIGEN_FAST_MATH, HasErf = EIGEN_FAST_MATH, HasErfc = EIGEN_FAST_MATH @@ -5635,6 +5657,21 @@ } template <> +EIGEN_STRONG_INLINE Packet8hf pmsub(const Packet8hf& a, const Packet8hf& b, const Packet8hf& c) { + return vfmaq_f16(pnegate(c), a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4hf pnmadd(const Packet4hf& a, const Packet4hf& b, const Packet4hf& c) { + return vfma_f16(c, pnegate(a), b); +} + +template <> +EIGEN_STRONG_INLINE Packet4hf pnmsub(const Packet4hf& a, const Packet4hf& b, const Packet4hf& c) { + return vfma_f16(pnegate(c), pnegate(a), b); +} + +template <> EIGEN_STRONG_INLINE Packet8hf pmin<Packet8hf>(const Packet8hf& a, const Packet8hf& b) { return vminq_f16(a, b); }
diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index c69e3d4..f79da7b 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h
@@ -465,19 +465,11 @@ } template <> EIGEN_STRONG_INLINE Packet2cf pnmadd(const Packet2cf& a, const Packet2cf& b, const Packet2cf& c) { - __m128 a_odd = _mm_movehdup_ps(a.v); - __m128 a_even = _mm_moveldup_ps(a.v); - __m128 b_swap = _mm_permute_ps(b.v, _MM_SHUFFLE(2, 3, 0, 1)); - __m128 result = _mm_fmaddsub_ps(a_odd, b_swap, _mm_fmaddsub_ps(a_even, b.v, c.v)); - return Packet2cf(result); + return pnegate(pmsub(a, b, c)); } template <> EIGEN_STRONG_INLINE Packet2cf pnmsub(const Packet2cf& a, const Packet2cf& b, const Packet2cf& c) { - __m128 a_odd = _mm_movehdup_ps(a.v); - __m128 a_even = _mm_moveldup_ps(a.v); - __m128 b_swap = _mm_permute_ps(b.v, _MM_SHUFFLE(2, 3, 0, 1)); - __m128 result = _mm_fmaddsub_ps(a_odd, b_swap, _mm_fmsubadd_ps(a_even, b.v, c.v)); - return Packet2cf(result); + return pnegate(pmadd(a, b, c)); } // std::complex<double> template <> @@ -498,19 +490,11 @@ } template <> EIGEN_STRONG_INLINE Packet1cd pnmadd(const Packet1cd& a, const Packet1cd& b, const Packet1cd& c) { - __m128d a_odd = _mm_permute_pd(a.v, 0x3); - __m128d a_even = _mm_movedup_pd(a.v); - __m128d b_swap = _mm_permute_pd(b.v, 0x1); - __m128d result = _mm_fmaddsub_pd(a_odd, b_swap, _mm_fmaddsub_pd(a_even, b.v, c.v)); - return Packet1cd(result); + return pnegate(pmsub(a, b, c)); } template <> EIGEN_STRONG_INLINE Packet1cd pnmsub(const Packet1cd& a, const Packet1cd& b, const Packet1cd& c) { - __m128d a_odd = _mm_permute_pd(a.v, 0x3); - __m128d a_even = _mm_movedup_pd(a.v); - __m128d b_swap = _mm_permute_pd(b.v, 0x1); - __m128d result = _mm_fmaddsub_pd(a_odd, b_swap, _mm_fmsubadd_pd(a_even, b.v, c.v)); - return Packet1cd(result); + return pnegate(pmadd(a, b, c)); } #endif } // end namespace internal
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 5e91fba..70d13d6 100644 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -195,6 +195,7 @@ HasBessel = 1, HasSqrt = 1, HasRsqrt = 1, + HasCbrt = 1, HasTanh = EIGEN_FAST_MATH, HasErf = EIGEN_FAST_MATH, HasErfc = EIGEN_FAST_MATH, @@ -222,6 +223,7 @@ HasExp = 1, HasSqrt = 1, HasRsqrt = 1, + HasCbrt = 1, HasATan = 1, HasATanh = 1, HasBlend = 1
diff --git a/Eigen/src/Core/functors/AssignmentFunctors.h b/Eigen/src/Core/functors/AssignmentFunctors.h index 7edcc60..0239262 100644 --- a/Eigen/src/Core/functors/AssignmentFunctors.h +++ b/Eigen/src/Core/functors/AssignmentFunctors.h
@@ -29,6 +29,11 @@ EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const { pstoret<DstScalar, Packet, Alignment>(a, b); } + + template <int Alignment, typename Packet> + EIGEN_STRONG_INLINE void assignPacketSegment(DstScalar* a, const Packet& b, Index begin, Index count) const { + pstoretSegment<DstScalar, Packet, Alignment>(a, b, begin, count); + } }; // Empty overload for void type (used by PermutationMatrix) @@ -60,6 +65,12 @@ assign_op<DstScalar, DstScalar>().template assignPacket<Alignment, Packet>( a, Func().packetOp(ploadt<Packet, Alignment>(a), b)); } + + template <int Alignment, typename Packet> + EIGEN_STRONG_INLINE void assignPacketSegment(DstScalar* a, const Packet& b, Index begin, Index count) const { + assign_op<DstScalar, DstScalar>().template assignPacketSegment<Alignment, Packet>( + a, Func().packetOp(ploadtSegment<Packet, Alignment>(a, begin, count), b), begin, count); + } }; template <typename DstScalar, typename SrcScalar, typename Func>
diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h index c91e6bb..a93b998 100644 --- a/Eigen/src/Core/functors/BinaryFunctors.h +++ b/Eigen/src/Core/functors/BinaryFunctors.h
@@ -438,7 +438,6 @@ } template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const { - maybe_raise_div_by_zero<Packet>::run(b); return internal::pdiv(a, b); } };
diff --git a/Eigen/src/Core/functors/NullaryFunctors.h b/Eigen/src/Core/functors/NullaryFunctors.h index 14b56d7..35dc738 100644 --- a/Eigen/src/Core/functors/NullaryFunctors.h +++ b/Eigen/src/Core/functors/NullaryFunctors.h
@@ -28,7 +28,7 @@ const Scalar m_other; }; template <typename Scalar> -struct functor_traits<scalar_constant_op<Scalar> > { +struct functor_traits<scalar_constant_op<Scalar>> { enum { Cost = 0 /* as the constant value should be loaded in register only once for the whole expression */, PacketAccess = packet_traits<Scalar>::Vectorizable, @@ -56,7 +56,7 @@ } }; template <typename Scalar> -struct functor_traits<scalar_identity_op<Scalar> > { +struct functor_traits<scalar_identity_op<Scalar>> { enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = false, IsRepeatable = true }; }; @@ -86,18 +86,19 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const { // Principle: // [low, ..., low] + ( [step, ..., step] * ( [i, ..., i] + [0, ..., size] ) ) + Packet low = pset1<Packet>(m_low); + Packet high = pset1<Packet>(m_high); + Packet step = pset1<Packet>(m_step); if (m_flip) { Packet pi = plset<Packet>(Scalar(i - m_size1)); - Packet res = padd(pset1<Packet>(m_high), pmul(pset1<Packet>(m_step), pi)); - if (EIGEN_PREDICT_TRUE(i != 0)) return res; - Packet mask = pcmp_lt(pset1<Packet>(0), plset<Packet>(0)); - return pselect<Packet>(mask, res, pset1<Packet>(m_low)); + Packet res = pmadd(step, pi, high); + Packet mask = pcmp_lt(pzero(res), plset<Packet>(Scalar(i))); + return pselect<Packet>(mask, res, low); } else { Packet pi = plset<Packet>(Scalar(i)); - Packet res = padd(pset1<Packet>(m_low), pmul(pset1<Packet>(m_step), pi)); - if (EIGEN_PREDICT_TRUE(i != m_size1 - unpacket_traits<Packet>::size + 1)) return res; - Packet mask = pcmp_lt(plset<Packet>(0), pset1<Packet>(unpacket_traits<Packet>::size - 1)); - return pselect<Packet>(mask, res, pset1<Packet>(m_high)); + Packet res = pmadd(step, pi, low); + Packet mask = pcmp_lt(pi, pset1<Packet>(Scalar(m_size1))); + return pselect<Packet>(mask, res, high); } } @@ -139,7 +140,7 @@ template <typename Scalar> struct linspaced_op; template <typename Scalar> -struct functor_traits<linspaced_op<Scalar> > { +struct functor_traits<linspaced_op<Scalar>> { enum { Cost = 1, PacketAccess = (!NumTraits<Scalar>::IsInteger) && packet_traits<Scalar>::HasSetLinear, @@ -192,7 +193,7 @@ }; template <typename Scalar> -struct functor_traits<equalspaced_op<Scalar> > { +struct functor_traits<equalspaced_op<Scalar>> { enum { Cost = NumTraits<Scalar>::AddCost + NumTraits<Scalar>::MulCost, PacketAccess =
diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h index 03542e3..ba7d97a 100644 --- a/Eigen/src/Core/functors/UnaryFunctors.h +++ b/Eigen/src/Core/functors/UnaryFunctors.h
@@ -558,11 +558,15 @@ template <typename Scalar> struct scalar_cbrt_op { EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::cbrt(a); } + template <typename Packet> + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { + return internal::pcbrt(a); + } }; template <typename Scalar> struct functor_traits<scalar_cbrt_op<Scalar>> { - enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false }; + enum { Cost = 20 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasCbrt }; }; /** \internal
diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h index 8f7b7dd..4f36689 100644 --- a/Eigen/src/Core/products/Parallelizer.h +++ b/Eigen/src/Core/products/Parallelizer.h
@@ -217,7 +217,7 @@ // Note that the actual number of threads might be lower than the number of // requested ones Index actual_threads = omp_get_num_threads(); - GemmParallelInfo<Index> info(i, static_cast<int>(actual_threads), task_info); + GemmParallelInfo<Index> info(static_cast<int>(i), static_cast<int>(actual_threads), task_info); Index blockCols = (cols / actual_threads) & ~Index(0x3); Index blockRows = (rows / actual_threads);
diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h index 5d3f1cf..49f307c 100644 --- a/Eigen/src/Core/util/ConfigureVectorization.h +++ b/Eigen/src/Core/util/ConfigureVectorization.h
@@ -285,6 +285,8 @@ #ifdef __AVX512FP16__ #ifdef __AVX512VL__ #define EIGEN_VECTORIZE_AVX512FP16 +// Built-in _Float16. +#define EIGEN_HAS_BUILTIN_FLOAT16 1 #else #if EIGEN_COMP_GNUC #error Please add -mavx512vl to your compiler flags: compiling with -mavx512fp16 alone without AVX512-VL is not supported.
diff --git a/Eigen/src/Core/util/EmulateArray.h b/Eigen/src/Core/util/EmulateArray.h index f2fd10b..6c4c22d 100644 --- a/Eigen/src/Core/util/EmulateArray.h +++ b/Eigen/src/Core/util/EmulateArray.h
@@ -248,15 +248,15 @@ #endif template <std::size_t I_, class T, std::size_t N> -constexpr inline T& array_get(std::array<T, N>& a) { +constexpr T& array_get(std::array<T, N>& a) { return (T&)STD_GET_ARR_HACK; } template <std::size_t I_, class T, std::size_t N> -constexpr inline T&& array_get(std::array<T, N>&& a) { +constexpr T&& array_get(std::array<T, N>&& a) { return (T&&)STD_GET_ARR_HACK; } template <std::size_t I_, class T, std::size_t N> -constexpr inline T const& array_get(std::array<T, N> const& a) { +constexpr T const& array_get(std::array<T, N> const& a) { return (T const&)STD_GET_ARR_HACK; }
diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h index 2488be4..3c0bc46 100644 --- a/Eigen/src/Core/util/ForwardDeclarations.h +++ b/Eigen/src/Core/util/ForwardDeclarations.h
@@ -514,6 +514,9 @@ struct eigen_memset_helper; template <typename Xpr, bool use_memset = eigen_memset_helper<Xpr>::value> struct eigen_zero_impl; + +template <typename Packet> +struct has_packet_segment : std::false_type {}; } // namespace internal } // end namespace Eigen
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 7347dbb..aed8a88 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h
@@ -1281,11 +1281,6 @@ #define EIGEN_CATCH(X) else #endif -#define EIGEN_NOEXCEPT noexcept -#define EIGEN_NOEXCEPT_IF(x) noexcept(x) -#define EIGEN_NO_THROW noexcept(true) -#define EIGEN_EXCEPTION_SPEC(X) noexcept(false) - // The all function is used to enable a variadic version of eigen_assert which can take a parameter pack as its input. namespace Eigen { namespace internal {
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 0facd26..89b2fff 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h
@@ -833,46 +833,44 @@ // HIP does not support new/delete on device. #if EIGEN_MAX_ALIGN_BYTES != 0 && !defined(EIGEN_HIP_DEVICE_COMPILE) -#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \ - EIGEN_DEVICE_FUNC void* operator new(std::size_t size, const std::nothrow_t&) EIGEN_NO_THROW { \ - EIGEN_TRY { return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); } \ - EIGEN_CATCH(...) { return 0; } \ +#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \ + EIGEN_DEVICE_FUNC void* operator new(std::size_t size, const std::nothrow_t&) noexcept { \ + EIGEN_TRY { return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); } \ + EIGEN_CATCH(...) { return 0; } \ } -#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign) \ - EIGEN_DEVICE_FUNC void* operator new(std::size_t size) { \ - return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); \ - } \ - EIGEN_DEVICE_FUNC void* operator new[](std::size_t size) { \ - return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); \ - } \ - EIGEN_DEVICE_FUNC void operator delete(void* ptr) EIGEN_NO_THROW { \ - Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); \ - } \ - EIGEN_DEVICE_FUNC void operator delete[](void* ptr) EIGEN_NO_THROW { \ - Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); \ - } \ - EIGEN_DEVICE_FUNC void operator delete(void* ptr, std::size_t /* sz */) EIGEN_NO_THROW { \ - Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); \ - } \ - EIGEN_DEVICE_FUNC void operator delete[](void* ptr, std::size_t /* sz */) EIGEN_NO_THROW { \ - Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); \ - } \ - /* in-place new and delete. since (at least afaik) there is no actual */ \ - /* memory allocated we can safely let the default implementation handle */ \ - /* this particular case. */ \ - EIGEN_DEVICE_FUNC static void* operator new(std::size_t size, void* ptr) { return ::operator new(size, ptr); } \ - EIGEN_DEVICE_FUNC static void* operator new[](std::size_t size, void* ptr) { return ::operator new[](size, ptr); } \ - EIGEN_DEVICE_FUNC void operator delete(void* memory, void* ptr) EIGEN_NO_THROW { \ - return ::operator delete(memory, ptr); \ - } \ - EIGEN_DEVICE_FUNC void operator delete[](void* memory, void* ptr) EIGEN_NO_THROW { \ - return ::operator delete[](memory, ptr); \ - } \ - /* nothrow-new (returns zero instead of std::bad_alloc) */ \ - EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \ - EIGEN_DEVICE_FUNC void operator delete(void* ptr, const std::nothrow_t&) EIGEN_NO_THROW { \ - Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); \ - } \ +#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign) \ + EIGEN_DEVICE_FUNC void* operator new(std::size_t size) { \ + return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); \ + } \ + EIGEN_DEVICE_FUNC void* operator new[](std::size_t size) { \ + return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); \ + } \ + EIGEN_DEVICE_FUNC void operator delete(void* ptr) noexcept { \ + Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); \ + } \ + EIGEN_DEVICE_FUNC void operator delete[](void* ptr) noexcept { \ + Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); \ + } \ + EIGEN_DEVICE_FUNC void operator delete(void* ptr, std::size_t /* sz */) noexcept { \ + Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); \ + } \ + EIGEN_DEVICE_FUNC void operator delete[](void* ptr, std::size_t /* sz */) noexcept { \ + Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); \ + } \ + /* in-place new and delete. since (at least afaik) there is no actual */ \ + /* memory allocated we can safely let the default implementation handle */ \ + /* this particular case. */ \ + EIGEN_DEVICE_FUNC static void* operator new(std::size_t size, void* ptr) { return ::operator new(size, ptr); } \ + EIGEN_DEVICE_FUNC static void* operator new[](std::size_t size, void* ptr) { return ::operator new[](size, ptr); } \ + EIGEN_DEVICE_FUNC void operator delete(void* memory, void* ptr) noexcept { return ::operator delete(memory, ptr); } \ + EIGEN_DEVICE_FUNC void operator delete[](void* memory, void* ptr) noexcept { \ + return ::operator delete[](memory, ptr); \ + } \ + /* nothrow-new (returns zero instead of std::bad_alloc) */ \ + EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \ + EIGEN_DEVICE_FUNC void operator delete(void* ptr, const std::nothrow_t&) noexcept { \ + Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); \ + } \ typedef void eigen_aligned_operator_new_marker_type; #else #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign)
diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index 40604f8..ddbc898 100644 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h
@@ -90,12 +90,8 @@ * we however don't want to add a dependency to Boost. */ -struct true_type { - enum { value = 1 }; -}; -struct false_type { - enum { value = 0 }; -}; +using std::false_type; +using std::true_type; template <bool Condition> struct bool_constant; @@ -341,7 +337,7 @@ #if EIGEN_COMP_CXXVER >= 20 && defined(__cpp_lib_ssize) && __cpp_lib_ssize >= 201902L template <typename T> -EIGEN_CONSTEXPR auto index_list_size(T&& x) { +constexpr auto index_list_size(T&& x) { using std::ssize; return ssize(std::forward<T>(x)); } @@ -349,13 +345,13 @@ #else template <typename T> -EIGEN_CONSTEXPR auto index_list_size(const T& x) { +constexpr auto index_list_size(const T& x) { using R = std::common_type_t<std::ptrdiff_t, std::make_signed_t<decltype(x.size())>>; return static_cast<R>(x.size()); } template <typename T, std::ptrdiff_t N> -EIGEN_CONSTEXPR std::ptrdiff_t index_list_size(const T (&)[N]) { +constexpr std::ptrdiff_t index_list_size(const T (&)[N]) { return N; } #endif @@ -641,21 +637,21 @@ constexpr bool is_int_or_enum_v = std::is_enum<A>::value || std::is_integral<A>::value; template <typename A, typename B> -inline constexpr void plain_enum_asserts(A, B) { +constexpr void plain_enum_asserts(A, B) { static_assert(is_int_or_enum_v<A>, "Argument a must be an integer or enum"); static_assert(is_int_or_enum_v<B>, "Argument b must be an integer or enum"); } /// \internal Gets the minimum of two values which may be integers or enums template <typename A, typename B> -inline constexpr int plain_enum_min(A a, B b) { +constexpr int plain_enum_min(A a, B b) { plain_enum_asserts(a, b); return ((int)a <= (int)b) ? (int)a : (int)b; } /// \internal Gets the maximum of two values which may be integers or enums template <typename A, typename B> -inline constexpr int plain_enum_max(A a, B b) { +constexpr int plain_enum_max(A a, B b) { plain_enum_asserts(a, b); return ((int)a >= (int)b) ? (int)a : (int)b; } @@ -667,7 +663,7 @@ * finite values is that min(3, Dynamic) should be Dynamic, since that could be anything between 0 and 3. */ template <typename A, typename B> -inline constexpr int min_size_prefer_dynamic(A a, B b) { +constexpr int min_size_prefer_dynamic(A a, B b) { plain_enum_asserts(a, b); if ((int)a == 0 || (int)b == 0) return 0; if ((int)a == 1 || (int)b == 1) return 1; @@ -682,7 +678,7 @@ * 0 and 3), it is not more than 3. */ template <typename A, typename B> -inline constexpr int min_size_prefer_fixed(A a, B b) { +constexpr int min_size_prefer_fixed(A a, B b) { plain_enum_asserts(a, b); if ((int)a == 0 || (int)b == 0) return 0; if ((int)a == 1 || (int)b == 1) return 1; @@ -694,7 +690,7 @@ /// \internal see `min_size_prefer_fixed`. No need for a separate variant for MaxSizes here. template <typename A, typename B> -inline constexpr int max_size_prefer_dynamic(A a, B b) { +constexpr int max_size_prefer_dynamic(A a, B b) { plain_enum_asserts(a, b); if ((int)a == Dynamic || (int)b == Dynamic) return Dynamic; return plain_enum_max(a, b); @@ -714,38 +710,38 @@ } template <typename A, typename B> -inline constexpr bool enum_lt_not_dynamic(A a, B b) { +constexpr bool enum_lt_not_dynamic(A a, B b) { plain_enum_asserts(a, b); if ((int)a == Dynamic || (int)b == Dynamic) return false; return (int)a < (int)b; } template <typename A, typename B> -inline constexpr bool enum_le_not_dynamic(A a, B b) { +constexpr bool enum_le_not_dynamic(A a, B b) { plain_enum_asserts(a, b); if ((int)a == Dynamic || (int)b == Dynamic) return false; return (int)a <= (int)b; } template <typename A, typename B> -inline constexpr bool enum_gt_not_dynamic(A a, B b) { +constexpr bool enum_gt_not_dynamic(A a, B b) { plain_enum_asserts(a, b); if ((int)a == Dynamic || (int)b == Dynamic) return false; return (int)a > (int)b; } template <typename A, typename B> -inline constexpr bool enum_ge_not_dynamic(A a, B b) { +constexpr bool enum_ge_not_dynamic(A a, B b) { plain_enum_asserts(a, b); if ((int)a == Dynamic || (int)b == Dynamic) return false; return (int)a >= (int)b; } /// \internal Calculate logical XOR at compile time -inline constexpr bool logical_xor(bool a, bool b) { return a != b; } +constexpr bool logical_xor(bool a, bool b) { return a != b; } /// \internal Calculate logical IMPLIES at compile time -inline constexpr bool check_implication(bool a, bool b) { return !a || b; } +constexpr bool check_implication(bool a, bool b) { return !a || b; } /// \internal Provide fallback for std::is_constant_evaluated for pre-C++20. #if EIGEN_COMP_CXXVER >= 20 && defined(__cpp_lib_is_constant_evaluated) && __cpp_lib_is_constant_evaluated >= 201811L
diff --git a/Eigen/src/Core/util/ReshapedHelper.h b/Eigen/src/Core/util/ReshapedHelper.h index e569408..1747950 100644 --- a/Eigen/src/Core/util/ReshapedHelper.h +++ b/Eigen/src/Core/util/ReshapedHelper.h
@@ -40,7 +40,7 @@ inline Index get_runtime_reshape_size(AutoSize_t /*size*/, Index other, Index total) { return total / other; } -constexpr inline int get_compiletime_reshape_order(int flags, int order) { +constexpr int get_compiletime_reshape_order(int flags, int order) { return order == AutoOrder ? flags & RowMajorBit : order; }
diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index a42bb0f..a0e160e 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h
@@ -158,8 +158,8 @@ EIGEN_ONLY_USED_FOR_DEBUG(v); eigen_assert(v == T(Value)); } - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR T value() { return T(Value); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR operator T() const { return T(Value); } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr T value() { return T(Value); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr operator T() const { return T(Value); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T v) const { EIGEN_ONLY_USED_FOR_DEBUG(v); eigen_assert(v == T(Value)); @@ -171,7 +171,7 @@ T m_value; public: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamic(T value = 0) EIGEN_NO_THROW : m_value(value) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamic(T value = 0) noexcept : m_value(value) {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T value() const { return m_value; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator T() const { return m_value; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T value) { m_value = value; } @@ -186,7 +186,7 @@ EIGEN_ONLY_USED_FOR_DEBUG(v); eigen_assert(v == T(Value)); } - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR T value() { return T(Value); } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr T value() { return T(Value); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T) {} }; @@ -315,7 +315,7 @@ }; #if EIGEN_MAX_STATIC_ALIGN_BYTES > 0 -constexpr inline int compute_default_alignment_helper(int ArrayBytes, int AlignmentBytes) { +constexpr int compute_default_alignment_helper(int ArrayBytes, int AlignmentBytes) { if ((ArrayBytes % AlignmentBytes) == 0) { return AlignmentBytes; } else if (EIGEN_MIN_ALIGN_BYTES < AlignmentBytes) { @@ -327,7 +327,7 @@ #else // If static alignment is disabled, no need to bother. // This also avoids a division by zero -constexpr inline int compute_default_alignment_helper(int ArrayBytes, int AlignmentBytes) { +constexpr int compute_default_alignment_helper(int ArrayBytes, int AlignmentBytes) { EIGEN_UNUSED_VARIABLE(ArrayBytes); EIGEN_UNUSED_VARIABLE(AlignmentBytes); return 0; @@ -362,7 +362,7 @@ typedef Matrix<Scalar_, Rows_, Cols_, Options, MaxRows_, MaxCols_> type; }; -constexpr inline unsigned compute_matrix_flags(int Options) { +constexpr unsigned compute_matrix_flags(int Options) { unsigned row_major_bit = Options & RowMajor ? RowMajorBit : 0; // FIXME currently we still have to handle DirectAccessBit at the expression level to handle DenseCoeffsBase<> // and then propagate this information to the evaluator's flags. @@ -370,7 +370,7 @@ return DirectAccessBit | LvalueBit | NestByRefBit | row_major_bit; } -constexpr inline int size_at_compile_time(int rows, int cols) { +constexpr int size_at_compile_time(int rows, int cols) { if (rows == 0 || cols == 0) return 0; if (rows == Dynamic || cols == Dynamic) return Dynamic; return rows * cols;
diff --git a/Eigen/src/Eigenvalues/RealQZ.h b/Eigen/src/Eigenvalues/RealQZ.h index 5915387..a54d82d 100644 --- a/Eigen/src/Eigenvalues/RealQZ.h +++ b/Eigen/src/Eigenvalues/RealQZ.h
@@ -449,23 +449,23 @@ Index lr = (std::min)(k + 4, dim); // last row to update Map<Matrix<Scalar, Dynamic, 1> > tmp(m_workspace.data(), lr); // S - tmp = m_S.template middleCols<2>(k).topRows(lr) * essential2; + tmp.noalias() = m_S.template middleCols<2>(k).topRows(lr) * essential2; tmp += m_S.col(k + 2).head(lr); m_S.col(k + 2).head(lr) -= tau * tmp; - m_S.template middleCols<2>(k).topRows(lr) -= (tau * tmp) * essential2.adjoint(); + m_S.template middleCols<2>(k).topRows(lr).noalias() -= (tau * tmp) * essential2.adjoint(); // T tmp = m_T.template middleCols<2>(k).topRows(lr) * essential2; tmp += m_T.col(k + 2).head(lr); m_T.col(k + 2).head(lr) -= tau * tmp; - m_T.template middleCols<2>(k).topRows(lr) -= (tau * tmp) * essential2.adjoint(); + m_T.template middleCols<2>(k).topRows(lr).noalias() -= (tau * tmp) * essential2.adjoint(); } if (m_computeQZ) { // Z Map<Matrix<Scalar, 1, Dynamic> > tmp(m_workspace.data(), dim); - tmp = essential2.adjoint() * (m_Z.template middleRows<2>(k)); + tmp.noalias() = essential2.adjoint() * (m_Z.template middleRows<2>(k)); tmp += m_Z.row(k + 2); m_Z.row(k + 2) -= tau * tmp; - m_Z.template middleRows<2>(k) -= essential2 * (tau * tmp); + m_Z.template middleRows<2>(k).noalias() -= essential2 * (tau * tmp); } m_T.coeffRef(k + 2, k) = m_T.coeffRef(k + 2, k + 1) = Scalar(0.0);
diff --git a/Eigen/src/Eigenvalues/Tridiagonalization.h b/Eigen/src/Eigenvalues/Tridiagonalization.h index b2f07bc..4da6d07 100644 --- a/Eigen/src/Eigenvalues/Tridiagonalization.h +++ b/Eigen/src/Eigenvalues/Tridiagonalization.h
@@ -515,8 +515,8 @@ result.template diagonal<-1>() = m_matrix.template diagonal<-1>(); } - EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); } - EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); } + constexpr Index rows() const noexcept { return m_matrix.rows(); } + constexpr Index cols() const noexcept { return m_matrix.cols(); } protected: typename MatrixType::Nested m_matrix;
diff --git a/Eigen/src/Geometry/Homogeneous.h b/Eigen/src/Geometry/Homogeneous.h index 64c1b65..795af0d 100644 --- a/Eigen/src/Geometry/Homogeneous.h +++ b/Eigen/src/Geometry/Homogeneous.h
@@ -69,10 +69,10 @@ EIGEN_DEVICE_FUNC explicit inline Homogeneous(const MatrixType& matrix) : m_matrix(matrix) {} - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { + EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_matrix.rows() + (int(Direction) == Vertical ? 1 : 0); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { + EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_matrix.cols() + (int(Direction) == Horizontal ? 1 : 0); } @@ -244,8 +244,8 @@ EIGEN_DEVICE_FUNC homogeneous_left_product_impl(const Lhs& lhs, const MatrixType& rhs) : m_lhs(take_matrix_for_product<Lhs>::run(lhs)), m_rhs(rhs) {} - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_lhs.rows(); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); } + EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_lhs.rows(); } + EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_rhs.cols(); } template <typename Dest> EIGEN_DEVICE_FUNC void evalTo(Dest& dst) const { @@ -275,8 +275,8 @@ typedef remove_all_t<typename Rhs::Nested> RhsNested; EIGEN_DEVICE_FUNC homogeneous_right_product_impl(const MatrixType& lhs, const Rhs& rhs) : m_lhs(lhs), m_rhs(rhs) {} - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_lhs.rows(); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); } + EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_lhs.rows(); } + EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_rhs.cols(); } template <typename Dest> EIGEN_DEVICE_FUNC void evalTo(Dest& dst) const {
diff --git a/Eigen/src/Geometry/Quaternion.h b/Eigen/src/Geometry/Quaternion.h index 1d8ded9..147e6e3 100644 --- a/Eigen/src/Geometry/Quaternion.h +++ b/Eigen/src/Geometry/Quaternion.h
@@ -57,22 +57,22 @@ typedef AngleAxis<Scalar> AngleAxisType; /** \returns the \c x coefficient */ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline CoeffReturnType x() const { return this->derived().coeffs().coeff(0); } + EIGEN_DEVICE_FUNC constexpr CoeffReturnType x() const { return this->derived().coeffs().coeff(0); } /** \returns the \c y coefficient */ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline CoeffReturnType y() const { return this->derived().coeffs().coeff(1); } + EIGEN_DEVICE_FUNC constexpr CoeffReturnType y() const { return this->derived().coeffs().coeff(1); } /** \returns the \c z coefficient */ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline CoeffReturnType z() const { return this->derived().coeffs().coeff(2); } + EIGEN_DEVICE_FUNC constexpr CoeffReturnType z() const { return this->derived().coeffs().coeff(2); } /** \returns the \c w coefficient */ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline CoeffReturnType w() const { return this->derived().coeffs().coeff(3); } + EIGEN_DEVICE_FUNC constexpr CoeffReturnType w() const { return this->derived().coeffs().coeff(3); } /** \returns a reference to the \c x coefficient (if Derived is a non-const lvalue) */ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline NonConstCoeffReturnType x() { return this->derived().coeffs().x(); } + EIGEN_DEVICE_FUNC constexpr NonConstCoeffReturnType x() { return this->derived().coeffs().x(); } /** \returns a reference to the \c y coefficient (if Derived is a non-const lvalue) */ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline NonConstCoeffReturnType y() { return this->derived().coeffs().y(); } + EIGEN_DEVICE_FUNC constexpr NonConstCoeffReturnType y() { return this->derived().coeffs().y(); } /** \returns a reference to the \c z coefficient (if Derived is a non-const lvalue) */ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline NonConstCoeffReturnType z() { return this->derived().coeffs().z(); } + EIGEN_DEVICE_FUNC constexpr NonConstCoeffReturnType z() { return this->derived().coeffs().z(); } /** \returns a reference to the \c w coefficient (if Derived is a non-const lvalue) */ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline NonConstCoeffReturnType w() { return this->derived().coeffs().w(); } + EIGEN_DEVICE_FUNC constexpr NonConstCoeffReturnType w() { return this->derived().coeffs().w(); } /** \returns a read-only vector expression of the imaginary part (x,y,z) */ EIGEN_DEVICE_FUNC inline const VectorBlock<const Coefficients, 3> vec() const { return coeffs().template head<3>(); } @@ -346,13 +346,11 @@ // We define a copy constructor, which means we don't get an implicit move constructor or assignment operator. /** Default move constructor */ - EIGEN_DEVICE_FUNC inline Quaternion(Quaternion&& other) - EIGEN_NOEXCEPT_IF(std::is_nothrow_move_constructible<Scalar>::value) + EIGEN_DEVICE_FUNC inline Quaternion(Quaternion&& other) noexcept(std::is_nothrow_move_constructible<Scalar>::value) : m_coeffs(std::move(other.coeffs())) {} /** Default move assignment operator */ - EIGEN_DEVICE_FUNC Quaternion& operator=(Quaternion&& other) - EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable<Scalar>::value) { + EIGEN_DEVICE_FUNC Quaternion& operator=(Quaternion&& other) noexcept(std::is_nothrow_move_assignable<Scalar>::value) { m_coeffs = std::move(other.coeffs()); return *this; } @@ -793,7 +791,7 @@ } else { // theta is the angle between the 2 quaternions Scalar theta = acos(absD); - Scalar sinTheta = sin(theta); + Scalar sinTheta = numext::sqrt(Scalar(1) - absD * absD); scale0 = sin((Scalar(1) - t) * theta) / sinTheta; scale1 = sin((t * theta)) / sinTheta;
diff --git a/Eigen/src/Geometry/Transform.h b/Eigen/src/Geometry/Transform.h index b1a9f21..a5d7b60 100644 --- a/Eigen/src/Geometry/Transform.h +++ b/Eigen/src/Geometry/Transform.h
@@ -353,10 +353,10 @@ inline QTransform toQTransform(void) const; #endif - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { + EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return int(Mode) == int(Projective) ? m_matrix.cols() : (m_matrix.cols() - 1); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); } + EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_matrix.cols(); } /** shortcut for m_matrix(row,col); * \sa MatrixBase::operator(Index,Index) const */ @@ -1059,11 +1059,11 @@ : Scalar(1); // so x has absolute value 1 VectorType sv(svd.singularValues()); sv.coeffRef(Dim - 1) *= x; - if (scaling) *scaling = svd.matrixV() * sv.asDiagonal() * svd.matrixV().adjoint(); + if (scaling) (*scaling).noalias() = svd.matrixV() * sv.asDiagonal() * svd.matrixV().adjoint(); if (rotation) { LinearMatrixType m(svd.matrixU()); m.col(Dim - 1) *= x; - *rotation = m * svd.matrixV().adjoint(); + (*rotation).noalias() = m * svd.matrixV().adjoint(); } } @@ -1182,7 +1182,8 @@ eigen_assert(false && "Invalid transform traits in Transform::Inverse"); } // translation and remaining parts - res.matrix().template topRightCorner<Dim, 1>() = -res.matrix().template topLeftCorner<Dim, Dim>() * translation(); + res.matrix().template topRightCorner<Dim, 1>().noalias() = + -res.matrix().template topLeftCorner<Dim, Dim>() * translation(); res.makeAffine(); // we do need this, because in the beginning res is uninitialized } return res; @@ -1432,7 +1433,7 @@ typedef Transform<Scalar, Dim, ResultMode, LhsOptions> ResultType; static EIGEN_DEVICE_FUNC ResultType run(const Lhs& lhs, const Rhs& rhs) { ResultType res; - res.linear() = lhs.linear() * rhs.linear(); + res.linear().noalias() = lhs.linear() * rhs.linear(); res.translation() = lhs.linear() * rhs.translation() + lhs.translation(); res.makeAffine(); return res;
diff --git a/Eigen/src/Geometry/Translation.h b/Eigen/src/Geometry/Translation.h index 682c4c7..d942ac8 100644 --- a/Eigen/src/Geometry/Translation.h +++ b/Eigen/src/Geometry/Translation.h
@@ -69,18 +69,18 @@ EIGEN_DEVICE_FUNC explicit inline Translation(const VectorType& vector) : m_coeffs(vector) {} /** \brief Returns the x-translation by value. **/ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Scalar x() const { return m_coeffs.x(); } + EIGEN_DEVICE_FUNC constexpr Scalar x() const { return m_coeffs.x(); } /** \brief Returns the y-translation by value. **/ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Scalar y() const { return m_coeffs.y(); } + EIGEN_DEVICE_FUNC constexpr Scalar y() const { return m_coeffs.y(); } /** \brief Returns the z-translation by value. **/ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Scalar z() const { return m_coeffs.z(); } + EIGEN_DEVICE_FUNC constexpr Scalar z() const { return m_coeffs.z(); } /** \brief Returns the x-translation as a reference. **/ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Scalar& x() { return m_coeffs.x(); } + EIGEN_DEVICE_FUNC constexpr Scalar& x() { return m_coeffs.x(); } /** \brief Returns the y-translation as a reference. **/ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Scalar& y() { return m_coeffs.y(); } + EIGEN_DEVICE_FUNC constexpr Scalar& y() { return m_coeffs.y(); } /** \brief Returns the z-translation as a reference. **/ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Scalar& z() { return m_coeffs.z(); } + EIGEN_DEVICE_FUNC constexpr Scalar& z() { return m_coeffs.z(); } EIGEN_DEVICE_FUNC const VectorType& vector() const { return m_coeffs; } EIGEN_DEVICE_FUNC VectorType& vector() { return m_coeffs; }
diff --git a/Eigen/src/Householder/HouseholderSequence.h b/Eigen/src/Householder/HouseholderSequence.h index 440573f..d49c961 100644 --- a/Eigen/src/Householder/HouseholderSequence.h +++ b/Eigen/src/Householder/HouseholderSequence.h
@@ -183,7 +183,7 @@ * \returns Number of rows * \details This equals the dimension of the space that the transformation acts on. */ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { + EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return Side == OnTheLeft ? m_vectors.rows() : m_vectors.cols(); } @@ -191,7 +191,7 @@ * \returns Number of columns * \details This equals the dimension of the space that the transformation acts on. */ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return rows(); } + EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return rows(); } /** \brief Essential part of a Householder vector. * \param[in] k Index of Householder reflection
diff --git a/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h b/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h index 0beef60..904d853 100644 --- a/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +++ b/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
@@ -51,8 +51,8 @@ compute(mat); } - EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_invdiag.size(); } - EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_invdiag.size(); } + constexpr Index rows() const noexcept { return m_invdiag.size(); } + constexpr Index cols() const noexcept { return m_invdiag.size(); } template <typename MatType> DiagonalPreconditioner& analyzePattern(const MatType&) {
diff --git a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h index ba379ec..dd40058 100644 --- a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +++ b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h
@@ -84,10 +84,10 @@ } /** \returns number of rows of the factored matrix */ - EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_L.rows(); } + constexpr Index rows() const noexcept { return m_L.rows(); } /** \returns number of columns of the factored matrix */ - EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_L.cols(); } + constexpr Index cols() const noexcept { return m_L.cols(); } /** \brief Reports whether previous computation was successful. *
diff --git a/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h b/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h index 930077d..11ce5e5 100644 --- a/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +++ b/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
@@ -135,9 +135,9 @@ /** \brief Extraction Method for U-Factor */ const FactorType matrixU() const; - EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_lu.rows(); } + constexpr Index rows() const noexcept { return m_lu.rows(); } - EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_lu.cols(); } + constexpr Index cols() const noexcept { return m_lu.cols(); } /** \brief Reports whether previous computation was successful. * @@ -446,4 +446,4 @@ } // end namespace Eigen -#endif // EIGEN_INCOMPLETE_LUT_H \ No newline at end of file +#endif // EIGEN_INCOMPLETE_LUT_H
diff --git a/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h b/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h index cf85f2e..5caa396 100644 --- a/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +++ b/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
@@ -218,10 +218,10 @@ } /** \internal */ - EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return matrix().rows(); } + constexpr Index rows() const noexcept { return matrix().rows(); } /** \internal */ - EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return matrix().cols(); } + constexpr Index cols() const noexcept { return matrix().cols(); } /** \returns the tolerance threshold used by the stopping criteria. * \sa setTolerance()
diff --git a/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h b/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h index 2b146b3..271679f 100644 --- a/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +++ b/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h
@@ -50,8 +50,8 @@ SolveWithGuess(const Decomposition &dec, const RhsType &rhs, const GuessType &guess) : m_dec(dec), m_rhs(rhs), m_guess(guess) {} - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_dec.cols(); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); } + EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_dec.cols(); } + EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_rhs.cols(); } EIGEN_DEVICE_FUNC const Decomposition &dec() const { return m_dec; } EIGEN_DEVICE_FUNC const RhsType &rhs() const { return m_rhs; }
diff --git a/Eigen/src/KLUSupport/KLUSupport.h b/Eigen/src/KLUSupport/KLUSupport.h index 574021d..12cf6c2 100644 --- a/Eigen/src/KLUSupport/KLUSupport.h +++ b/Eigen/src/KLUSupport/KLUSupport.h
@@ -100,8 +100,8 @@ if (m_numeric) klu_free_numeric(&m_numeric, &m_common); } - EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return mp_matrix.rows(); } - EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return mp_matrix.cols(); } + constexpr Index rows() const noexcept { return mp_matrix.rows(); } + constexpr Index cols() const noexcept { return mp_matrix.cols(); } /** \brief Reports whether previous computation was successful. *
diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h index 3e57764..a725a7b 100644 --- a/Eigen/src/LU/FullPivLU.h +++ b/Eigen/src/LU/FullPivLU.h
@@ -391,8 +391,8 @@ MatrixType reconstructedMatrix() const; - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_lu.rows(); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_lu.cols(); } + EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_lu.rows(); } + EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_lu.cols(); } #ifndef EIGEN_PARSED_BY_DOXYGEN template <typename RhsType, typename DstType> @@ -717,7 +717,7 @@ // Step 2 m_lu.topLeftCorner(smalldim, smalldim).template triangularView<UnitLower>().solveInPlace(c.topRows(smalldim)); - if (rows > cols) c.bottomRows(rows - cols) -= m_lu.bottomRows(rows - cols) * c.topRows(cols); + if (rows > cols) c.bottomRows(rows - cols).noalias() -= m_lu.bottomRows(rows - cols) * c.topRows(cols); // Step 3 m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots)
diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h index 1edd6b8..f09b90e 100644 --- a/Eigen/src/LU/PartialPivLU.h +++ b/Eigen/src/LU/PartialPivLU.h
@@ -210,8 +210,8 @@ MatrixType reconstructedMatrix() const; - EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_lu.rows(); } - EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_lu.cols(); } + constexpr Index rows() const noexcept { return m_lu.rows(); } + constexpr Index cols() const noexcept { return m_lu.cols(); } #ifndef EIGEN_PARSED_BY_DOXYGEN template <typename RhsType, typename DstType>
diff --git a/Eigen/src/SVD/SVDBase.h b/Eigen/src/SVD/SVDBase.h index d1ad63d..dcb4dba 100644 --- a/Eigen/src/SVD/SVDBase.h +++ b/Eigen/src/SVD/SVDBase.h
@@ -379,7 +379,7 @@ Index l_rank = rank(); tmp.noalias() = m_matrixU.leftCols(l_rank).adjoint() * rhs; tmp = m_singularValues.head(l_rank).asDiagonal().inverse() * tmp; - dst = m_matrixV.leftCols(l_rank) * tmp; + dst.noalias() = m_matrixV.leftCols(l_rank) * tmp; } template <typename Derived>
diff --git a/Eigen/src/SVD/UpperBidiagonalization.h b/Eigen/src/SVD/UpperBidiagonalization.h index d78b30b..6df6318 100644 --- a/Eigen/src/SVD/UpperBidiagonalization.h +++ b/Eigen/src/SVD/UpperBidiagonalization.h
@@ -172,7 +172,7 @@ // 1 - update the k-th column of A SubColumnType v_k = A.col(k).tail(remainingRows); v_k -= V_k1 * Y.row(k).head(k).adjoint(); - if (k) v_k -= X_k1 * A.col(k).head(k); + if (k) v_k.noalias() -= X_k1 * A.col(k).head(k); // 2 - construct left Householder transform in-place v_k.makeHouseholderInPlace(tau_v, diagonal[k]); @@ -203,7 +203,7 @@ SubRowType u_k(A.row(k).tail(remainingCols)); u_k = u_k.conjugate(); { - u_k -= Y_k * A.row(k).head(k + 1).adjoint(); + u_k.noalias() -= Y_k * A.row(k).head(k + 1).adjoint(); if (k) u_k -= U_k1.adjoint() * X.row(k).head(k).adjoint(); }
diff --git a/Eigen/src/ThreadPool/CoreThreadPoolDevice.h b/Eigen/src/ThreadPool/CoreThreadPoolDevice.h index a45be20..c603a38 100644 --- a/Eigen/src/ThreadPool/CoreThreadPoolDevice.h +++ b/Eigen/src/ThreadPool/CoreThreadPoolDevice.h
@@ -308,19 +308,24 @@ this->template assignPacket<DstAlignment, SrcAlignment, PacketType>(index); } }; + static constexpr bool UsePacketSegment = Kernel::AssignmentTraits::UsePacketSegment; + using head_loop = + unaligned_dense_assignment_loop<PacketType, DstAlignment, SrcAlignment, UsePacketSegment, DstIsAligned>; + using tail_loop = unaligned_dense_assignment_loop<PacketType, DstAlignment, SrcAlignment, UsePacketSegment, false>; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) { const Index size = kernel.size(); const Index alignedStart = DstIsAligned ? 0 : internal::first_aligned<RequestedAlignment>(kernel.dstDataPtr(), size); const Index alignedEnd = alignedStart + numext::round_down(size - alignedStart, PacketSize); - unaligned_dense_assignment_loop<DstIsAligned != 0>::run(kernel, 0, alignedStart); + head_loop::run(kernel, 0, alignedStart); constexpr float cost = static_cast<float>(XprEvaluationCost); AssignmentFunctor functor(kernel); device.template parallelFor<AssignmentFunctor, PacketSize>(alignedStart, alignedEnd, functor, cost); - unaligned_dense_assignment_loop<>::run(kernel, alignedEnd, size); + tail_loop::run(kernel, alignedEnd, size); } };
diff --git a/Eigen/src/misc/lapacke_helpers.h b/Eigen/src/misc/lapacke_helpers.h index 5a2f38f..ff98639 100644 --- a/Eigen/src/misc/lapacke_helpers.h +++ b/Eigen/src/misc/lapacke_helpers.h
@@ -75,7 +75,7 @@ /// translates storage order of the given Eigen object to the corresponding lapack constant template <typename Derived> -EIGEN_ALWAYS_INLINE EIGEN_CONSTEXPR lapack_int lapack_storage_of(const EigenBase<Derived> &) { +EIGEN_ALWAYS_INLINE constexpr lapack_int lapack_storage_of(const EigenBase<Derived> &) { return Derived::IsRowMajor ? LAPACK_ROW_MAJOR : LAPACK_COL_MAJOR; }
diff --git a/Eigen/src/plugins/BlockMethods.inc b/Eigen/src/plugins/BlockMethods.inc index 46dc9dd..0782aa3 100644 --- a/Eigen/src/plugins/BlockMethods.inc +++ b/Eigen/src/plugins/BlockMethods.inc
@@ -1365,6 +1365,6 @@ * \sa subVector(Index) */ template <DirectionType Direction> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index subVectors() const { +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index subVectors() const { return (Direction == Vertical) ? cols() : rows(); }
diff --git a/cmake/EigenConfigureTesting.cmake b/cmake/EigenConfigureTesting.cmake index 2a1e7ab..d72d88a 100644 --- a/cmake/EigenConfigureTesting.cmake +++ b/cmake/EigenConfigureTesting.cmake
@@ -1,19 +1,26 @@ include(EigenTesting) include(CheckCXXSourceCompiles) -# configure the "site" and "buildname" + +# configure the "site" and "buildname" ei_set_sitename() # retrieve and store the build string ei_set_build_string() add_custom_target(buildtests) + +if (NOT EIGEN_CTEST_ARGS) + # By default, run tests in parallel on all available cores. + set(EIGEN_CTEST_ARGS "" CACHE STRING "-j0") +endif() add_custom_target(check COMMAND "ctest" ${EIGEN_CTEST_ARGS}) + add_dependencies(check buildtests) # Convenience target for only building GPU tests. add_custom_target(buildtests_gpu) -add_custom_target(check_gpu COMMAND "ctest" "--output-on-failure" +add_custom_target(check_gpu COMMAND "ctest" "--output-on-failure" "--no-compress-output" "--build-no-clean" "-T" "test" @@ -59,7 +66,7 @@ set(CTEST_CUSTOM_COVERAGE_EXCLUDE "/test/") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COVERAGE_FLAGS}") endif() - + elseif(MSVC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /D_CRT_SECURE_NO_WARNINGS /D_SCL_SECURE_NO_WARNINGS") endif()
diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt index 0d123a2..6178d4b 100644 --- a/doc/CMakeLists.txt +++ b/doc/CMakeLists.txt
@@ -115,3 +115,8 @@ WORKING_DIRECTORY ${Eigen_BINARY_DIR}/doc) add_dependencies(doc doc-eigen-prerequisites doc-unsupported-prerequisites) + +add_custom_target(install-doc ALL + COMMAND ${CMAKE_COMMAND} -E copy_directory ${Eigen_BINARY_DIR}/doc ${CMAKE_INSTALL_FULL_DOCDIR} +) +add_dependencies(install-doc doc)
diff --git a/scripts/msvc_setup.ps1 b/scripts/msvc_setup.ps1 new file mode 100644 index 0000000..e2d0642 --- /dev/null +++ b/scripts/msvc_setup.ps1
@@ -0,0 +1,21 @@ +# Powershell script to set up MSVC environment. + +param ($EIGEN_CI_MSVC_ARCH, $EIGEN_CI_MSVC_VER) + +Set-PSDebug -Trace 1 + +function Get-ScriptDirectory { Split-Path $MyInvocation.ScriptName } + +# Set defaults if not already set. +IF (!$EIGEN_CI_MSVC_ARCH) { $EIGEN_CI_MSVC_ARCH = "x64" } +IF (!$EIGEN_CI_MSVC_VER) { $EIGEN_CI_MSVC_VER = "14.29" } + +# Export variables into the global scope +$global:EIGEN_CI_MSVC_ARCH = $EIGEN_CI_MSVC_ARCH +$global:EIGEN_CI_MSVC_VER = $EIGEN_CI_MSVC_VER + +# Find Visual Studio installation directory. +$global:VS_INSTALL_DIR = &"${Env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer\vswhere.exe" -latest -property installationPath +# Run VCVarsAll.bat incitialization script and extract environment variables. +# http://allen-mack.blogspot.com/2008/03/replace-visual-studio-command-prompt.html +cmd.exe /c "`"${VS_INSTALL_DIR}\VC\Auxiliary\Build\vcvarsall.bat`" $EIGEN_CI_MSVC_ARCH -vcvars_ver=$EIGEN_CI_MSVC_VER & set" | foreach { if ($_ -match "=") { $v = $_.split("="); set-item -force -path "ENV:\$($v[0])" -value "$($v[1])" } } \ No newline at end of file
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 813cc53..e62ec45 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt
@@ -186,6 +186,7 @@ ei_add_test(float_conversion) ei_add_test(io) ei_add_test(packetmath "-DEIGEN_FAST_MATH=1") +ei_add_test(packet_segment) ei_add_test(vectorization_logic) ei_add_test(basicstuff) ei_add_test(constexpr)
diff --git a/test/packet_ostream.h b/test/packet_ostream.h deleted file mode 100644 index 49e1bb0..0000000 --- a/test/packet_ostream.h +++ /dev/null
@@ -1,23 +0,0 @@ -#ifndef TEST_PACKET_OSTREAM -#define TEST_PACKET_OSTREAM - -#include <type_traits> -#include <ostream> - -// Include this header to be able to print Packets while debugging. - -template <typename Packet, - typename EnableIf = std::enable_if_t<Eigen::internal::unpacket_traits<Packet>::vectorizable> > -std::ostream& operator<<(std::ostream& os, const Packet& packet) { - using Scalar = typename Eigen::internal::unpacket_traits<Packet>::type; - Scalar v[Eigen::internal::unpacket_traits<Packet>::size]; - Eigen::internal::pstoreu(v, packet); - os << "{" << v[0]; - for (int i = 1; i < Eigen::internal::unpacket_traits<Packet>::size; ++i) { - os << "," << v[i]; - } - os << "}"; - return os; -} - -#endif // TEST_PACKET_OSTREAM \ No newline at end of file
diff --git a/test/packet_segment.cpp b/test/packet_segment.cpp new file mode 100644 index 0000000..6fa6a29 --- /dev/null +++ b/test/packet_segment.cpp
@@ -0,0 +1,168 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2025 The Eigen Authors +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +template <typename Scalar, typename Packet> +void verify_data(const Scalar* data_in, const Scalar* data_out, const Packet& a, Index begin, Index count) { + constexpr int PacketSize = internal::unpacket_traits<Packet>::size; + bool ok = true; + for (Index i = begin; i < begin + count; i++) { + ok = ok && numext::equal_strict(data_in[i], data_out[i]); + } + if (!ok) { + std::cout << "begin: " << begin << ", count: " << count << "\n"; + std::cout << "Scalar type: " << type_name(Scalar()) << " x " << PacketSize << "\n"; + std::cout << "data in: {"; + for (Index i = 0; i < PacketSize; i++) { + if (i > 0) std::cout << ","; + if (i < begin || i >= begin + count) { + std::cout << "MASK"; + } else { + std::cout << data_in[i]; + } + } + std::cout << "}\n"; + std::cout << "data out: {"; + for (Index i = 0; i < PacketSize; i++) { + if (i > 0) std::cout << ","; + if (i < begin || i >= begin + count) { + std::cout << "MASK"; + } else { + std::cout << data_out[i]; + } + } + std::cout << "}\n"; + std::cout << "packet: "; + std::cout << internal::postream(a) << "\n"; + } + VERIFY(ok); +} + +template <typename Scalar, int PacketSize, bool Run = internal::find_packet_by_size<Scalar, PacketSize>::value> +struct packet_segment_test_impl { + using Packet = typename internal::find_packet_by_size<Scalar, PacketSize>::type; + static void test_unaligned() { + // test loading a packet segment from unaligned memory that includes unallocated memory + + // | X X X X | * * * X | X X X X | + // begin -> { X | * * * } <- begin + count + + VectorX<Scalar> data_in(PacketSize), data_out(PacketSize); + data_in.setRandom(); + data_out.setRandom(); + + Scalar* unaligned_data_in = data_in.data() - 1; + Scalar* unaligned_data_out = data_out.data() - 1; + + Index begin = 1; + Index count = PacketSize - 1; + + Packet a = internal::ploaduSegment<Packet>(unaligned_data_in, begin, count); + internal::pstoreuSegment<Scalar, Packet>(unaligned_data_out, a, begin, count); + + verify_data(unaligned_data_in, unaligned_data_out, a, begin, count); + + // test loading the entire packet + + data_in.setRandom(); + data_out.setRandom(); + + unaligned_data_in = data_in.data(); + unaligned_data_out = data_out.data(); + + begin = 0; + count = PacketSize; + + Packet b = internal::ploaduSegment<Packet>(unaligned_data_in, begin, count); + internal::pstoreuSegment<Scalar, Packet>(unaligned_data_out, b, begin, count); + + verify_data(unaligned_data_in, unaligned_data_out, b, begin, count); + + // test loading an empty packet segment in unallocated memory + count = 0; + + for (begin = 0; begin < PacketSize; begin++) { + data_in.setRandom(); + data_out = data_in; + Packet c = internal::ploaduSegment<Packet>(data_in.data(), begin, count); + internal::pstoreuSegment<Scalar, Packet>(data_out.data(), c, begin, count); + // verify that ploaduSegment / pstoreuSegment did nothing + VERIFY_IS_CWISE_EQUAL(data_in, data_out); + } + } + static void test_aligned() { + // test loading a packet segment from aligned memory that includes unallocated memory + + // | X X X X | * * * X | X X X X | + // begin -> { * * * X } <- begin + count + + VectorX<Scalar> data_in(PacketSize - 1), data_out(PacketSize - 1); + data_in.setRandom(); + data_out.setRandom(); + + Scalar* aligned_data_in = data_in.data(); + Scalar* aligned_data_out = data_out.data(); + + Index begin = 0; + Index count = PacketSize - 1; + + Packet b = internal::ploadSegment<Packet>(aligned_data_in, begin, count); + internal::pstoreSegment<Scalar, Packet>(aligned_data_out, b, begin, count); + + verify_data(aligned_data_in, aligned_data_out, b, begin, count); + } + static void run() { + test_unaligned(); + test_aligned(); + } +}; + +template <typename Scalar, int PacketSize> +struct packet_segment_test_impl<Scalar, PacketSize, false> { + static void run() {} +}; + +template <typename Scalar, int PacketSize> +struct packet_segment_test_driver { + static void run() { + packet_segment_test_impl<Scalar, PacketSize>::run(); + packet_segment_test_driver<Scalar, PacketSize / 2>::run(); + } +}; + +template <typename Scalar> +struct packet_segment_test_driver<Scalar, 1> { + static void run() {} +}; + +template <typename Scalar> +void test_packet_segment() { + packet_segment_test_driver<Scalar, internal::packet_traits<Scalar>::size>::run(); +} + +EIGEN_DECLARE_TEST(packet_segment) { + for (int i = 0; i < g_repeat; i++) { + test_packet_segment<bool>(); + test_packet_segment<int8_t>(); + test_packet_segment<uint8_t>(); + test_packet_segment<int16_t>(); + test_packet_segment<uint16_t>(); + test_packet_segment<int32_t>(); + test_packet_segment<uint32_t>(); + test_packet_segment<int64_t>(); + test_packet_segment<uint64_t>(); + test_packet_segment<bfloat16>(); + test_packet_segment<half>(); + test_packet_segment<float>(); + test_packet_segment<double>(); + test_packet_segment<std::complex<float>>(); + test_packet_segment<std::complex<double>>(); + } +}
diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 9c5d6cf..7647592 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp
@@ -10,7 +10,6 @@ #include "packetmath_test_shared.h" #include "random_without_cast_overflow.h" -#include "packet_ostream.h" template <typename T> inline T REF_ADD(const T& a, const T& b) { @@ -24,21 +23,55 @@ inline T REF_MUL(const T& a, const T& b) { return a * b; } + +template <typename Scalar, typename EnableIf = void> +struct madd_impl { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar madd(const Scalar& a, const Scalar& b, const Scalar& c) { + return a * b + c; + } + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar msub(const Scalar& a, const Scalar& b, const Scalar& c) { + return a * b - c; + } + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar nmadd(const Scalar& a, const Scalar& b, const Scalar& c) { + return c - a * b; + } + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar nmsub(const Scalar& a, const Scalar& b, const Scalar& c) { + return Scalar(0) - (a * b + c); + } +}; + +template <typename Scalar> +struct madd_impl<Scalar, + std::enable_if_t<Eigen::internal::is_scalar<Scalar>::value && Eigen::NumTraits<Scalar>::IsSigned>> { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar madd(const Scalar& a, const Scalar& b, const Scalar& c) { + return numext::fma(a, b, c); + } + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar msub(const Scalar& a, const Scalar& b, const Scalar& c) { + return numext::fma(a, b, Scalar(-c)); + } + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar nmadd(const Scalar& a, const Scalar& b, const Scalar& c) { + return numext::fma(Scalar(-a), b, c); + } + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar nmsub(const Scalar& a, const Scalar& b, const Scalar& c) { + return -Scalar(numext::fma(a, b, c)); + } +}; + template <typename T> inline T REF_MADD(const T& a, const T& b, const T& c) { - return a * b + c; + return madd_impl<T>::madd(a, b, c); } template <typename T> inline T REF_MSUB(const T& a, const T& b, const T& c) { - return a * b - c; + return madd_impl<T>::msub(a, b, c); } template <typename T> inline T REF_NMADD(const T& a, const T& b, const T& c) { - return c - a * b; + return madd_impl<T>::nmadd(a, b, c); } template <typename T> inline T REF_NMSUB(const T& a, const T& b, const T& c) { - return test::negate(a * b + c); + return madd_impl<T>::nmsub(a, b, c); } template <typename T> inline T REF_DIV(const T& a, const T& b) { @@ -70,6 +103,14 @@ inline bool REF_MADD(const bool& a, const bool& b, const bool& c) { return (a && b) || c; } +template <> +inline bool REF_DIV(const bool& a, const bool& b) { + return a && b; +} +template <> +inline bool REF_RECIPROCAL(const bool& a) { + return a; +} template <typename T> inline T REF_FREXP(const T& x, T& exp) { @@ -92,6 +133,26 @@ return static_cast<T>(ldexp(x, static_cast<int>(exp))); } +// provides a convenient function to take the absolute value of each component of a complex number to prevent +// catastrophic cancellation in randomly generated complex numbers +template <typename T, bool IsComplex = NumTraits<T>::IsComplex> +struct abs_helper_impl { + static T run(T x) { return numext::abs(x); } +}; +template <typename T> +struct abs_helper_impl<T, true> { + static T run(T x) { + T res = x; + numext::real_ref(res) = numext::abs(numext::real(res)); + numext::imag_ref(res) = numext::abs(numext::imag(res)); + return res; + } +}; +template <typename T> +T abs_helper(T x) { + return abs_helper_impl<T>::run(x); +} + // Uses pcast to cast from one array to another. template <typename SrcPacket, typename TgtPacket, int SrcCoeffRatio, int TgtCoeffRatio> struct pcast_array; @@ -481,8 +542,8 @@ eigen_optimization_barrier_test<Scalar>::run(); for (int i = 0; i < size; ++i) { - data1[i] = internal::random<Scalar>() / RealScalar(PacketSize); - data2[i] = internal::random<Scalar>() / RealScalar(PacketSize); + data1[i] = internal::random<Scalar>(); + data2[i] = internal::random<Scalar>(); refvalue = (std::max)(refvalue, numext::abs(data1[i])); } @@ -502,8 +563,8 @@ for (int M = 0; M < PacketSize; ++M) { for (int N = 0; N <= PacketSize; ++N) { for (int j = 0; j < size; ++j) { - data1[j] = internal::random<Scalar>() / RealScalar(PacketSize); - data2[j] = internal::random<Scalar>() / RealScalar(PacketSize); + data1[j] = internal::random<Scalar>(); + data2[j] = internal::random<Scalar>(); refvalue = (std::max)(refvalue, numext::abs(data1[j])); } @@ -568,6 +629,7 @@ negate_test<Scalar, Packet>(data1, data2, ref, PacketSize); CHECK_CWISE1_IF(PacketTraits::HasReciprocal, REF_RECIPROCAL, internal::preciprocal); CHECK_CWISE1(numext::conj, internal::pconj); + CHECK_CWISE1_IF(PacketTraits::HasSign, numext::sign, internal::psign); for (int offset = 0; offset < 3; ++offset) { @@ -632,11 +694,17 @@ // Avoid overflows. if (NumTraits<Scalar>::IsInteger && NumTraits<Scalar>::IsSigned && Eigen::internal::unpacket_traits<Packet>::size > 1) { - Scalar limit = - static_cast<Scalar>(std::pow(static_cast<double>(numext::real(NumTraits<Scalar>::highest())), - 1.0 / static_cast<double>(Eigen::internal::unpacket_traits<Packet>::size))); + Scalar limit = static_cast<Scalar>( + static_cast<RealScalar>(std::pow(static_cast<double>(numext::real(NumTraits<Scalar>::highest())), + 1.0 / static_cast<double>(Eigen::internal::unpacket_traits<Packet>::size)))); for (int i = 0; i < PacketSize; ++i) { - data1[i] = internal::random<Scalar>(-limit, limit); + data1[i] = internal::random<Scalar>(Scalar(0) - limit, limit); + } + } else if (!NumTraits<Scalar>::IsInteger && !NumTraits<Scalar>::IsComplex) { + // Prevent very small product results by adjusting range. Otherwise, + // we may end up with multiplying e.g. 32 Eigen::halfs with values < 1. + for (int i = 0; i < PacketSize; ++i) { + data1[i] = internal::random<Scalar>(Scalar(0.5), Scalar(1)) * (internal::random<bool>() ? Scalar(-1) : Scalar(1)); } } ref[0] = Scalar(1); @@ -724,11 +792,6 @@ packetmath_pcast_ops_runner<Scalar, Packet>::run(); packetmath_minus_zero_add_test<Scalar, Packet>::run(); - for (int i = 0; i < size; ++i) { - data1[i] = numext::abs(internal::random<Scalar>()); - } - CHECK_CWISE1_IF(PacketTraits::HasSqrt, numext::sqrt, internal::psqrt); - CHECK_CWISE1_IF(PacketTraits::HasRsqrt, numext::rsqrt, internal::prsqrt); CHECK_CWISE3_IF(true, REF_MADD, internal::pmadd); if (!std::is_same<Scalar, bool>::value && NumTraits<Scalar>::IsSigned) { nmsub_test<Scalar, Packet>(data1, data2, ref, PacketSize); @@ -738,14 +801,18 @@ // which can lead to very flaky tests. Here we ensure the signs are such that // they do not cancel. for (int i = 0; i < PacketSize; ++i) { - data1[i] = numext::abs(internal::random<Scalar>()); - data1[i + PacketSize] = numext::abs(internal::random<Scalar>()); - data1[i + 2 * PacketSize] = Scalar(0) - numext::abs(internal::random<Scalar>()); + data1[i] = abs_helper(internal::random<Scalar>()); + data1[i + PacketSize] = abs_helper(internal::random<Scalar>()); + data1[i + 2 * PacketSize] = Scalar(0) - abs_helper(internal::random<Scalar>()); } if (!std::is_same<Scalar, bool>::value && NumTraits<Scalar>::IsSigned) { CHECK_CWISE3_IF(true, REF_MSUB, internal::pmsub); CHECK_CWISE3_IF(true, REF_NMADD, internal::pnmadd); } + + CHECK_CWISE1_IF(PacketTraits::HasSqrt, numext::sqrt, internal::psqrt); + CHECK_CWISE1_IF(PacketTraits::HasRsqrt, numext::rsqrt, internal::prsqrt); + CHECK_CWISE1_IF(PacketTraits::HasCbrt, numext::cbrt, internal::pcbrt); } // Notice that this definition works for complex types as well. @@ -767,6 +834,7 @@ CREATE_FUNCTOR(psqrt_functor, internal::psqrt); CREATE_FUNCTOR(prsqrt_functor, internal::prsqrt); +CREATE_FUNCTOR(pcbrt_functor, internal::pcbrt); // TODO(rmlarsen): Run this test for more functions. template <bool Cond, typename Scalar, typename Packet, typename RefFunctorT, typename FunctorT> @@ -1137,6 +1205,7 @@ packetmath_test_IEEE_corner_cases<PacketTraits::HasSqrt, Scalar, Packet>(numext::sqrt<Scalar>, psqrt_functor()); packetmath_test_IEEE_corner_cases<PacketTraits::HasRsqrt, Scalar, Packet>(numext::rsqrt<Scalar>, prsqrt_functor()); + packetmath_test_IEEE_corner_cases<PacketTraits::HasCbrt, Scalar, Packet>(numext::cbrt<Scalar>, pcbrt_functor()); // TODO(rmlarsen): Re-enable for half and bfloat16. if (PacketTraits::HasCos && !internal::is_same<Scalar, half>::value && @@ -1665,7 +1734,7 @@ for (Index N = 0; N <= PacketSize; ++N) { for (Index i = 0; i < N; ++i) { - data1[i] = internal::random<Scalar>() / RealScalar(PacketSize); + data1[i] = internal::random<Scalar>(); } for (Index i = 0; i < N * 20; ++i) { @@ -1684,7 +1753,7 @@ } for (Index i = 0; i < N * 7; ++i) { - buffer[i] = internal::random<Scalar>() / RealScalar(PacketSize); + buffer[i] = internal::random<Scalar>(); } packet = internal::pgather_partial<Scalar, Packet>(buffer, 7, N); internal::pstore_partial(data1, packet, N);
diff --git a/test/packetmath_test_shared.h b/test/packetmath_test_shared.h index 93d4149..7d7a0da 100644 --- a/test/packetmath_test_shared.h +++ b/test/packetmath_test_shared.h
@@ -162,7 +162,9 @@ template <typename T> inline Packet load(const T* from, unsigned long long umask) const { - return internal::ploadu<Packet>(from, umask); + using UMaskType = typename numext::get_integer_by_size<internal::plain_enum_max( + internal::unpacket_traits<Packet>::size / CHAR_BIT, 1)>::unsigned_type; + return internal::ploadu<Packet>(from, static_cast<UMaskType>(umask)); } template <typename T> @@ -172,7 +174,9 @@ template <typename T> inline void store(T* to, const Packet& x, unsigned long long umask) const { - internal::pstoreu(to, x, umask); + using UMaskType = typename numext::get_integer_by_size<internal::plain_enum_max( + internal::unpacket_traits<Packet>::size / CHAR_BIT, 1)>::unsigned_type; + internal::pstoreu(to, x, static_cast<UMaskType>(umask)); } template <typename T>
diff --git a/test/product.h b/test/product.h index f8eb5df..f37a932 100644 --- a/test/product.h +++ b/test/product.h
@@ -38,6 +38,15 @@ std::enable_if_t<RhsType::SizeAtCompileTime != Dynamic, void> check_mismatched_product(LhsType& /*unused*/, const RhsType& /*unused*/) {} +template <typename Scalar, typename V1, typename V2> +Scalar ref_dot_product(const V1& v1, const V2& v2) { + Scalar out = Scalar(0); + for (Index i = 0; i < v1.size(); ++i) { + out = Eigen::numext::fma(v1[i], v2[i], out); + } + return out; +} + template <typename MatrixType> void product(const MatrixType& m) { /* this test covers the following files: @@ -245,7 +254,10 @@ // inner product { Scalar x = square2.row(c) * square2.col(c2); - VERIFY_IS_APPROX(x, square2.row(c).transpose().cwiseProduct(square2.col(c2)).sum()); + // NOTE: FMA is necessary here in the reference to ensure accuracy for + // large vector sizes and float16/bfloat16 types. + Scalar y = ref_dot_product<Scalar>(square2.row(c), square2.col(c2)); + VERIFY_IS_APPROX(x, y); } // outer product
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h index 902aa96..ea210a1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
@@ -45,19 +45,19 @@ template <typename Scalar, typename StorageIndex, StorageIndex NCWindow, StorageIndex CFactor, StorageIndex NCFactor> struct TVPanelSize { // LocalThreadSizeC: determines total number of thread per workgroup for the contracting dimension - static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeC = EIGEN_SYCL_LOCAL_THREAD_DIM0; + static constexpr StorageIndex LocalThreadSizeC = EIGEN_SYCL_LOCAL_THREAD_DIM0; // LocalThreadSizeNC: determines total number of thread per workgroup for the non-contracting dimension - static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC = EIGEN_SYCL_LOCAL_THREAD_DIM1; + static constxpr StorageIndex LocalThreadSizeNC = EIGEN_SYCL_LOCAL_THREAD_DIM1; // TileSizeDimNC: determines the tile size for the non-contracting dimension - static EIGEN_CONSTEXPR StorageIndex TileSizeDimNC = NCWindow / NCFactor; + static constexpr StorageIndex TileSizeDimNC = NCWindow / NCFactor; // TileSizeDimC: determines the tile size for the contracting dimension - static EIGEN_CONSTEXPR StorageIndex TileSizeDimC = CFactor * LocalThreadSizeNC * LocalThreadSizeC; + static constexpr StorageIndex TileSizeDimC = CFactor * LocalThreadSizeNC * LocalThreadSizeC; // WorkLoadPerThreadNC : determines workload per thread for loading the non-contracting dimension - static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC = TileSizeDimNC / LocalThreadSizeNC; + static constexpr StorageIndex WorkLoadPerThreadNC = TileSizeDimNC / LocalThreadSizeNC; // WorkLoadPerThreadC: determines workload per thread for loading the non-contracting dimension - static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadC = TileSizeDimC / LocalThreadSizeC; + static constexpr StorageIndex WorkLoadPerThreadC = TileSizeDimC / LocalThreadSizeC; // BC : determines if supporting bank conflict is required - static EIGEN_CONSTEXPR bool BC = false; + static constexpr bool BC = false; }; #endif @@ -81,40 +81,40 @@ template <typename Scalar, typename StorageIndex, StorageIndex REG_SIZE_M, StorageIndex REG_SIZE_N, StorageIndex TSDK> struct TTPanelSize { // TileSizeDimK: determines Tile size for dimension K. The packet size is assumed to be considered - static EIGEN_CONSTEXPR StorageIndex TileSizeDimK = TSDK; + static constexpr StorageIndex TileSizeDimK = TSDK; // WorkLoadPerThreadM : determines workload per thread for loading the M dimension This can be varied based on the // available register on a chosen device(can be controlled by EIGEN_SYCL_REG_M macro// #ifndef EIGEN_SYCL_REG_M - static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM = REG_SIZE_M; + static constexpr StorageIndex WorkLoadPerThreadM = REG_SIZE_M; #else - static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM = EIGEN_SYCL_REG_M; + static constexpr StorageIndex WorkLoadPerThreadM = EIGEN_SYCL_REG_M; #endif // WorkLoadPerThreadN : determines workload per thread for loading the N dimension This can be varied based on the // available register on a chosen device(can be controlled by EIGEN_SYCL_REG_N macro #ifndef EIGEN_SYCL_REG_N - static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN = REG_SIZE_N; + static constexpr StorageIndex WorkLoadPerThreadN = REG_SIZE_N; #else - static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN = EIGEN_SYCL_REG_N; + static constexpr StorageIndex WorkLoadPerThreadN = EIGEN_SYCL_REG_N; #endif // LocalThreadSizeM: determines total number of thread per workgroup for the m dimension - static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeM = EIGEN_SYCL_LOCAL_THREAD_DIM0; + static constexpr StorageIndex LocalThreadSizeM = EIGEN_SYCL_LOCAL_THREAD_DIM0; // LocalThreadSizeN: determines total number of thread per workgroup for the n dimension - static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeN = EIGEN_SYCL_LOCAL_THREAD_DIM1; + static constexpr StorageIndex LocalThreadSizeN = EIGEN_SYCL_LOCAL_THREAD_DIM1; // TileSizeDimM: determines the tile size for the m dimension - static EIGEN_CONSTEXPR StorageIndex TileSizeDimM = LocalThreadSizeM * WorkLoadPerThreadM; + static constexpr StorageIndex TileSizeDimM = LocalThreadSizeM * WorkLoadPerThreadM; // TileSizeDimN: determines the tile size for the n dimension - static EIGEN_CONSTEXPR StorageIndex TileSizeDimN = LocalThreadSizeN * WorkLoadPerThreadN; + static constexpr StorageIndex TileSizeDimN = LocalThreadSizeN * WorkLoadPerThreadN; // LoadPerThreadLhs: determines workload per thread for loading Lhs Tensor. This must be divisible by packetsize - static EIGEN_CONSTEXPR StorageIndex LoadPerThreadLhs = + static constexpr StorageIndex LoadPerThreadLhs = ((TileSizeDimK * WorkLoadPerThreadM * WorkLoadPerThreadN) / (TileSizeDimN)); // LoadPerThreadRhs: determines workload per thread for loading Rhs Tensor. This must be divisible by packetsize - static EIGEN_CONSTEXPR StorageIndex LoadPerThreadRhs = + static constexpr StorageIndex LoadPerThreadRhs = ((TileSizeDimK * WorkLoadPerThreadM * WorkLoadPerThreadN) / (TileSizeDimM)); // BC : determines if supporting bank conflict is required - static EIGEN_CONSTEXPR bool BC = true; + static constexpr bool BC = true; // DoubleBuffer: determines if double buffering technique should be used (This can be disabled by // EIGEN_SYCL_DISABLE_DOUBLE_BUFFER macro when the device does not have sufficient local memory) - static EIGEN_CONSTEXPR bool DoubleBuffer = + static constexpr bool DoubleBuffer = #ifdef EIGEN_SYCL_DISABLE_DOUBLE_BUFFER false; #else @@ -220,7 +220,7 @@ template <typename StorageIndex, StorageIndex ld, data_source dt, typename PacketType, typename DataScalar> static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<dt != data_source::global_mem, void> write( PacketType &packet_data, DataScalar ptr) { - EIGEN_CONSTEXPR int PacketSize = Eigen::internal::unpacket_traits<PacketType>::size; + constexpr int PacketSize = Eigen::internal::unpacket_traits<PacketType>::size; EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; i++) { *ptr = PacketWrapper<PacketType, PacketSize>::scalarize(i, packet_data); @@ -320,14 +320,14 @@ */ template <bool is_transposed, bool is_rhs_, bool packet_load_, typename PacketType> struct BlockProperties { - static EIGEN_CONSTEXPR bool packet_load = packet_load_; + static constexpr bool packet_load = packet_load_; typedef typename Eigen::internal::unpacket_traits<PacketType>::type OutScalar; - static EIGEN_CONSTEXPR bool is_rhs = is_rhs_; + static constexpr bool is_rhs = is_rhs_; typedef std::conditional_t<packet_load, PacketType, OutScalar> OutType; - static EIGEN_CONSTEXPR int elements_per_access = Eigen::internal::unpacket_traits<OutType>::size; - static EIGEN_CONSTEXPR bool is_coalesced_layout = !(is_transposed ^ is_rhs); - static EIGEN_CONSTEXPR int nc_stride = (is_coalesced_layout ? elements_per_access : 1); - static EIGEN_CONSTEXPR int c_stride = (is_coalesced_layout ? 1 : elements_per_access); + static constexpr int elements_per_access = Eigen::internal::unpacket_traits<OutType>::size; + static constexpr bool is_coalesced_layout = !(is_transposed ^ is_rhs); + static constexpr int nc_stride = (is_coalesced_layout ? elements_per_access : 1); + static constexpr int c_stride = (is_coalesced_layout ? 1 : elements_per_access); }; /*! @@ -458,11 +458,11 @@ public: typedef typename Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketReturnType PacketReturnType; - static EIGEN_CONSTEXPR int PacketSize = + static constexpr int PacketSize = Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketSize; - static EIGEN_CONSTEXPR bool is_lhs_transposed = + static constexpr bool is_lhs_transposed = !::Eigen::internal::TensorContractionInputMapperTrait<LhsMapper>::inner_dim_contiguous; - static EIGEN_CONSTEXPR bool is_rhs_transposed = + static constexpr bool is_rhs_transposed = !::Eigen::internal::TensorContractionInputMapperTrait<RhsMapper>::inner_dim_contiguous; typedef BlockProperties<is_lhs_transposed, false, input_mapper_properties::is_lhs_matrix && Vectorizable, @@ -473,20 +473,20 @@ PacketReturnType> RHSBlockProperties; - static EIGEN_CONSTEXPR StorageIndex NStride = + static constexpr StorageIndex NStride = contraction_tp == contraction_type::local ? Properties::WorkLoadPerThreadN : RHSBlockProperties::nc_stride; typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch; typedef cl::sycl::multi_ptr<OutScalar, cl::sycl::access::address_space::local_space> local_ptr; typedef OutScalar * /*cl::sycl::multi_ptr<OutScalar, cl::sycl::access::address_space::private_space>*/ private_ptr; typedef std::conditional_t<contraction_tp == contraction_type::local, local_ptr, private_ptr> tile_ptr; - static EIGEN_CONSTEXPR StorageIndex LSDL = contraction_tp == contraction_type::local - ? Properties::TileSizeDimM + Properties::BC - : Properties::WorkLoadPerThreadM; - static EIGEN_CONSTEXPR StorageIndex LSDR = contraction_tp == contraction_type::local - ? Properties::TileSizeDimN + Properties::BC - : Properties::WorkLoadPerThreadN; - static EIGEN_CONSTEXPR StorageIndex LocalOffset = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN; + static constexpr StorageIndex LSDL = contraction_tp == contraction_type::local + ? Properties::TileSizeDimM + Properties::BC + : Properties::WorkLoadPerThreadM; + static constexpr StorageIndex LSDR = contraction_tp == contraction_type::local + ? Properties::TileSizeDimN + Properties::BC + : Properties::WorkLoadPerThreadN; + static constexpr StorageIndex LocalOffset = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN; /** * \brief MemHolder this is a place holder struct for creating memory hierarchy in SYCL. Inside SYCL kernel it is not @@ -638,7 +638,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_block_per_tile(OutScalar *lhs_block_ptr, OutScalar *rhs_block_ptr, PacketReturnType *privateRes) const { StorageIndex idx = 0; - EIGEN_CONSTEXPR StorageIndex lhs_stride = + constexpr StorageIndex lhs_stride = contraction_tp == contraction_type::local ? (PacketSize * Properties::LocalThreadSizeM) : 1; EIGEN_UNROLL_LOOP for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN; wLPTN++) { @@ -668,8 +668,7 @@ // when local memory is not used M and N are both accessed in a coalesced way. However, when local memory is // available the k*N is transposed in the local to N*K therefore, each blocks operates on blockId* // WorkLoadPerThreadN slice of N - EIGEN_CONSTEXPR StorageIndex GlobalNStride = - contraction_tp == contraction_type::local ? 1 : Properties::LocalThreadSizeN; + constexpr StorageIndex GlobalNStride = contraction_tp == contraction_type::local ? 1 : Properties::LocalThreadSizeN; EIGEN_UNROLL_LOOP for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN / PrivateNStride; wLPTN++) { // output leading dimension @@ -713,9 +712,9 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<contract_tp == contraction_type::no_local> extract_block( const Input &inpt, PrivateReg private_ptr, const std::pair<StorageIndex, StorageIndex> &, const StorageIndex &ncOffset, const StorageIndex cOffset) const { - EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC = + constexpr StorageIndex LocalThreadSizeNC = InputBlockProperties::is_rhs ? Properties::LocalThreadSizeN : Properties::LocalThreadSizeM; - EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC = + constexpr StorageIndex WorkLoadPerThreadNC = InputBlockProperties::is_rhs ? Properties::WorkLoadPerThreadN : Properties::WorkLoadPerThreadM; const StorageIndex &NC = InputBlockProperties::is_rhs ? triple_dim.N : triple_dim.M; @@ -891,11 +890,11 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<contract_tp == contraction_type::local> extract_block( const Input &inpt, Local local_ptr, const std::pair<StorageIndex, StorageIndex> &local_index, const StorageIndex &ncOffset, const StorageIndex cOffset) const { - EIGEN_CONSTEXPR StorageIndex TileSizeDimNC = + constexpr StorageIndex TileSizeDimNC = InputBlockProperties::is_rhs ? Properties::TileSizeDimN : Properties::TileSizeDimM; - EIGEN_CONSTEXPR StorageIndex LoadPerThread = + constexpr StorageIndex LoadPerThread = InputBlockProperties::is_rhs ? Properties::LoadPerThreadRhs : Properties::LoadPerThreadLhs; - EIGEN_CONSTEXPR StorageIndex LSD = InputBlockProperties::is_rhs ? LSDR : LSDL; + constexpr StorageIndex LSD = InputBlockProperties::is_rhs ? LSDR : LSDL; static_assert(((LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride) == 0) && (LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride) == 0)), " LocalOffset must be divisible by stride"); @@ -995,11 +994,11 @@ struct GeneralVectorTensor { typedef typename Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketReturnType PacketReturnType; - static EIGEN_CONSTEXPR int PacketSize = + static constexpr int PacketSize = Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketSize; typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch; - static EIGEN_CONSTEXPR StorageIndex OutScratchOffset = + static constexpr StorageIndex OutScratchOffset = KFactor * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC; // Since the access layout for a vector can always be coalesced, when LHS is a vector, we pass false and false to make @@ -1328,8 +1327,8 @@ template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct input_mapper_propertis { - static EIGEN_CONSTEXPR bool is_lhs_matrix = (LDims == 2 && ContractDims == 1) || lhs_inner_dim_contiguous; - static EIGEN_CONSTEXPR bool is_rhs_matrix = + static constexpr bool is_lhs_matrix = (LDims == 2 && ContractDims == 1) || lhs_inner_dim_contiguous; + static constexpr bool is_rhs_matrix = (RDims == 2 && ContractDims == 1) || (rhs_inner_dim_contiguous && !rhs_inner_dim_reordered); }; @@ -1537,9 +1536,9 @@ void EIGEN_ALWAYS_INLINE LaunchVT(EvaluatorPointerType buffer, const VectorMapper &vec, const TensorMapper &mat, StorageIndex NC, StorageIndex C) const { const StorageIndex nonContractDim = NC; - EIGEN_CONSTEXPR StorageIndex NCFactor = 1; - EIGEN_CONSTEXPR StorageIndex CFactor = 1; - EIGEN_CONSTEXPR StorageIndex NCWindow = 16; + constexpr StorageIndex NCFactor = 1; + constexpr StorageIndex CFactor = 1; + constexpr StorageIndex NCWindow = 16; typedef Eigen::TensorSycl::internal::TVPanelSize<CoeffReturnType, StorageIndex, NCWindow, CFactor, NCFactor> Properties; const StorageIndex roundUpC = Eigen::TensorSycl::internal::roundUp(C, Properties::TileSizeDimC); @@ -1601,7 +1600,7 @@ (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)), "The Local thread size must be a power of 2 for the reduction " "operation"); - EIGEN_CONSTEXPR StorageIndex local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1; + constexpr StorageIndex local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1; // Here we force the code not to be more than 2-step reduction: Our empirical research shows that if each thread // reduces at least 512 elementss individually, we get better performance.
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h index 6a7571c..394c150 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
@@ -42,10 +42,10 @@ typedef Index Real; enum { IsComplex = 0, RequireInitialization = false, ReadCost = 1, AddCost = 1, MulCost = 1 }; - EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real epsilon() { return 0; } - EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real dummy_precision() { return 0; } - EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real highest() { return n; } - EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real lowest() { return n; } + EIGEN_DEVICE_FUNC static constexpr EIGEN_STRONG_INLINE Real epsilon() { return 0; } + EIGEN_DEVICE_FUNC static constexpr EIGEN_STRONG_INLINE Real dummy_precision() { return 0; } + EIGEN_DEVICE_FUNC static constexpr EIGEN_STRONG_INLINE Real highest() { return n; } + EIGEN_DEVICE_FUNC static constexpr EIGEN_STRONG_INLINE Real lowest() { return n; } }; namespace internal { @@ -569,47 +569,47 @@ namespace Eigen { namespace internal { template <typename T> -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_known_statically(Index i) { +static EIGEN_DEVICE_FUNC constexpr bool index_known_statically(Index i) { return index_known_statically_impl<T>::run(i); } template <typename T> -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool all_indices_known_statically() { +static EIGEN_DEVICE_FUNC constexpr bool all_indices_known_statically() { return all_indices_known_statically_impl<T>::run(); } template <typename T> -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool indices_statically_known_to_increase() { +static EIGEN_DEVICE_FUNC constexpr bool indices_statically_known_to_increase() { return indices_statically_known_to_increase_impl<T>::run(); } template <typename T> -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_eq(Index i, Index value) { +static EIGEN_DEVICE_FUNC constexpr bool index_statically_eq(Index i, Index value) { return index_statically_eq_impl<T>::run(i, value); } template <typename T> -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_ne(Index i, Index value) { +static EIGEN_DEVICE_FUNC constexpr bool index_statically_ne(Index i, Index value) { return index_statically_ne_impl<T>::run(i, value); } template <typename T> -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_gt(Index i, Index value) { +static EIGEN_DEVICE_FUNC constexpr bool index_statically_gt(Index i, Index value) { return index_statically_gt_impl<T>::run(i, value); } template <typename T> -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_lt(Index i, Index value) { +static EIGEN_DEVICE_FUNC constexpr bool index_statically_lt(Index i, Index value) { return index_statically_lt_impl<T>::run(i, value); } template <typename T> -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_first_statically_eq(Index i, Index value) { +static EIGEN_DEVICE_FUNC constexpr bool index_pair_first_statically_eq(Index i, Index value) { return index_pair_first_statically_eq_impl<T>::run(i, value); } template <typename T> -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_second_statically_eq(Index i, Index value) { +static EIGEN_DEVICE_FUNC constexpr bool index_pair_second_statically_eq(Index i, Index value) { return index_pair_second_statically_eq_impl<T>::run(i, value); }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h index 8c2bb2e..8454070 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@@ -212,9 +212,9 @@ typedef U first_type; typedef V second_type; - EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Pair() : first(), second() {} + constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Pair() : first(), second() {} - EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Pair(const U& f, const V& s) : first(f), second(s) {} + constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Pair(const U& f, const V& s) : first(f), second(s) {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void swap(Pair& rhs) { using numext::swap; @@ -224,20 +224,20 @@ }; template <typename U, typename V> -EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator==(const Pair<U, V>& x, const Pair<U, V>& y) { +constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator==(const Pair<U, V>& x, const Pair<U, V>& y) { return (x.first == y.first && x.second == y.second); } template <typename U, typename V> -EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator!=(const Pair<U, V>& x, const Pair<U, V>& y) { +constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator!=(const Pair<U, V>& x, const Pair<U, V>& y) { return !(x == y); } // Can't use std::pairs on cuda devices template <typename Idx> struct IndexPair { - EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair() : first(0), second(0) {} - EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair(Idx f, Idx s) : first(f), second(s) {} + constexpr EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair() : first(0), second(0) {} + constexpr EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair(Idx f, Idx s) : first(f), second(s) {} EIGEN_DEVICE_FUNC void set(IndexPair<Idx> val) { first = val.first; @@ -251,19 +251,18 @@ namespace internal { template <typename IndexType, typename Index, Index First, Index... Is> -EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array<Index, 1 + sizeof...(Is)> customIndices2Array( +constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array<Index, 1 + sizeof...(Is)> customIndices2Array( IndexType& idx, numeric_list<Index, First, Is...>) { return {static_cast<Index>(idx[First]), static_cast<Index>(idx[Is])...}; } template <typename IndexType, typename Index> -EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array<Index, 0> customIndices2Array(IndexType&, - numeric_list<Index>) { +constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array<Index, 0> customIndices2Array(IndexType&, numeric_list<Index>) { return array<Index, 0>(); } /** Make an array (for index/dimensions) out of a custom index */ template <typename Index, std::size_t NumIndices, typename IndexType> -EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array<Index, NumIndices> customIndices2Array(IndexType& idx) { +constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array<Index, NumIndices> customIndices2Array(IndexType& idx) { return customIndices2Array(idx, typename gen_numeric_list<Index, NumIndices>::type{}); }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h index 6944c03..b4749b4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
@@ -402,9 +402,9 @@ template <typename Index, Index LTP, Index LTR, bool BC_> struct ReductionPannel { - static EIGEN_CONSTEXPR Index LocalThreadSizeP = LTP; - static EIGEN_CONSTEXPR Index LocalThreadSizeR = LTR; - static EIGEN_CONSTEXPR bool BC = BC_; + static constexpr Index LocalThreadSizeP = LTP; + static constexpr Index LocalThreadSizeR = LTR; + static constexpr bool BC = BC_; }; template <typename Self, typename Op, TensorSycl::internal::reduction_dim rt> @@ -430,7 +430,7 @@ "The Local thread size must be a power of 2 for the reduction " "operation"); - EIGEN_CONSTEXPR Index localRange = PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR; + constexpr Index localRange = PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR; // In this step, we force the code not to be more than 2-step reduction: // Our empirical research shows that if each thread reduces at least 64 // elements individually, we get better performance. However, this can change @@ -445,7 +445,7 @@ const Index rNumGroups = num_coeffs_to_reduce > reductionPerThread * localRange ? std::min(rGroups, localRange) : 1; const Index globalRange = pNumGroups * rNumGroups * localRange; - EIGEN_CONSTEXPR Index scratchSize = + constexpr Index scratchSize = PannelParameters::LocalThreadSizeR * (PannelParameters::LocalThreadSizeP + PannelParameters::BC); auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange)); if (rNumGroups > 1) { @@ -482,15 +482,15 @@ struct FullReducer<Self, Op, Eigen::SyclDevice, Vectorizable> { typedef typename Self::CoeffReturnType CoeffReturnType; typedef typename Self::EvaluatorPointerType EvaluatorPointerType; - static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true; - static EIGEN_CONSTEXPR int PacketSize = Self::PacketAccess ? Self::PacketSize : 1; + static constexpr bool HasOptimizedImplementation = true; + static constexpr int PacketSize = Self::PacketAccess ? Self::PacketSize : 1; static void run(const Self &self, Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType data) { typedef std::conditional_t<Self::PacketAccess, typename Self::PacketReturnType, CoeffReturnType> OutType; static_assert(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) & (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)), "The Local thread size must be a power of 2 for the reduction " "operation"); - EIGEN_CONSTEXPR Index local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1; + constexpr Index local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1; typename Self::Index inputSize = self.impl().dimensions().TotalSize(); // In this step we force the code not to be more than 2-step reduction: @@ -535,7 +535,7 @@ // col reduction template <typename Self, typename Op> struct OuterReducer<Self, Op, Eigen::SyclDevice> { - static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true; + static constexpr bool HasOptimizedImplementation = true; static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce, @@ -549,7 +549,7 @@ // row reduction template <typename Self, typename Op> struct InnerReducer<Self, Op, Eigen::SyclDevice> { - static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true; + static constexpr bool HasOptimizedImplementation = true; static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce, @@ -566,7 +566,7 @@ // generic partial reduction template <typename Self, typename Op> struct GenericReducer<Self, Op, Eigen::SyclDevice> { - static EIGEN_CONSTEXPR bool HasOptimizedImplementation = false; + static constexpr bool HasOptimizedImplementation = false; static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, typename Self::EvaluatorPointerType output, typename Self::Index num_values_to_reduce, typename Self::Index num_coeffs_to_preserve) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h index f0a390f..6de0867 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
@@ -197,7 +197,7 @@ // threads. Currently set to twice the cache line size on Intel and ARM // processors. EIGEN_STRONG_INLINE Index AdjustBlockSize(Index item_size, Index block_size) { - EIGEN_CONSTEXPR Index kBlockAlignment = 128; + constexpr Index kBlockAlignment = 128; const Index items_per_cacheline = numext::maxi<Index>(1, kBlockAlignment / item_size); return items_per_cacheline * numext::div_ceil(block_size, items_per_cacheline); }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h index 30fde91..3636788 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h
@@ -51,7 +51,7 @@ template <typename index_t> struct ScanParameters { // must be power of 2 - static EIGEN_CONSTEXPR index_t ScanPerThread = 8; + static constexpr index_t ScanPerThread = 8; const index_t total_size; const index_t non_scan_size; const index_t scan_size; @@ -86,7 +86,7 @@ struct ScanKernelFunctor { typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> LocalAccessor; - static EIGEN_CONSTEXPR int PacketSize = ScanParameters<Index>::ScanPerThread / 2; + static constexpr int PacketSize = ScanParameters<Index>::ScanPerThread / 2; LocalAccessor scratch; Evaluator dev_eval; @@ -288,7 +288,7 @@ struct ScanAdjustmentKernelFunctor { typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> LocalAccessor; - static EIGEN_CONSTEXPR int PacketSize = ScanParameters<Index>::ScanPerThread / 2; + static constexpr int PacketSize = ScanParameters<Index>::ScanPerThread / 2; InAccessor in_ptr; OutAccessor out_ptr; const ScanParameters<Index> scanParameters;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h index c149985..5357a48 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
@@ -27,6 +27,10 @@ typedef std::remove_reference_t<Nested> Nested_; static constexpr int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value; static constexpr int Layout = XprTraits::Layout; + enum { + // Trace is read-only. + Flags = traits<XprType>::Flags & ~LvalueBit + }; }; template <typename Dims, typename XprType> @@ -203,6 +207,8 @@ return true; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return nullptr; } + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h b/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h index 51c0ad6..ae5c4f4 100644 --- a/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h +++ b/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h
@@ -106,7 +106,7 @@ int one; int two; int flags; - constexpr inline Generator(int one_, int two_, int flags_) : one(one_), two(two_), flags(flags_) {} + constexpr Generator(int one_, int two_, int flags_) : one(one_), two(two_), flags(flags_) {} }; std::size_t m_numIndices;
diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h b/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h index 3f9bb51..66a982b 100644 --- a/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h +++ b/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h
@@ -25,7 +25,7 @@ constexpr static std::size_t N = sizeof...(nn); template <typename T> - constexpr static inline std::array<T, N> run(const std::array<T, N>& indices) { + constexpr static std::array<T, N> run(const std::array<T, N>& indices) { return {{indices[nn]...}}; } }; @@ -51,7 +51,7 @@ template <typename iib> struct tensor_static_symgroup_multiply_helper { template <int... iia> - constexpr static inline numeric_list<int, get<iia, iib>::value...> helper(numeric_list<int, iia...>) { + constexpr static numeric_list<int, get<iia, iib>::value...> helper(numeric_list<int, iia...>) { return numeric_list<int, get<iia, iib>::value...>(); } }; @@ -107,9 +107,9 @@ }; template <typename Index, std::size_t N, int... ii, int... jj> -constexpr static inline std::array<Index, N> tensor_static_symgroup_index_permute(std::array<Index, N> idx, - internal::numeric_list<int, ii...>, - internal::numeric_list<int, jj...>) { +constexpr static std::array<Index, N> tensor_static_symgroup_index_permute(std::array<Index, N> idx, + internal::numeric_list<int, ii...>, + internal::numeric_list<int, jj...>) { return {{idx[ii]..., idx[jj]...}}; } @@ -179,9 +179,9 @@ typedef typename group_elements::type ge; public: - constexpr inline StaticSGroup() {} - constexpr inline StaticSGroup(const StaticSGroup<Gen...>&) {} - constexpr inline StaticSGroup(StaticSGroup<Gen...>&&) {} + constexpr StaticSGroup() {} + constexpr StaticSGroup(const StaticSGroup<Gen...>&) {} + constexpr StaticSGroup(StaticSGroup<Gen...>&&) {} template <typename Op, typename RV, typename Index, std::size_t N, typename... Args> static inline RV apply(const std::array<Index, N>& idx, RV initial, Args&&... args) { @@ -196,8 +196,8 @@ constexpr static std::size_t static_size = ge::count; - constexpr static inline std::size_t size() { return ge::count; } - constexpr static inline int globalFlags() { return group_elements::global_flags; } + constexpr static std::size_t size() { return ge::count; } + constexpr static int globalFlags() { return group_elements::global_flags; } template <typename Tensor_, typename... IndexTypes> inline internal::tensor_symmetry_value_setter<Tensor_, StaticSGroup<Gen...>> operator()(
diff --git a/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h b/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h index e3be69d..632f437 100644 --- a/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h +++ b/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h
@@ -36,15 +36,15 @@ */ template <std::size_t I_, class T> -constexpr inline T& array_get(std::vector<T>& a) { +constexpr T& array_get(std::vector<T>& a) { return a[I_]; } template <std::size_t I_, class T> -constexpr inline T&& array_get(std::vector<T>&& a) { +constexpr T&& array_get(std::vector<T>&& a) { return a[I_]; } template <std::size_t I_, class T> -constexpr inline T const& array_get(std::vector<T> const& a) { +constexpr T const& array_get(std::vector<T> const& a) { return a[I_]; }
diff --git a/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h b/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h index 808fd7d..19ec8ea 100644 --- a/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h +++ b/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h
@@ -268,7 +268,7 @@ /* compute the scaled predicted reduction and */ /* the scaled directional derivative. */ - wa3 = fjac.template triangularView<Upper>() * (qrfac.colsPermutation().inverse() * wa1); + wa3.noalias() = fjac.template triangularView<Upper>() * (qrfac.colsPermutation().inverse() * wa1); temp1 = numext::abs2(wa3.stableNorm() / fnorm); temp2 = numext::abs2(sqrt(par) * pnorm / fnorm); prered = temp1 + temp2 / Scalar(.5);
diff --git a/unsupported/Eigen/src/NonLinearOptimization/r1updt.h b/unsupported/Eigen/src/NonLinearOptimization/r1updt.h index 4bf7ee5..201fba3 100644 --- a/unsupported/Eigen/src/NonLinearOptimization/r1updt.h +++ b/unsupported/Eigen/src/NonLinearOptimization/r1updt.h
@@ -75,7 +75,7 @@ /* givens rotation. */ w_givens[j] = givens; } else - v_givens[j] = IdentityRotation; + w_givens[j] = IdentityRotation; /* test for zero diagonal elements in the output s. */ if (s(j, j) == 0.) {
diff --git a/unsupported/test/cxx11_tensor_ref.cpp b/unsupported/test/cxx11_tensor_ref.cpp index cf09749..8d6821a 100644 --- a/unsupported/test/cxx11_tensor_ref.cpp +++ b/unsupported/test/cxx11_tensor_ref.cpp
@@ -108,6 +108,17 @@ VERIFY_IS_EQUAL(slice.data(), tensor.data()); } +static void test_ref_of_trace() { + Tensor<int, 2> input(6, 6); + input.setRandom(); + int trace = 0; + for (int i = 0; i < 6; ++i) { + trace += input(i, i); + } + TensorRef<const Tensor<int, 0>> ref(input.trace()); + VERIFY_IS_EQUAL(ref.coeff(0), trace); +} + static void test_ref_of_ref() { Tensor<float, 3> input(3, 5, 7); input.setRandom(); @@ -224,6 +235,7 @@ CALL_SUBTEST(test_simple_rvalue_ref()); CALL_SUBTEST(test_multiple_dims()); CALL_SUBTEST(test_slice()); + CALL_SUBTEST(test_ref_of_trace()); CALL_SUBTEST(test_ref_of_ref()); CALL_SUBTEST(test_ref_in_expr()); CALL_SUBTEST(test_coeff_ref());