Update Eigen to commit:3460f3558e7b469efb8a225894e21929c8c77629
CHANGELOG
=========
3460f3558 - Use VERIFY_IS_EQUAL to compare to zeros.
13a1f25da - Revert StlIterators edit from "Fix undefined behavior..."
fd2fd4870 - Update file ForwardDeclarations.h
37b2e9717 - Tweak special case handling in atan2.
a1cdcdb03 - Fix undefined behavior in Block access
4a58f30aa - Fix pre-POWER8_VECTOR bugs in pcmp_lt and pnegate and reactivate psqrt.
12ad99ce6 - Remove unused variables from GenericPacketMathFunctions.h
6987a200b - Fix stupid sparse bugs with outerSize == 0
0471e61b4 - Optimize various mathematical packet ops
1aa6dc200 - Fix sparse warnings
17ae83a96 - Fix bugs exposed by enabling GPU asserts.
ab8725d94 - Turn off vectorize version of rsqrt - doesn't match generic version
6d9f662a7 - Tweak atan2
6fc9de7d9 - Fix slowdown in bfloat16 MMA when rows is not a multiple of 8 or columns is not a multiple of 4.
6d4221af7 - Revert qr tests
7f58bc98b - Refactor sparse
576448572 - More fixes for __GNUC_PATCHLEVEL__.
164ddf75a - Use __GNUC_PATCHLEVEL__ rather than __GNUC_PATCH__, according to the documentation https://gcc.gnu.org/onlinedocs/cpp/Common-Predefined-Macros.html
5a7ca681d - Fix sparse insert
08c961e83 - Add custom ODR-safe assert.
3fe8c5110 - Replace the Deprecated `$<CONFIGURATION>` with `$<CONFIG>`
d70b4864d - issue #2581: review and cleanup of compiler version checks
b52312068 - [SYCL-2020 Support] Enabling Intel DPCPP Compiler support to Eigen
bae119bb7 - Support per-thread is_malloc_allowed() state
fa0bd2c34 - improve sparse permutations
2e61c0c6b - Add missing EIGEN_DEVICE_FUNC in a few places when called by asserts.
4aca06f63 - avoid move assignment in ColPivHouseholderQR
68082b822 - Fix QR, again
4d0576534 - Altivec fixes for Darwin: do not use unsupported VSX insns
PiperOrigin-RevId: 506525228
Change-Id: I7b499f4573134b7fbdc73a4feb15a3ea032b599d
diff --git a/Eigen/Core b/Eigen/Core
index 623d735..af71891 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -34,16 +34,16 @@
#include <new>
#endif
-// Disable the ipa-cp-clone optimization flag with MinGW 6.x or newer (enabled by default with -O3)
+// Disable the ipa-cp-clone optimization flag with MinGW 6.x or older (enabled by default with -O3)
// See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=556 for details.
-#if EIGEN_COMP_MINGW && EIGEN_GNUC_AT_MOST(5,5)
+#if EIGEN_COMP_MINGW && EIGEN_GNUC_STRICT_LESS_THAN(6,0,0)
#pragma GCC optimize ("-fno-ipa-cp-clone")
#endif
// Prevent ICC from specializing std::complex operators that silently fail
// on device. This allows us to use our own device-compatible specializations
// instead.
-#if defined(EIGEN_COMP_ICC) && defined(EIGEN_GPU_COMPILE_PHASE) \
+#if EIGEN_COMP_ICC && defined(EIGEN_GPU_COMPILE_PHASE) \
&& !defined(_OVERRIDE_COMPLEX_SPECIALIZATION_)
#define _OVERRIDE_COMPLEX_SPECIALIZATION_ 1
#endif
@@ -82,7 +82,6 @@
#include <cstddef>
#include <cstdlib>
#include <cmath>
-#include <cassert>
#include <functional>
#ifndef EIGEN_NO_IO
#include <sstream>
@@ -158,6 +157,7 @@
#include "src/Core/util/Constants.h"
#include "src/Core/util/Meta.h"
+#include "src/Core/util/Assert.h"
#include "src/Core/util/ForwardDeclarations.h"
#include "src/Core/util/StaticAssert.h"
#include "src/Core/util/XprHelper.h"
diff --git a/Eigen/src/Cholesky/LLT_LAPACKE.h b/Eigen/src/Cholesky/LLT_LAPACKE.h
index 62bc679..a32ff23 100644
--- a/Eigen/src/Cholesky/LLT_LAPACKE.h
+++ b/Eigen/src/Cholesky/LLT_LAPACKE.h
@@ -70,6 +70,7 @@
template<typename Scalar, UpLoType Mode>
struct lapacke_llt {
+ EIGEN_STATIC_ASSERT(((Mode == Lower) || (Mode == Upper)),MODE_MUST_BE_UPPER_OR_LOWER)
template<typename MatrixType>
static Index blocked(MatrixType& m)
{
@@ -80,10 +81,11 @@
/* Set up parameters for ?potrf */
lapack_int size = to_lapack(m.rows());
lapack_int matrix_order = lapack_storage_of(m);
+ constexpr char uplo = Mode == Upper ? 'U' : 'L';
Scalar* a = &(m.coeffRef(0,0));
lapack_int lda = to_lapack(m.outerStride());
- lapack_int info = potrf(matrix_order, translate_mode<Mode>, size, to_lapack(a), lda );
+ lapack_int info = potrf(matrix_order, uplo, size, to_lapack(a), lda );
info = (info==0) ? -1 : info>0 ? info-1 : size;
return info;
}
diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h
index 8fb1f81..2b3a1c7 100644
--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h
@@ -937,7 +937,7 @@
}
// forward declaration
-template<typename Dst, typename Src> void check_for_aliasing(const Dst &dst, const Src &src);
+template<typename Dst, typename Src> EIGEN_DEVICE_FUNC void check_for_aliasing(const Dst &dst, const Src &src);
// Generic Dense to Dense assignment
// Note that the last template argument "Weak" is needed to make it possible to perform
diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h
index 19c4b68..5805763 100644
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@@ -336,6 +336,17 @@
enum {
XprTypeIsRowMajor = (int(traits<XprType>::Flags)&RowMajorBit) != 0
};
+
+ /** \internal Returns base+offset (unless base is null, in which case returns null).
+ * Adding an offset to nullptr is undefined behavior, so we must avoid it.
+ */
+ template <typename Scalar>
+ EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE
+ static Scalar* add_to_nullable_pointer(Scalar* base, Index offset)
+ {
+ return base != nullptr ? base+offset : nullptr;
+ }
+
public:
typedef MapBase<BlockType> Base;
@@ -346,8 +357,9 @@
*/
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
BlockImpl_dense(XprType& xpr, Index i)
- : Base(xpr.data() + i * ( ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && (!XprTypeIsRowMajor))
- || ((BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) && ( XprTypeIsRowMajor)) ? xpr.innerStride() : xpr.outerStride()),
+ : Base(add_to_nullable_pointer(xpr.data(),
+ i * ( ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && (!XprTypeIsRowMajor))
+ || ((BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) && ( XprTypeIsRowMajor)) ? xpr.innerStride() : xpr.outerStride())),
BlockRows==1 ? 1 : xpr.rows(),
BlockCols==1 ? 1 : xpr.cols()),
m_xpr(xpr),
@@ -361,7 +373,8 @@
*/
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
BlockImpl_dense(XprType& xpr, Index startRow, Index startCol)
- : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol)),
+ : Base(add_to_nullable_pointer(xpr.data(),
+ xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol))),
m_xpr(xpr), m_startRow(startRow), m_startCol(startCol)
{
init();
@@ -373,7 +386,9 @@
BlockImpl_dense(XprType& xpr,
Index startRow, Index startCol,
Index blockRows, Index blockCols)
- : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol), blockRows, blockCols),
+ : Base(add_to_nullable_pointer(xpr.data(),
+ xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol)),
+ blockRows, blockCols),
m_xpr(xpr), m_startRow(startRow), m_startCol(startCol)
{
init();
diff --git a/Eigen/src/Core/DenseStorage.h b/Eigen/src/Core/DenseStorage.h
index cf588bd..b409a39 100644
--- a/Eigen/src/Core/DenseStorage.h
+++ b/Eigen/src/Core/DenseStorage.h
@@ -54,18 +54,6 @@
#if defined(EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT)
#define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask)
-#elif EIGEN_COMP_GNUC
- // GCC 4.7 is too aggressive in its optimizations and remove the alignment test based on the fact the array is declared to be aligned.
- // See this bug report: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=53900
- // Hiding the origin of the array pointer behind a function argument seems to do the trick even if the function is inlined:
- template<typename PtrType>
- EIGEN_ALWAYS_INLINE PtrType eigen_unaligned_array_assert_workaround_gcc47(PtrType array) { return array; }
- #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \
- eigen_assert((internal::is_constant_evaluated() \
- || (internal::UIntPtr(eigen_unaligned_array_assert_workaround_gcc47(array)) & (sizemask)) == 0) \
- && "this assertion is explained here: " \
- "http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html" \
- " **** READ THIS WEB PAGE !!! ****");
#else
#define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \
eigen_assert((internal::is_constant_evaluated() || (internal::UIntPtr(array) & (sizemask)) == 0) \
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index af773dd..d65c63a 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -1219,6 +1219,43 @@
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE constexpr Packet
psignbit(const Packet& a) { return psignbit_impl<Packet>::run(a); }
+/** \internal \returns the 2-argument arc tangent of \a y and \a x (coeff-wise) */
+template <typename Packet, std::enable_if_t<is_scalar<Packet>::value, int> = 0>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet patan2(const Packet& y, const Packet& x) {
+ return numext::atan2(y, x);
+}
+
+/** \internal \returns the 2-argument arc tangent of \a y and \a x (coeff-wise) */
+template <typename Packet, std::enable_if_t<!is_scalar<Packet>::value, int> = 0>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet patan2(const Packet& y, const Packet& x) {
+ typedef typename internal::unpacket_traits<Packet>::type Scalar;
+
+ // See https://en.cppreference.com/w/cpp/numeric/math/atan2
+ // for how corner cases are supposed to be handled according to the
+ // IEEE floating-point standard (IEC 60559).
+ const Packet kSignMask = pset1<Packet>(-Scalar(0));
+ const Packet kZero = pzero(x);
+ const Packet kOne = pset1<Packet>(Scalar(1));
+ const Packet kPi = pset1<Packet>(Scalar(EIGEN_PI));
+
+ const Packet x_has_signbit = psignbit(x);
+ const Packet y_signmask = pand(y, kSignMask);
+ const Packet x_signmask = pand(x, kSignMask);
+ const Packet result_signmask = pxor(y_signmask, x_signmask);
+ const Packet shift = por(pand(x_has_signbit, kPi), y_signmask);
+
+ const Packet x_and_y_are_same = pcmp_eq(pabs(x), pabs(y));
+ const Packet x_and_y_are_zero = pcmp_eq(por(x, y), kZero);
+
+ Packet arg = pdiv(y, x);
+ arg = pselect(x_and_y_are_same, por(kOne, result_signmask), arg);
+ arg = pselect(x_and_y_are_zero, result_signmask, arg);
+
+ Packet result = patan(arg);
+ result = padd(result, shift);
+ return result;
+}
+
} // end namespace internal
} // end namespace Eigen
diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index b194353..def5428 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -1734,6 +1734,12 @@
return static_cast<T>(atan(x));
}
+template <typename T, std::enable_if_t<!NumTraits<T>::IsComplex, int> = 0>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T atan2(const T& y, const T& x) {
+ EIGEN_USING_STD(atan2);
+ return static_cast<T>(atan2(y, x));
+}
+
template<typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
T atanh(const T &x) {
diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h
index ea2178f..8767e1f 100644
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@@ -362,9 +362,9 @@
/////////// QR module ///////////
inline const HouseholderQR<PlainObject> householderQr() const;
- inline const ColPivHouseholderQR<PlainObject> colPivHouseholderQr() const;
- inline const FullPivHouseholderQR<PlainObject> fullPivHouseholderQr() const;
- inline const CompleteOrthogonalDecomposition<PlainObject> completeOrthogonalDecomposition() const;
+ template<typename PermutationIndex = DefaultPermutationIndex> inline const ColPivHouseholderQR<PlainObject, PermutationIndex> colPivHouseholderQr() const;
+ template<typename PermutationIndex = DefaultPermutationIndex> inline const FullPivHouseholderQR<PlainObject, PermutationIndex> fullPivHouseholderQr() const;
+ template<typename PermutationIndex = DefaultPermutationIndex> inline const CompleteOrthogonalDecomposition<PlainObject, PermutationIndex> completeOrthogonalDecomposition() const;
/////////// Eigenvalues module ///////////
diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h
index 74650ef..c56318c 100644
--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h
@@ -402,7 +402,7 @@
template<typename Scalar, bool DestIsTransposed, typename OtherDerived>
struct check_transpose_aliasing_run_time_selector
{
- static bool run(const Scalar* dest, const OtherDerived& src)
+ EIGEN_DEVICE_FUNC static bool run(const Scalar* dest, const OtherDerived& src)
{
return (bool(blas_traits<OtherDerived>::IsTransposed) != DestIsTransposed) && (dest!=0 && dest==(const Scalar*)extract_data(src));
}
@@ -411,7 +411,7 @@
template<typename Scalar, bool DestIsTransposed, typename BinOp, typename DerivedA, typename DerivedB>
struct check_transpose_aliasing_run_time_selector<Scalar,DestIsTransposed,CwiseBinaryOp<BinOp,DerivedA,DerivedB> >
{
- static bool run(const Scalar* dest, const CwiseBinaryOp<BinOp,DerivedA,DerivedB>& src)
+ EIGEN_DEVICE_FUNC static bool run(const Scalar* dest, const CwiseBinaryOp<BinOp,DerivedA,DerivedB>& src)
{
return ((blas_traits<DerivedA>::IsTransposed != DestIsTransposed) && (dest!=0 && dest==(const Scalar*)extract_data(src.lhs())))
|| ((blas_traits<DerivedB>::IsTransposed != DestIsTransposed) && (dest!=0 && dest==(const Scalar*)extract_data(src.rhs())));
@@ -431,7 +431,7 @@
>
struct checkTransposeAliasing_impl
{
- static void run(const Derived& dst, const OtherDerived& other)
+ EIGEN_DEVICE_FUNC static void run(const Derived& dst, const OtherDerived& other)
{
eigen_assert((!check_transpose_aliasing_run_time_selector
<typename Derived::Scalar,blas_traits<Derived>::IsTransposed,OtherDerived>
@@ -445,13 +445,13 @@
template<typename Derived, typename OtherDerived>
struct checkTransposeAliasing_impl<Derived, OtherDerived, false>
{
- static void run(const Derived&, const OtherDerived&)
+ EIGEN_DEVICE_FUNC static void run(const Derived&, const OtherDerived&)
{
}
};
template<typename Dst, typename Src>
-void check_for_aliasing(const Dst &dst, const Src &src)
+EIGEN_DEVICE_FUNC inline void check_for_aliasing(const Dst &dst, const Src &src)
{
if((!Dst::IsVectorAtCompileTime) && dst.rows()>1 && dst.cols()>1)
internal::checkTransposeAliasing_impl<Dst, Src>::run(dst, src);
diff --git a/Eigen/src/Core/VectorwiseOp.h b/Eigen/src/Core/VectorwiseOp.h
index b004f76..762cbfc 100644
--- a/Eigen/src/Core/VectorwiseOp.h
+++ b/Eigen/src/Core/VectorwiseOp.h
@@ -745,7 +745,7 @@
# endif
protected:
- Index redux_length() const
+ EIGEN_DEVICE_FUNC Index redux_length() const
{
return Direction==Vertical ? m_matrix.rows() : m_matrix.cols();
}
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index 0fe830a..60e1ff4 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -670,7 +670,7 @@
}
template<> EIGEN_STRONG_INLINE Packet8f pmin<Packet8f>(const Packet8f& a, const Packet8f& b) {
-#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
+#if EIGEN_GNUC_STRICT_LESS_THAN(6,3,0)
// There appears to be a bug in GCC, by which the optimizer may flip
// the argument order in calls to _mm_min_ps/_mm_max_ps, so we have to
// resort to inline ASM here. This is supposed to be fixed in gcc6.3,
@@ -684,7 +684,7 @@
#endif
}
template<> EIGEN_STRONG_INLINE Packet4d pmin<Packet4d>(const Packet4d& a, const Packet4d& b) {
-#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
+#if EIGEN_GNUC_STRICT_LESS_THAN(6,3,0)
// See pmin above
Packet4d res;
asm("vminpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
@@ -705,7 +705,7 @@
}
template<> EIGEN_STRONG_INLINE Packet8f pmax<Packet8f>(const Packet8f& a, const Packet8f& b) {
-#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
+#if EIGEN_GNUC_STRICT_LESS_THAN(6,3,0)
// See pmin above
Packet8f res;
asm("vmaxps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
@@ -716,7 +716,7 @@
#endif
}
template<> EIGEN_STRONG_INLINE Packet4d pmax<Packet4d>(const Packet4d& a, const Packet4d& b) {
-#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
+#if EIGEN_GNUC_STRICT_LESS_THAN(6,3,0)
// See pmin above
Packet4d res;
asm("vmaxpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h
index 159ae3e..916babf 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
@@ -31,7 +31,7 @@
#endif
// Disable the code for older versions of gcc that don't support many of the required avx512 math instrinsics.
-#if EIGEN_GNUC_AT_LEAST(5, 3) || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC >= 1923 || EIGEN_COMP_ICC >= 1900
+#if EIGEN_GNUC_STRICT_AT_LEAST(5,3,0) || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC >= 1923 || EIGEN_COMP_ICC >= 1900
#define EIGEN_HAS_AVX512_MATH 1
#else
#define EIGEN_HAS_AVX512_MATH 0
@@ -2384,7 +2384,7 @@
EIGEN_STRONG_INLINE Packet16bf F32ToBf16(const Packet16f& a) {
Packet16bf r;
-#if defined(EIGEN_VECTORIZE_AVX512BF16) && EIGEN_GNUC_AT_LEAST(10, 1)
+#if defined(EIGEN_VECTORIZE_AVX512BF16) && EIGEN_GNUC_STRICT_AT_LEAST(10,1,0)
// Since GCC 10.1 supports avx512bf16 and C style explicit cast
// (C++ static_cast is not supported yet), do conversion via intrinsic
// and register path for performance.
diff --git a/Eigen/src/Core/arch/AVX512/PacketMathFP16.h b/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
index f2e3e84..bbc016a 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
@@ -17,7 +17,7 @@
namespace internal {
// Disable the code for older versions of gcc that don't support many of the required avx512 math instrinsics.
-#if EIGEN_GNUC_AT_LEAST(5, 3) || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC >= 1923 || EIGEN_COMP_ICC >= 1900
+#if EIGEN_GNUC_STRICT_AT_LEAST(5,3,0) || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC >= 1923 || EIGEN_COMP_ICC >= 1900
#define EIGEN_HAS_AVX512_MATH 1
#else
#define EIGEN_HAS_AVX512_MATH 0
diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h
index 46812f9..60f28ff 100644
--- a/Eigen/src/Core/arch/AltiVec/Complex.h
+++ b/Eigen/src/Core/arch/AltiVec/Complex.h
@@ -18,7 +18,7 @@
namespace internal {
static Packet4ui p4ui_CONJ_XOR = vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
#if defined(_BIG_ENDIAN)
static Packet2ul p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_MZERO, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
static Packet2ul p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO, (Packet4ui) p2d_MZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
@@ -103,7 +103,7 @@
HasMin = 0,
HasMax = 0,
HasSqrt = 1,
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
HasBlend = 1,
#endif
HasSetLinear = 0
@@ -115,7 +115,7 @@
template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from)
{
Packet2cf res;
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
// Load a single std::complex<float> from memory and duplicate
//
// Using pload would read past the end of the reference in this case
@@ -151,7 +151,7 @@
EIGEN_STRONG_INLINE Packet2cf pload2(const std::complex<float>& from0, const std::complex<float>& from1)
{
Packet4f res0, res1;
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
// Load two std::complex<float> from memory and combine
__asm__ ("lxsdx %x0,%y1" : "=wa" (res0) : "Z" (from0));
__asm__ ("lxsdx %x0,%y1" : "=wa" (res1) : "Z" (from1));
@@ -269,7 +269,7 @@
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel)
{
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
Packet4f tmp = reinterpret_cast<Packet4f>(vec_mergeh(reinterpret_cast<Packet2d>(kernel.packet[0].v), reinterpret_cast<Packet2d>(kernel.packet[1].v)));
kernel.packet[1].v = reinterpret_cast<Packet4f>(vec_mergel(reinterpret_cast<Packet2d>(kernel.packet[0].v), reinterpret_cast<Packet2d>(kernel.packet[1].v)));
#else
@@ -284,7 +284,7 @@
return Packet2cf(vec_and(eq, vec_perm(eq, eq, p16uc_COMPLEX32_REV)));
}
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
Packet2cf result;
result.v = reinterpret_cast<Packet4f>(pblend<Packet2d>(ifPacket, reinterpret_cast<Packet2d>(thenPacket.v), reinterpret_cast<Packet2d>(elsePacket.v)));
@@ -298,7 +298,7 @@
}
//---------- double ----------
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
struct Packet1cd
{
EIGEN_STRONG_INLINE Packet1cd() {}
diff --git a/Eigen/src/Core/arch/AltiVec/MathFunctions.h b/Eigen/src/Core/arch/AltiVec/MathFunctions.h
index 6f48d98..45761e2 100644
--- a/Eigen/src/Core/arch/AltiVec/MathFunctions.h
+++ b/Eigen/src/Core/arch/AltiVec/MathFunctions.h
@@ -60,27 +60,7 @@
return patan_float(_x);
}
-#ifdef __VSX__
-#ifndef EIGEN_COMP_CLANG
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet4f prsqrt<Packet4f>(const Packet4f& x)
-{
- return vec_rsqrt(x);
-}
-
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet2d prsqrt<Packet2d>(const Packet2d& x)
-{
- return vec_rsqrt(x);
-}
-
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet2d patan<Packet2d>(const Packet2d& _x)
-{
- return patan_double(_x);
-}
-#endif
-
+#ifdef EIGEN_VECTORIZE_VSX
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
Packet4f psqrt<Packet4f>(const Packet4f& x)
{
@@ -93,6 +73,30 @@
return vec_sqrt(x);
}
+#if !EIGEN_COMP_CLANG
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet4f prsqrt<Packet4f>(const Packet4f& x)
+{
+ return pset1<Packet4f>(1.0f) / psqrt<Packet4f>(x);
+// vec_rsqrt returns different results from the generic version
+// return vec_rsqrt(x);
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet2d prsqrt<Packet2d>(const Packet2d& x)
+{
+ return pset1<Packet2d>(1.0) / psqrt<Packet2d>(x);
+// vec_rsqrt returns different results from the generic version
+// return vec_rsqrt(x);
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet2d patan<Packet2d>(const Packet2d& _x)
+{
+ return patan_double(_x);
+}
+#endif
+
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
Packet2d pexp<Packet2d>(const Packet2d& _x)
{
@@ -103,7 +107,7 @@
BF16_TO_F32_UNARY_OP_WRAPPER(psqrt<Packet4f>, a);
}
-#ifndef EIGEN_COMP_CLANG
+#if !EIGEN_COMP_CLANG
template<> EIGEN_STRONG_INLINE Packet8bf prsqrt<Packet8bf> (const Packet8bf& a){
BF16_TO_F32_UNARY_OP_WRAPPER(prsqrt<Packet4f>, a);
}
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
index 61d9618..adb2eac 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
@@ -1007,6 +1007,7 @@
Packet2ul v1[8];
+ // This is transposing and interleaving data
v1[0] = reinterpret_cast<Packet2ul>(vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[0].m_val), reinterpret_cast<Packet4ui>(block1.packet[1].m_val)));
v1[1] = reinterpret_cast<Packet2ul>(vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[0].m_val), reinterpret_cast<Packet4ui>(block1.packet[1].m_val)));
v1[2] = reinterpret_cast<Packet2ul>(vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[2].m_val), reinterpret_cast<Packet4ui>(block1.packet[3].m_val)));
@@ -1052,19 +1053,82 @@
if(PanelMode) ri += vectorSize*(stride - offset - depth);
}
-
- if(PanelMode) ri += offset;
-
- for(; j < rows; j++)
+ if(j + 4 <= rows)
{
const DataMapper lhs2 = lhs.getSubMapper(j, 0);
- for(Index i = 0; i < depth; i++)
+ Index i = 0;
+
+ if(PanelMode) ri += 4*offset;
+
+ for(; i + 2 <= depth; i+=2)
{
- blockA[ri] = lhs2(0, i);
- ri += 1;
+ if(StorageOrder == ColMajor)
+ {
+ PacketBlock<Packet8bf,2> block;
+
+ block.packet[0] = lhs2.template loadPacketPartial<Packet8bf>(0, i + 0, 4);
+ block.packet[1] = lhs2.template loadPacketPartial<Packet8bf>(0, i + 1, 4);
+
+ block.packet[0] = vec_mergeh(block.packet[0].m_val, block.packet[1].m_val);
+
+ pstore<bfloat16>(blockA + ri, block.packet[0]);
+ } else {
+ blockA[ri+0] = lhs2(0, i + 0);
+ blockA[ri+1] = lhs2(0, i + 1);
+ blockA[ri+2] = lhs2(1, i + 0);
+ blockA[ri+3] = lhs2(1, i + 1);
+ blockA[ri+4] = lhs2(2, i + 0);
+ blockA[ri+5] = lhs2(2, i + 1);
+ blockA[ri+6] = lhs2(3, i + 0);
+ blockA[ri+7] = lhs2(3, i + 1);
+ }
+
+ ri += 2*4;
+ }
+ if (depth & 1)
+ {
+ if(StorageOrder == ColMajor)
+ {
+ Packet8bf lhsV = lhs2.template loadPacketPartial<Packet8bf>(0, i + 0, 4);
+
+ pstore_partial<bfloat16>(blockA + ri, lhsV, 4);
+ } else {
+ blockA[ri+0] = lhs2(0, i);
+ blockA[ri+1] = lhs2(1, i);
+ blockA[ri+2] = lhs2(2, i);
+ blockA[ri+3] = lhs2(3, i);
+ }
+
+ ri += 4;
}
- if(PanelMode) ri += stride - depth;
+ if(PanelMode) ri += 4*(stride - offset - depth);
+ j += 4;
+ }
+
+ if (j < rows)
+ {
+ if(PanelMode) ri += offset*(rows - j);
+
+ Index i = 0;
+ for(; i + 2 <= depth; i+=2)
+ {
+ Index k = j;
+ for(; k < rows; k++)
+ {
+ blockA[ri+0] = lhs(k, i + 0);
+ blockA[ri+1] = lhs(k, i + 1);
+ ri += 2;
+ }
+ }
+ if (depth & 1)
+ {
+ for(; j < rows; j++)
+ {
+ blockA[ri] = lhs(j, i);
+ ri += 1;
+ }
+ }
}
}
};
@@ -1163,18 +1227,29 @@
if(PanelMode) ri += 4*(stride - offset - depth);
}
- if(PanelMode) ri += offset;
-
- for(; j < cols; j++)
+ if (j < cols)
{
- const DataMapper rhs2 = rhs.getSubMapper(0, j);
- for(Index i = 0; i < depth; i++)
- {
- blockB[ri] = rhs2(i, 0);
- ri += 1;
- }
+ if(PanelMode) ri += offset*(cols - j);
- if(PanelMode) ri += stride - depth;
+ Index i = 0;
+ for(; i + 2 <= depth; i+=2)
+ {
+ Index k = j;
+ for(; k < cols; k++)
+ {
+ blockB[ri+0] = rhs(i + 0, k);
+ blockB[ri+1] = rhs(i + 1, k);
+ ri += 2;
+ }
+ }
+ if (depth & 1)
+ {
+ for(; j < cols; j++)
+ {
+ blockB[ri] = rhs(i, j);
+ ri += 1;
+ }
+ }
}
}
};
@@ -2662,6 +2737,7 @@
#endif
#ifdef __MMA__
+#if EIGEN_ALTIVEC_USE_CUSTOM_PACK
template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
struct gemm_pack_rhs<bfloat16, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
{
@@ -2689,6 +2765,7 @@
dhs_pack<bfloat16, DataMapper, Packet8bf, RowMajor, PanelMode, false> pack;
pack(blockB, rhs, depth, cols, stride, offset);
}
+#endif
template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
struct gemm_pack_lhs<bfloat16, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h b/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h
index 01465d3..c6d4216 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h
@@ -18,8 +18,8 @@
pstoreu(result, result_block);
}
-template<Index num_packets, bool zero>
-EIGEN_ALWAYS_INLINE Packet8bf loadLhsBfloat16(const bfloat16* indexA)
+template<bool zero>
+EIGEN_ALWAYS_INLINE Packet8bf loadBfloat16(const bfloat16* indexA)
{
Packet8bf lhs1 = ploadu<Packet8bf>(indexA);
if(zero){
@@ -31,48 +31,33 @@
}
template<bool zero>
-EIGEN_ALWAYS_INLINE Packet8bf loadBfloat16Extra(const bfloat16* indexA, Index strideA, Index extra_rows)
+EIGEN_ALWAYS_INLINE Packet8bf loadBfloat16Extra(const bfloat16* indexA, Index extra_rows)
{
- Index row_count = 0;
if (zero) {
- EIGEN_ALIGN16 bfloat16 lhs_array[8] = { Eigen::bfloat16(0) };
- do{
- lhs_array[row_count] = *indexA;
- indexA += strideA;
- } while ((row_count += 2) < extra_rows*2);
- return pload_partial<Packet8bf>(lhs_array, extra_rows*2);
+ Packet8bf lhs1 = ploadu_partial<Packet8bf>(indexA, extra_rows);
+ Packet8bf lhs2 = pset1<Packet8bf>(Eigen::bfloat16(0));
+ return vec_mergeh(lhs1.m_val, lhs2.m_val);
} else {
- EIGEN_ALIGN16 int lhs_array[4];
- do{
- lhs_array[row_count] = *reinterpret_cast<const int *>(indexA);
- indexA += strideA;
- } while ((row_count += 1) < extra_rows);
- return reinterpret_cast<Packet8us>(pload_partial<Packet4i>(lhs_array, extra_rows));
+ return reinterpret_cast<Packet8us>(ploadu_partial<Packet4i>(reinterpret_cast<const int *>(indexA), extra_rows));
}
}
template<bool zero>
EIGEN_ALWAYS_INLINE Packet8bf loadLhsBfloat16ExtraRows(const bfloat16* indexA, Index strideA, Index row, Index extra_rows)
{
- return loadBfloat16Extra<zero>(indexA + row*strideA, strideA, extra_rows);
+ return loadBfloat16Extra<zero>(indexA + row*strideA, extra_rows);
}
template<bool zero>
EIGEN_ALWAYS_INLINE Packet8bf loadRhsBfloat16(const bfloat16* baseB, Index strideB, Index i, Index k)
{
- const bfloat16* indexB = baseB + strideB*4*i + (k*4);
- Packet8bf rhs1 = ploadu<Packet8bf>(indexB);
- if(zero){
- Packet8bf rhs2 = pset1<Packet8bf>(Eigen::bfloat16(0));
- return vec_mergeh(rhs1.m_val, rhs2.m_val);
- }
- return rhs1;
+ return loadBfloat16<zero>(baseB + strideB*4*i + (k*4));
}
template<bool zero>
EIGEN_ALWAYS_INLINE Packet8bf loadRhsBfloat16ExtraCols(const bfloat16* blockB, Index strideB, Index offsetB, Index col, Index i, Index k, Index extra_cols)
{
- return loadBfloat16Extra<zero>(blockB + ((col+4*i)*strideB)+k+offsetB, strideB, extra_cols);
+ return loadBfloat16Extra<zero>(blockB + ((col+4*i)*strideB)+k*extra_cols+offsetB, extra_cols);
}
template<Index num_acc, Index num_packets, bool zero, bool rhs_extra_cols, bool lhs_extra_rows>
@@ -93,8 +78,8 @@
{
Packet8bf lhs;
Packet8bf rhs[num_acc];
- if(lhs_extra_rows) lhs = loadLhsBfloat16ExtraRows<zero>(indexA+k, strideA, row, extra_rows);
- else lhs = loadLhsBfloat16<num_packets, zero>(indexA + k*num_packets); //a packet of bfloat16 has 8 elements
+ if(lhs_extra_rows) lhs = loadLhsBfloat16ExtraRows<zero>(indexA+k*extra_rows, strideA, row, extra_rows);
+ else lhs = loadBfloat16<zero>(indexA + k*num_packets); //a packet of bfloat16 has 8 elements
BFLOAT16_UNROLL
for(Index i = 0; i < num_acc; i++){
if(!rhs_extra_cols)
@@ -125,7 +110,7 @@
KLoop<num_acc, num_packets, false, rhsExtraCols, lhsExtraRows>(indexA, indexB, quad_acc, strideA, strideB, offsetB, k, row, col, extra_rows, extra_cols);
}
if(depth&1){
- KLoop<num_acc, num_packets, true, rhsExtraCols, lhsExtraRows>(indexA-offset_row, indexB, quad_acc, strideA, strideB, offsetB, k, row, col, extra_rows, extra_cols);
+ KLoop<num_acc, num_packets, true, rhsExtraCols, lhsExtraRows>(indexA-(offset_row&(num_packets-1)), indexB, quad_acc, strideA, strideB, offsetB, k, row, col, extra_rows, extra_cols);
}
BFLOAT16_UNROLL
@@ -174,8 +159,10 @@
typedef typename DataMapper::LinearMapper LinearMapper;
for(Index j = 0; j < cols; j++){
const LinearMapper res2 = res.getLinearMapper(0, j);
+ float *result2 = result + j*rows;
+ BFLOAT16_UNROLL
for(Index i = 0; i < rows; i++){
- result[j*rows + i] = res2(i);
+ result2[i] = res2(i);
}
}
@@ -185,15 +172,16 @@
if( strideA == -1 ) strideA = depth;
if( strideB == -1 ) strideB = depth;
//Packing is done in blocks.
- //There's 3 possible sizes of blocks
- //Blocks of 8 columns with 16 elements (8x16) as col major
- //Blocks of 8 columns with 8 elements (8x8) as col major. This happens when there's 16 > rows > 8
- //Blocks of 8 columns with <8 elements as row major. This happens when there's less than 8 remaining rows
+ //There's 4 possible sizes of blocks
+ //Blocks of 8 columns with 16 elements (8x16)
+ //Blocks of 8 columns with 8 elements (8x8). This happens when there's 16 > rows >= 8
+ //Blocks of 8 columns with 4 elements (8x4). This happens when there's 8 > rows >= 4
+ //Blocks of 8 columns with < 4 elements. This happens when there's less than 4 remaining rows
//Loop for LHS standard block (8x16)
const Index standard_block_size = 16;
const Index standard_blocks_quantity = rows/standard_block_size; //Number of standard blocks
- Index bigSuffix = (2*8) * (strideA-offsetA-depth);
+ Index bigSuffix = (2*8) * (strideA-offsetA);
const bfloat16* indexA = blockA;
const Index offset_factor = 2;
Index block_index;
@@ -215,11 +203,11 @@
}
}
row += 16;
- indexA += bigSuffix + 2*8*depth;
+ indexA += bigSuffix;
}
//LHS (8x8) block
- if(rows - standard_blocks_quantity*16 >= 8){
- indexA += 1*8*offsetA + 2*8*offsetA;
+ if(rows & 8){
+ indexA += 1*8*offsetA;
for(Index offset_row = 0; offset_row < 8; offset_row += 4){
col = 0;
colLoopBody<7, 8>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row*offset_factor, strideA, blockB, strideB, offsetB, result);
@@ -238,14 +226,33 @@
}
} //end extra cols
row += 8;
+ indexA += (bigSuffix >> 1);
+ }
+ //LHS (8x4) block
+ if(rows & 4){
+ Index offset_row = (rows & 8);
+ indexA += 1*4*offsetA;
+ col = 0;
+ colLoopBody<7, 4>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA, strideA, blockB, strideB, offsetB, result);
+ colLoopBody<6, 4>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA, strideA, blockB, strideB, offsetB, result);
+ colLoopBody<5, 4>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA, strideA, blockB, strideB, offsetB, result);
+ colLoopBody<4, 4>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA, strideA, blockB, strideB, offsetB, result);
+ colLoopBody<3, 4>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA, strideA, blockB, strideB, offsetB, result);
+ colLoopBody<2, 4>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA, strideA, blockB, strideB, offsetB, result);
+ colLoopBody<1, 4>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA, strideA, blockB, strideB, offsetB, result);
+ if(cols > col){
+ Index extra_cols= cols-col;
+
+ colLoopBody<1, 4, true>(col, row, depth, cols, rows, 0, block_index, pAlpha, indexA, strideA, blockB, strideB, offsetB, result, extra_cols, 4);
+ }
+ row += 4;
+ indexA += (bigSuffix >> 2);
}
//extra rows
- while(row < rows){
- Index extra_rows = rows-row;
- Index extra_rows_or_four = (extra_rows <= 4) ? extra_rows : 4;
+ if(row < rows){
+ Index extra_rows_or_four = rows-row;
- //This index is the beginning of remaining block.
- //This last block for LHS is organized as RowMajor
+ //This index is the beginning of remaining block.
col = 0;
colLoopBody<7, 8, false, true>(col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, 4, extra_rows_or_four);
colLoopBody<6, 8, false, true>(col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, 4, extra_rows_or_four);
@@ -269,8 +276,8 @@
//get and save block
PacketBlock<Packet8bf,4> block;
for(Index j = 0; j < 4; j++){
- Packet16uc fp16_0 = __builtin_vsx_xvcvspbf16(reinterpret_cast<Packet16uc>(pload<Packet4f>(result + (col + j)*rows + row)));
- Packet16uc fp16_1 = __builtin_vsx_xvcvspbf16(reinterpret_cast<Packet16uc>(pload<Packet4f>(result + (col + j)*rows + row + 4)));
+ Packet16uc fp16_0 = __builtin_vsx_xvcvspbf16(reinterpret_cast<Packet16uc>(ploadu<Packet4f>(result + (col + j)*rows + row)));
+ Packet16uc fp16_1 = __builtin_vsx_xvcvspbf16(reinterpret_cast<Packet16uc>(ploadu<Packet4f>(result + (col + j)*rows + row + 4)));
block.packet[j].m_val = vec_pack(reinterpret_cast<Packet4ui>(fp16_0), reinterpret_cast<Packet4ui>(fp16_1));
}
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h
index 9d00b93..bb84ac9 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h
@@ -599,7 +599,7 @@
EIGEN_ALWAYS_INLINE Packet1cd pcplxflip2(Packet1cd a)
{
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
return Packet1cd(__builtin_vsx_xxpermdi(a.v, a.v, 2));
#else
return Packet1cd(Packet2d(vec_perm(Packet16uc(a.v), Packet16uc(a.v), p16uc_COMPLEX64_XORFLIP)));
@@ -610,7 +610,7 @@
EIGEN_ALWAYS_INLINE Packet4f pload_complex_half(std::complex<float>* src)
{
Packet4f t;
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
// Load float64/two float32 (doubleword alignment)
__asm__("lxsdx %x0,%y1" : "=wa" (t) : "Z" (*src));
#else
@@ -636,7 +636,7 @@
template<typename RhsScalar>
EIGEN_ALWAYS_INLINE void pload_realimag(RhsScalar* src, Packet2d& r, Packet2d& i)
{
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
__asm__("lxvdsx %x0,%y1" : "=wa" (r) : "Z" (*(reinterpret_cast<double*>(src) + 0)));
__asm__("lxvdsx %x0,%y1" : "=wa" (i) : "Z" (*(reinterpret_cast<double*>(src) + 1)));
#else
@@ -675,7 +675,7 @@
/** \internal load and splat a complex value into a vector - column-wise */
EIGEN_ALWAYS_INLINE Packet4f pload_realimag_combine(std::complex<float>* src)
{
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
Packet4f ret;
__asm__("lxvdsx %x0,%y1" : "=wa" (ret) : "Z" (*(reinterpret_cast<double*>(src) + 0)));
return ret;
diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index b0f8529..fa12892 100644
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -173,7 +173,7 @@
HasATan = 1,
HasLog = 1,
HasExp = 1,
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
HasSqrt = 1,
#if !EIGEN_COMP_CLANG
HasRsqrt = 1,
@@ -218,7 +218,7 @@
HasCos = EIGEN_FAST_MATH,
HasLog = 1,
HasExp = 1,
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
HasSqrt = 1,
#if !EIGEN_COMP_CLANG
HasRsqrt = 1,
@@ -446,7 +446,7 @@
// ignoring these warnings for now.
EIGEN_UNUSED_VARIABLE(from);
EIGEN_DEBUG_ALIGNED_LOAD
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
#else
return vec_ld(0, from);
@@ -501,7 +501,7 @@
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#endif
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
#else
return vec_ld(0, from);
@@ -608,7 +608,7 @@
// ignoring these warnings for now.
EIGEN_UNUSED_VARIABLE(to);
EIGEN_DEBUG_ALIGNED_STORE
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
vec_xst(from, 0, to);
#else
vec_st(from, 0, to);
@@ -1002,7 +1002,7 @@
#ifdef __POWER8_VECTOR__
return vec_neg(a);
#else
- return p4f_ZERO - a;
+ return vec_xor(a, p4f_MZERO);
#endif
}
template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a)
@@ -1054,7 +1054,7 @@
template<> EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) { return vec_madd(a,b,c); }
template<> EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c) { return vec_madd(a,b,c); }
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
template<> EIGEN_STRONG_INLINE Packet4f pmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_msub(a,b,c); }
template<> EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_nmsub(a,b,c); }
template<> EIGEN_STRONG_INLINE Packet4f pnmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_nmadd(a,b,c); }
@@ -1062,7 +1062,7 @@
template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b)
{
- #ifdef __VSX__
+ #ifdef EIGEN_VECTORIZE_VSX
// NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
Packet4f ret;
__asm__ ("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
@@ -1080,7 +1080,7 @@
template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b)
{
- #ifdef __VSX__
+ #ifdef EIGEN_VECTORIZE_VSX
// NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
Packet4f ret;
__asm__ ("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
@@ -1096,34 +1096,37 @@
template<> EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vec_max(a, b); }
template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmple(a,b)); }
+// To fix bug with vec_cmplt on older versions
+#if defined(__POWER8_VECTOR__) || EIGEN_COMP_LLVM
template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmplt(a,b)); }
+#endif
template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmpeq(a,b)); }
template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) {
Packet4f c = reinterpret_cast<Packet4f>(vec_cmpge(a,b));
return vec_nor(c,c);
}
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
template<> EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmple(a,b)); }
#endif
template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmplt(a,b)); }
template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmpeq(a,b)); }
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
template<> EIGEN_STRONG_INLINE Packet8s pcmp_le(const Packet8s& a, const Packet8s& b) { return reinterpret_cast<Packet8s>(vec_cmple(a,b)); }
#endif
template<> EIGEN_STRONG_INLINE Packet8s pcmp_lt(const Packet8s& a, const Packet8s& b) { return reinterpret_cast<Packet8s>(vec_cmplt(a,b)); }
template<> EIGEN_STRONG_INLINE Packet8s pcmp_eq(const Packet8s& a, const Packet8s& b) { return reinterpret_cast<Packet8s>(vec_cmpeq(a,b)); }
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
template<> EIGEN_STRONG_INLINE Packet8us pcmp_le(const Packet8us& a, const Packet8us& b) { return reinterpret_cast<Packet8us>(vec_cmple(a,b)); }
#endif
template<> EIGEN_STRONG_INLINE Packet8us pcmp_lt(const Packet8us& a, const Packet8us& b) { return reinterpret_cast<Packet8us>(vec_cmplt(a,b)); }
template<> EIGEN_STRONG_INLINE Packet8us pcmp_eq(const Packet8us& a, const Packet8us& b) { return reinterpret_cast<Packet8us>(vec_cmpeq(a,b)); }
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
template<> EIGEN_STRONG_INLINE Packet16c pcmp_le(const Packet16c& a, const Packet16c& b) { return reinterpret_cast<Packet16c>(vec_cmple(a,b)); }
#endif
template<> EIGEN_STRONG_INLINE Packet16c pcmp_lt(const Packet16c& a, const Packet16c& b) { return reinterpret_cast<Packet16c>(vec_cmplt(a,b)); }
template<> EIGEN_STRONG_INLINE Packet16c pcmp_eq(const Packet16c& a, const Packet16c& b) { return reinterpret_cast<Packet16c>(vec_cmpeq(a,b)); }
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
template<> EIGEN_STRONG_INLINE Packet16uc pcmp_le(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast<Packet16uc>(vec_cmple(a,b)); }
#endif
template<> EIGEN_STRONG_INLINE Packet16uc pcmp_lt(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast<Packet16uc>(vec_cmplt(a,b)); }
@@ -1164,7 +1167,7 @@
Packet4f t = vec_add(reinterpret_cast<Packet4f>(vec_or(vec_and(reinterpret_cast<Packet4ui>(a), p4ui_SIGN), p4ui_PREV0DOT5)), a);
Packet4f res;
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
__asm__("xvrspiz %x0, %x1\n\t"
: "=&wa" (res)
: "wa" (t));
@@ -1178,7 +1181,7 @@
}
template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) { return vec_ceil(a); }
template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return vec_floor(a); }
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
template<> EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a)
{
Packet4f res;
@@ -1194,7 +1197,7 @@
template<typename Packet> EIGEN_STRONG_INLINE Packet ploadu_common(const __UNPACK_TYPE__(Packet)* from)
{
EIGEN_DEBUG_ALIGNED_LOAD
-#if defined(__VSX__) || !defined(_BIG_ENDIAN)
+#if defined(EIGEN_VECTORIZE_VSX) || !defined(_BIG_ENDIAN)
EIGEN_DEBUG_UNALIGNED_LOAD
return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
#else
@@ -1377,7 +1380,7 @@
template<typename Packet> EIGEN_STRONG_INLINE void pstoreu_common(__UNPACK_TYPE__(Packet)* to, const Packet& from)
{
EIGEN_DEBUG_UNALIGNED_STORE
-#if defined(__VSX__) || !defined(_BIG_ENDIAN)
+#if defined(EIGEN_VECTORIZE_VSX) || !defined(_BIG_ENDIAN)
vec_xst(from, 0, to);
#else
// Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
@@ -1773,7 +1776,7 @@
template<> EIGEN_STRONG_INLINE Packet8bf pround<Packet8bf> (const Packet8bf& a){
BF16_TO_F32_UNARY_OP_WRAPPER(pround<Packet4f>, a);
}
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
template<> EIGEN_STRONG_INLINE Packet8bf print<Packet8bf> (const Packet8bf& a){
BF16_TO_F32_UNARY_OP_WRAPPER(print<Packet4f>, a);
}
@@ -2657,7 +2660,7 @@
//---------- double ----------
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
typedef __vector double Packet2d;
typedef __vector unsigned long long Packet2ul;
typedef __vector long long Packet2l;
@@ -2710,7 +2713,11 @@
HasLog = 0,
HasExp = 1,
HasSqrt = 1,
+#if !EIGEN_COMP_CLANG
HasRsqrt = 1,
+#else
+ HasRsqrt = 0,
+#endif
HasRound = 1,
HasFloor = 1,
HasCeil = 1,
@@ -2816,7 +2823,7 @@
#ifdef __POWER8_VECTOR__
return vec_neg(a);
#else
- return p2d_ZERO - a;
+ return vec_xor(a, p2d_MZERO);
#endif
}
@@ -2935,8 +2942,7 @@
// are buggy, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70963
template<>
inline Packet2l pcast<Packet2d, Packet2l>(const Packet2d& x) {
-#if EIGEN_GNUC_AT_LEAST(5, 4) || \
- (EIGEN_GNUC_AT(6, 1) && __GNUC_PATCHLEVEL__ >= 1)
+#if EIGEN_GNUC_STRICT_AT_LEAST(7,1,0)
return vec_cts(x, 0); // TODO: check clang version.
#else
double tmp[2];
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
index 3060214..38bd93d 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@@ -728,20 +728,19 @@
const Packet cst_one = pset1<Packet>(Scalar(1));
const Packet cst_pi = pset1<Packet>(Scalar(EIGEN_PI));
- const Packet p6 = pset1<Packet>(Scalar(2.26911413483321666717529296875e-3));
- const Packet p5 = pset1<Packet>(Scalar(-1.1063250713050365447998046875e-2));
- const Packet p4 = pset1<Packet>(Scalar(2.680264413356781005859375e-2));
- const Packet p3 = pset1<Packet>(Scalar(-4.87488098442554473876953125e-2));
- const Packet p2 = pset1<Packet>(Scalar(8.874166011810302734375e-2));
- const Packet p1 = pset1<Packet>(Scalar(-0.2145837843418121337890625));
- const Packet p0 = pset1<Packet>(Scalar(1.57079613208770751953125));
+ const Packet p6 = pset1<Packet>(Scalar(2.36423197202384471893310546875e-3));
+ const Packet p5 = pset1<Packet>(Scalar(-1.1368644423782825469970703125e-2));
+ const Packet p4 = pset1<Packet>(Scalar(2.717843465507030487060546875e-2));
+ const Packet p3 = pset1<Packet>(Scalar(-4.8969544470310211181640625e-2));
+ const Packet p2 = pset1<Packet>(Scalar(8.8804088532924652099609375e-2));
+ const Packet p1 = pset1<Packet>(Scalar(-0.214591205120086669921875));
+ const Packet p0 = pset1<Packet>(Scalar(1.57079637050628662109375));
// For x in [0:1], we approximate acos(x)/sqrt(1-x), which is a smooth
// function, by a 6'th order polynomial.
// For x in [-1:0) we use that acos(-x) = pi - acos(x).
- const Packet neg_mask = pcmp_lt(x_in, pzero(x_in));
- Packet x = pabs(x_in);
- const Packet invalid_mask = pcmp_lt(pset1<Packet>(1.0f), x);
+ const Packet neg_mask = psignbit(x_in);
+ const Packet abs_x = pabs(x_in);
// Evaluate the polynomial using Horner's rule:
// P(x) = p0 + x * (p1 + x * (p2 + ... (p5 + x * p6)) ... ) .
@@ -753,19 +752,15 @@
p_even = pmadd(p_even, x2, p2);
p_odd = pmadd(p_odd, x2, p1);
p_even = pmadd(p_even, x2, p0);
- Packet p = pmadd(p_odd, x, p_even);
+ Packet p = pmadd(p_odd, abs_x, p_even);
// The polynomial approximates acos(x)/sqrt(1-x), so
// multiply by sqrt(1-x) to get acos(x).
- Packet denom = psqrt(psub(cst_one, x));
+ // Conveniently returns NaN for arguments outside [-1:1].
+ Packet denom = psqrt(psub(cst_one, abs_x));
Packet result = pmul(denom, p);
-
// Undo mapping for negative arguments.
- result = pselect(neg_mask, psub(cst_pi, result), result);
- // Return NaN for arguments outside [-1:1].
- return pselect(invalid_mask,
- pset1<Packet>(std::numeric_limits<float>::quiet_NaN()),
- result);
+ return pselect(neg_mask, psub(cst_pi, result), result);
}
// Generic implementation of asin(x).
@@ -834,6 +829,8 @@
// We evaluate even and odd terms in x^2 in parallel
// to take advantage of instruction level parallelism
// and hardware with multiple FMA units.
+
+ // note: if x == -0, this returns +0
const Packet x2 = pmul(x, x);
const Packet x4 = pmul(x2, x2);
Packet q_odd = pmadd(q14, x4, q10);
@@ -852,20 +849,25 @@
typedef typename unpacket_traits<Packet>::type Scalar;
static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
+ constexpr float kPiOverTwo = static_cast<float>(EIGEN_PI / 2);
+
+ const Packet cst_signmask = pset1<Packet>(-0.0f);
const Packet cst_one = pset1<Packet>(1.0f);
- constexpr float kPiOverTwo = static_cast<float>(EIGEN_PI/2);
+ const Packet cst_pi_over_two = pset1<Packet>(kPiOverTwo);
// "Large": For |x| > 1, use atan(1/x) = sign(x)*pi/2 - atan(x).
// "Small": For |x| <= 1, approximate atan(x) directly by a polynomial
// calculated using Sollya.
- const Packet neg_mask = pcmp_lt(x_in, pzero(x_in));
- const Packet large_mask = pcmp_lt(cst_one, pabs(x_in));
- const Packet large_shift = pselect(neg_mask, pset1<Packet>(-kPiOverTwo), pset1<Packet>(kPiOverTwo));
- const Packet x = pselect(large_mask, preciprocal(x_in), x_in);
+
+ const Packet abs_x = pabs(x_in);
+ const Packet x_signmask = pand(x_in, cst_signmask);
+ const Packet large_mask = pcmp_lt(cst_one, abs_x);
+ const Packet x = pselect(large_mask, preciprocal(abs_x), abs_x);
const Packet p = patan_reduced_float(x);
-
// Apply transformations according to the range reduction masks.
- return pselect(large_mask, psub(large_shift, p), p);
+ Packet result = pselect(large_mask, psub(cst_pi_over_two, p), p);
+ // Return correct sign
+ return pxor(result, x_signmask);
}
// Computes elementwise atan(x) for x in [-tan(pi/8):tan(pi/8)]
@@ -920,16 +922,17 @@
typedef typename unpacket_traits<Packet>::type Scalar;
static_assert(std::is_same<Scalar, double>::value, "Scalar type must be double");
- const Packet cst_one = pset1<Packet>(1.0);
constexpr double kPiOverTwo = static_cast<double>(EIGEN_PI / 2);
- const Packet cst_pi_over_two = pset1<Packet>(kPiOverTwo);
constexpr double kPiOverFour = static_cast<double>(EIGEN_PI / 4);
- const Packet cst_pi_over_four = pset1<Packet>(kPiOverFour);
- const Packet cst_large = pset1<Packet>(2.4142135623730950488016887); // tan(3*pi/8);
- const Packet cst_medium = pset1<Packet>(0.4142135623730950488016887); // tan(pi/8);
+ constexpr double kTanPiOverEight = 0.4142135623730950488016887;
+ constexpr double kTan3PiOverEight = 2.4142135623730950488016887;
- const Packet neg_mask = pcmp_lt(x_in, pzero(x_in));
- Packet x = pabs(x_in);
+ const Packet cst_signmask = pset1<Packet>(-0.0);
+ const Packet cst_one = pset1<Packet>(1.0);
+ const Packet cst_pi_over_two = pset1<Packet>(kPiOverTwo);
+ const Packet cst_pi_over_four = pset1<Packet>(kPiOverFour);
+ const Packet cst_large = pset1<Packet>(kTan3PiOverEight);
+ const Packet cst_medium = pset1<Packet>(kTanPiOverEight);
// Use the same range reduction strategy (to [0:tan(pi/8)]) as the
// Cephes library:
@@ -938,10 +941,15 @@
// use atan(x) = pi/4 + atan((x-1)/(x+1)).
// "Small": For x < tan(pi/8), approximate atan(x) directly by a polynomial
// calculated using Sollya.
- const Packet large_mask = pcmp_lt(cst_large, x);
- x = pselect(large_mask, preciprocal(x), x);
- const Packet medium_mask = pandnot(pcmp_lt(cst_medium, x), large_mask);
- x = pselect(medium_mask, pdiv(psub(x, cst_one), padd(x, cst_one)), x);
+
+ const Packet abs_x = pabs(x_in);
+ const Packet x_signmask = pand(x_in, cst_signmask);
+ const Packet large_mask = pcmp_lt(cst_large, abs_x);
+ const Packet medium_mask = pandnot(pcmp_lt(cst_medium, abs_x), large_mask);
+
+ Packet x = abs_x;
+ x = pselect(large_mask, preciprocal(abs_x), x);
+ x = pselect(medium_mask, pdiv(psub(abs_x, cst_one), padd(abs_x, cst_one)), x);
// Compute approximation of p ~= atan(x') where x' is the argument reduced to
// [0:tan(pi/8)].
@@ -950,7 +958,8 @@
// Apply transformations according to the range reduction masks.
p = pselect(large_mask, psub(cst_pi_over_two, p), p);
p = pselect(medium_mask, padd(cst_pi_over_four, p), p);
- return pselect(neg_mask, pnegate(p), p);
+ // Return the correct sign
+ return pxor(p, x_signmask);
}
template<typename Packet>
@@ -1751,7 +1760,7 @@
const Packet abs_x = pabs(x);
// Predicates for sign and magnitude of x.
const Packet abs_x_is_zero = pcmp_eq(abs_x, cst_zero);
- const Packet x_has_signbit = pcmp_eq(por(pand(x, cst_neg_inf), cst_pos_inf), cst_neg_inf);
+ const Packet x_has_signbit = psignbit(x);
const Packet x_is_neg = pandnot(x_has_signbit, abs_x_is_zero);
const Packet x_is_neg_zero = pand(x_has_signbit, abs_x_is_zero);
const Packet abs_x_is_inf = pcmp_eq(abs_x, cst_pos_inf);
@@ -1790,19 +1799,21 @@
const Packet pow_is_inf = por(por(por(pand(abs_x_is_zero, y_is_neg), pand(abs_x_is_inf, y_is_pos)),
pand(pand(abs_x_is_lt_one, abs_y_is_huge), y_is_neg)),
pand(pand(abs_x_is_gt_one, abs_y_is_huge), y_is_pos));
+ const Packet pow_is_neg_zero = pand(pandnot(y_is_int, y_is_even),
+ por(pand(y_is_neg, pand(abs_x_is_inf, x_is_neg)), pand(y_is_pos, x_is_neg_zero)));
const Packet inf_val =
pselect(pandnot(pand(por(pand(abs_x_is_inf, x_is_neg), pand(x_is_neg_zero, y_is_neg)), y_is_int), y_is_even),
cst_neg_inf, cst_pos_inf);
-
// General computation of pow(x,y) for positive x or negative x and integer y.
const Packet negate_pow_abs = pandnot(x_is_neg, y_is_even);
const Packet pow_abs = generic_pow_impl(abs_x, y);
- return pselect(
- y_is_one, x,
- pselect(pow_is_one, cst_one,
- pselect(pow_is_nan, cst_nan,
- pselect(pow_is_inf, inf_val,
- pselect(pow_is_zero, cst_zero, pselect(negate_pow_abs, pnegate(pow_abs), pow_abs))))));
+ return pselect(y_is_one, x,
+ pselect(pow_is_one, cst_one,
+ pselect(pow_is_nan, cst_nan,
+ pselect(pow_is_inf, inf_val,
+ pselect(pow_is_neg_zero, pnegate(cst_zero),
+ pselect(pow_is_zero, cst_zero,
+ pselect(negate_pow_abs, pnegate(pow_abs), pow_abs)))))));
}
/* polevl (modified for Eigen)
@@ -2022,7 +2033,7 @@
const Packet abs_x_is_one = pcmp_eq(abs_x, cst_pos_one);
const Packet abs_x_is_inf = pcmp_eq(abs_x, cst_pos_inf);
- const Packet x_has_signbit = pcmp_eq(por(pand(x, cst_neg_inf), cst_pos_inf), cst_neg_inf);
+ const Packet x_has_signbit = psignbit(x);
const Packet x_is_neg = pandnot(x_has_signbit, abs_x_is_zero);
const Packet x_is_neg_zero = pand(x_has_signbit, abs_x_is_zero);
@@ -2071,13 +2082,11 @@
const Scalar pos_zero = Scalar(0);
const Scalar pos_one = Scalar(1);
const Scalar pos_inf = NumTraits<Scalar>::infinity();
- const Scalar neg_inf = -NumTraits<Scalar>::infinity();
const Scalar nan = NumTraits<Scalar>::quiet_NaN();
const Packet cst_pos_zero = pset1<Packet>(pos_zero);
const Packet cst_pos_one = pset1<Packet>(pos_one);
const Packet cst_pos_inf = pset1<Packet>(pos_inf);
- const Packet cst_neg_inf = pset1<Packet>(neg_inf);
const Packet cst_nan = pset1<Packet>(nan);
const Packet abs_x = pabs(x);
@@ -2087,7 +2096,7 @@
const Packet abs_x_is_one = pcmp_eq(abs_x, cst_pos_one);
const Packet abs_x_is_inf = pcmp_eq(abs_x, cst_pos_inf);
- const Packet x_has_signbit = pcmp_eq(por(pand(x, cst_neg_inf), cst_pos_inf), cst_neg_inf);
+ const Packet x_has_signbit = psignbit(x);
const Packet x_is_neg = pandnot(x_has_signbit, abs_x_is_zero);
if (exponent_is_nan) {
diff --git a/Eigen/src/Core/arch/GPU/Complex.h b/Eigen/src/Core/arch/GPU/Complex.h
index c2b4c38..4e4cdd1 100644
--- a/Eigen/src/Core/arch/GPU/Complex.h
+++ b/Eigen/src/Core/arch/GPU/Complex.h
@@ -39,7 +39,7 @@
// To allow std::complex operator use on device, define _OVERRIDE_COMPLEX_SPECIALIZATION_
// prior to first inclusion of <complex>. This prevents ICC from adding
// its own specializations, so our custom ones below can be used instead.
-#if !(defined(EIGEN_COMP_ICC) && defined(_USE_COMPLEX_SPECIALIZATION_))
+#if !(EIGEN_COMP_ICC && defined(_USE_COMPLEX_SPECIALIZATION_))
// Import Eigen's internal operator specializations.
#define EIGEN_USING_STD_COMPLEX_OPERATORS \
diff --git a/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h b/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h
index c892d39..ff1646d 100644
--- a/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h
@@ -96,7 +96,7 @@
template<int LaneID>
EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
{
- #if EIGEN_COMP_GNUC_STRICT
+ #if EIGEN_GNUC_STRICT_LESS_THAN(9,0,0)
// 1. workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
// vfmaq_laneq_f32 is implemented through a costly dup, which was fixed in gcc9
// 2. workaround the gcc register split problem on arm64-neon
@@ -166,7 +166,7 @@
template <int LaneID>
EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
{
- #if EIGEN_COMP_GNUC_STRICT
+ #if EIGEN_GNUC_STRICT_LESS_THAN(9,0,0)
// 1. workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
// vfmaq_laneq_f64 is implemented through a costly dup, which was fixed in gcc9
// 2. workaround the gcc register split problem on arm64-neon
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 069bd4c..6d4d1c3 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -3696,11 +3696,11 @@
// Clang 3.5 in the iOS toolchain has an ICE triggered by NEON intrisics for double.
// Confirmed at least with __apple_build_version__ = 6000054.
-#ifdef __apple_build_version__
+#if EIGEN_COMP_CLANGAPPLE
// Let's hope that by the time __apple_build_version__ hits the 601* range, the bug will be fixed.
// https://gist.github.com/yamaya/2924292 suggests that the 3 first digits are only updated with
// major toolchain updates.
-#define EIGEN_APPLE_DOUBLE_NEON_BUG (__apple_build_version__ < 6010000)
+#define EIGEN_APPLE_DOUBLE_NEON_BUG (EIGEN_COMP_CLANGAPPLE < 6010000)
#else
#define EIGEN_APPLE_DOUBLE_NEON_BUG 0
#endif
@@ -3928,7 +3928,7 @@
// Other reduction functions:
// mul
-#if EIGEN_COMP_CLANG && defined(__apple_build_version__)
+#if EIGEN_COMP_CLANGAPPLE
template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
{ return (vget_low_f64(a) * vget_high_f64(a))[0]; }
#else
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index a0ff359..434d033 100644
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -475,7 +475,7 @@
template<> EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) { return por(pcmp_lt(a,b), pcmp_eq(a,b)); }
template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
-#if EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC < 63
+#if EIGEN_GNUC_STRICT_LESS_THAN(6,3,0)
// There appears to be a bug in GCC, by which the optimizer may
// flip the argument order in calls to _mm_min_ps, so we have to
// resort to inline ASM here. This is supposed to be fixed in gcc6.3,
@@ -494,7 +494,7 @@
#endif
}
template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
-#if EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC < 63
+#if EIGEN_GNUC_STRICT_LESS_THAN(6,3,0)
// There appears to be a bug in GCC, by which the optimizer may
// flip the argument order in calls to _mm_min_pd, so we have to
// resort to inline ASM here. This is supposed to be fixed in gcc6.3,
@@ -525,7 +525,7 @@
template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
-#if EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC < 63
+#if EIGEN_GNUC_STRICT_LESS_THAN(6,3,0)
// There appears to be a bug in GCC, by which the optimizer may
// flip the argument order in calls to _mm_max_ps, so we have to
// resort to inline ASM here. This is supposed to be fixed in gcc6.3,
@@ -544,7 +544,7 @@
#endif
}
template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
-#if EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC < 63
+#if EIGEN_GNUC_STRICT_LESS_THAN(6,3,0)
// There appears to be a bug in GCC, by which the optimizer may
// flip the argument order in calls to _mm_max_pd, so we have to
// resort to inline ASM here. This is supposed to be fixed in gcc6.3,
diff --git a/Eigen/src/Core/arch/SYCL/PacketMath.h b/Eigen/src/Core/arch/SYCL/PacketMath.h
index 5bc3235..062d50e 100644
--- a/Eigen/src/Core/arch/SYCL/PacketMath.h
+++ b/Eigen/src/Core/arch/SYCL/PacketMath.h
@@ -21,193 +21,53 @@
#ifndef EIGEN_PACKET_MATH_SYCL_H
#define EIGEN_PACKET_MATH_SYCL_H
#include <type_traits>
+
#include "../../InternalHeaderCheck.h"
namespace Eigen {
namespace internal {
#ifdef SYCL_DEVICE_ONLY
-
-#define SYCL_PLOADT_RO(address_space_target) \
- template <typename packet_type, int Alignment> \
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type ploadt_ro( \
- typename cl::sycl::multi_ptr< \
- const typename unpacket_traits<packet_type>::type, \
- cl::sycl::access::address_space::address_space_target>::pointer_t \
- from) { \
- typedef typename unpacket_traits<packet_type>::type scalar; \
- typedef cl::sycl::multi_ptr< \
- scalar, cl::sycl::access::address_space::address_space_target> \
- multi_ptr; \
- auto res = packet_type( \
- static_cast<typename unpacket_traits<packet_type>::type>(0)); \
- res.load(0, multi_ptr(const_cast<typename multi_ptr::pointer_t>(from))); \
- return res; \
+#define SYCL_PLOAD(packet_type, AlignedType) \
+ template <> \
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type \
+ pload##AlignedType<packet_type>( \
+ const typename unpacket_traits<packet_type>::type* from) { \
+ using scalar = typename unpacket_traits<packet_type>::type; \
+ typedef cl::sycl::multi_ptr< \
+ const scalar, cl::sycl::access::address_space::generic_space, \
+ cl::sycl::access::decorated::no> \
+ multi_ptr; \
+ packet_type res{}; \
+ res.load(0, multi_ptr(from)); \
+ return res; \
}
-SYCL_PLOADT_RO(global_space)
-SYCL_PLOADT_RO(local_space)
-#undef SYCL_PLOADT_RO
-#endif
-
-template <typename packet_type, int Alignment, typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type
-ploadt_ro(const Eigen::TensorSycl::internal::RangeAccess<
- cl::sycl::access::mode::read_write, T>& from) {
- return ploadt_ro<packet_type, Alignment>(from.get_pointer());
-}
-
-#ifdef SYCL_DEVICE_ONLY
-#define SYCL_PLOAD(address_space_target, Alignment, AlignedType) \
- template <typename packet_type> \
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##AlignedType( \
- typename cl::sycl::multi_ptr< \
- const typename unpacket_traits<packet_type>::type, \
- cl::sycl::access::address_space::address_space_target>::pointer_t \
- from) { \
- return ploadt_ro<packet_type, Alignment>(from); \
- }
-
-// global space
-SYCL_PLOAD(global_space, Unaligned, u)
-SYCL_PLOAD(global_space, Aligned, )
-// local space
-SYCL_PLOAD(local_space, Unaligned, u)
-SYCL_PLOAD(local_space, Aligned, )
+SYCL_PLOAD(cl::sycl::cl_float4, u)
+SYCL_PLOAD(cl::sycl::cl_float4, )
+SYCL_PLOAD(cl::sycl::cl_double2, u)
+SYCL_PLOAD(cl::sycl::cl_double2, )
#undef SYCL_PLOAD
-#endif
-#define SYCL_PLOAD(Alignment, AlignedType) \
- template <typename packet_type> \
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##AlignedType( \
- const Eigen::TensorSycl::internal::RangeAccess< \
- cl::sycl::access::mode::read_write, \
- typename unpacket_traits<packet_type>::type> \
- from) { \
- return ploadt_ro<packet_type, Alignment>(from); \
- }
-SYCL_PLOAD(Unaligned, u)
-SYCL_PLOAD(Aligned, )
-#undef SYCL_PLOAD
-
-#ifdef SYCL_DEVICE_ONLY
-/** \internal \returns a packet version of \a *from.
- * The pointer \a from must be aligned on a \a Alignment bytes boundary. */
-#define SYCL_PLOADT(address_space_target) \
- template <typename packet_type, int Alignment> \
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type ploadt( \
- typename cl::sycl::multi_ptr< \
- const typename unpacket_traits<packet_type>::type, \
- cl::sycl::access::address_space::address_space_target>::pointer_t \
- from) { \
- if (Alignment >= unpacket_traits<packet_type>::alignment) \
- return pload<packet_type>(from); \
- else \
- return ploadu<packet_type>(from); \
+#define SYCL_PSTORE(scalar, packet_type, alignment) \
+ template <> \
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \
+ scalar* to, const packet_type& from) { \
+ typedef cl::sycl::multi_ptr< \
+ scalar, cl::sycl::access::address_space::generic_space, \
+ cl::sycl::access::decorated::no> \
+ multi_ptr; \
+ from.store(0, multi_ptr(to)); \
}
-// global space
-SYCL_PLOADT(global_space)
-// local space
-SYCL_PLOADT(local_space)
-#undef SYCL_PLOADT
-#endif
+SYCL_PSTORE(float, cl::sycl::cl_float4, )
+SYCL_PSTORE(float, cl::sycl::cl_float4, u)
+SYCL_PSTORE(double, cl::sycl::cl_double2, )
+SYCL_PSTORE(double, cl::sycl::cl_double2, u)
-template <typename packet_type, int Alignment>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type
-ploadt(const Eigen::TensorSycl::internal::RangeAccess<
- cl::sycl::access::mode::read_write,
- typename unpacket_traits<packet_type>::type>& from) {
- return ploadt<packet_type, Alignment>(from.get_pointer());
-}
-#ifdef SYCL_DEVICE_ONLY
-
-// private_space
-#define SYCL_PLOADT_RO_SPECIAL(packet_type, Alignment) \
- template <> \
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type \
- ploadt_ro<packet_type, Alignment>( \
- const typename unpacket_traits<packet_type>::type* from) { \
- typedef typename unpacket_traits<packet_type>::type scalar; \
- auto res = packet_type(static_cast<scalar>(0)); \
- res.template load<cl::sycl::access::address_space::private_space>( \
- 0, const_cast<scalar*>(from)); \
- return res; \
- }
-
-SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_float4, Aligned)
-SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_double2, Aligned)
-SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_float4, Unaligned)
-SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_double2, Unaligned)
-
-#define SYCL_PLOAD_SPECIAL(packet_type, alignment_type) \
- template <> \
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##alignment_type( \
- const typename unpacket_traits<packet_type>::type* from) { \
- typedef typename unpacket_traits<packet_type>::type scalar; \
- auto res = packet_type(static_cast<scalar>(0)); \
- res.template load<cl::sycl::access::address_space::private_space>( \
- 0, const_cast<scalar*>(from)); \
- return res; \
- }
-SYCL_PLOAD_SPECIAL(cl::sycl::cl_float4, )
-SYCL_PLOAD_SPECIAL(cl::sycl::cl_double2, )
-SYCL_PLOAD_SPECIAL(cl::sycl::cl_float4, u)
-SYCL_PLOAD_SPECIAL(cl::sycl::cl_double2, u)
-
-#undef SYCL_PLOAD_SPECIAL
-
-#define SYCL_PSTORE(scalar, packet_type, address_space_target, alignment) \
- template <> \
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \
- typename cl::sycl::multi_ptr< \
- scalar, \
- cl::sycl::access::address_space::address_space_target>::pointer_t \
- to, \
- const packet_type& from) { \
- typedef cl::sycl::multi_ptr< \
- scalar, cl::sycl::access::address_space::address_space_target> \
- multi_ptr; \
- from.store(0, multi_ptr(to)); \
- }
-
-// global space
-SYCL_PSTORE(float, cl::sycl::cl_float4, global_space, )
-SYCL_PSTORE(float, cl::sycl::cl_float4, global_space, u)
-SYCL_PSTORE(double, cl::sycl::cl_double2, global_space, )
-SYCL_PSTORE(double, cl::sycl::cl_double2, global_space, u)
-SYCL_PSTORE(float, cl::sycl::cl_float4, local_space, )
-SYCL_PSTORE(float, cl::sycl::cl_float4, local_space, u)
-SYCL_PSTORE(double, cl::sycl::cl_double2, local_space, )
-SYCL_PSTORE(double, cl::sycl::cl_double2, local_space, u)
-
-SYCL_PSTORE(float, cl::sycl::cl_float4, private_space, )
-SYCL_PSTORE(float, cl::sycl::cl_float4, private_space, u)
-SYCL_PSTORE(double, cl::sycl::cl_double2, private_space, )
-SYCL_PSTORE(double, cl::sycl::cl_double2, private_space, u)
#undef SYCL_PSTORE
-#define SYCL_PSTORE_T(address_space_target) \
- template <typename scalar, typename packet_type, int Alignment> \
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret( \
- typename cl::sycl::multi_ptr< \
- scalar, \
- cl::sycl::access::address_space::address_space_target>::pointer_t \
- to, \
- const packet_type& from) { \
- if (Alignment) \
- pstore(to, from); \
- else \
- pstoreu(to, from); \
- }
-
-SYCL_PSTORE_T(global_space)
-
-SYCL_PSTORE_T(local_space)
-
-#undef SYCL_PSTORE_T
-
#define SYCL_PSET1(packet_type) \
template <> \
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pset1<packet_type>( \
@@ -291,22 +151,6 @@
}
};
-#define SYCL_PLOAD_DUP(address_space_target) \
- template <typename packet_type> \
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ploaddup( \
- typename cl::sycl::multi_ptr< \
- const typename unpacket_traits<packet_type>::type, \
- cl::sycl::access::address_space::address_space_target>::pointer_t \
- from) { \
- return get_base_packet<packet_type>::get_ploaddup(from); \
- }
-
-// global space
-SYCL_PLOAD_DUP(global_space)
-// local_space
-SYCL_PLOAD_DUP(local_space)
-#undef SYCL_PLOAD_DUP
-
#define SYCL_PLOAD_DUP_SPECILIZE(packet_type) \
template <> \
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ploaddup<packet_type>( \
@@ -325,30 +169,11 @@
const typename unpacket_traits<packet_type>::type& a) { \
return get_base_packet<packet_type>::set_plset(a); \
}
-
SYCL_PLSET(cl::sycl::cl_float4)
SYCL_PLSET(cl::sycl::cl_double2)
#undef SYCL_PLSET
-#define SYCL_PGATHER(address_space_target) \
- template <typename Scalar, typename packet_type> \
- EIGEN_DEVICE_FUNC inline packet_type pgather( \
- typename cl::sycl::multi_ptr< \
- const typename unpacket_traits<packet_type>::type, \
- cl::sycl::access::address_space::address_space_target>::pointer_t \
- from, \
- Index stride) { \
- return get_base_packet<packet_type>::get_pgather(from, stride); \
- }
-
-// global space
-SYCL_PGATHER(global_space)
-// local space
-SYCL_PGATHER(local_space)
-
-#undef SYCL_PGATHER
-
#define SYCL_PGATHER_SPECILIZE(scalar, packet_type) \
template <> \
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type \
@@ -362,24 +187,6 @@
#undef SYCL_PGATHER_SPECILIZE
-#define SYCL_PSCATTER(address_space_target) \
- template <typename Scalar, typename packet_type> \
- EIGEN_DEVICE_FUNC inline void pscatter( \
- typename cl::sycl::multi_ptr< \
- typename unpacket_traits<packet_type>::type, \
- cl::sycl::access::address_space::address_space_target>::pointer_t \
- to, \
- const packet_type& from, Index stride) { \
- get_base_packet<packet_type>::set_pscatter(to, from, stride); \
- }
-
-// global space
-SYCL_PSCATTER(global_space)
-// local space
-SYCL_PSCATTER(local_space)
-
-#undef SYCL_PSCATTER
-
#define SYCL_PSCATTER_SPECILIZE(scalar, packet_type) \
template <> \
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<scalar, packet_type>( \
@@ -563,6 +370,34 @@
}
#endif // SYCL_DEVICE_ONLY
+template <typename packet_type, int Alignment, typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type
+ploadt_ro(const Eigen::TensorSycl::internal::RangeAccess<
+ cl::sycl::access::mode::read_write, T>& from) {
+ return ploadt_ro<packet_type, Alignment>(from.get_pointer());
+}
+
+#define SYCL_PLOAD(Alignment, AlignedType) \
+ template <typename packet_type> \
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##AlignedType( \
+ const Eigen::TensorSycl::internal::RangeAccess< \
+ cl::sycl::access::mode::read_write, \
+ typename unpacket_traits<packet_type>::type> \
+ from) { \
+ return ploadt_ro<packet_type, Alignment>(from); \
+ }
+SYCL_PLOAD(Unaligned, u)
+SYCL_PLOAD(Aligned, )
+#undef SYCL_PLOAD
+
+template <typename packet_type, int Alignment>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type
+ploadt(const Eigen::TensorSycl::internal::RangeAccess<
+ cl::sycl::access::mode::read_write,
+ typename unpacket_traits<packet_type>::type>& from) {
+ return ploadt<packet_type, Alignment>(from.get_pointer());
+}
+
#define SYCL_PSTORE(alignment) \
template <typename packet_type> \
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \
diff --git a/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h b/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h
index 54eedfa..c532c18 100644
--- a/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h
+++ b/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h
@@ -223,7 +223,6 @@
m_pointerMap.clear();
EIGEN_THROW_X(
std::out_of_range("The pointer is not registered in the map\n"));
-
}
--node;
}
@@ -550,7 +549,7 @@
static const auto is_place_holder = cl::sycl::access::placeholder::true_t;
typedef T scalar_t;
typedef scalar_t &ref_t;
- typedef typename cl::sycl::global_ptr<scalar_t>::pointer_t ptr_t;
+ typedef scalar_t *ptr_t;
// the accessor type does not necessarily the same as T
typedef cl::sycl::accessor<scalar_t, 1, AcMd, global_access, is_place_holder>
@@ -570,7 +569,12 @@
RangeAccess(std::nullptr_t) : RangeAccess() {}
// This template parameter must be removed and scalar_t should be replaced
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptr_t get_pointer() const {
- return (access_.get_pointer().get() + offset_);
+ typedef cl::sycl::multi_ptr<scalar_t,
+ cl::sycl::access::address_space::generic_space,
+ cl::sycl::access::decorated::no>
+ multi_ptr;
+ multi_ptr p(access_);
+ return (p + offset_).get_raw();
}
template <typename Index>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t &operator+=(Index offset) {
diff --git a/Eigen/src/Core/arch/SYCL/TypeCasting.h b/Eigen/src/Core/arch/SYCL/TypeCasting.h
index 613e823..f6f057b 100644
--- a/Eigen/src/Core/arch/SYCL/TypeCasting.h
+++ b/Eigen/src/Core/arch/SYCL/TypeCasting.h
@@ -64,7 +64,7 @@
cl::sycl::rounding_mode::automatic>();
auto b1 = b.template convert<cl::sycl::cl_float,
cl::sycl::rounding_mode::automatic>();
- return cl::sycl::float4(a1.x(), a1.y(), b1.x(), b1.y());
+ return cl::sycl::cl_float4(a1.x(), a1.y(), b1.x(), b1.y());
}
template <>
diff --git a/Eigen/src/Core/functors/AssignmentFunctors.h b/Eigen/src/Core/functors/AssignmentFunctors.h
index c9d80e6..ca08991 100644
--- a/Eigen/src/Core/functors/AssignmentFunctors.h
+++ b/Eigen/src/Core/functors/AssignmentFunctors.h
@@ -154,7 +154,7 @@
enum {
Cost = 3 * NumTraits<Scalar>::ReadCost,
PacketAccess =
- #if defined(EIGEN_VECTORIZE_AVX) && EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<800 || defined(__apple_build_version__))
+ #if defined(EIGEN_VECTORIZE_AVX) && (EIGEN_CLANG_STRICT_LESS_THAN(8,0,0) || EIGEN_COMP_CLANGAPPLE)
// This is a partial workaround for a bug in clang generating bad code
// when mixing 256/512 bits loads and 128 bits moves.
// See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1684
diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h
index c8bb4e7..3c76019 100644
--- a/Eigen/src/Core/functors/BinaryFunctors.h
+++ b/Eigen/src/Core/functors/BinaryFunctors.h
@@ -516,68 +516,25 @@
template <typename LhsScalar, typename RhsScalar>
struct scalar_atan2_op {
using Scalar = LhsScalar;
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<is_same<LhsScalar,RhsScalar>::value, Scalar>
- operator()(const Scalar& y, const Scalar& x) const {
- EIGEN_USING_STD(atan2);
- return static_cast<Scalar>(atan2(y, x));
+
+ static constexpr bool Enable = is_same<LhsScalar, RhsScalar>::value && !NumTraits<Scalar>::IsInteger && !NumTraits<Scalar>::IsComplex;
+ EIGEN_STATIC_ASSERT(Enable, "LhsScalar and RhsScalar must be the same non-integer, non-complex type")
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& y, const Scalar& x) const {
+ return numext::atan2(y, x);
}
template <typename Packet>
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- std::enable_if_t<is_same<LhsScalar, RhsScalar>::value, Packet>
- packetOp(const Packet& y, const Packet& x) const {
- // See https://en.cppreference.com/w/cpp/numeric/math/atan2
- // for how corner cases are supposed to be handled according to the
- // IEEE floating-point standard (IEC 60559).
- const Packet kSignMask = pset1<Packet>(-Scalar(0));
- const Packet kPi = pset1<Packet>(Scalar(EIGEN_PI));
- const Packet kPiO2 = pset1<Packet>(Scalar(EIGEN_PI / 2));
- const Packet kPiO4 = pset1<Packet>(Scalar(EIGEN_PI / 4));
- const Packet k3PiO4 = pset1<Packet>(Scalar(3.0 * (EIGEN_PI / 4)));
-
- // Various predicates about the inputs.
- Packet x_signbit = pand(x, kSignMask);
- Packet x_has_signbit = pcmp_lt(por(x_signbit, kPi), pzero(x));
- Packet x_is_zero = pcmp_eq(x, pzero(x));
- Packet x_neg = pandnot(x_has_signbit, x_is_zero);
-
- Packet y_signbit = pand(y, kSignMask);
- Packet y_is_zero = pcmp_eq(y, pzero(y));
- Packet x_is_not_nan = pcmp_eq(x, x);
- Packet y_is_not_nan = pcmp_eq(y, y);
-
- // Compute the normal case. Notice that we expect that
- // finite/infinite = +/-0 here.
- Packet result = patan(pdiv(y, x));
-
- // Compute shift for when x != 0 and y != 0.
- Packet shift = pselect(x_neg, por(kPi, y_signbit), pzero(x));
-
- // Special cases:
- // Handle x = +/-inf && y = +/-inf.
- Packet is_not_nan = pcmp_eq(result, result);
- result =
- pselect(is_not_nan, padd(shift, result),
- pselect(x_neg, por(k3PiO4, y_signbit), por(kPiO4, y_signbit)));
- // Handle x == +/-0.
- result = pselect(
- x_is_zero, pselect(y_is_zero, pzero(y), por(y_signbit, kPiO2)), result);
- // Handle y == +/-0.
- result = pselect(
- y_is_zero,
- pselect(x_has_signbit, por(y_signbit, kPi), por(y_signbit, pzero(y))),
- result);
- // Handle NaN inputs.
- Packet kQNaN = pset1<Packet>(NumTraits<Scalar>::quiet_NaN());
- return pselect(pand(x_is_not_nan, y_is_not_nan), result, kQNaN);
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& y, const Packet& x) const {
+ return internal::patan2(y, x);
}
};
template<typename LhsScalar,typename RhsScalar>
struct functor_traits<scalar_atan2_op<LhsScalar, RhsScalar>> {
+ using Scalar = LhsScalar;
enum {
- PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasATan && packet_traits<LhsScalar>::HasDiv && !NumTraits<LhsScalar>::IsInteger && !NumTraits<LhsScalar>::IsComplex,
- Cost =
- scalar_div_cost<LhsScalar, PacketAccess>::value + 5 * NumTraits<LhsScalar>::MulCost + 5 * NumTraits<LhsScalar>::AddCost
+ PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<Scalar>::HasATan && packet_traits<Scalar>::HasDiv && !NumTraits<Scalar>::IsInteger && !NumTraits<Scalar>::IsComplex,
+ Cost = scalar_div_cost<Scalar, PacketAccess>::value + functor_traits<scalar_atan_op<Scalar>>::Cost
};
};
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 4a6cef5..862efc6 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -1212,7 +1212,7 @@
traits.madd(*A0, *rhs_panel, *C1, *T0, fix<1>);
traits.madd(*A0, *rhs_panel, *C2, *T0, fix<2>);
traits.madd(*A0, *rhs_panel, *C3, *T0, fix<3>);
- #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE) && !(EIGEN_COMP_LCC)
+ #if EIGEN_GNUC_STRICT_AT_LEAST(6,0,0) && defined(EIGEN_VECTORIZE_SSE) && !(EIGEN_COMP_LCC)
__asm__ ("" : "+x,m" (*A0));
#endif
EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
@@ -1628,7 +1628,7 @@
RhsPanel27 rhs_panel;
RhsPacket T0;
LhsPacket A2;
- #if EIGEN_COMP_GNUC_STRICT && EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && !(EIGEN_GNUC_AT_LEAST(9,0))
+ #if EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && EIGEN_GNUC_STRICT_LESS_THAN(9,0,0)
// see http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1633
// without this workaround A0, A1, and A2 are loaded in the same register,
// which is not good for pipelining
@@ -1834,7 +1834,7 @@
RhsPanel15 rhs_panel;
RhsPacket T0;
LhsPacket A2;
- #if EIGEN_COMP_GNUC_STRICT && EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && !(EIGEN_GNUC_AT_LEAST(9,0))
+ #if EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && EIGEN_GNUC_STRICT_LESS_THAN(9,0,0)
// see http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1633
// without this workaround A0, A1, and A2 are loaded in the same register,
// which is not good for pipelining
@@ -2081,7 +2081,7 @@
RhsPacket T0;
// NOTE: the begin/end asm comments below work around bug 935!
// but they are not enough for gcc>=6 without FMA (bug 1637)
- #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE)
+ #if EIGEN_GNUC_STRICT_AT_LEAST(6,0,0) && defined(EIGEN_VECTORIZE_SSE)
#define EIGEN_GEBP_2Px8_SPILLING_WORKAROUND __asm__ ("" : [a0] "+x,m" (A0),[a1] "+x,m" (A1));
#else
#define EIGEN_GEBP_2Px8_SPILLING_WORKAROUND
@@ -2245,7 +2245,7 @@
// NOTE: the begin/end asm comments below work around bug 935!
// but they are not enough for gcc>=6 without FMA (bug 1637)
- #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE) && !(EIGEN_COMP_LCC)
+ #if EIGEN_GNUC_STRICT_AT_LEAST(6,0,0) && defined(EIGEN_VECTORIZE_SSE) && !(EIGEN_COMP_LCC)
#define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__ ("" : [a0] "+x,m" (A0),[a1] "+x,m" (A1));
#else
#define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND
diff --git a/Eigen/src/Core/util/Assert.h b/Eigen/src/Core/util/Assert.h
new file mode 100644
index 0000000..f8ba632
--- /dev/null
+++ b/Eigen/src/Core/util/Assert.h
@@ -0,0 +1,166 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2022, The Eigen authors.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CORE_UTIL_ASSERT_H
+#define EIGEN_CORE_UTIL_ASSERT_H
+
+// Eigen custom assert function.
+//
+// The combination of Eigen's relative includes and cassert's `assert` function
+// (or any usage of the __FILE__ macro) can lead to ODR issues:
+// a header included using different relative paths in two different TUs will
+// have two different token-for-token definitions, since __FILE__ is expanded
+// as an in-line string with different values. Normally this would be
+// harmless - the linker would just choose one definition. However, it breaks
+// with C++20 modules when functions in different modules have different
+// definitions.
+//
+// To get around this, we need to use __builtin_FILE() when available, which is
+// considered a single token, and thus satisfies the ODR.
+
+// Only define eigen_plain_assert if we are debugging, and either
+// - we are not compiling for GPU, or
+// - gpu debugging is enabled.
+#if !defined(EIGEN_NO_DEBUG) && (!defined(EIGEN_GPU_COMPILE_PHASE) || !defined(EIGEN_NO_DEBUG_GPU))
+
+#include <cassert>
+
+#ifndef EIGEN_USE_CUSTOM_PLAIN_ASSERT
+// Disable new custom asserts by default for now.
+#define EIGEN_USE_CUSTOM_PLAIN_ASSERT 0
+#endif
+
+#if EIGEN_USE_CUSTOM_PLAIN_ASSERT
+
+#ifndef EIGEN_HAS_BUILTIN_FILE
+// Clang can check if __builtin_FILE() is supported.
+// GCC > 5, MSVC 2019 14.26 (1926) all have __builtin_FILE().
+//
+// For NVCC, it's more complicated. Through trial-and-error:
+// - nvcc+gcc supports __builtin_FILE() on host, and on device after CUDA 11.
+// - nvcc+msvc supports __builtin_FILE() only after CUDA 11.
+#if (EIGEN_HAS_BUILTIN(__builtin_FILE) && (EIGEN_COMP_CLANG || !defined(EIGEN_CUDA_ARCH))) || \
+ (EIGEN_GNUC_STRICT_AT_LEAST(5, 0, 0) && (EIGEN_COMP_NVCC >= 110000 || !defined(EIGEN_CUDA_ARCH))) || \
+ (EIGEN_COMP_MSVC >= 1926 && (!EIGEN_COMP_NVCC || EIGEN_COMP_NVCC >= 110000))
+#define EIGEN_HAS_BUILTIN_FILE 1
+#else
+#define EIGEN_HAS_BUILTIN_FILE 0
+#endif
+#endif // EIGEN_HAS_BUILTIN_FILE
+
+#if EIGEN_HAS_BUILTIN_FILE
+# define EIGEN_BUILTIN_FILE __builtin_FILE()
+# define EIGEN_BUILTIN_LINE __builtin_LINE()
+#else
+// Default (potentially unsafe) values.
+# define EIGEN_BUILTIN_FILE __FILE__
+# define EIGEN_BUILTIN_LINE __LINE__
+#endif
+
+// Use __PRETTY_FUNCTION__ when available, since it is more descriptive, as
+// __builtin_FUNCTION() only returns the undecorated function name.
+// This should still be okay ODR-wise since it is a compiler-specific fixed
+// value. Mixing compilers will likely lead to ODR violations anyways.
+#if EIGEN_COMP_MSVC
+# define EIGEN_BUILTIN_FUNCTION __FUNCSIG__
+#elif EIGEN_COMP_GNUC
+# define EIGEN_BUILTIN_FUNCTION __PRETTY_FUNCTION__
+#else
+# define EIGEN_BUILTIN_FUNCTION __func__
+#endif
+
+namespace Eigen {
+namespace internal {
+
+// Generic default assert handler.
+template<typename EnableIf = void, typename... EmptyArgs>
+struct assert_handler_impl {
+ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE
+ static inline void run(const char* expression, const char* file, unsigned line, const char* function) {
+#ifdef EIGEN_GPU_COMPILE_PHASE
+ // GPU device code doesn't allow stderr or abort, so use printf and raise an
+ // illegal instruction exception to trigger a kernel failure.
+#ifndef EIGEN_NO_IO
+ printf("Assertion failed at %s:%u in %s: %s\n",
+ file == nullptr ? "<file>" : file,
+ line,
+ function == nullptr ? "<function>" : function,
+ expression);
+#endif
+ __trap();
+
+#else // EIGEN_GPU_COMPILE_PHASE
+
+ // Print to stderr and abort, as specified in <cassert>.
+#ifndef EIGEN_NO_IO
+ fprintf(stderr, "Assertion failed at %s:%u in %s: %s\n",
+ file == nullptr ? "<file>" : file,
+ line,
+ function == nullptr ? "<function>" : function,
+ expression);
+#endif
+ std::abort();
+
+#endif // EIGEN_GPU_COMPILE_PHASE
+ }
+};
+
+// Use POSIX __assert_fail handler when available.
+//
+// This allows us to integrate with systems that have custom handlers.
+//
+// NOTE: this handler is not always available on all POSIX systems (otherwise
+// we could simply test for __unix__ or similar). The handler function name
+// seems to depend on the specific toolchain implementation, and differs between
+// compilers, platforms, OSes, etc. Hence, we detect support via SFINAE.
+template<typename... EmptyArgs>
+struct assert_handler_impl<
+ void_t<decltype(__assert_fail(
+ (const char*)nullptr, // expression
+ (const char*)nullptr, // file
+ 0, // line
+ (const char*)nullptr, // function
+ std::declval<EmptyArgs>()... // Empty substitution required for SFINAE.
+ ))>, EmptyArgs... > {
+ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE
+ static inline void run(const char* expression, const char* file, unsigned line, const char* function) {
+ // GCC requires this call to be dependent on the template parameters.
+ __assert_fail(expression, file, line, function, std::declval<EmptyArgs>()...);
+ }
+};
+
+EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE
+inline void __assert_handler(const char* expression, const char* file, unsigned line, const char* function) {
+ assert_handler_impl<>::run(expression, file, line, function);
+}
+
+} // namespace internal
+} // namespace Eigen
+
+#define eigen_plain_assert(expression) \
+ (EIGEN_PREDICT_FALSE(!(expression)) ? \
+ Eigen::internal::__assert_handler(#expression, \
+ EIGEN_BUILTIN_FILE, \
+ EIGEN_BUILTIN_LINE, \
+ EIGEN_BUILTIN_FUNCTION) : (void)0)
+
+#else // EIGEN_USE_CUSTOM_PLAIN_ASSERT
+
+// Use regular assert.
+#define eigen_plain_assert(condition) assert(condition)
+
+#endif // EIGEN_USE_CUSTOM_PLAIN_ASSERT
+
+#else // EIGEN_NO_DEBUG
+
+#define eigen_plain_assert(condition) ((void)0)
+
+#endif // EIGEN_NO_DEBUG
+
+#endif // EIGEN_CORE_UTIL_ASSERT_H
diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h
index a2486af..979d974 100644
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@@ -478,8 +478,8 @@
ExtractType,
typename ExtractType_::PlainObject
> DirectLinearAccessType;
- static inline EIGEN_DEVICE_FUNC ExtractType extract(const XprType& x) { return x; }
- static inline EIGEN_DEVICE_FUNC const Scalar extractScalarFactor(const XprType&) { return Scalar(1); }
+ EIGEN_DEVICE_FUNC static inline EIGEN_DEVICE_FUNC ExtractType extract(const XprType& x) { return x; }
+ EIGEN_DEVICE_FUNC static inline EIGEN_DEVICE_FUNC const Scalar extractScalarFactor(const XprType&) { return Scalar(1); }
};
// pop conjugate
@@ -495,8 +495,8 @@
IsComplex = NumTraits<Scalar>::IsComplex,
NeedToConjugate = Base::NeedToConjugate ? 0 : IsComplex
};
- static inline ExtractType extract(const XprType& x) { return Base::extract(x.nestedExpression()); }
- static inline Scalar extractScalarFactor(const XprType& x) { return conj(Base::extractScalarFactor(x.nestedExpression())); }
+ EIGEN_DEVICE_FUNC static inline ExtractType extract(const XprType& x) { return Base::extract(x.nestedExpression()); }
+ EIGEN_DEVICE_FUNC static inline Scalar extractScalarFactor(const XprType& x) { return conj(Base::extractScalarFactor(x.nestedExpression())); }
};
// pop scalar multiple
@@ -510,8 +510,8 @@
typedef blas_traits<NestedXpr> Base;
typedef CwiseBinaryOp<scalar_product_op<Scalar>, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain>, NestedXpr> XprType;
typedef typename Base::ExtractType ExtractType;
- static inline EIGEN_DEVICE_FUNC ExtractType extract(const XprType& x) { return Base::extract(x.rhs()); }
- static inline EIGEN_DEVICE_FUNC Scalar extractScalarFactor(const XprType& x)
+ EIGEN_DEVICE_FUNC static inline EIGEN_DEVICE_FUNC ExtractType extract(const XprType& x) { return Base::extract(x.rhs()); }
+ EIGEN_DEVICE_FUNC static inline EIGEN_DEVICE_FUNC Scalar extractScalarFactor(const XprType& x)
{ return x.lhs().functor().m_other * Base::extractScalarFactor(x.rhs()); }
};
template<typename Scalar, typename NestedXpr, typename Plain>
@@ -524,8 +524,8 @@
typedef blas_traits<NestedXpr> Base;
typedef CwiseBinaryOp<scalar_product_op<Scalar>, NestedXpr, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain> > XprType;
typedef typename Base::ExtractType ExtractType;
- static inline ExtractType extract(const XprType& x) { return Base::extract(x.lhs()); }
- static inline Scalar extractScalarFactor(const XprType& x)
+ EIGEN_DEVICE_FUNC static inline ExtractType extract(const XprType& x) { return Base::extract(x.lhs()); }
+ EIGEN_DEVICE_FUNC static inline Scalar extractScalarFactor(const XprType& x)
{ return Base::extractScalarFactor(x.lhs()) * x.rhs().functor().m_other; }
};
template<typename Scalar, typename Plain1, typename Plain2>
@@ -545,8 +545,8 @@
typedef blas_traits<NestedXpr> Base;
typedef CwiseUnaryOp<scalar_opposite_op<Scalar>, NestedXpr> XprType;
typedef typename Base::ExtractType ExtractType;
- static inline ExtractType extract(const XprType& x) { return Base::extract(x.nestedExpression()); }
- static inline Scalar extractScalarFactor(const XprType& x)
+ EIGEN_DEVICE_FUNC static inline ExtractType extract(const XprType& x) { return Base::extract(x.nestedExpression()); }
+ EIGEN_DEVICE_FUNC static inline Scalar extractScalarFactor(const XprType& x)
{ return - Base::extractScalarFactor(x.nestedExpression()); }
};
@@ -567,8 +567,8 @@
enum {
IsTransposed = Base::IsTransposed ? 0 : 1
};
- static inline ExtractType extract(const XprType& x) { return ExtractType(Base::extract(x.nestedExpression())); }
- static inline Scalar extractScalarFactor(const XprType& x) { return Base::extractScalarFactor(x.nestedExpression()); }
+ EIGEN_DEVICE_FUNC static inline ExtractType extract(const XprType& x) { return ExtractType(Base::extract(x.nestedExpression())); }
+ EIGEN_DEVICE_FUNC static inline Scalar extractScalarFactor(const XprType& x) { return Base::extractScalarFactor(x.nestedExpression()); }
};
template<typename T>
@@ -586,7 +586,7 @@
template<typename T>
struct extract_data_selector<T,false> {
- static typename T::Scalar* run(const T&) { return 0; }
+ EIGEN_DEVICE_FUNC static typename T::Scalar* run(const T&) { return 0; }
};
template<typename T>
diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h
index 7c1a08b..643d6400 100644
--- a/Eigen/src/Core/util/ConfigureVectorization.h
+++ b/Eigen/src/Core/util/ConfigureVectorization.h
@@ -186,14 +186,14 @@
#define EIGEN_SSE2_ON_MSVC_2008_OR_LATER
#endif
#else
- #if (defined __SSE2__) && ( (!EIGEN_COMP_GNUC) || EIGEN_COMP_ICC || EIGEN_COMP_GNUC )
- #define EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC
+ #if defined(__SSE2__)
+ #define EIGEN_SSE2_ON_NON_MSVC
#endif
#endif
#if !(defined(EIGEN_DONT_VECTORIZE) || defined(EIGEN_GPUCC))
- #if defined (EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC) || defined(EIGEN_SSE2_ON_MSVC_2008_OR_LATER)
+ #if defined (EIGEN_SSE2_ON_NON_MSVC) || defined(EIGEN_SSE2_ON_MSVC_2008_OR_LATER)
// Defines symbols for compile-time detection of which instructions are
// used.
@@ -285,7 +285,7 @@
#endif
// Disable AVX support on broken xcode versions
- #if defined(__apple_build_version__) && (__apple_build_version__ == 11000033 ) && ( __MAC_OS_X_VERSION_MIN_REQUIRED == 101500 )
+ #if ( EIGEN_COMP_CLANGAPPLE == 11000033 ) && ( __MAC_OS_X_VERSION_MIN_REQUIRED == 101500 )
// A nasty bug in the clang compiler shipped with xcode in a common compilation situation
// when XCode 11.0 and Mac deployment target macOS 10.15 is https://trac.macports.org/ticket/58776#no1
#ifdef EIGEN_VECTORIZE_AVX
@@ -352,10 +352,10 @@
#endif
} // end extern "C"
- #elif defined __VSX__
+ #elif defined(__VSX__) && !defined(__APPLE__)
#define EIGEN_VECTORIZE
- #define EIGEN_VECTORIZE_VSX
+ #define EIGEN_VECTORIZE_VSX 1
#include <altivec.h>
// We need to #undef all these ugly tokens defined in <altivec.h>
// => use __vector instead of vector
@@ -427,7 +427,7 @@
#include <arm_fp16.h>
#endif
-#if defined(__F16C__) && (!defined(EIGEN_GPUCC) && (!EIGEN_COMP_CLANG || EIGEN_COMP_CLANG>=380))
+#if defined(__F16C__) && !defined(EIGEN_GPUCC) && (!EIGEN_COMP_CLANG_STRICT || EIGEN_CLANG_STRICT_AT_LEAST(3,8,0))
// We can use the optimized fp16 to float and float to fp16 conversion routines
#define EIGEN_HAS_FP16_C
diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h
index 8f87c4a..facb074 100644
--- a/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/Eigen/src/Core/util/ForwardDeclarations.h
@@ -192,6 +192,8 @@
template<typename Scalar> struct scalar_acos_op;
template<typename Scalar> struct scalar_asin_op;
template<typename Scalar> struct scalar_tan_op;
+template<typename Scalar> struct scalar_atan_op;
+template <typename LhsScalar, typename RhsScalar = LhsScalar> struct scalar_atan2_op;
template<typename Scalar> struct scalar_inverse_op;
template<typename Scalar> struct scalar_square_op;
template<typename Scalar> struct scalar_cube_op;
@@ -252,15 +254,22 @@
template<typename MatrixType,int RowFactor,int ColFactor> class Replicate;
template<typename MatrixType, int Direction = BothDirections> class Reverse;
+#if defined(EIGEN_USE_LAPACKE) && defined(lapack_int)
+// Lapacke interface requires StorageIndex to be lapack_int
+typedef lapack_int DefaultPermutationIndex;
+#else
+typedef int DefaultPermutationIndex;
+#endif
+
template<typename MatrixType> class FullPivLU;
template<typename MatrixType> class PartialPivLU;
namespace internal {
template<typename MatrixType> struct inverse_impl;
}
template<typename MatrixType> class HouseholderQR;
-template<typename MatrixType> class ColPivHouseholderQR;
-template<typename MatrixType> class FullPivHouseholderQR;
-template<typename MatrixType> class CompleteOrthogonalDecomposition;
+template<typename MatrixType, typename PermutationIndex = DefaultPermutationIndex> class ColPivHouseholderQR;
+template<typename MatrixType, typename PermutationIndex = DefaultPermutationIndex> class FullPivHouseholderQR;
+template<typename MatrixType, typename PermutationIndex = DefaultPermutationIndex> class CompleteOrthogonalDecomposition;
template<typename MatrixType> class SVDBase;
template<typename MatrixType, int Options = 0> class JacobiSVD;
template<typename MatrixType, int Options = 0> class BDCSVD;
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index 4cd81a0..dd94f88 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -63,20 +63,27 @@
// Compiler identification, EIGEN_COMP_*
//------------------------------------------------------------------------------------------
-/// \internal EIGEN_COMP_GNUC set to 1 for all compilers compatible with GCC
+/// \internal EIGEN_COMP_GNUC set to version (e.g., 951 for GCC 9.5.1) for all compilers compatible with GCC
#ifdef __GNUC__
- #define EIGEN_COMP_GNUC (__GNUC__*10+__GNUC_MINOR__)
+ #define EIGEN_COMP_GNUC (__GNUC__*100+__GNUC_MINOR__*10+__GNUC_PATCHLEVEL__)
#else
#define EIGEN_COMP_GNUC 0
#endif
-/// \internal EIGEN_COMP_CLANG set to major+minor version (e.g., 307 for clang 3.7) if the compiler is clang
+/// \internal EIGEN_COMP_CLANG set to version (e.g., 372 for clang 3.7.2) if the compiler is clang
#if defined(__clang__)
- #define EIGEN_COMP_CLANG (__clang_major__*100+__clang_minor__)
+ #define EIGEN_COMP_CLANG (__clang_major__*100+__clang_minor__*10+__clang_patchlevel__)
#else
#define EIGEN_COMP_CLANG 0
#endif
+/// \internal EIGEN_COMP_CLANGAPPLE set to the version number (e.g. 9000000 for AppleClang 9.0) if the compiler is AppleClang
+#if defined(__clang__) && defined(__apple_build_version__)
+ #define EIGEN_COMP_CLANGAPPLE __apple_build_version__
+#else
+ #define EIGEN_COMP_CLANGAPPLE 0
+#endif
+
/// \internal EIGEN_COMP_CASTXML set to 1 if being preprocessed by CastXML
#if defined(__castxml__)
#define EIGEN_COMP_CASTXML 1
@@ -248,25 +255,48 @@
#endif
-/// \internal EIGEN_GNUC_STRICT set to 1 if the compiler is really GCC and not a compatible compiler (e.g., ICC, clang, mingw, etc.)
+/// \internal EIGEN_COMP_GNUC_STRICT set to 1 if the compiler is really GCC and not a compatible compiler (e.g., ICC, clang, mingw, etc.)
#if EIGEN_COMP_GNUC && !(EIGEN_COMP_CLANG || EIGEN_COMP_ICC || EIGEN_COMP_CLANGICC || EIGEN_COMP_MINGW || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM || EIGEN_COMP_EMSCRIPTEN || EIGEN_COMP_FCC || EIGEN_COMP_CLANGFCC || EIGEN_COMP_CPE || EIGEN_COMP_CLANGCPE || EIGEN_COMP_LCC)
#define EIGEN_COMP_GNUC_STRICT 1
#else
#define EIGEN_COMP_GNUC_STRICT 0
#endif
-
-#if EIGEN_COMP_GNUC
- #define EIGEN_GNUC_AT_LEAST(x,y) ((__GNUC__==x && __GNUC_MINOR__>=y) || __GNUC__>x)
- #define EIGEN_GNUC_AT_MOST(x,y) ((__GNUC__==x && __GNUC_MINOR__<=y) || __GNUC__<x)
- #define EIGEN_GNUC_AT(x,y) ( __GNUC__==x && __GNUC_MINOR__==y )
+// GCC, and compilers that pretend to be it, have different version schemes, so this only makes sense to use with the real GCC.
+#if EIGEN_COMP_GNUC_STRICT
+ #define EIGEN_GNUC_STRICT_AT_LEAST(x,y,z) ((__GNUC__ > x) || \
+ (__GNUC__ == x && __GNUC_MINOR__ > y) || \
+ (__GNUC__ == x && __GNUC_MINOR__ == y && __GNUC_PATCHLEVEL__ >= z))
+ #define EIGEN_GNUC_STRICT_LESS_THAN(x,y,z) ((__GNUC__ < x) || \
+ (__GNUC__ == x && __GNUC_MINOR__ < y) || \
+ (__GNUC__ == x && __GNUC_MINOR__ == y && __GNUC_PATCHLEVEL__ < z))
#else
- #define EIGEN_GNUC_AT_LEAST(x,y) 0
- #define EIGEN_GNUC_AT_MOST(x,y) 0
- #define EIGEN_GNUC_AT(x,y) 0
+ #define EIGEN_GNUC_STRICT_AT_LEAST(x,y,z) 0
+ #define EIGEN_GNUC_STRICT_LESS_THAN(x,y,z) 0
#endif
+
+/// \internal EIGEN_COMP_CLANG_STRICT set to 1 if the compiler is really Clang and not a compatible compiler (e.g., AppleClang, etc.)
+#if EIGEN_COMP_CLANG && !(EIGEN_COMP_CLANGAPPLE || EIGEN_COMP_CLANGICC || EIGEN_COMP_CLANGFCC || EIGEN_COMP_CLANGCPE)
+ #define EIGEN_COMP_CLANG_STRICT 1
+#else
+ #define EIGEN_COMP_CLANG_STRICT 0
+#endif
+
+// Clang, and compilers forked from it, have different version schemes, so this only makes sense to use with the real Clang.
+#if EIGEN_COMP_CLANG_STRICT
+ #define EIGEN_CLANG_STRICT_AT_LEAST(x,y,z) ((__clang_major__ > x) || \
+ (__clang_major__ == x && __clang_minor__ > y) || \
+ (__clang_major__ == x && __clang_minor__ == y && __clang_patchlevel__ >= z))
+ #define EIGEN_CLANG_STRICT_LESS_THAN(x,y,z) ((__clang_major__ < x) || \
+ (__clang_major__ == x && __clang_minor__ < y) || \
+ (__clang_major__ == x && __clang_minor__ == y && __clang_patchlevel__ < z))
+#else
+ #define EIGEN_CLANG_STRICT_AT_LEAST(x,y,z) 0
+ #define EIGEN_CLANG_STRICT_LESS_THAN(x,y,z) 0
+#endif
+
//------------------------------------------------------------------------------------------
// Architecture identification, EIGEN_ARCH_*
//------------------------------------------------------------------------------------------
@@ -375,7 +405,7 @@
#endif
/// \internal EIGEN_ARCH_PPC set to 1 if the architecture is PowerPC
-#if defined(__powerpc__) || defined(__ppc__) || defined(_M_PPC)
+#if defined(__powerpc__) || defined(__ppc__) || defined(_M_PPC) || defined(__POWERPC__)
#define EIGEN_ARCH_PPC 1
#else
#define EIGEN_ARCH_PPC 0
@@ -671,10 +701,13 @@
// but in practice we should not rely on them but rather on the availability of
// individual features as defined later.
// This is why there is no EIGEN_HAS_CXX17.
-#if EIGEN_MAX_CPP_VER<14 || EIGEN_COMP_CXXVER<14 || (EIGEN_COMP_MSVC && EIGEN_COMP_MSVC < 1900) || \
- (EIGEN_COMP_ICC && EIGEN_COMP_ICC < 1500) || (EIGEN_COMP_NVCC && EIGEN_COMP_NVCC < 80000) || \
- (EIGEN_COMP_CLANG && ((EIGEN_COMP_CLANG<309) || (defined(__apple_build_version__) && (__apple_build_version__ < 9000000)))) || \
- (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<51)
+#if EIGEN_MAX_CPP_VER < 14 || EIGEN_COMP_CXXVER < 14 || \
+ (EIGEN_COMP_MSVC && EIGEN_COMP_MSVC < 1900) || \
+ (EIGEN_COMP_ICC && EIGEN_COMP_ICC < 1500) || \
+ (EIGEN_COMP_NVCC && EIGEN_COMP_NVCC < 80000) || \
+ (EIGEN_COMP_CLANG_STRICT && EIGEN_COMP_CLANG < 390) || \
+ (EIGEN_COMP_CLANGAPPLE && EIGEN_COMP_CLANGAPPLE < 9000000) || \
+ (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC < 510)
#error This compiler appears to be too old to be supported by Eigen
#endif
@@ -721,11 +754,11 @@
// for over-aligned data, but not in a manner that is compatible with Eigen.
// See https://gitlab.com/libeigen/eigen/-/issues/2575
#ifndef EIGEN_HAS_CXX17_OVERALIGN
-#if EIGEN_MAX_CPP_VER>=17 && EIGEN_COMP_CXXVER>=17 && ( \
- (EIGEN_COMP_MSVC >= 1912) \
- || (EIGEN_GNUC_AT_LEAST(7,0)) \
- || ((!defined(__apple_build_version__)) && (EIGEN_COMP_CLANG>=500)) \
- || (( defined(__apple_build_version__)) && (__apple_build_version__>=10000000)) \
+#if EIGEN_MAX_CPP_VER>=17 && EIGEN_COMP_CXXVER>=17 && ( \
+ (EIGEN_COMP_MSVC >= 1912) \
+ || (EIGEN_GNUC_STRICT_AT_LEAST(7,0,0)) \
+ || (EIGEN_CLANG_STRICT_AT_LEAST(5,0,0)) \
+ || (EIGEN_COMP_CLANGAPPLE && EIGEN_COMP_CLANGAPPLE >= 10000000) \
) && !EIGEN_COMP_ICC
#define EIGEN_HAS_CXX17_OVERALIGN 1
#else
@@ -819,8 +852,8 @@
// GPU stuff
-// Disable some features when compiling with GPU compilers (NVCC/clang-cuda/SYCL/HIPCC)
-#if defined(EIGEN_CUDACC) || defined(SYCL_DEVICE_ONLY) || defined(EIGEN_HIPCC)
+// Disable some features when compiling with GPU compilers (SYCL/HIPCC)
+#if defined(SYCL_DEVICE_ONLY) || defined(EIGEN_HIP_DEVICE_COMPILE)
// Do not try asserts on device code
#ifndef EIGEN_NO_DEBUG
#define EIGEN_NO_DEBUG
@@ -829,7 +862,10 @@
#ifdef EIGEN_INTERNAL_DEBUGGING
#undef EIGEN_INTERNAL_DEBUGGING
#endif
+#endif
+// No exceptions on device.
+#if defined(SYCL_DEVICE_ONLY) || defined(EIGEN_GPU_COMPILE_PHASE)
#ifdef EIGEN_EXCEPTIONS
#undef EIGEN_EXCEPTIONS
#endif
@@ -861,17 +897,6 @@
# endif
#endif
-// eigen_plain_assert is where we implement the workaround for the assert() bug in GCC <= 4.3, see bug 89
-#ifdef EIGEN_NO_DEBUG
- #ifdef SYCL_DEVICE_ONLY // used to silence the warning on SYCL device
- #define eigen_plain_assert(x) EIGEN_UNUSED_VARIABLE(x)
- #else
- #define eigen_plain_assert(x)
- #endif
-#else
- #define eigen_plain_assert(x) assert(x)
-#endif
-
// eigen_assert can be overridden
#ifndef eigen_assert
#define eigen_assert(x) eigen_plain_assert(x)
@@ -880,10 +905,10 @@
#ifdef EIGEN_INTERNAL_DEBUGGING
#define eigen_internal_assert(x) eigen_assert(x)
#else
-#define eigen_internal_assert(x)
+#define eigen_internal_assert(x) ((void)0)
#endif
-#ifdef EIGEN_NO_DEBUG
+#if defined(EIGEN_NO_DEBUG) || (defined(EIGEN_GPU_COMPILE_PHASE) && defined(EIGEN_NO_DEBUG_GPU))
#define EIGEN_ONLY_USED_FOR_DEBUG(x) EIGEN_UNUSED_VARIABLE(x)
#else
#define EIGEN_ONLY_USED_FOR_DEBUG(x)
@@ -980,11 +1005,16 @@
// directly for std::complex<T>, Eigen::half, Eigen::bfloat16. For these,
// you will need to apply to the underlying POD type.
#if EIGEN_ARCH_PPC && EIGEN_COMP_GNUC_STRICT
- // This seems to be broken on clang. Packet4f is loaded into a single
- // register rather than a vector, zeroing out some entries. Integer
+ // This seems to be broken on clang. Packet4f is loaded into a single
+ // register rather than a vector, zeroing out some entries. Integer
// types also generate a compile error.
- // General, Altivec, VSX.
- #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__ ("" : "+r,v,wa" (X));
+ #if EIGEN_OS_MAC
+ // General, Altivec for Apple (VSX were added in ISA v2.06):
+ #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__ ("" : "+r,v" (X));
+ #else
+ // General, Altivec, VSX otherwise:
+ #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__ ("" : "+r,v,wa" (X));
+ #endif
#elif EIGEN_ARCH_ARM_OR_ARM64
#ifdef __ARM_FP
// General, VFP or NEON.
@@ -1237,10 +1267,10 @@
namespace Eigen {
namespace internal {
-inline bool all(){ return true; }
+EIGEN_DEVICE_FUNC inline bool all(){ return true; }
template<typename T, typename ...Ts>
-bool all(T t, Ts ... ts){ return t && all(ts...); }
+EIGEN_DEVICE_FUNC bool all(T t, Ts ... ts){ return t && all(ts...); }
}
}
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index e4a8793..a39c718 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -59,6 +59,25 @@
#endif
+#ifndef EIGEN_MALLOC_CHECK_THREAD_LOCAL
+
+// Check whether we can use the thread_local keyword to allow or disallow
+// allocating memory with per-thread granularity, by means of the
+// set_is_malloc_allowed() function.
+#ifndef EIGEN_AVOID_THREAD_LOCAL
+
+#if ((EIGEN_COMP_GNUC) || __has_feature(cxx_thread_local) || EIGEN_COMP_MSVC >= 1900) && !defined(EIGEN_GPU_COMPILE_PHASE)
+#define EIGEN_MALLOC_CHECK_THREAD_LOCAL thread_local
+#else
+#define EIGEN_MALLOC_CHECK_THREAD_LOCAL
+#endif
+
+#else // EIGEN_AVOID_THREAD_LOCAL
+#define EIGEN_MALLOC_CHECK_THREAD_LOCAL
+#endif // EIGEN_AVOID_THREAD_LOCAL
+
+#endif
+
#include "../InternalHeaderCheck.h"
namespace Eigen {
@@ -156,7 +175,7 @@
#elif defined EIGEN_RUNTIME_NO_MALLOC
EIGEN_DEVICE_FUNC inline bool is_malloc_allowed_impl(bool update, bool new_value = false)
{
- static bool value = true;
+ EIGEN_MALLOC_CHECK_THREAD_LOCAL static bool value = true;
if (update == 1)
value = new_value;
return value;
@@ -938,7 +957,7 @@
~aligned_allocator() {}
- #if EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_LEAST(7,0)
+ #if EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_STRICT_AT_LEAST(7,0,0)
// In gcc std::allocator::max_size() is bugged making gcc triggers a warning:
// eigen/Eigen/src/Core/util/Memory.h:189:12: warning: argument 1 value '18446744073709551612' exceeds maximum object size 9223372036854775807
// See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87544
diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h
index 6c6fb71..1a5d515 100644
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@@ -153,6 +153,21 @@
template< class T >
struct is_void : is_same<void, std::remove_const_t<T>> {};
+/** \internal
+ * Implementation of std::void_t for SFINAE.
+ *
+ * Pre C++17:
+ * Custom implementation.
+ *
+ * Post C++17: Uses std::void_t
+ */
+#if EIGEN_COMP_CXXVER >= 17
+using std::void_t;
+#else
+template<typename...>
+using void_t = void;
+#endif
+
template<> struct is_arithmetic<signed long long> { enum { value = true }; };
template<> struct is_arithmetic<unsigned long long> { enum { value = true }; };
using std::is_integral;
@@ -232,7 +247,7 @@
*
* For C++20, this function just forwards to `std::ssize`, or any ADL discoverable `ssize` function.
*/
-#if EIGEN_COMP_CXXVER < 20 || EIGEN_GNUC_AT_MOST(9,4)
+#if EIGEN_COMP_CXXVER < 20 || EIGEN_GNUC_STRICT_LESS_THAN(10,0,0)
template <typename T>
EIGEN_CONSTEXPR auto index_list_size(const T& x) {
using R = std::common_type_t<std::ptrdiff_t, std::make_signed_t<decltype(x.size())>>;
diff --git a/Eigen/src/QR/ColPivHouseholderQR.h b/Eigen/src/QR/ColPivHouseholderQR.h
index c906997..6855893 100644
--- a/Eigen/src/QR/ColPivHouseholderQR.h
+++ b/Eigen/src/QR/ColPivHouseholderQR.h
@@ -16,12 +16,12 @@
namespace Eigen {
namespace internal {
-template<typename MatrixType_> struct traits<ColPivHouseholderQR<MatrixType_> >
+template<typename MatrixType_, typename PermutationIndex_> struct traits<ColPivHouseholderQR<MatrixType_, PermutationIndex_> >
: traits<MatrixType_>
{
typedef MatrixXpr XprKind;
typedef SolverStorage StorageKind;
- typedef int StorageIndex;
+ typedef PermutationIndex_ PermutationIndex;
enum { Flags = 0 };
};
@@ -50,31 +50,41 @@
*
* \sa MatrixBase::colPivHouseholderQr()
*/
-template<typename MatrixType_> class ColPivHouseholderQR
- : public SolverBase<ColPivHouseholderQR<MatrixType_> >
+template<typename MatrixType_, typename PermutationIndex_> class ColPivHouseholderQR
+ : public SolverBase<ColPivHouseholderQR<MatrixType_, PermutationIndex_> >
{
public:
typedef MatrixType_ MatrixType;
typedef SolverBase<ColPivHouseholderQR> Base;
friend class SolverBase<ColPivHouseholderQR>;
-
+ typedef PermutationIndex_ PermutationIndex;
EIGEN_GENERIC_PUBLIC_INTERFACE(ColPivHouseholderQR)
+
enum {
MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
};
typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
- typedef PermutationMatrix<ColsAtCompileTime, MaxColsAtCompileTime> PermutationType;
- typedef typename internal::plain_row_type<MatrixType, Index>::type IntRowVectorType;
+ typedef PermutationMatrix<ColsAtCompileTime, MaxColsAtCompileTime, PermutationIndex> PermutationType;
+ typedef typename internal::plain_row_type<MatrixType, PermutationIndex>::type IntRowVectorType;
typedef typename internal::plain_row_type<MatrixType>::type RowVectorType;
typedef typename internal::plain_row_type<MatrixType, RealScalar>::type RealRowVectorType;
typedef HouseholderSequence<MatrixType,internal::remove_all_t<typename HCoeffsType::ConjugateReturnType>> HouseholderSequenceType;
typedef typename MatrixType::PlainObject PlainObject;
- private:
-
- typedef typename PermutationType::StorageIndex PermIndexType;
+private:
+ void init(Index rows, Index cols) {
+ Index diag = numext::mini(rows, cols);
+ m_hCoeffs.resize(diag);
+ m_colsPermutation.resize(cols);
+ m_colsTranspositions.resize(cols);
+ m_temp.resize(cols);
+ m_colNormsUpdated.resize(cols);
+ m_colNormsDirect.resize(cols);
+ m_isInitialized = false;
+ m_usePrescribedThreshold = false;
+ }
public:
@@ -101,16 +111,7 @@
* according to the specified problem \a size.
* \sa ColPivHouseholderQR()
*/
- ColPivHouseholderQR(Index rows, Index cols)
- : m_qr(rows, cols),
- m_hCoeffs((std::min)(rows,cols)),
- m_colsPermutation(PermIndexType(cols)),
- m_colsTranspositions(cols),
- m_temp(cols),
- m_colNormsUpdated(cols),
- m_colNormsDirect(cols),
- m_isInitialized(false),
- m_usePrescribedThreshold(false) {}
+ ColPivHouseholderQR(Index rows, Index cols) : m_qr(rows, cols) { init(rows, cols); }
/** \brief Constructs a QR factorization from a given matrix
*
@@ -124,18 +125,9 @@
*
* \sa compute()
*/
- template<typename InputType>
- explicit ColPivHouseholderQR(const EigenBase<InputType>& matrix)
- : m_qr(matrix.rows(), matrix.cols()),
- m_hCoeffs((std::min)(matrix.rows(),matrix.cols())),
- m_colsPermutation(PermIndexType(matrix.cols())),
- m_colsTranspositions(matrix.cols()),
- m_temp(matrix.cols()),
- m_colNormsUpdated(matrix.cols()),
- m_colNormsDirect(matrix.cols()),
- m_isInitialized(false),
- m_usePrescribedThreshold(false)
- {
+ template <typename InputType>
+ explicit ColPivHouseholderQR(const EigenBase<InputType>& matrix) : m_qr(matrix.rows(), matrix.cols()) {
+ init(matrix.rows(), matrix.cols());
compute(matrix.derived());
}
@@ -145,18 +137,9 @@
*
* \sa ColPivHouseholderQR(const EigenBase&)
*/
- template<typename InputType>
- explicit ColPivHouseholderQR(EigenBase<InputType>& matrix)
- : m_qr(matrix.derived()),
- m_hCoeffs((std::min)(matrix.rows(),matrix.cols())),
- m_colsPermutation(PermIndexType(matrix.cols())),
- m_colsTranspositions(matrix.cols()),
- m_temp(matrix.cols()),
- m_colNormsUpdated(matrix.cols()),
- m_colNormsDirect(matrix.cols()),
- m_isInitialized(false),
- m_usePrescribedThreshold(false)
- {
+ template <typename InputType>
+ explicit ColPivHouseholderQR(EigenBase<InputType>& matrix) : m_qr(matrix.derived()) {
+ init(matrix.rows(), matrix.cols());
computeInPlace();
}
@@ -441,7 +424,7 @@
protected:
- friend class CompleteOrthogonalDecomposition<MatrixType>;
+ friend class CompleteOrthogonalDecomposition<MatrixType, PermutationIndex>;
EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
@@ -460,8 +443,8 @@
Index m_det_p;
};
-template<typename MatrixType>
-typename MatrixType::Scalar ColPivHouseholderQR<MatrixType>::determinant() const
+template<typename MatrixType, typename PermutationIndex>
+typename MatrixType::Scalar ColPivHouseholderQR<MatrixType, PermutationIndex>::determinant() const
{
eigen_assert(m_isInitialized && "HouseholderQR is not initialized.");
eigen_assert(m_qr.rows() == m_qr.cols() && "You can't take the determinant of a non-square matrix!");
@@ -470,8 +453,8 @@
return m_qr.diagonal().prod() * detQ * Scalar(m_det_p);
}
-template<typename MatrixType>
-typename MatrixType::RealScalar ColPivHouseholderQR<MatrixType>::absDeterminant() const
+template<typename MatrixType, typename PermutationIndex>
+typename MatrixType::RealScalar ColPivHouseholderQR<MatrixType, PermutationIndex>::absDeterminant() const
{
using std::abs;
eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
@@ -479,8 +462,8 @@
return abs(m_qr.diagonal().prod());
}
-template<typename MatrixType>
-typename MatrixType::RealScalar ColPivHouseholderQR<MatrixType>::logAbsDeterminant() const
+template<typename MatrixType, typename PermutationIndex>
+typename MatrixType::RealScalar ColPivHouseholderQR<MatrixType, PermutationIndex>::logAbsDeterminant() const
{
eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
eigen_assert(m_qr.rows() == m_qr.cols() && "You can't take the determinant of a non-square matrix!");
@@ -493,20 +476,20 @@
*
* \sa class ColPivHouseholderQR, ColPivHouseholderQR(const MatrixType&)
*/
-template<typename MatrixType>
+template<typename MatrixType, typename PermutationIndex>
template<typename InputType>
-ColPivHouseholderQR<MatrixType>& ColPivHouseholderQR<MatrixType>::compute(const EigenBase<InputType>& matrix)
+ColPivHouseholderQR<MatrixType, PermutationIndex>& ColPivHouseholderQR<MatrixType, PermutationIndex>::compute(const EigenBase<InputType>& matrix)
{
m_qr = matrix.derived();
computeInPlace();
return *this;
}
-template<typename MatrixType>
-void ColPivHouseholderQR<MatrixType>::computeInPlace()
+template<typename MatrixType, typename PermutationIndex>
+void ColPivHouseholderQR<MatrixType, PermutationIndex>::computeInPlace()
{
- // the column permutation is stored as int indices, so just to be sure:
- eigen_assert(m_qr.cols()<=NumTraits<int>::highest());
+
+ eigen_assert(m_qr.cols()<=NumTraits<PermutationIndex>::highest());
using std::abs;
@@ -549,7 +532,7 @@
m_nonzero_pivots = k;
// apply the transposition to the columns
- m_colsTranspositions.coeffRef(k) = biggest_col_index;
+ m_colsTranspositions.coeffRef(k) = static_cast<PermutationIndex>(biggest_col_index);
if(k != biggest_col_index) {
m_qr.col(k).swap(m_qr.col(biggest_col_index));
std::swap(m_colNormsUpdated.coeffRef(k), m_colNormsUpdated.coeffRef(biggest_col_index));
@@ -595,18 +578,18 @@
}
}
- m_colsPermutation.setIdentity(PermIndexType(cols));
- for(PermIndexType k = 0; k < size/*m_nonzero_pivots*/; ++k)
- m_colsPermutation.applyTranspositionOnTheRight(k, PermIndexType(m_colsTranspositions.coeff(k)));
+ m_colsPermutation.setIdentity(cols);
+ for(Index k = 0; k < size/*m_nonzero_pivots*/; ++k)
+ m_colsPermutation.applyTranspositionOnTheRight(k, static_cast<Index>(m_colsTranspositions.coeff(k)));
m_det_p = (number_of_transpositions%2) ? -1 : 1;
m_isInitialized = true;
}
#ifndef EIGEN_PARSED_BY_DOXYGEN
-template<typename MatrixType_>
+template<typename MatrixType_, typename PermutationIndex_>
template<typename RhsType, typename DstType>
-void ColPivHouseholderQR<MatrixType_>::_solve_impl(const RhsType &rhs, DstType &dst) const
+void ColPivHouseholderQR<MatrixType_, PermutationIndex_>::_solve_impl(const RhsType &rhs, DstType &dst) const
{
const Index nonzero_pivots = nonzeroPivots();
@@ -628,9 +611,9 @@
for(Index i = nonzero_pivots; i < cols(); ++i) dst.row(m_colsPermutation.indices().coeff(i)).setZero();
}
-template<typename MatrixType_>
+template<typename MatrixType_, typename PermutationIndex_>
template<bool Conjugate, typename RhsType, typename DstType>
-void ColPivHouseholderQR<MatrixType_>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
+void ColPivHouseholderQR<MatrixType_, PermutationIndex_>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
{
const Index nonzero_pivots = nonzeroPivots();
@@ -656,10 +639,10 @@
namespace internal {
-template<typename DstXprType, typename MatrixType>
-struct Assignment<DstXprType, Inverse<ColPivHouseholderQR<MatrixType> >, internal::assign_op<typename DstXprType::Scalar,typename ColPivHouseholderQR<MatrixType>::Scalar>, Dense2Dense>
+template<typename DstXprType, typename MatrixType, typename PermutationIndex>
+struct Assignment<DstXprType, Inverse<ColPivHouseholderQR<MatrixType, PermutationIndex> >, internal::assign_op<typename DstXprType::Scalar,typename ColPivHouseholderQR<MatrixType, PermutationIndex>::Scalar>, Dense2Dense>
{
- typedef ColPivHouseholderQR<MatrixType> QrType;
+ typedef ColPivHouseholderQR<MatrixType, PermutationIndex> QrType;
typedef Inverse<QrType> SrcXprType;
static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename QrType::Scalar> &)
{
@@ -672,8 +655,8 @@
/** \returns the matrix Q as a sequence of householder transformations.
* You can extract the meaningful part only by using:
* \code qr.householderQ().setLength(qr.nonzeroPivots()) \endcode*/
-template<typename MatrixType>
-typename ColPivHouseholderQR<MatrixType>::HouseholderSequenceType ColPivHouseholderQR<MatrixType>
+template<typename MatrixType, typename PermutationIndex>
+typename ColPivHouseholderQR<MatrixType, PermutationIndex>::HouseholderSequenceType ColPivHouseholderQR<MatrixType, PermutationIndex>
::householderQ() const
{
eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
@@ -685,10 +668,11 @@
* \sa class ColPivHouseholderQR
*/
template<typename Derived>
-const ColPivHouseholderQR<typename MatrixBase<Derived>::PlainObject>
+template<typename PermutationIndexType>
+const ColPivHouseholderQR<typename MatrixBase<Derived>::PlainObject, PermutationIndexType>
MatrixBase<Derived>::colPivHouseholderQr() const
{
- return ColPivHouseholderQR<PlainObject>(eval());
+ return ColPivHouseholderQR<PlainObject, PermutationIndexType>(eval());
}
} // end namespace Eigen
diff --git a/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h b/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h
index 7652d31..8a6ae9b 100644
--- a/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h
+++ b/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h
@@ -36,64 +36,117 @@
#include "./InternalHeaderCheck.h"
-namespace Eigen {
+namespace Eigen {
-/** \internal Specialization for the data types supported by LAPACKe */
+#if defined(EIGEN_USE_LAPACKE)
-#define EIGEN_LAPACKE_QR_COLPIV(EIGTYPE, LAPACKE_TYPE, LAPACKE_PREFIX, EIGCOLROW, LAPACKE_COLROW) \
-template<> template<typename InputType> inline \
-ColPivHouseholderQR<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic> >& \
-ColPivHouseholderQR<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic> >::compute( \
- const EigenBase<InputType>& matrix) \
-\
-{ \
- using std::abs; \
- typedef Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic> MatrixType; \
- typedef MatrixType::RealScalar RealScalar; \
- Index rows = matrix.rows();\
- Index cols = matrix.cols();\
-\
- m_qr = matrix;\
- Index size = m_qr.diagonalSize();\
- m_hCoeffs.resize(size);\
-\
- m_colsTranspositions.resize(cols);\
- /*Index number_of_transpositions = 0;*/ \
-\
- m_nonzero_pivots = 0; \
- m_maxpivot = RealScalar(0);\
- m_colsPermutation.resize(cols); \
- m_colsPermutation.indices().setZero(); \
-\
- lapack_int lda = internal::convert_index<lapack_int,Index>(m_qr.outerStride()); \
- lapack_int matrix_order = LAPACKE_COLROW; \
- LAPACKE_##LAPACKE_PREFIX##geqp3( matrix_order, internal::convert_index<lapack_int,Index>(rows), internal::convert_index<lapack_int,Index>(cols), \
- (LAPACKE_TYPE*)m_qr.data(), lda, (lapack_int*)m_colsPermutation.indices().data(), (LAPACKE_TYPE*)m_hCoeffs.data()); \
- m_isInitialized = true; \
- m_maxpivot=m_qr.diagonal().cwiseAbs().maxCoeff(); \
- m_hCoeffs.adjointInPlace(); \
- RealScalar premultiplied_threshold = abs(m_maxpivot) * threshold(); \
- lapack_int *perm = m_colsPermutation.indices().data(); \
- for(Index i=0;i<size;i++) { \
- m_nonzero_pivots += (abs(m_qr.coeff(i,i)) > premultiplied_threshold);\
- } \
- for(Index i=0;i<cols;i++) perm[i]--;\
-\
- /*m_det_pq = (number_of_transpositions%2) ? -1 : 1; // TODO: It's not needed now; fix upon availability in Eigen */ \
-\
- return *this; \
-}
+ template<typename Scalar>
+ inline lapack_int call_geqp3(int matrix_layout, lapack_int m, lapack_int n, Scalar* a, lapack_int lda, lapack_int* jpvt, Scalar* tau);
+ template<>
+ inline lapack_int call_geqp3(int matrix_layout, lapack_int m, lapack_int n, float* a, lapack_int lda, lapack_int* jpvt, float* tau)
+ { return LAPACKE_sgeqp3(matrix_layout, m, n, a, lda, jpvt, tau); }
+ template<>
+ inline lapack_int call_geqp3(int matrix_layout, lapack_int m, lapack_int n, double* a, lapack_int lda, lapack_int* jpvt, double* tau)
+ { return LAPACKE_dgeqp3(matrix_layout, m, n, a, lda, jpvt, tau); }
+ template<>
+ inline lapack_int call_geqp3(int matrix_layout, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda, lapack_int* jpvt, lapack_complex_float* tau)
+ { return LAPACKE_cgeqp3(matrix_layout, m, n, a, lda, jpvt, tau); }
+ template<>
+ inline lapack_int call_geqp3(int matrix_layout, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda, lapack_int* jpvt, lapack_complex_double* tau)
+ { return LAPACKE_zgeqp3(matrix_layout, m, n, a, lda, jpvt, tau); }
-EIGEN_LAPACKE_QR_COLPIV(double, double, d, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_LAPACKE_QR_COLPIV(float, float, s, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_LAPACKE_QR_COLPIV(dcomplex, lapack_complex_double, z, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_LAPACKE_QR_COLPIV(scomplex, lapack_complex_float, c, ColMajor, LAPACK_COL_MAJOR)
+ template <typename MatrixType>
+ struct ColPivHouseholderQR_LAPACKE_impl {
+ typedef typename MatrixType::Scalar Scalar;
+ typedef typename MatrixType::RealScalar RealScalar;
+ typedef typename internal::lapacke_helpers::translate_type_imp<Scalar>::type LapackeType;
+ static constexpr int LapackeStorage = MatrixType::IsRowMajor ? (LAPACK_ROW_MAJOR) : (LAPACK_COL_MAJOR);
-EIGEN_LAPACKE_QR_COLPIV(double, double, d, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_LAPACKE_QR_COLPIV(float, float, s, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_LAPACKE_QR_COLPIV(dcomplex, lapack_complex_double, z, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_LAPACKE_QR_COLPIV(scomplex, lapack_complex_float, c, RowMajor, LAPACK_ROW_MAJOR)
+ typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
+ typedef PermutationMatrix<Dynamic, Dynamic, lapack_int> PermutationType;
-} // end namespace Eigen
+ static void run(MatrixType& qr, HCoeffsType& hCoeffs, PermutationType& colsPermutation, Index& nonzero_pivots,
+ RealScalar& maxpivot, bool usePrescribedThreshold, RealScalar prescribedThreshold, Index& det_p,
+ bool& isInitialized) {
-#endif // EIGEN_COLPIVOTINGHOUSEHOLDERQR_LAPACKE_H
+ isInitialized = false;
+ hCoeffs.resize(qr.diagonalSize());
+ nonzero_pivots = 0;
+ maxpivot = RealScalar(0);
+ colsPermutation.resize(qr.cols());
+ colsPermutation.indices().setZero();
+
+ lapack_int rows = internal::lapacke_helpers::to_lapack(qr.rows());
+ lapack_int cols = internal::lapacke_helpers::to_lapack(qr.cols());
+ LapackeType* qr_data = (LapackeType*)(qr.data());
+ lapack_int lda = internal::lapacke_helpers::to_lapack(qr.outerStride());
+ lapack_int* perm_data = colsPermutation.indices().data();
+ LapackeType* hCoeffs_data = (LapackeType*)(hCoeffs.data());
+
+ lapack_int info = call_geqp3(LapackeStorage, rows, cols, qr_data, lda, perm_data, hCoeffs_data);
+ if (info != 0) return;
+
+ maxpivot = qr.diagonal().cwiseAbs().maxCoeff();
+ hCoeffs.adjointInPlace();
+ RealScalar defaultThreshold = NumTraits<RealScalar>::epsilon() * RealScalar(qr.diagonalSize());
+ RealScalar threshold = usePrescribedThreshold ? prescribedThreshold : defaultThreshold;
+ RealScalar premultiplied_threshold = maxpivot * threshold;
+ nonzero_pivots = (qr.diagonal().cwiseAbs().array() > premultiplied_threshold).count();
+ colsPermutation.indices().array() -= 1;
+ det_p = colsPermutation.determinant();
+ isInitialized = true;
+ };
+
+ static void init(Index rows, Index cols, HCoeffsType& hCoeffs, PermutationType& colsPermutation,
+ bool& usePrescribedThreshold, bool& isInitialized) {
+
+ Index diag = numext::mini(rows, cols);
+ hCoeffs.resize(diag);
+ colsPermutation.resize(cols);
+ usePrescribedThreshold = false;
+ isInitialized = false;
+ }
+ };
+
+ #define COLPIVQR_LAPACKE_COMPUTEINPLACE(EIGTYPE) \
+ template <> inline void ColPivHouseholderQR<EIGTYPE, lapack_int>::computeInPlace() { \
+ ColPivHouseholderQR_LAPACKE_impl<MatrixType>::run(m_qr, m_hCoeffs, m_colsPermutation, m_nonzero_pivots, \
+ m_maxpivot, m_usePrescribedThreshold, m_prescribedThreshold, \
+ m_det_p, m_isInitialized); } \
+
+ #define COLPIVQR_LAPACKE_INIT(EIGTYPE) \
+ template <> inline void ColPivHouseholderQR<EIGTYPE, lapack_int>::init(Index rows, Index cols) { \
+ ColPivHouseholderQR_LAPACKE_impl<MatrixType>::init(rows, cols, m_hCoeffs, m_colsPermutation, m_isInitialized, \
+ m_usePrescribedThreshold); } \
+
+ #define COLPIVQR_LAPACKE(EIGTYPE) \
+ COLPIVQR_LAPACKE_COMPUTEINPLACE(EIGTYPE) \
+ COLPIVQR_LAPACKE_INIT(EIGTYPE) \
+
+ typedef Matrix<float, Dynamic, Dynamic, RowMajor> MatrixXfR;
+ typedef Matrix<double, Dynamic, Dynamic, RowMajor> MatrixXdR;
+ typedef Matrix<scomplex, Dynamic, Dynamic, RowMajor> MatrixXcfR;
+ typedef Matrix<dcomplex, Dynamic, Dynamic, RowMajor> MatrixXcdR;
+
+ COLPIVQR_LAPACKE(MatrixXf)
+ COLPIVQR_LAPACKE(MatrixXd)
+ COLPIVQR_LAPACKE(MatrixXcf)
+ COLPIVQR_LAPACKE(MatrixXcd)
+ COLPIVQR_LAPACKE(MatrixXfR)
+ COLPIVQR_LAPACKE(MatrixXdR)
+ COLPIVQR_LAPACKE(MatrixXcfR)
+ COLPIVQR_LAPACKE(MatrixXcdR)
+
+ COLPIVQR_LAPACKE(Ref<MatrixXf>)
+ COLPIVQR_LAPACKE(Ref<MatrixXd>)
+ COLPIVQR_LAPACKE(Ref<MatrixXcf>)
+ COLPIVQR_LAPACKE(Ref<MatrixXcd>)
+ COLPIVQR_LAPACKE(Ref<MatrixXfR>)
+ COLPIVQR_LAPACKE(Ref<MatrixXdR>)
+ COLPIVQR_LAPACKE(Ref<MatrixXcfR>)
+ COLPIVQR_LAPACKE(Ref<MatrixXcdR>)
+
+#endif
+} // end namespace Eigen
+
+#endif // EIGEN_COLPIVOTINGHOUSEHOLDERQR_LAPACKE_H
diff --git a/Eigen/src/QR/CompleteOrthogonalDecomposition.h b/Eigen/src/QR/CompleteOrthogonalDecomposition.h
index 02583a2..171e601 100644
--- a/Eigen/src/QR/CompleteOrthogonalDecomposition.h
+++ b/Eigen/src/QR/CompleteOrthogonalDecomposition.h
@@ -15,12 +15,12 @@
namespace Eigen {
namespace internal {
-template <typename MatrixType_>
-struct traits<CompleteOrthogonalDecomposition<MatrixType_> >
+template <typename MatrixType_, typename PermutationIndex_>
+struct traits<CompleteOrthogonalDecomposition<MatrixType_, PermutationIndex_> >
: traits<MatrixType_> {
typedef MatrixXpr XprKind;
typedef SolverStorage StorageKind;
- typedef int StorageIndex;
+ typedef PermutationIndex_ PermutationIndex;
enum { Flags = 0 };
};
@@ -49,8 +49,8 @@
*
* \sa MatrixBase::completeOrthogonalDecomposition()
*/
-template <typename MatrixType_> class CompleteOrthogonalDecomposition
- : public SolverBase<CompleteOrthogonalDecomposition<MatrixType_> >
+template <typename MatrixType_, typename PermutationIndex_> class CompleteOrthogonalDecomposition
+ : public SolverBase<CompleteOrthogonalDecomposition<MatrixType_, PermutationIndex_> >
{
public:
typedef MatrixType_ MatrixType;
@@ -58,14 +58,14 @@
template<typename Derived>
friend struct internal::solve_assertion;
-
+ typedef PermutationIndex_ PermutationIndex;
EIGEN_GENERIC_PUBLIC_INTERFACE(CompleteOrthogonalDecomposition)
enum {
MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
};
typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
- typedef PermutationMatrix<ColsAtCompileTime, MaxColsAtCompileTime>
+ typedef PermutationMatrix<ColsAtCompileTime, MaxColsAtCompileTime, PermutationIndex>
PermutationType;
typedef typename internal::plain_row_type<MatrixType, Index>::type
IntRowVectorType;
@@ -78,9 +78,6 @@
HouseholderSequenceType;
typedef typename MatrixType::PlainObject PlainObject;
- private:
- typedef typename PermutationType::Index PermIndexType;
-
public:
/**
* \brief Default Constructor.
@@ -417,26 +414,26 @@
template <typename Rhs>
void applyZAdjointOnTheLeftInPlace(Rhs& rhs) const;
- ColPivHouseholderQR<MatrixType> m_cpqr;
+ ColPivHouseholderQR<MatrixType, PermutationIndex> m_cpqr;
HCoeffsType m_zCoeffs;
RowVectorType m_temp;
};
-template <typename MatrixType>
+template <typename MatrixType, typename PermutationIndex>
typename MatrixType::Scalar
-CompleteOrthogonalDecomposition<MatrixType>::determinant() const {
+CompleteOrthogonalDecomposition<MatrixType, PermutationIndex>::determinant() const {
return m_cpqr.determinant();
}
-template <typename MatrixType>
+template <typename MatrixType, typename PermutationIndex>
typename MatrixType::RealScalar
-CompleteOrthogonalDecomposition<MatrixType>::absDeterminant() const {
+CompleteOrthogonalDecomposition<MatrixType, PermutationIndex>::absDeterminant() const {
return m_cpqr.absDeterminant();
}
-template <typename MatrixType>
+template <typename MatrixType, typename PermutationIndex>
typename MatrixType::RealScalar
-CompleteOrthogonalDecomposition<MatrixType>::logAbsDeterminant() const {
+CompleteOrthogonalDecomposition<MatrixType, PermutationIndex>::logAbsDeterminant() const {
return m_cpqr.logAbsDeterminant();
}
@@ -447,11 +444,10 @@
* \sa class CompleteOrthogonalDecomposition,
* CompleteOrthogonalDecomposition(const MatrixType&)
*/
-template <typename MatrixType>
-void CompleteOrthogonalDecomposition<MatrixType>::computeInPlace()
+template <typename MatrixType, typename PermutationIndex>
+void CompleteOrthogonalDecomposition<MatrixType, PermutationIndex>::computeInPlace()
{
- // the column permutation is stored as int indices, so just to be sure:
- eigen_assert(m_cpqr.cols() <= NumTraits<int>::highest());
+ eigen_assert(m_cpqr.cols() <= NumTraits<PermutationIndex>::highest());
const Index rank = m_cpqr.rank();
const Index cols = m_cpqr.cols();
@@ -503,9 +499,9 @@
}
}
-template <typename MatrixType>
+template <typename MatrixType, typename PermutationIndex>
template <bool Conjugate, typename Rhs>
-void CompleteOrthogonalDecomposition<MatrixType>::applyZOnTheLeftInPlace(
+void CompleteOrthogonalDecomposition<MatrixType, PermutationIndex>::applyZOnTheLeftInPlace(
Rhs& rhs) const {
const Index cols = this->cols();
const Index nrhs = rhs.cols();
@@ -525,9 +521,9 @@
}
}
-template <typename MatrixType>
+template <typename MatrixType, typename PermutationIndex>
template <typename Rhs>
-void CompleteOrthogonalDecomposition<MatrixType>::applyZAdjointOnTheLeftInPlace(
+void CompleteOrthogonalDecomposition<MatrixType, PermutationIndex>::applyZAdjointOnTheLeftInPlace(
Rhs& rhs) const {
const Index cols = this->cols();
const Index nrhs = rhs.cols();
@@ -548,9 +544,9 @@
}
#ifndef EIGEN_PARSED_BY_DOXYGEN
-template <typename MatrixType_>
+template <typename MatrixType_, typename PermutationIndex_>
template <typename RhsType, typename DstType>
-void CompleteOrthogonalDecomposition<MatrixType_>::_solve_impl(
+void CompleteOrthogonalDecomposition<MatrixType_, PermutationIndex_>::_solve_impl(
const RhsType& rhs, DstType& dst) const {
const Index rank = this->rank();
if (rank == 0) {
@@ -580,9 +576,9 @@
dst = colsPermutation() * dst;
}
-template<typename MatrixType_>
+template<typename MatrixType_, typename PermutationIndex_>
template<bool Conjugate, typename RhsType, typename DstType>
-void CompleteOrthogonalDecomposition<MatrixType_>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
+void CompleteOrthogonalDecomposition<MatrixType_, PermutationIndex_>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
{
const Index rank = this->rank();
@@ -611,17 +607,17 @@
namespace internal {
-template<typename MatrixType>
-struct traits<Inverse<CompleteOrthogonalDecomposition<MatrixType> > >
+template<typename MatrixType, typename PermutationIndex>
+struct traits<Inverse<CompleteOrthogonalDecomposition<MatrixType, PermutationIndex> > >
: traits<typename Transpose<typename MatrixType::PlainObject>::PlainObject>
{
enum { Flags = 0 };
};
-template<typename DstXprType, typename MatrixType>
-struct Assignment<DstXprType, Inverse<CompleteOrthogonalDecomposition<MatrixType> >, internal::assign_op<typename DstXprType::Scalar,typename CompleteOrthogonalDecomposition<MatrixType>::Scalar>, Dense2Dense>
+template<typename DstXprType, typename MatrixType, typename PermutationIndex>
+struct Assignment<DstXprType, Inverse<CompleteOrthogonalDecomposition<MatrixType, PermutationIndex> >, internal::assign_op<typename DstXprType::Scalar,typename CompleteOrthogonalDecomposition<MatrixType, PermutationIndex>::Scalar>, Dense2Dense>
{
- typedef CompleteOrthogonalDecomposition<MatrixType> CodType;
+ typedef CompleteOrthogonalDecomposition<MatrixType, PermutationIndex> CodType;
typedef Inverse<CodType> SrcXprType;
static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename CodType::Scalar> &)
{
@@ -633,9 +629,9 @@
} // end namespace internal
/** \returns the matrix Q as a sequence of householder transformations */
-template <typename MatrixType>
-typename CompleteOrthogonalDecomposition<MatrixType>::HouseholderSequenceType
-CompleteOrthogonalDecomposition<MatrixType>::householderQ() const {
+template <typename MatrixType, typename PermutationIndex>
+typename CompleteOrthogonalDecomposition<MatrixType, PermutationIndex>::HouseholderSequenceType
+CompleteOrthogonalDecomposition<MatrixType, PermutationIndex>::householderQ() const {
return m_cpqr.householderQ();
}
@@ -644,7 +640,8 @@
* \sa class CompleteOrthogonalDecomposition
*/
template <typename Derived>
-const CompleteOrthogonalDecomposition<typename MatrixBase<Derived>::PlainObject>
+template <typename PermutationIndex>
+const CompleteOrthogonalDecomposition<typename MatrixBase<Derived>::PlainObject, PermutationIndex>
MatrixBase<Derived>::completeOrthogonalDecomposition() const {
return CompleteOrthogonalDecomposition<PlainObject>(eval());
}
diff --git a/Eigen/src/QR/FullPivHouseholderQR.h b/Eigen/src/QR/FullPivHouseholderQR.h
index ec7e19b..588f917 100644
--- a/Eigen/src/QR/FullPivHouseholderQR.h
+++ b/Eigen/src/QR/FullPivHouseholderQR.h
@@ -17,19 +17,19 @@
namespace internal {
-template<typename MatrixType_> struct traits<FullPivHouseholderQR<MatrixType_> >
+template<typename MatrixType_, typename PermutationIndex_> struct traits<FullPivHouseholderQR<MatrixType_, PermutationIndex_> >
: traits<MatrixType_>
{
typedef MatrixXpr XprKind;
typedef SolverStorage StorageKind;
- typedef int StorageIndex;
+ typedef PermutationIndex_ PermutationIndex;
enum { Flags = 0 };
};
-template<typename MatrixType> struct FullPivHouseholderQRMatrixQReturnType;
+template<typename MatrixType, typename PermutationIndex> struct FullPivHouseholderQRMatrixQReturnType;
-template<typename MatrixType>
-struct traits<FullPivHouseholderQRMatrixQReturnType<MatrixType> >
+template<typename MatrixType, typename PermutationIndex>
+struct traits<FullPivHouseholderQRMatrixQReturnType<MatrixType, PermutationIndex> >
{
typedef typename MatrixType::PlainObject ReturnType;
};
@@ -59,26 +59,27 @@
*
* \sa MatrixBase::fullPivHouseholderQr()
*/
-template<typename MatrixType_> class FullPivHouseholderQR
- : public SolverBase<FullPivHouseholderQR<MatrixType_> >
+template<typename MatrixType_, typename PermutationIndex_> class FullPivHouseholderQR
+ : public SolverBase<FullPivHouseholderQR<MatrixType_, PermutationIndex_> >
{
public:
typedef MatrixType_ MatrixType;
typedef SolverBase<FullPivHouseholderQR> Base;
friend class SolverBase<FullPivHouseholderQR>;
-
+ typedef PermutationIndex_ PermutationIndex;
EIGEN_GENERIC_PUBLIC_INTERFACE(FullPivHouseholderQR)
+
enum {
MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
};
- typedef internal::FullPivHouseholderQRMatrixQReturnType<MatrixType> MatrixQReturnType;
+ typedef internal::FullPivHouseholderQRMatrixQReturnType<MatrixType, PermutationIndex> MatrixQReturnType;
typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
- typedef Matrix<StorageIndex, 1,
+ typedef Matrix<PermutationIndex, 1,
internal::min_size_prefer_dynamic(ColsAtCompileTime,RowsAtCompileTime), RowMajor, 1,
internal::min_size_prefer_fixed(MaxColsAtCompileTime, MaxRowsAtCompileTime)> IntDiagSizeVectorType;
- typedef PermutationMatrix<ColsAtCompileTime, MaxColsAtCompileTime> PermutationType;
+ typedef PermutationMatrix<ColsAtCompileTime, MaxColsAtCompileTime, PermutationIndex> PermutationType;
typedef typename internal::plain_row_type<MatrixType>::type RowVectorType;
typedef typename internal::plain_col_type<MatrixType>::type ColVectorType;
typedef typename MatrixType::PlainObject PlainObject;
@@ -437,8 +438,8 @@
Index m_det_p;
};
-template<typename MatrixType>
-typename MatrixType::Scalar FullPivHouseholderQR<MatrixType>::determinant() const
+template<typename MatrixType, typename PermutationIndex>
+typename MatrixType::Scalar FullPivHouseholderQR<MatrixType, PermutationIndex>::determinant() const
{
eigen_assert(m_isInitialized && "HouseholderQR is not initialized.");
eigen_assert(m_qr.rows() == m_qr.cols() && "You can't take the determinant of a non-square matrix!");
@@ -447,8 +448,8 @@
return m_qr.diagonal().prod() * detQ * Scalar(m_det_p);
}
-template<typename MatrixType>
-typename MatrixType::RealScalar FullPivHouseholderQR<MatrixType>::absDeterminant() const
+template<typename MatrixType, typename PermutationIndex>
+typename MatrixType::RealScalar FullPivHouseholderQR<MatrixType, PermutationIndex>::absDeterminant() const
{
using std::abs;
eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
@@ -456,8 +457,8 @@
return abs(m_qr.diagonal().prod());
}
-template<typename MatrixType>
-typename MatrixType::RealScalar FullPivHouseholderQR<MatrixType>::logAbsDeterminant() const
+template<typename MatrixType, typename PermutationIndex>
+typename MatrixType::RealScalar FullPivHouseholderQR<MatrixType, PermutationIndex>::logAbsDeterminant() const
{
eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
eigen_assert(m_qr.rows() == m_qr.cols() && "You can't take the determinant of a non-square matrix!");
@@ -470,18 +471,19 @@
*
* \sa class FullPivHouseholderQR, FullPivHouseholderQR(const MatrixType&)
*/
-template<typename MatrixType>
+template<typename MatrixType, typename PermutationIndex>
template<typename InputType>
-FullPivHouseholderQR<MatrixType>& FullPivHouseholderQR<MatrixType>::compute(const EigenBase<InputType>& matrix)
+FullPivHouseholderQR<MatrixType, PermutationIndex>& FullPivHouseholderQR<MatrixType, PermutationIndex>::compute(const EigenBase<InputType>& matrix)
{
m_qr = matrix.derived();
computeInPlace();
return *this;
}
-template<typename MatrixType>
-void FullPivHouseholderQR<MatrixType>::computeInPlace()
+template<typename MatrixType, typename PermutationIndex>
+void FullPivHouseholderQR<MatrixType, PermutationIndex>::computeInPlace()
{
+ eigen_assert(m_qr.cols() <= NumTraits<PermutationIndex>::highest());
using std::abs;
Index rows = m_qr.rows();
Index cols = m_qr.cols();
@@ -523,15 +525,15 @@
m_nonzero_pivots = k;
for(Index i = k; i < size; i++)
{
- m_rows_transpositions.coeffRef(i) = internal::convert_index<StorageIndex>(i);
- m_cols_transpositions.coeffRef(i) = internal::convert_index<StorageIndex>(i);
+ m_rows_transpositions.coeffRef(i) = internal::convert_index<PermutationIndex>(i);
+ m_cols_transpositions.coeffRef(i) = internal::convert_index<PermutationIndex>(i);
m_hCoeffs.coeffRef(i) = Scalar(0);
}
break;
}
- m_rows_transpositions.coeffRef(k) = internal::convert_index<StorageIndex>(row_of_biggest_in_corner);
- m_cols_transpositions.coeffRef(k) = internal::convert_index<StorageIndex>(col_of_biggest_in_corner);
+ m_rows_transpositions.coeffRef(k) = internal::convert_index<PermutationIndex>(row_of_biggest_in_corner);
+ m_cols_transpositions.coeffRef(k) = internal::convert_index<PermutationIndex>(col_of_biggest_in_corner);
if(k != row_of_biggest_in_corner) {
m_qr.row(k).tail(cols-k).swap(m_qr.row(row_of_biggest_in_corner).tail(cols-k));
++number_of_transpositions;
@@ -561,9 +563,9 @@
}
#ifndef EIGEN_PARSED_BY_DOXYGEN
-template<typename MatrixType_>
+template<typename MatrixType_, typename PermutationIndex_>
template<typename RhsType, typename DstType>
-void FullPivHouseholderQR<MatrixType_>::_solve_impl(const RhsType &rhs, DstType &dst) const
+void FullPivHouseholderQR<MatrixType_, PermutationIndex_>::_solve_impl(const RhsType &rhs, DstType &dst) const
{
const Index l_rank = rank();
@@ -595,9 +597,9 @@
for(Index i = l_rank; i < cols(); ++i) dst.row(m_cols_permutation.indices().coeff(i)).setZero();
}
-template<typename MatrixType_>
+template<typename MatrixType_, typename PermutationIndex_>
template<bool Conjugate, typename RhsType, typename DstType>
-void FullPivHouseholderQR<MatrixType_>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
+void FullPivHouseholderQR<MatrixType_, PermutationIndex_>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
{
const Index l_rank = rank();
@@ -634,10 +636,10 @@
namespace internal {
-template<typename DstXprType, typename MatrixType>
-struct Assignment<DstXprType, Inverse<FullPivHouseholderQR<MatrixType> >, internal::assign_op<typename DstXprType::Scalar,typename FullPivHouseholderQR<MatrixType>::Scalar>, Dense2Dense>
+template<typename DstXprType, typename MatrixType, typename PermutationIndex>
+struct Assignment<DstXprType, Inverse<FullPivHouseholderQR<MatrixType, PermutationIndex> >, internal::assign_op<typename DstXprType::Scalar,typename FullPivHouseholderQR<MatrixType, PermutationIndex>::Scalar>, Dense2Dense>
{
- typedef FullPivHouseholderQR<MatrixType> QrType;
+ typedef FullPivHouseholderQR<MatrixType, PermutationIndex> QrType;
typedef Inverse<QrType> SrcXprType;
static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename QrType::Scalar> &)
{
@@ -651,11 +653,11 @@
*
* \tparam MatrixType type of underlying dense matrix
*/
-template<typename MatrixType> struct FullPivHouseholderQRMatrixQReturnType
- : public ReturnByValue<FullPivHouseholderQRMatrixQReturnType<MatrixType> >
+template<typename MatrixType, typename PermutationIndex> struct FullPivHouseholderQRMatrixQReturnType
+ : public ReturnByValue<FullPivHouseholderQRMatrixQReturnType<MatrixType, PermutationIndex> >
{
public:
- typedef typename FullPivHouseholderQR<MatrixType>::IntDiagSizeVectorType IntDiagSizeVectorType;
+ typedef typename FullPivHouseholderQR<MatrixType, PermutationIndex>::IntDiagSizeVectorType IntDiagSizeVectorType;
typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
typedef Matrix<typename MatrixType::Scalar, 1, MatrixType::RowsAtCompileTime, RowMajor, 1,
MatrixType::MaxRowsAtCompileTime> WorkVectorType;
@@ -712,8 +714,8 @@
} // end namespace internal
-template<typename MatrixType>
-inline typename FullPivHouseholderQR<MatrixType>::MatrixQReturnType FullPivHouseholderQR<MatrixType>::matrixQ() const
+template<typename MatrixType, typename PermutationIndex>
+inline typename FullPivHouseholderQR<MatrixType, PermutationIndex>::MatrixQReturnType FullPivHouseholderQR<MatrixType, PermutationIndex>::matrixQ() const
{
eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
return MatrixQReturnType(m_qr, m_hCoeffs, m_rows_transpositions);
@@ -724,10 +726,11 @@
* \sa class FullPivHouseholderQR
*/
template<typename Derived>
-const FullPivHouseholderQR<typename MatrixBase<Derived>::PlainObject>
+template<typename PermutationIndex>
+const FullPivHouseholderQR<typename MatrixBase<Derived>::PlainObject, PermutationIndex>
MatrixBase<Derived>::fullPivHouseholderQr() const
{
- return FullPivHouseholderQR<PlainObject>(eval());
+ return FullPivHouseholderQR<PlainObject, PermutationIndex>(eval());
}
} // end namespace Eigen
diff --git a/Eigen/src/SparseCore/CompressedStorage.h b/Eigen/src/SparseCore/CompressedStorage.h
index d89d99e..6b0fb4b 100644
--- a/Eigen/src/SparseCore/CompressedStorage.h
+++ b/Eigen/src/SparseCore/CompressedStorage.h
@@ -193,10 +193,8 @@
inline void moveChunk(Index from, Index to, Index chunkSize)
{
eigen_internal_assert(chunkSize >= 0 && to+chunkSize <= m_size);
- if (chunkSize > 0) {
- internal::smart_memmove(m_values + from, m_values + from + chunkSize, m_values + to);
- internal::smart_memmove(m_indices + from, m_indices + from + chunkSize, m_indices + to);
- }
+ internal::smart_memmove(m_values + from, m_values + from + chunkSize, m_values + to);
+ internal::smart_memmove(m_indices + from, m_indices + from + chunkSize, m_indices + to);
}
protected:
diff --git a/Eigen/src/SparseCore/SparseCompressedBase.h b/Eigen/src/SparseCore/SparseCompressedBase.h
index 243cd16..95af15a 100644
--- a/Eigen/src/SparseCore/SparseCompressedBase.h
+++ b/Eigen/src/SparseCore/SparseCompressedBase.h
@@ -60,14 +60,14 @@
/** \returns the number of non zero coefficients */
inline Index nonZeros() const
{
- if(Derived::IsVectorAtCompileTime && outerIndexPtr()==0)
- return derived().nonZeros();
- else if(isCompressed())
- return outerIndexPtr()[derived().outerSize()]-outerIndexPtr()[0];
- else if(derived().outerSize()==0)
- return 0;
- else
- return innerNonZeros().sum();
+ if (Derived::IsVectorAtCompileTime && outerIndexPtr() == 0)
+ return derived().nonZeros();
+ else if (derived().outerSize() == 0)
+ return 0;
+ else if (isCompressed())
+ return outerIndexPtr()[derived().outerSize()] - outerIndexPtr()[0];
+ else
+ return innerNonZeros().sum();
}
/** \returns a const pointer to the array of values.
diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h
index eaefcef..bf1d562 100644
--- a/Eigen/src/SparseCore/SparseMatrix.h
+++ b/Eigen/src/SparseCore/SparseMatrix.h
@@ -238,13 +238,24 @@
const Index outer = IsRowMajor ? row : col;
const Index inner = IsRowMajor ? col : row;
Index start = m_outerIndex[outer];
- Index end = m_innerNonZeros ? m_outerIndex[outer] + m_innerNonZeros[outer] : m_outerIndex[outer + 1];
+ Index end = isCompressed() ? m_outerIndex[outer + 1] : m_outerIndex[outer] + m_innerNonZeros[outer];
eigen_assert(end >= start && "you probably called coeffRef on a non finalized matrix");
- if (end <= start) return insertAtByOuterInner(outer, inner, start);
- Index dst = m_data.searchLowerIndex(start, end, inner);
+ Index dst = start == end ? end : m_data.searchLowerIndex(start, end, inner);
+ if (dst == end) {
+ Index capacity = m_outerIndex[outer + 1] - end;
+ if (capacity > 0) {
+ // implies uncompressed: push to back of vector
+ m_innerNonZeros[outer]++;
+ m_data.index(end) = StorageIndex(inner);
+ m_data.value(end) = Scalar(0);
+ return m_data.value(end);
+ }
+ }
if ((dst < end) && (m_data.index(dst) == inner))
+ // this coefficient exists, return a refernece to it
return m_data.value(dst);
else
+ // insertion will require reconfiguring the buffer
return insertAtByOuterInner(outer, inner, dst);
}
@@ -323,7 +334,7 @@
if(isCompressed())
{
Index totalReserveSize = 0;
- for (Index j = 0; j < m_outerSize; ++j) totalReserveSize += reserveSizes[j];
+ for (Index j = 0; j < m_outerSize; ++j) totalReserveSize += internal::convert_index<Index>(reserveSizes[j]);
// if reserveSizes is empty, don't do anything!
if (totalReserveSize == 0) return;
@@ -334,11 +345,12 @@
// temporarily use m_innerSizes to hold the new starting points.
StorageIndex* newOuterIndex = m_innerNonZeros;
- StorageIndex count = 0;
+ Index count = 0;
for(Index j=0; j<m_outerSize; ++j)
{
- newOuterIndex[j] = count;
- count += reserveSizes[j] + (m_outerIndex[j+1]-m_outerIndex[j]);
+ newOuterIndex[j] = internal::convert_index<StorageIndex>(count);
+ Index reserveSize = internal::convert_index<Index>(reserveSizes[j]);
+ count += reserveSize + internal::convert_index<Index>(m_outerIndex[j+1]-m_outerIndex[j]);
}
m_data.reserve(totalReserveSize);
@@ -364,29 +376,24 @@
{
StorageIndex* newOuterIndex = internal::conditional_aligned_new_auto<StorageIndex, true>(m_outerSize + 1);
- StorageIndex count = 0;
+ Index count = 0;
for(Index j=0; j<m_outerSize; ++j)
{
- newOuterIndex[j] = count;
- StorageIndex alreadyReserved = (m_outerIndex[j+1]-m_outerIndex[j]) - m_innerNonZeros[j];
- StorageIndex toReserve = std::max<StorageIndex>(reserveSizes[j], alreadyReserved);
- count += toReserve + m_innerNonZeros[j];
+ newOuterIndex[j] = internal::convert_index<StorageIndex>(count);
+ Index alreadyReserved = internal::convert_index<Index>(m_outerIndex[j+1] - m_outerIndex[j] - m_innerNonZeros[j]);
+ Index reserveSize = internal::convert_index<Index>(reserveSizes[j]);
+ Index toReserve = numext::maxi(reserveSize, alreadyReserved);
+ count += toReserve + internal::convert_index<Index>(m_innerNonZeros[j]);
}
- newOuterIndex[m_outerSize] = count;
-
+ newOuterIndex[m_outerSize] = internal::convert_index<StorageIndex>(count);
+
m_data.resize(count);
for(Index j=m_outerSize-1; j>=0; --j)
{
- StorageIndex offset = newOuterIndex[j] - m_outerIndex[j];
- if(offset>0)
- {
- StorageIndex innerNNZ = m_innerNonZeros[j];
- StorageIndex begin = m_outerIndex[j];
- StorageIndex end = begin + innerNNZ;
- StorageIndex target = newOuterIndex[j];
- internal::smart_memmove(innerIndexPtr() + begin, innerIndexPtr() + end, innerIndexPtr() + target);
- internal::smart_memmove(valuePtr() + begin, valuePtr() + end, valuePtr() + target);
- }
+ StorageIndex innerNNZ = m_innerNonZeros[j];
+ StorageIndex begin = m_outerIndex[j];
+ StorageIndex target = newOuterIndex[j];
+ m_data.moveChunk(begin, target, innerNNZ);
}
std::swap(m_outerIndex, newOuterIndex);
@@ -488,7 +495,22 @@
* same as insert(Index,Index) except that the indices are given relative to the storage order */
Scalar& insertByOuterInner(Index j, Index i)
{
- return insert(IsRowMajor ? j : i, IsRowMajor ? i : j);
+ Index start = m_outerIndex[j];
+ Index end = isCompressed() ? m_outerIndex[j + 1] : start + m_innerNonZeros[j];
+ Index dst = start == end ? end : m_data.searchLowerIndex(start, end, i);
+ if (dst == end) {
+ Index capacity = m_outerIndex[j + 1] - end;
+ if (capacity > 0) {
+ // implies uncompressed: push to back of vector
+ m_innerNonZeros[j]++;
+ m_data.index(end) = StorageIndex(i);
+ m_data.value(end) = Scalar(0);
+ return m_data.value(end);
+ }
+ }
+ eigen_assert((dst == end || m_data.index(dst) != i) &&
+ "you cannot insert an element that already exists, you must call coeffRef to this end");
+ return insertAtByOuterInner(j, i, dst);
}
/** Turns the matrix into the \em compressed format.
@@ -498,36 +520,45 @@
if (isCompressed()) return;
eigen_internal_assert(m_outerIndex!=0 && m_outerSize>0);
-
+
StorageIndex start = m_outerIndex[1];
m_outerIndex[1] = m_innerNonZeros[0];
- for(Index j=1; j<m_outerSize; ++j)
+ // try to move fewer, larger contiguous chunks
+ Index copyStart = start;
+ Index copyTarget = m_innerNonZeros[0];
+ for (Index j = 1; j < m_outerSize; j++)
{
StorageIndex end = start + m_innerNonZeros[j];
- StorageIndex target = m_outerIndex[j];
- if (start != target)
+ StorageIndex nextStart = m_outerIndex[j + 1];
+ // dont forget to move the last chunk!
+ bool breakUpCopy = (end != nextStart) || (j == m_outerSize - 1);
+ if (breakUpCopy)
{
- internal::smart_memmove(innerIndexPtr() + start, innerIndexPtr() + end, innerIndexPtr() + target);
- internal::smart_memmove(valuePtr() + start, valuePtr() + end, valuePtr() + target);
+ Index chunkSize = end - copyStart;
+ if(chunkSize > 0) m_data.moveChunk(copyStart, copyTarget, chunkSize);
+ copyStart = nextStart;
+ copyTarget += chunkSize;
}
- start = m_outerIndex[j + 1];
+ start = nextStart;
m_outerIndex[j + 1] = m_outerIndex[j] + m_innerNonZeros[j];
}
+ m_data.resize(m_outerIndex[m_outerSize]);
+
+ // release as much memory as possible
internal::conditional_aligned_delete_auto<StorageIndex, true>(m_innerNonZeros, m_outerSize);
m_innerNonZeros = 0;
- m_data.resize(m_outerIndex[m_outerSize]);
m_data.squeeze();
}
/** Turns the matrix into the uncompressed mode */
void uncompress()
{
- if(m_innerNonZeros != 0) return;
+ if (!isCompressed()) return;
m_innerNonZeros = internal::conditional_aligned_new_auto<StorageIndex, true>(m_outerSize);
- typename IndexVector::AlignedMapType innerNonZeroMap(m_innerNonZeros, m_outerSize);
- typename IndexVector::ConstAlignedMapType outerIndexMap(m_outerIndex, m_outerSize);
- typename IndexVector::ConstMapType nextOuterIndexMap(m_outerIndex + 1, m_outerSize);
- innerNonZeroMap = nextOuterIndexMap - outerIndexMap;
+ if (m_outerIndex[m_outerSize] == 0)
+ std::fill_n(m_innerNonZeros, m_outerSize, StorageIndex(0));
+ else
+ for (Index j = 0; j < m_outerSize; j++) m_innerNonZeros[j] = m_outerIndex[j + 1] - m_outerIndex[j];
}
/** Suppresses all nonzeros which are \b much \b smaller \b than \a reference under the tolerance \a epsilon */
@@ -590,40 +621,40 @@
Index newOuterSize = IsRowMajor ? rows : cols;
Index newInnerSize = IsRowMajor ? cols : rows;
- Index innerChange = newInnerSize - innerSize();
- Index outerChange = newOuterSize - outerSize();
+ Index innerChange = newInnerSize - m_innerSize;
+ Index outerChange = newOuterSize - m_outerSize;
if (outerChange != 0) {
m_outerIndex = internal::conditional_aligned_realloc_new_auto<StorageIndex, true>(
- outerIndexPtr(), newOuterSize + 1, outerSize() + 1);
+ m_outerIndex, newOuterSize + 1, m_outerSize + 1);
if (!isCompressed())
m_innerNonZeros = internal::conditional_aligned_realloc_new_auto<StorageIndex, true>(
- innerNonZeroPtr(), newOuterSize, outerSize());
+ m_innerNonZeros, newOuterSize, m_outerSize);
if (outerChange > 0) {
- StorageIndex lastIdx = outerSize() == 0 ? StorageIndex(0) : outerIndexPtr()[outerSize()];
- std::fill_n(outerIndexPtr() + outerSize(), outerChange + 1, lastIdx);
+ StorageIndex lastIdx = m_outerSize == 0 ? StorageIndex(0) : m_outerIndex[m_outerSize];
+ std::fill_n(m_outerIndex + m_outerSize, outerChange + 1, lastIdx);
- if (!isCompressed()) std::fill_n(innerNonZeroPtr() + outerSize(), outerChange, StorageIndex(0));
+ if (!isCompressed()) std::fill_n(m_innerNonZeros + m_outerSize, outerChange, StorageIndex(0));
}
}
m_outerSize = newOuterSize;
if (innerChange < 0) {
- for (Index j = 0; j < outerSize(); j++) {
- Index start = outerIndexPtr()[j];
- Index end = isCompressed() ? outerIndexPtr()[j + 1] : start + innerNonZeroPtr()[j];
- Index lb = data().searchLowerIndex(start, end, newInnerSize);
+ for (Index j = 0; j < m_outerSize; j++) {
+ Index start = m_outerIndex[j];
+ Index end = isCompressed() ? m_outerIndex[j + 1] : start + m_innerNonZeros[j];
+ Index lb = m_data.searchLowerIndex(start, end, newInnerSize);
if (lb != end) {
uncompress();
- innerNonZeroPtr()[j] = StorageIndex(lb - start);
+ m_innerNonZeros[j] = StorageIndex(lb - start);
}
}
}
m_innerSize = newInnerSize;
- Index newSize = outerIndexPtr()[outerSize()];
+ Index newSize = m_outerIndex[m_outerSize];
eigen_assert(newSize <= m_data.size());
m_data.resize(newSize);
}
@@ -640,17 +671,15 @@
const Index outerSize = IsRowMajor ? rows : cols;
m_innerSize = IsRowMajor ? cols : rows;
m_data.clear();
- if (m_outerSize != outerSize || m_outerSize==0)
- {
- m_outerIndex = internal::conditional_aligned_realloc_new_auto<StorageIndex, true>(m_outerIndex, outerSize + 1,
- m_outerSize + 1);
+
+ if ((m_outerIndex == 0) || (m_outerSize != outerSize)) {
+ m_outerIndex = internal::conditional_aligned_realloc_new_auto<StorageIndex, true>(m_outerIndex, outerSize + 1, m_outerSize + 1);
m_outerSize = outerSize;
}
- if(m_innerNonZeros)
- {
- internal::conditional_aligned_delete_auto<StorageIndex, true>(m_innerNonZeros, m_outerSize);
- m_innerNonZeros = 0;
- }
+
+ internal::conditional_aligned_delete_auto<StorageIndex, true>(m_innerNonZeros, m_outerSize);
+ m_innerNonZeros = 0;
+
std::fill_n(m_outerIndex, m_outerSize + 1, StorageIndex(0));
}
@@ -672,7 +701,7 @@
/** Default constructor yielding an empty \c 0 \c x \c 0 matrix */
inline SparseMatrix()
- : m_outerSize(-1), m_innerSize(0), m_outerIndex(0), m_innerNonZeros(0)
+ : m_outerSize(0), m_innerSize(0), m_outerIndex(0), m_innerNonZeros(0)
{
resize(0, 0);
}
@@ -756,20 +785,15 @@
* This function also turns the matrix into compressed mode, and drop any reserved memory. */
inline void setIdentity()
{
- eigen_assert(rows() == cols() && "ONLY FOR SQUARED MATRICES");
- if (m_innerNonZeros) {
- internal::conditional_aligned_delete_auto<StorageIndex, true>(m_innerNonZeros, m_outerSize);
- m_innerNonZeros = 0;
- }
- m_data.resize(rows());
+ eigen_assert(m_outerSize == m_innerSize && "ONLY FOR SQUARED MATRICES");
+ internal::conditional_aligned_delete_auto<StorageIndex, true>(m_innerNonZeros, m_outerSize);
+ m_innerNonZeros = 0;
+ m_data.resize(m_outerSize);
// is it necessary to squeeze?
m_data.squeeze();
- typename IndexVector::AlignedMapType outerIndexMap(outerIndexPtr(), rows() + 1);
- typename IndexVector::AlignedMapType innerIndexMap(innerIndexPtr(), rows());
- typename ScalarVector::AlignedMapType valueMap(valuePtr(), rows());
- outerIndexMap.setEqualSpaced(StorageIndex(0), StorageIndex(1));
- innerIndexMap.setEqualSpaced(StorageIndex(0), StorageIndex(1));
- valueMap.setOnes();
+ std::iota(m_outerIndex, m_outerIndex + m_outerSize + 1, StorageIndex(0));
+ std::iota(innerIndexPtr(), innerIndexPtr() + m_outerSize, StorageIndex(0));
+ std::fill_n(valuePtr(), m_outerSize, Scalar(1));
}
inline SparseMatrix& operator=(const SparseMatrix& other)
@@ -880,11 +904,8 @@
void initAssignment(const Other& other)
{
resize(other.rows(), other.cols());
- if(m_innerNonZeros)
- {
- internal::conditional_aligned_delete_auto<StorageIndex, true>(m_innerNonZeros, m_outerSize);
- m_innerNonZeros = 0;
- }
+ internal::conditional_aligned_delete_auto<StorageIndex, true>(m_innerNonZeros, m_outerSize);
+ m_innerNonZeros = 0;
}
/** \internal
@@ -922,8 +943,9 @@
eigen_assert(m_innerNonZeros[outer]<=(m_outerIndex[outer+1] - m_outerIndex[outer]));
Index p = m_outerIndex[outer] + m_innerNonZeros[outer]++;
- m_data.index(p) = convert_index(inner);
- return (m_data.value(p) = Scalar(0));
+ m_data.index(p) = StorageIndex(inner);
+ m_data.value(p) = Scalar(0);
+ return m_data.value(p);
}
protected:
struct IndexPosPair {
@@ -946,7 +968,6 @@
{
constexpr StorageIndex kEmptyIndexVal(-1);
- typedef typename IndexVector::AlignedMapType IndexMap;
typedef typename ScalarVector::AlignedMapType ValueMap;
Index n = diagXpr.size();
@@ -954,22 +975,18 @@
const bool overwrite = internal::is_same<Func, internal::assign_op<Scalar,Scalar> >::value;
if(overwrite)
{
- if((this->rows()!=n) || (this->cols()!=n))
- this->resize(n, n);
+ if((m_outerSize != n) || (m_innerSize != n))
+ resize(n, n);
}
- if(data().size()==0 || overwrite)
+ if(m_data.size()==0 || overwrite)
{
- if (!isCompressed()) {
- internal::conditional_aligned_delete_auto<StorageIndex, true>(m_innerNonZeros, m_outerSize);
- m_innerNonZeros = 0;
- }
+ internal::conditional_aligned_delete_auto<StorageIndex, true>(m_innerNonZeros, m_outerSize);
+ m_innerNonZeros = 0;
resizeNonZeros(n);
- IndexMap outerIndexMap(outerIndexPtr(), n + 1);
- IndexMap innerIndexMap(innerIndexPtr(), n);
ValueMap valueMap(valuePtr(), n);
- outerIndexMap.setEqualSpaced(StorageIndex(0), StorageIndex(1));
- innerIndexMap.setEqualSpaced(StorageIndex(0), StorageIndex(1));
+ std::iota(m_outerIndex, m_outerIndex + n + 1, StorageIndex(0));
+ std::iota(innerIndexPtr(), innerIndexPtr() + n, StorageIndex(0));
valueMap.setZero();
internal::call_assignment_no_alias(valueMap, diagXpr, assignFunc);
}
@@ -985,12 +1002,12 @@
Index shift = 0;
for (Index j = 0; j < n; j++) {
- Index begin = outerIndexPtr()[j];
- Index end = isCompressed() ? outerIndexPtr()[j + 1] : begin + innerNonZeroPtr()[j];
- Index capacity = outerIndexPtr()[j + 1] - end;
- Index dst = data().searchLowerIndex(begin, end, j);
+ Index begin = m_outerIndex[j];
+ Index end = isCompressed() ? m_outerIndex[j + 1] : begin + m_innerNonZeros[j];
+ Index capacity = m_outerIndex[j + 1] - end;
+ Index dst = m_data.searchLowerIndex(begin, end, j);
// the entry exists: update it now
- if (dst != end && data().index(dst) == j) assignFunc.assignCoeff(data().value(dst), diaEval.coeff(j));
+ if (dst != end && m_data.index(dst) == StorageIndex(j)) assignFunc.assignCoeff(m_data.value(dst), diaEval.coeff(j));
// the entry belongs at the back of the vector: push to back
else if (dst == end && capacity > 0)
assignFunc.assignCoeff(insertBackUncompressed(j, j), diaEval.coeff(j));
@@ -1005,13 +1022,13 @@
if (deferredInsertions > 0) {
- data().resize(data().size() + shift);
- Index copyEnd = isCompressed() ? outerIndexPtr()[outerSize()]
- : outerIndexPtr()[outerSize() - 1] + innerNonZeroPtr()[outerSize() - 1];
- for (Index j = outerSize() - 1; deferredInsertions > 0; j--) {
- Index begin = outerIndexPtr()[j];
- Index end = isCompressed() ? outerIndexPtr()[j + 1] : begin + innerNonZeroPtr()[j];
- Index capacity = outerIndexPtr()[j + 1] - end;
+ m_data.resize(m_data.size() + shift);
+ Index copyEnd = isCompressed() ? m_outerIndex[m_outerSize]
+ : m_outerIndex[m_outerSize - 1] + m_innerNonZeros[m_outerSize - 1];
+ for (Index j = m_outerSize - 1; deferredInsertions > 0; j--) {
+ Index begin = m_outerIndex[j];
+ Index end = isCompressed() ? m_outerIndex[j + 1] : begin + m_innerNonZeros[j];
+ Index capacity = m_outerIndex[j + 1] - end;
bool doInsertion = insertionLocations(j) >= 0;
bool breakUpCopy = doInsertion && (capacity > 0);
@@ -1020,14 +1037,14 @@
// where `threshold >= 0` to skip inactive nonzeros in each vector
// this reduces the total number of copied elements, but requires more moveChunk calls
if (breakUpCopy) {
- Index copyBegin = outerIndexPtr()[j + 1];
+ Index copyBegin = m_outerIndex[j + 1];
Index to = copyBegin + shift;
Index chunkSize = copyEnd - copyBegin;
- if (chunkSize > 0) data().moveChunk(copyBegin, to, chunkSize);
+ m_data.moveChunk(copyBegin, to, chunkSize);
copyEnd = end;
}
- outerIndexPtr()[j + 1] += shift;
+ m_outerIndex[j + 1] += shift;
if (doInsertion) {
// if there is capacity, shift into the inactive nonzeros
@@ -1035,11 +1052,12 @@
Index copyBegin = insertionLocations(j);
Index to = copyBegin + shift;
Index chunkSize = copyEnd - copyBegin;
- if (chunkSize > 0) data().moveChunk(copyBegin, to, chunkSize);
+ m_data.moveChunk(copyBegin, to, chunkSize);
Index dst = to - 1;
- data().index(dst) = StorageIndex(j);
- assignFunc.assignCoeff(data().value(dst) = Scalar(0), diaEval.coeff(j));
- if (!isCompressed()) innerNonZeroPtr()[j]++;
+ m_data.index(dst) = StorageIndex(j);
+ m_data.value(dst) = Scalar(0);
+ assignFunc.assignCoeff(m_data.value(dst), diaEval.coeff(j));
+ if (!isCompressed()) m_innerNonZeros[j]++;
shift--;
deferredInsertions--;
copyEnd = copyBegin;
@@ -1050,9 +1068,6 @@
}
}
- /* Provides a consistent reserve and reallocation strategy for insertCompressed and insertUncompressed
- If there is insufficient space for one insertion, the size of the memory buffer is doubled */
- inline void checkAllocatedSpaceAndMaybeExpand();
/* These functions are used to avoid a redundant binary search operation in functions such as coeffRef() and assume `dst` is the appropriate sorted insertion point */
EIGEN_STRONG_INLINE Scalar& insertAtByOuterInner(Index outer, Index inner, Index dst);
Scalar& insertCompressedAtByOuterInner(Index outer, Index inner, Index dst);
@@ -1299,10 +1314,8 @@
}
m_outerIndex[m_outerSize] = count;
// turn the matrix into compressed form
- if (m_innerNonZeros) {
- internal::conditional_aligned_delete_auto<StorageIndex, true>(m_innerNonZeros, m_outerSize);
- m_innerNonZeros = 0;
- }
+ internal::conditional_aligned_delete_auto<StorageIndex, true>(m_innerNonZeros, m_outerSize);
+ m_innerNonZeros = 0;
m_data.resize(m_outerIndex[m_outerSize]);
}
@@ -1383,21 +1396,15 @@
}
template <typename Scalar_, int Options_, typename StorageIndex_>
-inline typename SparseMatrix<Scalar_, Options_, StorageIndex_>::Scalar& SparseMatrix<Scalar_, Options_, StorageIndex_>::insert(
- Index row, Index col) {
- Index outer = IsRowMajor ? row : col;
- Index inner = IsRowMajor ? col : row;
- Index start = outerIndexPtr()[outer];
- Index end = isCompressed() ? outerIndexPtr()[outer + 1] : start + innerNonZeroPtr()[outer];
- Index dst = data().searchLowerIndex(start, end, inner);
- eigen_assert((dst == end || data().index(dst) != inner) &&
- "you cannot insert an element that already exists, you must call coeffRef to this end");
- return insertAtByOuterInner(outer, inner, dst);
+inline typename SparseMatrix<Scalar_, Options_, StorageIndex_>::Scalar&
+SparseMatrix<Scalar_, Options_, StorageIndex_>::insert(Index row, Index col) {
+ return insertByOuterInner(IsRowMajor ? row : col, IsRowMajor ? col : row);
}
template <typename Scalar_, int Options_, typename StorageIndex_>
EIGEN_STRONG_INLINE typename SparseMatrix<Scalar_, Options_, StorageIndex_>::Scalar&
SparseMatrix<Scalar_, Options_, StorageIndex_>::insertAtByOuterInner(Index outer, Index inner, Index dst) {
+ // random insertion into compressed matrix is very slow
uncompress();
return insertUncompressedAtByOuterInner(outer, inner, dst);
}
@@ -1408,10 +1415,20 @@
eigen_assert(!isCompressed());
Index outer = IsRowMajor ? row : col;
Index inner = IsRowMajor ? col : row;
- Index start = outerIndexPtr()[outer];
- Index end = start + innerNonZeroPtr()[outer];
- Index dst = data().searchLowerIndex(start, end, inner);
- eigen_assert((dst == end || data().index(dst) != inner) &&
+ Index start = m_outerIndex[outer];
+ Index end = start + m_innerNonZeros[outer];
+ Index dst = start == end ? end : m_data.searchLowerIndex(start, end, inner);
+ if (dst == end) {
+ Index capacity = m_outerIndex[outer + 1] - end;
+ if (capacity > 0) {
+ // implies uncompressed: push to back of vector
+ m_innerNonZeros[outer]++;
+ m_data.index(end) = StorageIndex(inner);
+ m_data.value(end) = Scalar(0);
+ return m_data.value(end);
+ }
+ }
+ eigen_assert((dst == end || m_data.index(dst) != inner) &&
"you cannot insert an element that already exists, you must call coeffRef to this end");
return insertUncompressedAtByOuterInner(outer, inner, dst);
}
@@ -1422,97 +1439,119 @@
eigen_assert(isCompressed());
Index outer = IsRowMajor ? row : col;
Index inner = IsRowMajor ? col : row;
- Index start = outerIndexPtr()[outer];
- Index end = outerIndexPtr()[outer + 1];
- Index dst = data().searchLowerIndex(start, end, inner);
- eigen_assert((dst == end || data().index(dst) != inner) &&
+ Index start = m_outerIndex[outer];
+ Index end = m_outerIndex[outer + 1];
+ Index dst = start == end ? end : m_data.searchLowerIndex(start, end, inner);
+ eigen_assert((dst == end || m_data.index(dst) != inner) &&
"you cannot insert an element that already exists, you must call coeffRef to this end");
return insertCompressedAtByOuterInner(outer, inner, dst);
}
template <typename Scalar_, int Options_, typename StorageIndex_>
-EIGEN_STRONG_INLINE void SparseMatrix<Scalar_, Options_, StorageIndex_>::checkAllocatedSpaceAndMaybeExpand() {
- // if there is no capacity for a single insertion, double the capacity
- if (data().allocatedSize() <= data().size()) {
- // increase capacity by a mininum of 32
- Index minReserve = 32;
- Index reserveSize = numext::maxi(minReserve, data().allocatedSize());
- data().reserve(reserveSize);
- }
-}
-
-template <typename Scalar_, int Options_, typename StorageIndex_>
typename SparseMatrix<Scalar_, Options_, StorageIndex_>::Scalar&
SparseMatrix<Scalar_, Options_, StorageIndex_>::insertCompressedAtByOuterInner(Index outer, Index inner, Index dst) {
eigen_assert(isCompressed());
// compressed insertion always requires expanding the buffer
- checkAllocatedSpaceAndMaybeExpand();
- data().resize(data().size() + 1);
- Index chunkSize = outerIndexPtr()[outerSize()] - dst;
+ // first, check if there is adequate allocated memory
+ if (m_data.allocatedSize() <= m_data.size()) {
+ // if there is no capacity for a single insertion, double the capacity
+ // increase capacity by a mininum of 32
+ Index minReserve = 32;
+ Index reserveSize = numext::maxi(minReserve, m_data.allocatedSize());
+ m_data.reserve(reserveSize);
+ }
+ m_data.resize(m_data.size() + 1);
+ Index chunkSize = m_outerIndex[m_outerSize] - dst;
// shift the existing data to the right if necessary
- if (chunkSize > 0) data().moveChunk(dst, dst + 1, chunkSize);
+ m_data.moveChunk(dst, dst + 1, chunkSize);
// update nonzero counts
- typename IndexVector::AlignedMapType outerIndexMap(outerIndexPtr(), outerSize() + 1);
- outerIndexMap.segment(outer + 1, outerSize() - outer).array() += 1;
+ // potentially O(outerSize) bottleneck!
+ for (Index j = outer; j < m_outerSize; j++) m_outerIndex[j + 1]++;
// initialize the coefficient
- data().index(dst) = StorageIndex(inner);
+ m_data.index(dst) = StorageIndex(inner);
+ m_data.value(dst) = Scalar(0);
// return a reference to the coefficient
- return data().value(dst) = Scalar(0);
+ return m_data.value(dst);
}
template <typename Scalar_, int Options_, typename StorageIndex_>
typename SparseMatrix<Scalar_, Options_, StorageIndex_>::Scalar&
SparseMatrix<Scalar_, Options_, StorageIndex_>::insertUncompressedAtByOuterInner(Index outer, Index inner, Index dst) {
eigen_assert(!isCompressed());
- // find nearest outer vector to the right with capacity (if any) to minimize copy size
- Index target = outer;
- for (; target < outerSize(); target++) {
- Index start = outerIndexPtr()[target];
- Index end = start + innerNonZeroPtr()[target];
- Index capacity = outerIndexPtr()[target + 1] - end;
- if (capacity > 0) {
- // `target` has room for interior insertion
- Index chunkSize = end - dst;
- // shift the existing data to the right if necessary
- if (chunkSize > 0) data().moveChunk(dst, dst + 1, chunkSize);
- break;
+ // find a vector with capacity, starting at `outer` and searching to the left and right
+ for (Index leftTarget = outer - 1, rightTarget = outer; (leftTarget >= 0) || (rightTarget < m_outerSize);) {
+ if (rightTarget < m_outerSize) {
+ Index start = m_outerIndex[rightTarget];
+ Index end = start + m_innerNonZeros[rightTarget];
+ Index nextStart = m_outerIndex[rightTarget + 1];
+ Index capacity = nextStart - end;
+ if (capacity > 0) {
+ // move [dst, end) to dst+1 and insert at dst
+ Index chunkSize = end - dst;
+ if (chunkSize > 0) m_data.moveChunk(dst, dst + 1, chunkSize);
+ m_innerNonZeros[outer]++;
+ for (Index j = outer; j < rightTarget; j++) m_outerIndex[j + 1]++;
+ m_data.index(dst) = StorageIndex(inner);
+ m_data.value(dst) = Scalar(0);
+ return m_data.value(dst);
+ }
+ rightTarget++;
+ }
+ if (leftTarget >= 0) {
+ Index start = m_outerIndex[leftTarget];
+ Index end = start + m_innerNonZeros[leftTarget];
+ Index nextStart = m_outerIndex[leftTarget + 1];
+ Index capacity = nextStart - end;
+ if (capacity > 0) {
+ // tricky: dst is a lower bound, so we must insert at dst-1 when shifting left
+ // move [nextStart, dst) to nextStart-1 and insert at dst-1
+ Index chunkSize = dst - nextStart;
+ if (chunkSize > 0) m_data.moveChunk(nextStart, nextStart - 1, chunkSize);
+ m_innerNonZeros[outer]++;
+ for (Index j = leftTarget; j < outer; j++) m_outerIndex[j + 1]--;
+ m_data.index(dst - 1) = StorageIndex(inner);
+ m_data.value(dst - 1) = Scalar(0);
+ return m_data.value(dst - 1);
+ }
+ leftTarget--;
}
}
- if (target == outerSize()) {
- // no room for interior insertion (to the right of `outer`)
- target = outer;
- Index dst_offset = dst - outerIndexPtr()[target];
- Index totalCapacity = data().allocatedSize() - data().size();
- eigen_assert(totalCapacity >= 0);
- if (totalCapacity == 0) {
- // there is no room left. we must reallocate. reserve space in each vector
- constexpr StorageIndex kReserveSizePerVector(2);
- reserveInnerVectors(IndexVector::Constant(outerSize(), kReserveSizePerVector));
+
+ // no room for interior insertion
+ // nonZeros() == m_data.size()
+ // record offset as outerIndxPtr will change
+ Index dst_offset = dst - m_outerIndex[outer];
+ // allocate space for random insertion
+ if (m_data.allocatedSize() == 0) {
+ // fast method to allocate space for one element per vector in empty matrix
+ m_data.resize(m_outerSize);
+ std::iota(m_outerIndex, m_outerIndex + m_outerSize + 1, StorageIndex(0));
+ } else {
+ // check for integer overflow: if maxReserveSize == 0, insertion is not possible
+ Index maxReserveSize = static_cast<Index>(NumTraits<StorageIndex>::highest()) - m_data.allocatedSize();
+ eigen_assert(maxReserveSize > 0);
+ if (m_outerSize <= maxReserveSize) {
+ // allocate space for one additional element per vector
+ reserveInnerVectors(IndexVector::Constant(m_outerSize, 1));
} else {
- // dont reallocate, but re-distribute the remaining capacity to the right of `outer`
- // each vector in the range [outer,outerSize) will receive totalCapacity / (outerSize - outer) nonzero
- // reservations each vector in the range [outer,remainder) will receive an additional nonzero reservation where
- // remainder = totalCapacity % (outerSize - outer)
+ // handle the edge case where StorageIndex is insufficient to reserve outerSize additional elements
+ // allocate space for one additional element in the interval [outer,maxReserveSize)
typedef internal::sparse_reserve_op<StorageIndex> ReserveSizesOp;
typedef CwiseNullaryOp<ReserveSizesOp, IndexVector> ReserveSizesXpr;
- ReserveSizesXpr reserveSizesXpr(outerSize(), 1, ReserveSizesOp(target, outerSize(), totalCapacity));
- eigen_assert(reserveSizesXpr.sum() == totalCapacity);
+ ReserveSizesXpr reserveSizesXpr(m_outerSize, 1, ReserveSizesOp(outer, m_outerSize, maxReserveSize));
reserveInnerVectors(reserveSizesXpr);
}
- Index start = outerIndexPtr()[target];
- Index end = start + innerNonZeroPtr()[target];
- dst = start + dst_offset;
- Index chunkSize = end - dst;
- if (chunkSize > 0) data().moveChunk(dst, dst + 1, chunkSize);
}
- // update nonzero counts
- innerNonZeroPtr()[outer]++;
- typename IndexVector::AlignedMapType outerIndexMap(outerIndexPtr(), outerSize() + 1);
- outerIndexMap.segment(outer + 1, target - outer).array() += 1;
- // initialize the coefficient
- data().index(dst) = StorageIndex(inner);
- // return a reference to the coefficient
- return data().value(dst) = Scalar(0);
+ // insert element at `dst` with new outer indices
+ Index start = m_outerIndex[outer];
+ Index end = start + m_innerNonZeros[outer];
+ Index new_dst = start + dst_offset;
+ Index chunkSize = end - new_dst;
+ if (chunkSize > 0) m_data.moveChunk(new_dst, new_dst + 1, chunkSize);
+ m_innerNonZeros[outer]++;
+ m_data.index(new_dst) = StorageIndex(inner);
+ m_data.value(new_dst) = Scalar(0);
+ return m_data.value(new_dst);
}
namespace internal {
diff --git a/Eigen/src/SparseCore/SparsePermutation.h b/Eigen/src/SparseCore/SparsePermutation.h
index af9a1fe..7e402cc 100644
--- a/Eigen/src/SparseCore/SparsePermutation.h
+++ b/Eigen/src/SparseCore/SparsePermutation.h
@@ -18,69 +18,142 @@
namespace internal {
+template<typename ExpressionType, typename PlainObjectType, bool NeedEval = !is_same<ExpressionType, PlainObjectType>::value>
+struct XprHelper
+{
+ XprHelper(const ExpressionType& xpr) : m_xpr(xpr) {}
+ inline const PlainObjectType& xpr() const { return m_xpr; }
+ // this is a new PlainObjectType initialized by xpr
+ const PlainObjectType m_xpr;
+};
+template<typename ExpressionType, typename PlainObjectType>
+struct XprHelper<ExpressionType, PlainObjectType, false>
+{
+ XprHelper(const ExpressionType& xpr) : m_xpr(xpr) {}
+ inline const PlainObjectType& xpr() const { return m_xpr; }
+ // this is a reference to xpr
+ const PlainObjectType& m_xpr;
+};
+
+template<typename PermDerived, bool NeedInverseEval>
+struct PermHelper
+{
+ using IndicesType = typename PermDerived::IndicesType;
+ using PermutationIndex = typename IndicesType::Scalar;
+ using type = PermutationMatrix<IndicesType::SizeAtCompileTime, IndicesType::MaxSizeAtCompileTime, PermutationIndex>;
+ PermHelper(const PermDerived& perm) : m_perm(perm.inverse()) {}
+ inline const type& perm() const { return m_perm; }
+ // this is a new PermutationMatrix initialized by perm.inverse()
+ const type m_perm;
+};
+template<typename PermDerived>
+struct PermHelper<PermDerived, false>
+{
+ using type = PermDerived;
+ PermHelper(const PermDerived& perm) : m_perm(perm) {}
+ inline const type& perm() const { return m_perm; }
+ // this is a reference to perm
+ const type& m_perm;
+};
+
template<typename ExpressionType, int Side, bool Transposed>
struct permutation_matrix_product<ExpressionType, Side, Transposed, SparseShape>
{
- typedef typename nested_eval<ExpressionType, 1>::type MatrixType;
- typedef remove_all_t<MatrixType> MatrixTypeCleaned;
+ using MatrixType = typename nested_eval<ExpressionType, 1>::type;
+ using MatrixTypeCleaned = remove_all_t<MatrixType>;
- typedef typename MatrixTypeCleaned::Scalar Scalar;
- typedef typename MatrixTypeCleaned::StorageIndex StorageIndex;
+ using Scalar = typename MatrixTypeCleaned::Scalar;
+ using StorageIndex = typename MatrixTypeCleaned::StorageIndex;
- enum {
- SrcStorageOrder = MatrixTypeCleaned::Flags&RowMajorBit ? RowMajor : ColMajor,
- MoveOuter = SrcStorageOrder==RowMajor ? Side==OnTheLeft : Side==OnTheRight
- };
-
- typedef std::conditional_t<MoveOuter,
- SparseMatrix<Scalar,SrcStorageOrder,StorageIndex>,
- SparseMatrix<Scalar,int(SrcStorageOrder)==RowMajor?ColMajor:RowMajor,StorageIndex> > ReturnType;
+ // the actual "return type" is `Dest`. this is a temporary type
+ using ReturnType = SparseMatrix<Scalar, MatrixTypeCleaned::IsRowMajor ? RowMajor : ColMajor, StorageIndex>;
+ using TmpHelper = XprHelper<ExpressionType, ReturnType>;
- template<typename Dest,typename PermutationType>
- static inline void run(Dest& dst, const PermutationType& perm, const ExpressionType& xpr)
- {
- MatrixType mat(xpr);
- if(MoveOuter)
- {
- SparseMatrix<Scalar,SrcStorageOrder,StorageIndex> tmp(mat.rows(), mat.cols());
- Matrix<StorageIndex,Dynamic,1> sizes(mat.outerSize());
- for(Index j=0; j<mat.outerSize(); ++j)
- {
+ static constexpr bool NeedOuterPermutation = ExpressionType::IsRowMajor ? Side == OnTheLeft : Side == OnTheRight;
+ static constexpr bool NeedInversePermutation = Transposed ? Side == OnTheLeft : Side == OnTheRight;
+
+ template <typename Dest, typename PermutationType>
+ static inline void permute_outer(Dest& dst, const PermutationType& perm, const ExpressionType& xpr) {
+
+ // if ExpressionType is not ReturnType, evaluate `xpr` (allocation)
+ // otherwise, just reference `xpr`
+ // TODO: handle trivial expressions such as CwiseBinaryOp without temporary
+ const TmpHelper tmpHelper(xpr);
+ const ReturnType& tmp = tmpHelper.xpr();
+
+ ReturnType result(tmp.rows(), tmp.cols());
+
+ for (Index j = 0; j < tmp.outerSize(); j++) {
Index jp = perm.indices().coeff(j);
- sizes[((Side==OnTheLeft) ^ Transposed) ? jp : j] = StorageIndex(mat.innerVector(((Side==OnTheRight) ^ Transposed) ? jp : j).nonZeros());
+ Index jsrc = NeedInversePermutation ? jp : j;
+ Index jdst = NeedInversePermutation ? j : jp;
+ Index begin = tmp.outerIndexPtr()[jsrc];
+ Index end = tmp.isCompressed() ? tmp.outerIndexPtr()[jsrc + 1] : begin + tmp.innerNonZeroPtr()[jsrc];
+ result.outerIndexPtr()[jdst + 1] += end - begin;
}
- tmp.reserve(sizes);
- for(Index j=0; j<mat.outerSize(); ++j)
- {
- Index jp = perm.indices().coeff(j);
- Index jsrc = ((Side==OnTheRight) ^ Transposed) ? jp : j;
- Index jdst = ((Side==OnTheLeft) ^ Transposed) ? jp : j;
- for(typename MatrixTypeCleaned::InnerIterator it(mat,jsrc); it; ++it)
- tmp.insertByOuterInner(jdst,it.index()) = it.value();
- }
- dst = tmp;
- }
- else
- {
- SparseMatrix<Scalar,int(SrcStorageOrder)==RowMajor?ColMajor:RowMajor,StorageIndex> tmp(mat.rows(), mat.cols());
- Matrix<StorageIndex,Dynamic,1> sizes(tmp.outerSize());
- sizes.setZero();
- PermutationMatrix<Dynamic,Dynamic,StorageIndex> perm_cpy;
- if((Side==OnTheLeft) ^ Transposed)
- perm_cpy = perm;
- else
- perm_cpy = perm.transpose();
- for(Index j=0; j<mat.outerSize(); ++j)
- for(typename MatrixTypeCleaned::InnerIterator it(mat,j); it; ++it)
- sizes[perm_cpy.indices().coeff(it.index())]++;
- tmp.reserve(sizes);
- for(Index j=0; j<mat.outerSize(); ++j)
- for(typename MatrixTypeCleaned::InnerIterator it(mat,j); it; ++it)
- tmp.insertByOuterInner(perm_cpy.indices().coeff(it.index()),j) = it.value();
- dst = tmp;
- }
+ std::partial_sum(result.outerIndexPtr(), result.outerIndexPtr() + result.outerSize() + 1,
+ result.outerIndexPtr());
+ result.resizeNonZeros(result.nonZeros());
+
+ for (Index j = 0; j < tmp.outerSize(); j++) {
+ Index jp = perm.indices().coeff(j);
+ Index jsrc = NeedInversePermutation ? jp : j;
+ Index jdst = NeedInversePermutation ? j : jp;
+ Index begin = tmp.outerIndexPtr()[jsrc];
+ Index end = tmp.isCompressed() ? tmp.outerIndexPtr()[jsrc + 1] : begin + tmp.innerNonZeroPtr()[jsrc];
+ Index target = result.outerIndexPtr()[jdst];
+ smart_copy(tmp.innerIndexPtr() + begin, tmp.innerIndexPtr() + end, result.innerIndexPtr() + target);
+ smart_copy(tmp.valuePtr() + begin, tmp.valuePtr() + end, result.valuePtr() + target);
+ }
+ dst = std::move(result);
}
+
+ template <typename Dest, typename PermutationType>
+ static inline void permute_inner(Dest& dst, const PermutationType& perm, const ExpressionType& xpr) {
+ using InnerPermHelper = PermHelper<PermutationType, NeedInversePermutation>;
+ using InnerPermType = typename InnerPermHelper::type;
+
+ // if ExpressionType is not ReturnType, evaluate `xpr` (allocation)
+ // otherwise, just reference `xpr`
+ // TODO: handle trivial expressions such as CwiseBinaryOp without temporary
+ const TmpHelper tmpHelper(xpr);
+ const ReturnType& tmp = tmpHelper.xpr();
+
+ // if inverse permutation of inner indices is requested, calculate perm.inverse() (allocation)
+ // otherwise, just reference `perm`
+ const InnerPermHelper permHelper(perm);
+ const InnerPermType& innerPerm = permHelper.perm();
+
+ ReturnType result(tmp.rows(), tmp.cols());
+
+ for (Index j = 0; j < tmp.outerSize(); j++) {
+ Index begin = tmp.outerIndexPtr()[j];
+ Index end = tmp.isCompressed() ? tmp.outerIndexPtr()[j + 1] : begin + tmp.innerNonZeroPtr()[j];
+ result.outerIndexPtr()[j + 1] += end - begin;
+ }
+
+ std::partial_sum(result.outerIndexPtr(), result.outerIndexPtr() + result.outerSize() + 1, result.outerIndexPtr());
+ result.resizeNonZeros(result.nonZeros());
+
+ for (Index j = 0; j < tmp.outerSize(); j++) {
+ Index begin = tmp.outerIndexPtr()[j];
+ Index end = tmp.isCompressed() ? tmp.outerIndexPtr()[j + 1] : begin + tmp.innerNonZeroPtr()[j];
+ Index target = result.outerIndexPtr()[j];
+ std::transform(tmp.innerIndexPtr() + begin, tmp.innerIndexPtr() + end, result.innerIndexPtr() + target,
+ [&innerPerm](StorageIndex i) { return innerPerm.indices().coeff(i); });
+ smart_copy(tmp.valuePtr() + begin, tmp.valuePtr() + end, result.valuePtr() + target);
+ }
+ // the inner indices were permuted, and must be sorted
+ result.sortInnerIndices();
+ dst = std::move(result);
+ }
+
+ template <typename Dest, typename PermutationType, bool DoOuter = NeedOuterPermutation, std::enable_if_t<DoOuter, int> = 0>
+ static inline void run(Dest& dst, const PermutationType& perm, const ExpressionType& xpr) { permute_outer(dst, perm, xpr); }
+
+ template <typename Dest, typename PermutationType, bool DoOuter = NeedOuterPermutation, std::enable_if_t<!DoOuter, int> = 0>
+ static inline void run(Dest& dst, const PermutationType& perm, const ExpressionType& xpr) { permute_inner(dst, perm, xpr); }
};
}
@@ -143,35 +216,34 @@
} // end namespace internal
/** \returns the matrix with the permutation applied to the columns
- */
-template<typename SparseDerived, typename PermDerived>
-inline const Product<SparseDerived, PermDerived, AliasFreeProduct>
-operator*(const SparseMatrixBase<SparseDerived>& matrix, const PermutationBase<PermDerived>& perm)
-{ return Product<SparseDerived, PermDerived, AliasFreeProduct>(matrix.derived(), perm.derived()); }
+ */
+template <typename SparseDerived, typename PermDerived>
+inline const Product<SparseDerived, PermDerived, AliasFreeProduct> operator*(
+ const SparseMatrixBase<SparseDerived>& matrix, const PermutationBase<PermDerived>& perm) {
+ return Product<SparseDerived, PermDerived, AliasFreeProduct>(matrix.derived(), perm.derived());
+}
/** \returns the matrix with the permutation applied to the rows
- */
-template<typename SparseDerived, typename PermDerived>
-inline const Product<PermDerived, SparseDerived, AliasFreeProduct>
-operator*( const PermutationBase<PermDerived>& perm, const SparseMatrixBase<SparseDerived>& matrix)
-{ return Product<PermDerived, SparseDerived, AliasFreeProduct>(perm.derived(), matrix.derived()); }
-
+ */
+template <typename SparseDerived, typename PermDerived>
+inline const Product<PermDerived, SparseDerived, AliasFreeProduct> operator*(
+ const PermutationBase<PermDerived>& perm, const SparseMatrixBase<SparseDerived>& matrix) {
+ return Product<PermDerived, SparseDerived, AliasFreeProduct>(perm.derived(), matrix.derived());
+}
/** \returns the matrix with the inverse permutation applied to the columns.
- */
-template<typename SparseDerived, typename PermutationType>
-inline const Product<SparseDerived, Inverse<PermutationType>, AliasFreeProduct>
-operator*(const SparseMatrixBase<SparseDerived>& matrix, const InverseImpl<PermutationType, PermutationStorage>& tperm)
-{
+ */
+template <typename SparseDerived, typename PermutationType>
+inline const Product<SparseDerived, Inverse<PermutationType>, AliasFreeProduct> operator*(
+ const SparseMatrixBase<SparseDerived>& matrix, const InverseImpl<PermutationType, PermutationStorage>& tperm) {
return Product<SparseDerived, Inverse<PermutationType>, AliasFreeProduct>(matrix.derived(), tperm.derived());
}
/** \returns the matrix with the inverse permutation applied to the rows.
- */
-template<typename SparseDerived, typename PermutationType>
-inline const Product<Inverse<PermutationType>, SparseDerived, AliasFreeProduct>
-operator*(const InverseImpl<PermutationType,PermutationStorage>& tperm, const SparseMatrixBase<SparseDerived>& matrix)
-{
+ */
+template <typename SparseDerived, typename PermutationType>
+inline const Product<Inverse<PermutationType>, SparseDerived, AliasFreeProduct> operator*(
+ const InverseImpl<PermutationType, PermutationStorage>& tperm, const SparseMatrixBase<SparseDerived>& matrix) {
return Product<Inverse<PermutationType>, SparseDerived, AliasFreeProduct>(tperm.derived(), matrix.derived());
}
diff --git a/Eigen/src/misc/lapacke_helpers.h b/Eigen/src/misc/lapacke_helpers.h
index b6ad6e8..0703c2b 100644
--- a/Eigen/src/misc/lapacke_helpers.h
+++ b/Eigen/src/misc/lapacke_helpers.h
@@ -76,12 +76,6 @@
return Derived::IsRowMajor ? LAPACK_ROW_MAJOR : LAPACK_COL_MAJOR;
}
-/// translate UpLo type to the corresponding letter code
-template<UpLoType mode> char translate_mode;
-template<> constexpr char translate_mode<Lower> = 'L';
-template<> constexpr char translate_mode<Upper> = 'U';
-
-
// ---------------------------------------------------------------------------------------------------------------------
// Automatic generation of low-level wrappers
// ---------------------------------------------------------------------------------------------------------------------
diff --git a/bench/tensors/README b/bench/tensors/README
index dcbf021..acce5a1 100644
--- a/bench/tensors/README
+++ b/bench/tensors/README
@@ -16,5 +16,11 @@
1. export COMPUTECPP_PACKAGE_ROOT_DIR={PATH TO COMPUTECPP ROOT DIRECTORY}
2. bash eigen_sycl_bench.sh
+To compile the floating point GPU benchmarks using Intel DPCPP compiler
+/path/to/dpcpp/bin/clang+ -DSYCL_COMPILER_IS_DPCPP -DNDEBUG -DEIGEN_MPL2_ONLY -DEIGEN_USE_SYCL=1 -I ../../ -O3 -DNDEBUG -fsycl -fsycl-targets="supported backend in DPCPP. i.e. spir64 or nvptx64-nvidia-cuda" -std=c++17 tensor_benchmarks_sycl.cc benchmark_main.cc -lpthread -o eigen_dpcpp_sycl
+
Last but not least, we also provide a suite of benchmarks to measure the scalability of the contraction code on CPU. To compile these benchmarks, call
g++ contraction_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu
+
+To compile the contraction with DPCPP:
+/path/to/dpcpp/bin/clang++ -DSYCL_COMPILER_IS_DPCPP -DNDEBUG -DEIGEN_MPL2_ONLY -DEIGEN_USE_SYCL=1 -I ../../ -O3 -DNDEBUG -fsycl -fsycl-targets="supported backend in DPCPP. i.e. spir64 or nvptx64-nvidia-cuda" -std=c++17 tensor_contract_sycl_bench.cc -lpthread -o eigen_dpcpp_contract
diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake
index 1ddaa12..639790c 100644
--- a/cmake/EigenTesting.cmake
+++ b/cmake/EigenTesting.cmake
@@ -230,10 +230,10 @@
# Add the tests to ctest.
add_test(NAME ${test_target_ok}
- COMMAND ${CMAKE_COMMAND} --build . --target ${test_target_ok} --config $<CONFIGURATION>
+ COMMAND ${CMAKE_COMMAND} --build . --target ${test_target_ok} --config $<CONFIG>
WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
add_test(NAME ${test_target_ko}
- COMMAND ${CMAKE_COMMAND} --build . --target ${test_target_ko} --config $<CONFIGURATION>
+ COMMAND ${CMAKE_COMMAND} --build . --target ${test_target_ko} --config $<CONFIG>
WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
# Expect the second test to fail
diff --git a/cmake/FindDPCPP.cmake b/cmake/FindDPCPP.cmake
new file mode 100644
index 0000000..73aa30f
--- /dev/null
+++ b/cmake/FindDPCPP.cmake
@@ -0,0 +1,62 @@
+include_guard()
+
+include(CheckCXXCompilerFlag)
+include(FindPackageHandleStandardArgs)
+
+if("${DPCPP_SYCL_TARGET}" STREQUAL "amdgcn-amd-amdhsa" AND
+ "${DPCPP_SYCL_ARCH}" STREQUAL "")
+ message(FATAL_ERROR "Architecture required for AMD DPCPP builds,"
+ " please specify in DPCPP_SYCL_ARCH")
+endif()
+
+set(DPCPP_USER_FLAGS "" CACHE STRING
+ "Additional user-specified compiler flags for DPC++")
+
+get_filename_component(DPCPP_BIN_DIR ${CMAKE_CXX_COMPILER} DIRECTORY)
+find_library(DPCPP_LIB_DIR NAMES sycl sycl6 PATHS "${DPCPP_BIN_DIR}/../lib")
+
+add_library(DPCPP::DPCPP INTERFACE IMPORTED)
+
+set(DPCPP_FLAGS "-fsycl;-fsycl-targets=${DPCPP_SYCL_TARGET};-fsycl-unnamed-lambda;${DPCPP_USER_FLAGS};-ftemplate-backtrace-limit=0")
+if(NOT "${DPCPP_SYCL_ARCH}" STREQUAL "")
+ if("${DPCPP_SYCL_TARGET}" STREQUAL "amdgcn-amd-amdhsa")
+ list(APPEND DPCPP_FLAGS "-Xsycl-target-backend")
+ list(APPEND DPCPP_FLAGS "--offload-arch=${DPCPP_SYCL_ARCH}")
+ elseif("${DPCPP_SYCL_TARGET}" STREQUAL "nvptx64-nvidia-cuda")
+ list(APPEND DPCPP_FLAGS "-Xsycl-target-backend")
+ list(APPEND DPCPP_FLAGS "--cuda-gpu-arch=${DPCPP_SYCL_ARCH}")
+ endif()
+endif()
+
+if(UNIX)
+ set_target_properties(DPCPP::DPCPP PROPERTIES
+ INTERFACE_COMPILE_OPTIONS "${DPCPP_FLAGS}"
+ INTERFACE_LINK_OPTIONS "${DPCPP_FLAGS}"
+ INTERFACE_LINK_LIBRARIES ${DPCPP_LIB_DIR}
+ INTERFACE_INCLUDE_DIRECTORIES "${DPCPP_BIN_DIR}/../include/sycl;${DPCPP_BIN_DIR}/../include")
+ message(STATUS ">>>>>>>>> DPCPP INCLUDE DIR: ${DPCPP_BIN_DIR}/../include/sycl")
+else()
+ set_target_properties(DPCPP::DPCPP PROPERTIES
+ INTERFACE_COMPILE_OPTIONS "${DPCPP_FLAGS}"
+ INTERFACE_LINK_LIBRARIES ${DPCPP_LIB_DIR}
+ INTERFACE_INCLUDE_DIRECTORIES "${DPCPP_BIN_DIR}/../include/sycl")
+endif()
+
+function(add_sycl_to_target)
+ set(options)
+ set(one_value_args TARGET)
+ set(multi_value_args SOURCES)
+ cmake_parse_arguments(SB_ADD_SYCL
+ "${options}"
+ "${one_value_args}"
+ "${multi_value_args}"
+ ${ARGN}
+ )
+ target_compile_options(${SB_ADD_SYCL_TARGET} PUBLIC ${DPCPP_FLAGS})
+ target_link_libraries(${SB_ADD_SYCL_TARGET} DPCPP::DPCPP)
+ target_compile_features(${SB_ADD_SYCL_TARGET} PRIVATE cxx_std_17)
+ get_target_property(target_type ${SB_ADD_SYCL_TARGET} TYPE)
+ if (NOT target_type STREQUAL "OBJECT_LIBRARY")
+ target_link_options(${SB_ADD_SYCL_TARGET} PUBLIC ${DPCPP_FLAGS})
+ endif()
+endfunction()
diff --git a/test/array_cwise.cpp b/test/array_cwise.cpp
index af8a1ef..f0a80df 100644
--- a/test/array_cwise.cpp
+++ b/test/array_cwise.cpp
@@ -83,6 +83,7 @@
Scalar e = static_cast<Scalar>(ref(x(i,j), y(i,j)));
Scalar a = actual(i, j);
bool success = (a==e) || ((numext::isfinite)(e) && internal::isApprox(a, e, tol)) || ((numext::isnan)(a) && (numext::isnan)(e));
+ if ((a == a) && (e == e)) success &= (bool)numext::signbit(e) == (bool)numext::signbit(a);
all_pass &= success;
if (!success) {
std::cout << name << "(" << x(i,j) << "," << y(i,j) << ") = " << a << " != " << e << std::endl;
@@ -95,11 +96,11 @@
template <typename Scalar>
void binary_ops_test() {
binary_op_test<Scalar>("pow",
- [](auto x, auto y) { return Eigen::pow(x, y); },
- [](auto x, auto y) { return std::pow(x, y); });
+ [](const auto& x, const auto& y) { return Eigen::pow(x, y); },
+ [](const auto& x, const auto& y) { return std::pow(x, y); });
binary_op_test<Scalar>("atan2",
- [](auto x, auto y) { return Eigen::atan2(x, y); },
- [](auto x, auto y) { return std::atan2(x, y); });
+ [](const auto& x, const auto& y) { return Eigen::atan2(x, y); },
+ [](const auto& x, const auto& y) { return std::atan2(x, y); });
}
template <typename Scalar>
@@ -125,6 +126,7 @@
Scalar a = eigenPow(j);
bool success = (a == e) || ((numext::isfinite)(e) && internal::isApprox(a, e, tol)) ||
((numext::isnan)(a) && (numext::isnan)(e));
+ if ((a == a) && (e == e)) success &= (bool)numext::signbit(e) == (bool)numext::signbit(a);
all_pass &= success;
if (!success) {
std::cout << "pow(" << bases(j) << "," << exponent << ") = " << a << " != " << e << std::endl;
@@ -138,6 +140,7 @@
Scalar a = eigenPow(j);
bool success = (a == e) || ((numext::isfinite)(e) && internal::isApprox(a, e, tol)) ||
((numext::isnan)(a) && (numext::isnan)(e));
+ if ((a == a) && (e == e)) success &= (bool)numext::signbit(e) == (bool)numext::signbit(a);
all_pass &= success;
if (!success) {
std::cout << "pow(" << bases(j) << "," << exponent << ") = " << a << " != " << e << std::endl;
diff --git a/test/array_for_matrix.cpp b/test/array_for_matrix.cpp
index 06e04a2..34d7215 100644
--- a/test/array_for_matrix.cpp
+++ b/test/array_for_matrix.cpp
@@ -24,10 +24,10 @@
ColVectorType cv1 = ColVectorType::Random(rows);
RowVectorType rv1 = RowVectorType::Random(cols);
-
+
Scalar s1 = internal::random<Scalar>(),
s2 = internal::random<Scalar>();
-
+
// scalar addition
VERIFY_IS_APPROX(m1.array() + s1, s1 + m1.array());
VERIFY_IS_APPROX((m1.array() + s1).matrix(), MatrixType::Constant(rows,cols,s1) + m1);
@@ -55,18 +55,18 @@
VERIFY_IS_APPROX(m3.rowwise() += rv1, m1.rowwise() + rv1);
m3 = m1;
VERIFY_IS_APPROX(m3.rowwise() -= rv1, m1.rowwise() - rv1);
-
- // empty objects
- VERIFY_IS_APPROX((m1.template block<0,Dynamic>(0,0,0,cols).colwise().sum()), RowVectorType::Zero(cols));
- VERIFY_IS_APPROX((m1.template block<Dynamic,0>(0,0,rows,0).rowwise().sum()), ColVectorType::Zero(rows));
- VERIFY_IS_APPROX((m1.template block<0,Dynamic>(0,0,0,cols).colwise().prod()), RowVectorType::Ones(cols));
- VERIFY_IS_APPROX((m1.template block<Dynamic,0>(0,0,rows,0).rowwise().prod()), ColVectorType::Ones(rows));
- VERIFY_IS_APPROX(m1.block(0,0,0,cols).colwise().sum(), RowVectorType::Zero(cols));
- VERIFY_IS_APPROX(m1.block(0,0,rows,0).rowwise().sum(), ColVectorType::Zero(rows));
- VERIFY_IS_APPROX(m1.block(0,0,0,cols).colwise().prod(), RowVectorType::Ones(cols));
- VERIFY_IS_APPROX(m1.block(0,0,rows,0).rowwise().prod(), ColVectorType::Ones(rows));
-
+ // empty objects
+ VERIFY_IS_EQUAL((m1.template block<0,Dynamic>(0,0,0,cols).colwise().sum()), RowVectorType::Zero(cols));
+ VERIFY_IS_EQUAL((m1.template block<Dynamic,0>(0,0,rows,0).rowwise().sum()), ColVectorType::Zero(rows));
+ VERIFY_IS_EQUAL((m1.template block<0,Dynamic>(0,0,0,cols).colwise().prod()), RowVectorType::Ones(cols));
+ VERIFY_IS_EQUAL((m1.template block<Dynamic,0>(0,0,rows,0).rowwise().prod()), ColVectorType::Ones(rows));
+
+ VERIFY_IS_EQUAL(m1.block(0,0,0,cols).colwise().sum(), RowVectorType::Zero(cols));
+ VERIFY_IS_EQUAL(m1.block(0,0,rows,0).rowwise().sum(), ColVectorType::Zero(rows));
+ VERIFY_IS_EQUAL(m1.block(0,0,0,cols).colwise().prod(), RowVectorType::Ones(cols));
+ VERIFY_IS_EQUAL(m1.block(0,0,rows,0).rowwise().prod(), ColVectorType::Ones(rows));
+
// verify the const accessors exist
const Scalar& ref_m1 = m.matrix().array().coeffRef(0);
const Scalar& ref_m2 = m.matrix().array().coeffRef(0,0);
diff --git a/test/blasutil.cpp b/test/blasutil.cpp
index ee98df4..2fc8b68 100644
--- a/test/blasutil.cpp
+++ b/test/blasutil.cpp
@@ -13,7 +13,7 @@
// for packet_traits<Packet*>
// => The only workaround would be to wrap _m128 and the likes
// within wrappers.
-#if EIGEN_GNUC_AT_LEAST(6,0)
+#if EIGEN_GNUC_STRICT_AT_LEAST(6,0,0)
#pragma GCC diagnostic ignored "-Wignored-attributes"
#endif
diff --git a/test/gpu_basic.cu b/test/gpu_basic.cu
index 82f9d61..00838ea 100644
--- a/test/gpu_basic.cu
+++ b/test/gpu_basic.cu
@@ -139,7 +139,7 @@
out[out_idx++] = a / numext::real(b);
out[out_idx++] = numext::real(a) / b;
-#if !defined(EIGEN_COMP_MSVC)
+#if !EIGEN_COMP_MSVC
out[out_idx] = a; out[out_idx++] += b;
out[out_idx] = a; out[out_idx++] -= b;
out[out_idx] = a; out[out_idx++] *= b;
@@ -191,7 +191,7 @@
res.segment(block_idx, size) = x1.real().array() / x2.array();
block_idx += size;
-#if !defined(EIGEN_COMP_MSVC)
+#if !EIGEN_COMP_MSVC
res.segment(block_idx, size) = x1; res.segment(block_idx, size) += x2;
block_idx += size;
res.segment(block_idx, size) = x1; res.segment(block_idx, size) -= x2;
diff --git a/test/gpu_common.h b/test/gpu_common.h
index 05080ad..659babe 100644
--- a/test/gpu_common.h
+++ b/test/gpu_common.h
@@ -146,7 +146,7 @@
std::cout << " EIGEN_CUDA_SDK_VER: " << int(EIGEN_CUDA_SDK_VER) << "\n";
#endif
- #ifdef EIGEN_COMP_NVCC
+ #if EIGEN_COMP_NVCC
std::cout << " EIGEN_COMP_NVCC: " << int(EIGEN_COMP_NVCC) << "\n";
#endif
diff --git a/test/gpu_test_helper.h b/test/gpu_test_helper.h
index 008e95c..ca9378b 100644
--- a/test/gpu_test_helper.h
+++ b/test/gpu_test_helper.h
@@ -395,7 +395,7 @@
std::cout << " EIGEN_CUDA_SDK_VER: " << int(EIGEN_CUDA_SDK_VER) << std::endl;
#endif
- #ifdef EIGEN_COMP_NVCC
+ #if EIGEN_COMP_NVCC
std::cout << " EIGEN_COMP_NVCC: " << int(EIGEN_COMP_NVCC) << std::endl;
#endif
diff --git a/test/main.h b/test/main.h
index a52da9e..8538581 100644
--- a/test/main.h
+++ b/test/main.h
@@ -18,6 +18,9 @@
#include <vector>
#include <typeinfo>
#include <functional>
+#ifdef EIGEN_USE_SYCL
+#include <CL/sycl.hpp>
+#endif
// The following includes of STL headers have to be done _before_ the
// definition of macros min() and max(). The reason is that many STL
@@ -121,9 +124,7 @@
#define FORBIDDEN_IDENTIFIER (this_identifier_is_forbidden_to_avoid_clashes) this_identifier_is_forbidden_to_avoid_clashes
// B0 is defined in POSIX header termios.h
#define B0 FORBIDDEN_IDENTIFIER
-// `I` may be defined by complex.h:
#define I FORBIDDEN_IDENTIFIER
-
// Unit tests calling Eigen's blas library must preserve the default blocking size
// to avoid troubles.
#ifndef EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS
@@ -301,15 +302,16 @@
}
#endif //EIGEN_EXCEPTIONS
- #elif !defined(__CUDACC__) && !defined(__HIPCC__) && !defined(SYCL_DEVICE_ONLY) // EIGEN_DEBUG_ASSERTS
+ #elif !defined(__CUDACC__) && !defined(__HIPCC__) && !defined(__SYCL_DEVICE_ONLY__) // EIGEN_DEBUG_ASSERTS
#define eigen_assert(a) \
if( (!(a)) && (!no_more_assert) ) \
{ \
Eigen::no_more_assert = true; \
- if(report_on_cerr_on_assert_failure) \
+ if(report_on_cerr_on_assert_failure) { \
eigen_plain_assert(a); \
- else \
+ } else { \
EIGEN_THROW_X(Eigen::eigen_assert_exception()); \
+ } \
}
#ifdef EIGEN_EXCEPTIONS
diff --git a/test/meta.cpp b/test/meta.cpp
index cac7af1..97ebdd7 100644
--- a/test/meta.cpp
+++ b/test/meta.cpp
@@ -86,9 +86,9 @@
VERIFY(( check_is_convertible(A*B, A) ));
}
- #if (EIGEN_COMP_GNUC && EIGEN_COMP_GNUC <= 99) \
- || (EIGEN_COMP_CLANG && EIGEN_COMP_CLANG <= 909) \
- || (EIGEN_COMP_MSVC && EIGEN_COMP_MSVC <=1914)
+ #if (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC <= 990) \
+ || (EIGEN_COMP_CLANG_STRICT && EIGEN_COMP_CLANG <= 990) \
+ || (EIGEN_COMP_MSVC && EIGEN_COMP_MSVC <= 1914)
// See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1752,
// basically, a fix in the c++ standard breaks our c++98 implementation
// of is_convertible for abstract classes.
diff --git a/test/qr_colpivoting.cpp b/test/qr_colpivoting.cpp
index 4185f51..b0cc000 100644
--- a/test/qr_colpivoting.cpp
+++ b/test/qr_colpivoting.cpp
@@ -15,7 +15,6 @@
template <typename MatrixType>
void cod() {
- STATIC_CHECK(( internal::is_same<typename CompleteOrthogonalDecomposition<MatrixType>::StorageIndex,int>::value ));
Index rows = internal::random<Index>(2, EIGEN_TEST_MAX_SIZE);
Index cols = internal::random<Index>(2, EIGEN_TEST_MAX_SIZE);
@@ -100,8 +99,6 @@
{
using std::sqrt;
- STATIC_CHECK(( internal::is_same<typename ColPivHouseholderQR<MatrixType>::StorageIndex,int>::value ));
-
Index rows = internal::random<Index>(2,EIGEN_TEST_MAX_SIZE), cols = internal::random<Index>(2,EIGEN_TEST_MAX_SIZE), cols2 = internal::random<Index>(2,EIGEN_TEST_MAX_SIZE);
Index rank = internal::random<Index>(1, (std::min)(rows, cols)-1);
diff --git a/test/qr_fullpivoting.cpp b/test/qr_fullpivoting.cpp
index cca9a8c..bd40881 100644
--- a/test/qr_fullpivoting.cpp
+++ b/test/qr_fullpivoting.cpp
@@ -14,7 +14,6 @@
template<typename MatrixType> void qr()
{
- STATIC_CHECK(( internal::is_same<typename FullPivHouseholderQR<MatrixType>::StorageIndex,int>::value ));
static const int Rows = MatrixType::RowsAtCompileTime, Cols = MatrixType::ColsAtCompileTime;
Index max_size = EIGEN_TEST_MAX_SIZE;
diff --git a/test/sparse_basic.cpp b/test/sparse_basic.cpp
index 6e2661e..a9fc9fd 100644
--- a/test/sparse_basic.cpp
+++ b/test/sparse_basic.cpp
@@ -105,7 +105,6 @@
VERIFY(g_realloc_count==0);
}
- m2.finalize();
VERIFY_IS_APPROX(m2,m1);
}
diff --git a/test/sparse_permutations.cpp b/test/sparse_permutations.cpp
index 5974c74..775c560 100644
--- a/test/sparse_permutations.cpp
+++ b/test/sparse_permutations.cpp
@@ -17,6 +17,15 @@
VERIFY( (#XPR) && nb_transposed_copies==N ); \
}
+static long int nb_temporaries;
+#define EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN {nb_temporaries++;}
+#define VERIFY_TEMPORARY_COUNT(XPR,N) {\
+ nb_temporaries = 0; \
+ XPR; \
+ if(nb_temporaries!=N) std::cerr << "nb_temporaries == " << nb_temporaries << "\n"; \
+ VERIFY( (#XPR) && nb_temporaries==N ); \
+ }
+
#include "sparse.h"
template<typename T>
@@ -79,28 +88,52 @@
VERIFY( is_sorted( ::eval(mat*p) ));
VERIFY( is_sorted( res = mat*p ));
VERIFY_TRANSPOSITION_COUNT( ::eval(mat*p), 0);
- //VERIFY_TRANSPOSITION_COUNT( res = mat*p, IsRowMajor ? 1 : 0 );
+ VERIFY_TEMPORARY_COUNT( ::eval(mat*p), 1)
res_d = mat_d*p;
VERIFY(res.isApprox(res_d) && "mat*p");
VERIFY( is_sorted( ::eval(p*mat) ));
VERIFY( is_sorted( res = p*mat ));
VERIFY_TRANSPOSITION_COUNT( ::eval(p*mat), 0);
+ VERIFY_TEMPORARY_COUNT( ::eval(p*mat), 1);
res_d = p*mat_d;
VERIFY(res.isApprox(res_d) && "p*mat");
VERIFY( is_sorted( (mat*p).eval() ));
VERIFY( is_sorted( res = mat*p.inverse() ));
VERIFY_TRANSPOSITION_COUNT( ::eval(mat*p.inverse()), 0);
+ VERIFY_TEMPORARY_COUNT( ::eval(mat*p.inverse()), 1);
res_d = mat*p.inverse();
VERIFY(res.isApprox(res_d) && "mat*inv(p)");
VERIFY( is_sorted( (p*mat+p*mat).eval() ));
VERIFY( is_sorted( res = p.inverse()*mat ));
VERIFY_TRANSPOSITION_COUNT( ::eval(p.inverse()*mat), 0);
+ VERIFY_TEMPORARY_COUNT( ::eval(p.inverse()*mat), 1);
res_d = p.inverse()*mat_d;
VERIFY(res.isApprox(res_d) && "inv(p)*mat");
+ // test non-plaintype expressions that require additional temporary
+ const Scalar alpha(2.34);
+
+ res_d = p * (alpha * mat_d);
+ VERIFY_TEMPORARY_COUNT( res = p * (alpha * mat), 2);
+ VERIFY( res.isApprox(res_d) && "p*(alpha*mat)" );
+
+ res_d = (alpha * mat_d) * p;
+ VERIFY_TEMPORARY_COUNT( res = (alpha * mat) * p, 2);
+ VERIFY( res.isApprox(res_d) && "(alpha*mat)*p" );
+
+ res_d = p.inverse() * (alpha * mat_d);
+ VERIFY_TEMPORARY_COUNT( res = p.inverse() * (alpha * mat), 2);
+ VERIFY( res.isApprox(res_d) && "inv(p)*(alpha*mat)" );
+
+ res_d = (alpha * mat_d) * p.inverse();
+ VERIFY_TEMPORARY_COUNT( res = (alpha * mat) * p.inverse(), 2);
+ VERIFY( res.isApprox(res_d) && "(alpha*mat)*inv(p)" );
+
+ //
+
VERIFY( is_sorted( (p * mat * p.inverse()).eval() ));
VERIFY( is_sorted( res = mat.twistedBy(p) ));
VERIFY_TRANSPOSITION_COUNT( ::eval(p * mat * p.inverse()), 0);
diff --git a/test/sparse_product.cpp b/test/sparse_product.cpp
index 8b46fc8..dbf549a 100644
--- a/test/sparse_product.cpp
+++ b/test/sparse_product.cpp
@@ -477,7 +477,7 @@
initSparse<Real>(0.1, refA, A);
initSparse<Real>(0.1, refB, B);
- SparseMatrix<Real, OrderC, std::int16_t> result;
+ SparseMatrix<Real, OrderC, std::int8_t> result;
SparseMatrix<Real, OrderC> result_large;
DenseMat refResult;
@@ -486,8 +486,8 @@
// Case: Small input but large result
{
- SparseMatrix<Real, OrderA, std::int16_t> A(127, 8);
- SparseMatrix<Real, OrderB, std::int16_t> B(8, 127);
+ SparseMatrix<Real, OrderA, std::int8_t> A(127, 8);
+ SparseMatrix<Real, OrderB, std::int8_t> B(8, 127);
DenseMat refA(127, 8);
DenseMat refB(8, 127);
diff --git a/test/vectorization_logic.cpp b/test/vectorization_logic.cpp
index a6441ab..59f88cf 100644
--- a/test/vectorization_logic.cpp
+++ b/test/vectorization_logic.cpp
@@ -26,7 +26,7 @@
// for packet_traits<Packet*>
// => The only workaround would be to wrap _m128 and the likes
// within wrappers.
-#if EIGEN_GNUC_AT_LEAST(6,0)
+#if EIGEN_GNUC_STRICT_AT_LEAST(6,0,0)
#pragma GCC diagnostic ignored "-Wignored-attributes"
#endif
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
index 0bd3a00..3365b72 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
@@ -170,10 +170,10 @@
m_rightImpl.cleanup();
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) const {
m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i);
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) const {
const int LhsStoreMode = TensorEvaluator<LeftArgType, Device>::IsAligned ? Aligned : Unaligned;
const int RhsLoadMode = TensorEvaluator<RightArgType, Device>::IsAligned ? Aligned : Unaligned;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index a4ac2ad..e3fc8f4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -1185,7 +1185,6 @@
internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
return derived();
}
-
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Derived& derived() { return *static_cast<Derived*>(this); }
EIGEN_DEVICE_FUNC
@@ -1195,3 +1194,4 @@
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_BASE_H
+
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index ef01e30..e5f3d5e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -438,13 +438,13 @@
: Base(op, device)
{ }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const
{
return this->m_impl.coeffRef(this->srcCoeff(index));
}
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- void writePacket(Index index, const PacketReturnType& x)
+ void writePacket(Index index, const PacketReturnType& x) const
{
if (this->isInnerChipping()) {
// m_stride is equal to 1, so let's avoid the integer division.
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
index 073be81..dea41df 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
@@ -331,7 +331,7 @@
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const
{
// Collect dimension-wise indices (subs).
array<Index, Base::NumDims> subs;
@@ -360,7 +360,7 @@
}
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- void writePacket(Index index, const PacketReturnType& x)
+ void writePacket(Index index, const PacketReturnType& x) const
{
const int packetSize = PacketType<CoeffReturnType, Device>::size;
EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index c629e44..3a917a0 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -362,10 +362,8 @@
const OutputKernelType m_output_kernel;
};
-
template<typename Derived>
-struct TensorContractionEvaluatorBase : internal::no_assignment_operator
-{
+struct TensorContractionEvaluatorBase {
typedef typename internal::traits<Derived>::Indices Indices;
typedef typename internal::traits<Derived>::LeftArgType LeftArgType;
typedef typename internal::traits<Derived>::RightArgType RightArgType;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
index c6c4077..526fc81 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
@@ -597,7 +597,7 @@
const TripleDim triple_dim_)
: TensorContractionKernel(scratch_, lhs_, rhs_, out_res_, groupSizeM_, 1, numTiles_, triple_dim_) {}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
const StorageIndex linearLocalThreadId = itemID.get_local_id(0);
const StorageIndex nLocalThreadId = linearLocalThreadId / Properties::LocalThreadSizeM;
const StorageIndex mLocalThreadId = linearLocalThreadId % Properties::LocalThreadSizeM;
@@ -636,7 +636,7 @@
// privateRes memory of Each computation the compute block function is independent of local and no local concepts as
// it only compute the block on each thread's private memory space
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_block_per_tile(OutScalar *lhs_block_ptr, OutScalar *rhs_block_ptr,
- PacketReturnType *privateRes) {
+ PacketReturnType *privateRes) const {
StorageIndex idx = 0;
EIGEN_CONSTEXPR StorageIndex lhs_stride =
contraction_tp == contraction_type::local ? (PacketSize * Properties::LocalThreadSizeM) : 1;
@@ -661,7 +661,7 @@
// class.
template <bool is_internal_block, StorageIndex PrivateNStride, typename OutPtr>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void store(OutPtr *out_ptr, PacketReturnType *privateRes,
- StorageIndex mGlobalOffset, StorageIndex nGlobalOffset) {
+ StorageIndex mGlobalOffset, StorageIndex nGlobalOffset) const {
auto chk_bound = [&](const StorageIndex &mIndex, const StorageIndex &nIndex) EIGEN_DEVICE_FUNC {
return (mIndex + PacketSize - 1 < triple_dim.M && nGlobalOffset + nIndex < triple_dim.N);
};
@@ -713,7 +713,7 @@
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
std::enable_if_t<contract_tp == contraction_type::no_local>
extract_block(const Input &inpt, PrivateReg private_ptr, const std::pair<StorageIndex, StorageIndex> &,
- const StorageIndex &ncOffset, const StorageIndex cOffset) {
+ const StorageIndex &ncOffset, const StorageIndex cOffset) const {
EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC =
InputBlockProperties::is_rhs ? Properties::LocalThreadSizeN : Properties::LocalThreadSizeM;
EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC =
@@ -833,7 +833,8 @@
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_tile_per_panel(const cl::sycl::nd_item<1> &itemID,
ThreadProperties<StorageIndex> &thread_properties,
TiledMemory &tiled_input_block,
- PacketReturnType *privateRes, bool &db_offset) {
+ PacketReturnType *privateRes, bool &db_offset) const {
+
// Tiling the Rhs block from global to local memory
extract_block<RHSBlockProperties, is_internal_block>(
rhs, tiled_input_block.rhs_scratch_extract.ptr + (db_offset * Properties::TileSizeDimK * LSDR),
@@ -871,7 +872,7 @@
template <bool is_internal_block, typename OutPtr>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(const cl::sycl::nd_item<1> &itemID,
ThreadProperties<StorageIndex> &thread_properties,
- OutPtr out_ptr) {
+ OutPtr out_ptr) const {
auto tiled_input_block = TiledMemory{thread_properties, scratch.get_pointer()};
// Allocate register space
PacketReturnType privateRes[Properties::WorkLoadPerThreadM * Properties::WorkLoadPerThreadN / PacketSize] = {
@@ -897,7 +898,7 @@
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
std::enable_if_t<contract_tp == contraction_type::local>
extract_block(const Input &inpt, Local local_ptr, const std::pair<StorageIndex, StorageIndex>& local_index,
- const StorageIndex &ncOffset, const StorageIndex cOffset) {
+ const StorageIndex &ncOffset, const StorageIndex cOffset) const {
EIGEN_CONSTEXPR StorageIndex TileSizeDimNC =
InputBlockProperties::is_rhs ? Properties::TileSizeDimN : Properties::TileSizeDimM;
EIGEN_CONSTEXPR StorageIndex LoadPerThread =
@@ -1035,7 +1036,7 @@
nonContractDim(nonContractDim_),
contractDim(contractDim_) {}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
auto scratch_ptr = scratch.get_pointer();
const StorageIndex linearLocalThreadId = itemID.get_local_id(0);
StorageIndex nonContractId = is_lhs_vec ? linearLocalThreadId / Properties::LocalThreadSizeC
@@ -1252,7 +1253,8 @@
const StorageIndex rng_)
: scratch(scratch_), lhs(lhs_), rhs(rhs_), out_res(out_res_), rng(rng_) {}
- EIGEN_DEVICE_FUNC void operator()(cl::sycl::nd_item<1> itemID) {
+ EIGEN_DEVICE_FUNC void operator()(cl::sycl::nd_item<1> itemID) const {
+
auto out_ptr = out_res.get_pointer();
auto scratch_ptr = scratch.get_pointer().get();
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index 70d2129..e6e586b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -123,7 +123,9 @@
inputIndex += idx * m_inputStrides[d];
p -= idx * m_gpuInputStrides[d];
}
- inputIndex += p * m_inputStrides[NumKernelDims];
+ if (NumKernelDims < NumDims) {
+ inputIndex += p * m_inputStrides[NumKernelDims];
+ }
} else {
std::ptrdiff_t limit = 0;
if (NumKernelDims < NumDims) {
@@ -147,7 +149,9 @@
outputIndex += idx * m_outputStrides[d];
p -= idx * m_gpuOutputStrides[d];
}
- outputIndex += p * m_outputStrides[NumKernelDims];
+ if (NumKernelDims < NumDims) {
+ outputIndex += p * m_outputStrides[NumKernelDims];
+ }
} else {
std::ptrdiff_t limit = 0;
if (NumKernelDims < NumDims) {
@@ -386,7 +390,7 @@
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
- EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
m_inputImpl.evalSubExprsIfNeeded(NULL);
preloadKernel();
return true;
@@ -824,7 +828,7 @@
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }
- EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
preloadKernel();
m_inputImpl.evalSubExprsIfNeeded(NULL);
if (data) {
@@ -1112,9 +1116,6 @@
}
private:
- // No assignment (copies are needed by the kernels)
- TensorEvaluator& operator = (const TensorEvaluator&);
-
TensorEvaluator<InputArgType, GpuDevice> m_inputImpl;
TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl;
KernelArgType m_kernelArg;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
index 3cbc1ab..7ccb174 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
@@ -57,10 +57,10 @@
input_range(input_range_) {}
template <typename BooleanDim2>
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) const {
return (boolean_check[0] && boolean_check[1]);
}
- void operator()(cl::sycl::nd_item<2> itemID) {
+ void operator()(cl::sycl::nd_item<2> itemID) const {
auto buffer_ptr = buffer_acc.get_pointer();
auto kernel_ptr = kernel_filter.get_pointer();
// the required row to be calculated for the for each plane in shered memory
@@ -123,11 +123,11 @@
kernel_size(kernel_size_),
input_range(input_range_) {}
template <typename BooleanDim3>
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) const {
return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
}
- void operator()(cl::sycl::nd_item<3> itemID) {
+ void operator()(cl::sycl::nd_item<3> itemID) const {
auto buffer_ptr = buffer_acc.get_pointer();
auto kernel_ptr = kernel_filter.get_pointer();
// the required row to be calculated for the for each plane in shered memory
@@ -212,10 +212,10 @@
input_range(input_range_),
numP(numP_) {}
template <typename BooleanDim3>
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) const {
return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
}
- void operator()(cl::sycl::nd_item<3> itemID) {
+ void operator()(cl::sycl::nd_item<3> itemID) const {
auto buffer_ptr = buffer_acc.get_pointer();
auto kernel_ptr = kernel_filter.get_pointer();
const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
index 84ebe38..8fdc8ba 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
@@ -31,8 +31,7 @@
.template get_info<cl::sycl::info::device::local_mem_type>()),
max_work_item_sizes(
queue.get_device()
- .template get_info<
- cl::sycl::info::device::max_work_item_sizes>()),
+ .template get_info<cl::sycl::info::device::max_work_item_sizes<3>>()),
max_mem_alloc_size(
queue.get_device()
.template get_info<
@@ -356,7 +355,7 @@
return;
}
const ptrdiff_t count = end - begin;
- auto f = [&](cl::sycl::handler &cgh) {
+ auto f = [&](cl::sycl::handler &cgh) {
auto dst_acc = get_typed_range_accessor<write_mode, T>(cgh, begin, count);
cgh.fill(dst_acc, value);
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
index 0e9cdfe..7e13c69 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@@ -159,10 +159,10 @@
}
#endif
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) const {
m_buffer[i] = m_impl.coeff(i);
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) const {
internal::pstoret<CoeffReturnType, PacketReturnType, Aligned>(m_buffer + i, m_impl.template packet<TensorEvaluator<ArgType, Device>::IsAligned ? Aligned : Unaligned>(i));
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index 92ad0f5..2bd94c3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -98,7 +98,7 @@
return m_data[index];
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const{
eigen_assert(m_data != NULL);
return m_data[index];
}
@@ -122,7 +122,7 @@
}
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- void writePacket(Index index, const PacketReturnType& x)
+ void writePacket(Index index, const PacketReturnType& x) const
{
return internal::pstoret<Scalar, PacketReturnType, StoreMode>(m_data + index, x);
}
@@ -137,7 +137,7 @@
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType&
- coeffRef(const array<DenseIndex, NumCoords>& coords) {
+ coeffRef(const array<DenseIndex, NumCoords>& coords) const {
eigen_assert(m_data != NULL);
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
return m_data[m_dims.IndexOfColMajor(coords)];
@@ -978,7 +978,14 @@
TensorEvaluator<ElseArgType, Device> m_elseImpl;
};
-
} // end namespace Eigen
+#if defined(EIGEN_USE_SYCL) && defined(SYCL_COMPILER_IS_DPCPP)
+template <typename Derived, typename Device>
+struct cl::sycl::is_device_copyable<
+ Eigen::TensorEvaluator<Derived, Device>,
+ std::enable_if_t<!std::is_trivially_copyable<
+ Eigen::TensorEvaluator<Derived, Device>>::value>> : std::true_type {};
+#endif
+
#endif // EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index de9bed4..f961b40 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -688,12 +688,12 @@
: evaluator(evaluator_), range(range_) {}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void operator()(
- cl::sycl::nd_item<1> itemID) {
+ cl::sycl::nd_item<1> itemID) const {
compute(itemID);
}
template <bool is_vec = Evaluator::PacketAccess>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t<!is_vec>
- compute(const cl::sycl::nd_item<1>& itemID) {
+ compute(const cl::sycl::nd_item<1>& itemID) const {
Index gId = static_cast<Index>(itemID.get_global_linear_id());
Index total_threads = itemID.get_global_range(0);
@@ -703,7 +703,7 @@
}
template <bool is_vec = Evaluator::PacketAccess>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t<is_vec>
- compute(const cl::sycl::nd_item<1>& itemID) {
+ compute(const cl::sycl::nd_item<1>& itemID) const {
const Index vectorizedRange =
(range / Evaluator::PacketSize) * Evaluator::PacketSize;
Index gId = static_cast<Index>(itemID.get_global_linear_id());
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
index d12bd6d..8b14085 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
@@ -202,12 +202,12 @@
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const
{
return this->m_impl.coeffRef(index);
}
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- void writePacket(Index index, const PacketReturnType& x)
+ void writePacket(Index index, const PacketReturnType& x) const
{
this->m_impl.template writePacket<StoreMode>(index, x);
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index 0205710..b696190b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -267,13 +267,13 @@
TensorBlockDesc;
//===--------------------------------------------------------------------===//
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const
{
return this->m_impl.coeffRef(index);
}
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- void writePacket(Index index, const PacketReturnType& x)
+ void writePacket(Index index, const PacketReturnType& x) const
{
this->m_impl.template writePacket<StoreMode>(index, x);
}
@@ -733,7 +733,7 @@
: Base(op, device)
{ }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const
{
if (this->m_is_identity) {
return this->m_impl.coeffRef(index);
@@ -743,7 +743,7 @@
}
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- void writePacket(Index index, const PacketReturnType& x)
+ void writePacket(Index index, const PacketReturnType& x) const
{
if (this->m_is_identity) {
this->m_impl.template writePacket<StoreMode>(index, x);
@@ -1085,7 +1085,7 @@
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
typedef Strides Dimensions;
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const
{
if (this->m_is_identity) {
return this->m_impl.coeffRef(index);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index e62397a..ae03ba5 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -895,7 +895,7 @@
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_impl.bind(cgh);
- m_result.bind(cgh);
+ if(m_result) m_result.bind(cgh);
}
#endif
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
index 397870f..715797d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
@@ -87,7 +87,7 @@
SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_)
: scratch(scratch_), aI(aI_), outAcc(outAcc_), op(OpDef::get_op(op_)) {}
- void operator()(cl::sycl::nd_item<1> itemID) {
+ void operator()(cl::sycl::nd_item<1> itemID) const {
// Our empirical research shows that the best performance will be achieved
// when there is only one element per thread to reduce in the second step.
// in this step the second step reduction time is almost negligible.
@@ -141,11 +141,11 @@
Index rng_, OpType op_)
: scratch(scratch_), evaluator(evaluator_), final_output(final_output_), rng(rng_), op(OpDef::get_op(op_)) {}
- void operator()(cl::sycl::nd_item<1> itemID) { compute_reduction(itemID); }
+ void operator()(cl::sycl::nd_item<1> itemID) const { compute_reduction(itemID); }
template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<Vect> compute_reduction(
- const cl::sycl::nd_item<1> &itemID) {
+ const cl::sycl::nd_item<1> &itemID) const {
auto output_ptr = final_output.get_pointer();
Index VectorizedRange = (rng / Evaluator::PacketSize) * Evaluator::PacketSize;
Index globalid = itemID.get_global_id(0);
@@ -184,7 +184,7 @@
template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!Vect> compute_reduction(
- const cl::sycl::nd_item<1> &itemID) {
+ const cl::sycl::nd_item<1> &itemID) const {
auto output_ptr = final_output.get_pointer();
Index globalid = itemID.get_global_id(0);
Index localid = itemID.get_local_id(0);
@@ -228,14 +228,16 @@
range(range_),
num_values_to_reduce(num_values_to_reduce_) {}
- void operator()(cl::sycl::nd_item<1> itemID) {
+ void operator()(cl::sycl::nd_item<1> itemID) const {
+ //This is to bypass the statefull condition in Eigen meanReducer
+ Op non_const_functor;
+ std::memcpy(&non_const_functor, &functor, sizeof (Op));
auto output_accessor_ptr = output_accessor.get_pointer();
- /// const cast added as a naive solution to solve the qualifier drop error
Index globalid = static_cast<Index>(itemID.get_global_linear_id());
if (globalid < range) {
CoeffReturnType accum = functor.initialize();
Eigen::internal::GenericDimReducer<Evaluator::NumReducedDims - 1, Evaluator, Op>::reduce(
- evaluator, evaluator.firstInput(globalid), functor, &accum);
+ evaluator, evaluator.firstInput(globalid), non_const_functor, &accum);
output_accessor_ptr[globalid] = OpDef::finalise_op(functor.finalize(accum), num_values_to_reduce);
}
}
@@ -281,7 +283,7 @@
num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void element_wise_reduce(Index globalRId, Index globalPId,
- CoeffReturnType &accumulator) {
+ CoeffReturnType &accumulator) const {
if (globalPId >= num_coeffs_to_preserve) {
return;
}
@@ -298,7 +300,7 @@
global_offset += per_thread_global_stride;
}
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
const Index linearLocalThreadId = itemID.get_local_id(0);
Index pLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId % PannelParameters::LocalThreadSizeP
: linearLocalThreadId / PannelParameters::LocalThreadSizeR;
@@ -380,7 +382,7 @@
num_coeffs_to_preserve(num_coeffs_to_preserve_),
num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
const Index globalId = itemID.get_global_id(0);
if (globalId >= num_coeffs_to_preserve) return;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
index b5e66b3..342d9e9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@@ -441,12 +441,12 @@
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const Dimensions& dimensions() const { return this->m_dimensions; }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) const {
return this->m_impl.coeffRef(this->reverseIndex(index));
}
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- void writePacket(Index index, const PacketReturnType& x) {
+ void writePacket(Index index, const PacketReturnType& x) const {
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
// This code is pilfered from TensorMorphing.h
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h
index 636fb7d..ecc872b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h
@@ -109,28 +109,28 @@
template <scan_step sst = stp, typename Input>
std::enable_if_t<sst == scan_step::first, CoeffReturnType> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
- read(const Input &inpt, Index global_id) {
+ read(const Input &inpt, Index global_id) const {
return inpt.coeff(global_id);
}
template <scan_step sst = stp, typename Input>
std::enable_if_t<sst != scan_step::first, CoeffReturnType> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
- read(const Input &inpt, Index global_id) {
+ read(const Input &inpt, Index global_id) const {
return inpt[global_id];
}
template <scan_step sst = stp, typename InclusiveOp>
std::enable_if_t<sst == scan_step::first> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- first_step_inclusive_Operation(InclusiveOp inclusive_op) {
+ first_step_inclusive_Operation(InclusiveOp inclusive_op) const {
inclusive_op();
}
template <scan_step sst = stp, typename InclusiveOp>
std::enable_if_t<sst != scan_step::first> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- first_step_inclusive_Operation(InclusiveOp) {}
+ first_step_inclusive_Operation(InclusiveOp) const {}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
auto out_ptr = out_accessor.get_pointer();
auto tmp_ptr = temp_accessor.get_pointer();
auto scratch_ptr = scratch.get_pointer().get();
@@ -307,7 +307,7 @@
scanParameters(scanParameters_),
accumulator(accumulator_) {}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
auto in_ptr = in_accessor.get_pointer();
auto out_ptr = out_accessor.get_pointer();
@@ -473,7 +473,7 @@
typedef typename Self::CoeffReturnType CoeffReturnType;
typedef typename Self::Storage Storage;
typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
- void operator()(Self &self, EvaluatorPointerType data) {
+ void operator()(Self &self, EvaluatorPointerType data) const {
const Index total_size = internal::array_prod(self.dimensions());
const Index scan_size = self.size();
const Index scan_stride = self.stride();
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
index 977263a..3d97e66 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -390,13 +390,13 @@
: Base(op, device)
{ }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const
{
return this->m_impl.coeffRef(this->srcCoeff(index));
}
template <int StoreMode> EIGEN_STRONG_INLINE
- void writePacket(Index index, const PacketReturnType& x)
+ void writePacket(Index index, const PacketReturnType& x) const
{
EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
index 609afe3..681c4e5 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@@ -288,13 +288,13 @@
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) const
{
return this->m_impl.coeffRef(this->srcCoeff(index));
}
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- void writePacket(Index index, const PacketReturnType& x)
+ void writePacket(Index index, const PacketReturnType& x) const
{
EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+PacketSize-1 < this->dimensions().TotalSize());
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h
index 4545119..ff3f834 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h
@@ -31,8 +31,7 @@
#endif
// Checks whether C++11's `thread_local` storage duration specifier is
// supported.
-#if defined(__apple_build_version__) && \
- ((__apple_build_version__ < 8000042) || \
+#if EIGEN_COMP_CLANGAPPLE && ((EIGEN_COMP_CLANGAPPLE < 8000042) || \
(TARGET_OS_IPHONE && __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_9_0))
// Notes: Xcode's clang did not support `thread_local` until version
// 8, and even then not for all iOS < 9.0.
diff --git a/unsupported/Eigen/src/SparseExtra/MarketIO.h b/unsupported/Eigen/src/SparseExtra/MarketIO.h
index cf5828e..5c26133 100644
--- a/unsupported/Eigen/src/SparseExtra/MarketIO.h
+++ b/unsupported/Eigen/src/SparseExtra/MarketIO.h
@@ -189,6 +189,7 @@
readsizes = true;
mat.resize(M,N);
mat.reserve(NNZ);
+ elements.reserve(NNZ);
}
}
else
diff --git a/unsupported/doc/examples/SYCL/CMakeLists.txt b/unsupported/doc/examples/SYCL/CMakeLists.txt
index 8ee96f5..4fe94c6 100644
--- a/unsupported/doc/examples/SYCL/CMakeLists.txt
+++ b/unsupported/doc/examples/SYCL/CMakeLists.txt
@@ -2,9 +2,8 @@
set(EIGEN_SYCL ON)
list(APPEND CMAKE_EXE_LINKER_FLAGS -pthread)
-if(EIGEN_SYCL_TRISYCL)
- set(CMAKE_CXX_STANDARD 17)
-else(EIGEN_SYCL_TRISYCL)
+set(CMAKE_CXX_STANDARD 17)
+if(EIGEN_SYCL_ComputeCpp)
if(MSVC)
list(APPEND COMPUTECPP_USER_FLAGS -DWIN32)
else()
@@ -22,7 +21,7 @@
-no-serial-memop
-Xclang
-cl-mad-enable)
-endif(EIGEN_SYCL_TRISYCL)
+endif(EIGEN_SYCL_ComputeCpp)
FOREACH(example_src ${examples_SRCS})
GET_FILENAME_COMPONENT(example ${example_src} NAME_WE)
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index 21d8c5e..45cc4a3 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -122,6 +122,7 @@
if(EIGEN_TEST_SYCL)
set(EIGEN_SYCL ON)
+ set(CMAKE_CXX_STANDARD 17)
# Forward CMake options as preprocessor definitions
if(EIGEN_SYCL_USE_DEFAULT_SELECTOR)
add_definitions(-DEIGEN_SYCL_USE_DEFAULT_SELECTOR=${EIGEN_SYCL_USE_DEFAULT_SELECTOR})
@@ -172,10 +173,7 @@
add_definitions(-DEIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION=${EIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION})
endif()
- if(EIGEN_SYCL_TRISYCL)
- # triSYCL now requires c++17.
- set(CMAKE_CXX_STANDARD 17)
- else()
+ if(EIGEN_SYCL_ComputeCpp)
if(MSVC)
list(APPEND COMPUTECPP_USER_FLAGS -DWIN32)
else()
@@ -193,7 +191,7 @@
-no-serial-memop
-Xclang
-cl-mad-enable)
- endif()
+ endif(EIGEN_SYCL_ComputeCpp)
ei_add_test(cxx11_tensor_sycl)
ei_add_test(cxx11_tensor_image_op_sycl)
@@ -409,4 +407,3 @@
endif()
endif()
-
diff --git a/unsupported/test/cxx11_eventcount.cpp b/unsupported/test/cxx11_eventcount.cpp
index 7bf4e96..714540e 100644
--- a/unsupported/test/cxx11_eventcount.cpp
+++ b/unsupported/test/cxx11_eventcount.cpp
@@ -15,7 +15,7 @@
// Visual studio doesn't implement a rand_r() function since its
// implementation of rand() is already thread safe
int rand_reentrant(unsigned int* s) {
-#ifdef EIGEN_COMP_MSVC_STRICT
+#if EIGEN_COMP_MSVC_STRICT
EIGEN_UNUSED_VARIABLE(s);
return rand();
#else
diff --git a/unsupported/test/cxx11_runqueue.cpp b/unsupported/test/cxx11_runqueue.cpp
index 8fc5a30..0f7f13b 100644
--- a/unsupported/test/cxx11_runqueue.cpp
+++ b/unsupported/test/cxx11_runqueue.cpp
@@ -17,7 +17,7 @@
// Visual studio doesn't implement a rand_r() function since its
// implementation of rand() is already thread safe
int rand_reentrant(unsigned int* s) {
-#ifdef EIGEN_COMP_MSVC_STRICT
+#if EIGEN_COMP_MSVC_STRICT
EIGEN_UNUSED_VARIABLE(s);
return rand();
#else
diff --git a/unsupported/test/cxx11_tensor_argmax_gpu.cu b/unsupported/test/cxx11_tensor_argmax_gpu.cu
index 79f4066..d9d5da4 100644
--- a/unsupported/test/cxx11_tensor_argmax_gpu.cu
+++ b/unsupported/test/cxx11_tensor_argmax_gpu.cu
@@ -23,8 +23,8 @@
void test_gpu_simple_argmax()
{
Tensor<double, 3, Layout> in(Eigen::array<DenseIndex, 3>(72,53,97));
- Tensor<DenseIndex, 1, Layout> out_max(Eigen::array<DenseIndex, 1>(1));
- Tensor<DenseIndex, 1, Layout> out_min(Eigen::array<DenseIndex, 1>(1));
+ Tensor<DenseIndex, 0, Layout> out_max;
+ Tensor<DenseIndex, 0, Layout> out_min;
in.setRandom();
in *= in.constant(100.0);
in(0, 0, 0) = -1000.0;
@@ -46,8 +46,8 @@
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<double, 3, Layout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 3>(72,53,97));
- Eigen::TensorMap<Eigen::Tensor<DenseIndex, 1, Layout>, Aligned > gpu_out_max(d_out_max, Eigen::array<DenseIndex, 1>(1));
- Eigen::TensorMap<Eigen::Tensor<DenseIndex, 1, Layout>, Aligned > gpu_out_min(d_out_min, Eigen::array<DenseIndex, 1>(1));
+ Eigen::TensorMap<Eigen::Tensor<DenseIndex, 0, Layout>, Aligned > gpu_out_max(d_out_max);
+ Eigen::TensorMap<Eigen::Tensor<DenseIndex, 0, Layout>, Aligned > gpu_out_min(d_out_min);
gpu_out_max.device(gpu_device) = gpu_in.argmax();
gpu_out_min.device(gpu_device) = gpu_in.argmin();
@@ -56,8 +56,8 @@
assert(gpuMemcpyAsync(out_min.data(), d_out_min, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
- VERIFY_IS_EQUAL(out_max(Eigen::array<DenseIndex, 1>(0)), 72*53*97 - 1);
- VERIFY_IS_EQUAL(out_min(Eigen::array<DenseIndex, 1>(0)), 0);
+ VERIFY_IS_EQUAL(out_max(), 72*53*97 - 1);
+ VERIFY_IS_EQUAL(out_min(), 0);
gpuFree(d_in);
gpuFree(d_out_max);
diff --git a/unsupported/test/cxx11_tensor_builtins_sycl.cpp b/unsupported/test/cxx11_tensor_builtins_sycl.cpp
index 27a8254..be7afd2 100644
--- a/unsupported/test/cxx11_tensor_builtins_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_builtins_sycl.cpp
@@ -27,17 +27,64 @@
// Functions used to compare the TensorMap implementation on the device with
// the equivalent on the host
-namespace cl {
-namespace sycl {
-template <typename T> T abs(T x) { return cl::sycl::fabs(x); }
+namespace SYCL {
+template <typename T> T abs(T x) {
+ return cl::sycl::abs(x);
+}
+
+template <> float abs(float x) {
+ return cl::sycl::fabs(x);
+}
+
+template <> double abs(double x) {
+ return cl::sycl::fabs(x);
+}
+
template <typename T> T square(T x) { return x * x; }
template <typename T> T cube(T x) { return x * x * x; }
template <typename T> T inverse(T x) { return T(1) / x; }
-template <typename T> T cwiseMax(T x, T y) { return cl::sycl::max(x, y); }
-template <typename T> T cwiseMin(T x, T y) { return cl::sycl::min(x, y); }
+template <typename T> T cwiseMax(T x, T y) {
+return cl::sycl::max(x, y);
+}
+template <typename T> T cwiseMin(T x, T y) {
+ return cl::sycl::min(x, y);
}
}
+
+#define DECLARE_UNARY_STRUCT_NON_SYCL(FUNC) \
+ struct op_##FUNC { \
+ template <typename T> \
+ auto operator()(const T& x) { \
+ return SYCL::FUNC(x); \
+ } \
+ template <typename T> \
+ auto operator()(const TensorMap<T>& x) { \
+ return x.FUNC(); \
+ } \
+ };
+
+DECLARE_UNARY_STRUCT_NON_SYCL(abs)
+DECLARE_UNARY_STRUCT_NON_SYCL(square)
+DECLARE_UNARY_STRUCT_NON_SYCL(cube)
+DECLARE_UNARY_STRUCT_NON_SYCL(inverse)
+
+#define DECLARE_BINARY_STRUCT_NON_SYCL(FUNC) \
+ struct op_##FUNC { \
+ template <typename T1, typename T2> \
+ auto operator()(const T1& x, const T2& y){ \
+ return SYCL::FUNC(x, y); \
+ } \
+ template <typename T1, typename T2> \
+ auto operator()(const TensorMap<T1>& x, const TensorMap<T2>& y) { \
+ return x.FUNC(y); \
+ } \
+ };
+
+DECLARE_BINARY_STRUCT_NON_SYCL(cwiseMax)
+DECLARE_BINARY_STRUCT_NON_SYCL(cwiseMin)
+
+
struct EqualAssignment {
template <typename Lhs, typename Rhs>
void operator()(Lhs& lhs, const Rhs& rhs) { lhs = rhs; }
@@ -119,12 +166,9 @@
} \
};
-DECLARE_UNARY_STRUCT(abs)
+
DECLARE_UNARY_STRUCT(sqrt)
DECLARE_UNARY_STRUCT(rsqrt)
-DECLARE_UNARY_STRUCT(square)
-DECLARE_UNARY_STRUCT(cube)
-DECLARE_UNARY_STRUCT(inverse)
DECLARE_UNARY_STRUCT(tanh)
DECLARE_UNARY_STRUCT(exp)
DECLARE_UNARY_STRUCT(expm1)
@@ -288,8 +332,6 @@
} \
};
-DECLARE_BINARY_STRUCT(cwiseMax)
-DECLARE_BINARY_STRUCT(cwiseMin)
#define DECLARE_BINARY_STRUCT_OP(NAME, OPERATOR) \
struct op_##NAME { \
diff --git a/unsupported/test/cxx11_tensor_device_sycl.cpp b/unsupported/test/cxx11_tensor_device_sycl.cpp
index d7ff38d..74e9026 100644
--- a/unsupported/test/cxx11_tensor_device_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_device_sycl.cpp
@@ -23,6 +23,13 @@
#include <stdint.h>
#include <iostream>
+#ifdef SYCL_COMPILER_IS_DPCPP
+template <typename T>
+struct cl::sycl::is_device_copyable<
+ const OffByOneScalar<T>,
+ std::enable_if_t<!std::is_trivially_copyable<const OffByOneScalar<T>>::value>> : std::true_type {};
+#endif
+
template <typename DataType, int DataLayout, typename IndexType>
void test_device_memory(const Eigen::SyclDevice &sycl_device) {
IndexType sizeDim1 = 100;
diff --git a/unsupported/test/cxx11_tensor_gpu.cu b/unsupported/test/cxx11_tensor_gpu.cu
index 7b3fb5a..31baf1b 100644
--- a/unsupported/test/cxx11_tensor_gpu.cu
+++ b/unsupported/test/cxx11_tensor_gpu.cu
@@ -1100,9 +1100,9 @@
template <typename Scalar>
void test_gpu_ndtri()
{
- Tensor<Scalar, 1> in_x(8);
- Tensor<Scalar, 1> out(8);
- Tensor<Scalar, 1> expected_out(8);
+ Tensor<Scalar, 1> in_x(9);
+ Tensor<Scalar, 1> out(9);
+ Tensor<Scalar, 1> expected_out(9);
out.setZero();
in_x(0) = Scalar(1);