BEGIN_PUBLIC
Update Eigen to: https://gitlab.com/libeigen/eigen/-/commit/fd1dcb6b45a2c797ad4c4d6cc7678ee70763b4ed
END_PUBLIC
PiperOrigin-RevId: 343602836
diff --git a/Eigen/src/Core/arch/Default/BFloat16.h b/Eigen/src/Core/arch/Default/BFloat16.h
index 63ceace..351f451 100644
--- a/Eigen/src/Core/arch/Default/BFloat16.h
+++ b/Eigen/src/Core/arch/Default/BFloat16.h
@@ -69,7 +69,7 @@
template<class T>
explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16(const T& val)
: bfloat16_impl::bfloat16_base(bfloat16_impl::float_to_bfloat16_rtne<internal::is_integral<T>::value>(static_cast<float>(val))) {}
-
+
explicit EIGEN_DEVICE_FUNC bfloat16(float f)
: bfloat16_impl::bfloat16_base(bfloat16_impl::float_to_bfloat16_rtne<false>(f)) {}
@@ -82,14 +82,6 @@
EIGEN_DEVICE_FUNC operator float() const { // NOLINT: Allow implicit conversion to float, because it is lossless.
return bfloat16_impl::bfloat16_to_float(*this);
}
-
-#if EIGEN_HAS_CXX11
- EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(bool) const {
- // +0.0 and -0.0 become false, everything else becomes true.
- return (value & 0x7fff) != 0;
- }
-#endif
-
};
} // namespace Eigen
@@ -272,10 +264,14 @@
return output;
}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw raw_uint16_to_bfloat16(unsigned short value) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw raw_uint16_to_bfloat16(numext::uint16_t value) {
return __bfloat16_raw(value);
}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR numext::uint16_t raw_bfloat16_as_uint16(const __bfloat16_raw& bf) {
+ return bf.value;
+}
+
// float_to_bfloat16_rtne template specialization that does not make any
// assumption about the value of its function argument (ff).
template <>
@@ -454,7 +450,7 @@
// float_to_bfloat16_rtne template specialization that assumes that its function
// argument (ff) is either a normal floating point number, or +/-infinity, or
// zero. Used to improve the runtime performance of conversion from an integer
-// type to bfloat16.
+// type to bfloat16.
template <>
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne<true>(float ff) {
#if (defined(EIGEN_HAS_CUDA_BF16) && defined(EIGEN_HAS_HIP_BF16))
@@ -691,7 +687,17 @@
return (bfloat16_impl::isfinite)(h);
}
-} // namespace numext
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bit_cast<Eigen::bfloat16, uint16_t>(const uint16_t& src) {
+ return Eigen::bfloat16(Eigen::bfloat16_impl::raw_uint16_to_bfloat16(src));
+}
+
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC uint16_t bit_cast<uint16_t, Eigen::bfloat16>(const Eigen::bfloat16& src) {
+ return Eigen::bfloat16_impl::raw_bfloat16_as_uint16(src);
+}
+
+} // namespace numext
} // namespace Eigen
#endif // EIGEN_BFLOAT16_H
diff --git a/Eigen/src/Core/arch/Default/Half.h b/Eigen/src/Core/arch/Default/Half.h
index 0bc1e9d..4dde913 100644
--- a/Eigen/src/Core/arch/Default/Half.h
+++ b/Eigen/src/Core/arch/Default/Half.h
@@ -159,55 +159,9 @@
explicit EIGEN_DEVICE_FUNC half(std::complex<RealScalar> c)
: half_impl::half_base(half_impl::float_to_half_rtne(static_cast<float>(c.real()))) {}
- EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(bool) const {
- // +0.0 and -0.0 become false, everything else becomes true.
- #if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
- return (numext::bit_cast<numext::uint16_t>(x) & 0x7fff) != 0;
- #else
- return (x & 0x7fff) != 0;
- #endif
- }
- EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(signed char) const {
- return static_cast<signed char>(half_impl::half_to_float(*this));
- }
- EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned char) const {
- return static_cast<unsigned char>(half_impl::half_to_float(*this));
- }
- EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(short) const {
- return static_cast<short>(half_impl::half_to_float(*this));
- }
- EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(numext::uint16_t) const {
- return static_cast<numext::uint16_t>(half_impl::half_to_float(*this));
- }
- EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(int) const {
- return static_cast<int>(half_impl::half_to_float(*this));
- }
- EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned int) const {
- return static_cast<unsigned int>(half_impl::half_to_float(*this));
- }
- EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long) const {
- return static_cast<long>(half_impl::half_to_float(*this));
- }
- EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long) const {
- return static_cast<unsigned long>(half_impl::half_to_float(*this));
- }
- EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long long) const {
- return static_cast<long long>(half_impl::half_to_float(*this));
- }
- EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long long) const {
- return static_cast<unsigned long long>(half_to_float(*this));
- }
- EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(float) const {
+ EIGEN_DEVICE_FUNC operator float() const { // NOLINT: Allow implicit conversion to float, because it is lossless.
return half_impl::half_to_float(*this);
}
- EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(double) const {
- return static_cast<double>(half_impl::half_to_float(*this));
- }
-
- template<typename RealScalar>
- EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(std::complex<RealScalar>) const {
- return std::complex<RealScalar>(static_cast<RealScalar>(*this), RealScalar(0));
- }
};
} // end namespace Eigen
@@ -494,6 +448,19 @@
#endif
}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR numext::uint16_t raw_half_as_uint16(const __half_raw& h) {
+ // HIP/CUDA/Default have a member 'x' of type uint16_t.
+ // For ARM64 native half, the member 'x' is of type __fp16, so we need to bit-cast.
+ // For SYCL, cl::sycl::half is _Float16, so cast directly.
+#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
+ return numext::bit_cast<numext::uint16_t>(h.x);
+#elif defined(SYCL_DEVICE_ONLY)
+ return numext::bit_cast<numext::uint16_t>(h);
+#else
+ return h.x;
+#endif
+}
+
union float32_bits {
unsigned int u;
float f;
@@ -673,6 +640,12 @@
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) {
return half(::tanhf(float(a)));
}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half asin(const half& a) {
+ return half(::asinf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half acos(const half& a) {
+ return half(::acosf(float(a)));
+}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
defined(EIGEN_HIP_DEVICE_COMPILE)
@@ -681,6 +654,9 @@
return half(::floorf(float(a)));
#endif
}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half rint(const half& a) {
+ return half(::rintf(float(a)));
+}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
defined(EIGEN_HIP_DEVICE_COMPILE)
@@ -812,10 +788,11 @@
}
#endif
-#if defined(EIGEN_GPU_COMPILE_PHASE)
namespace Eigen {
namespace numext {
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isnan)(const Eigen::half& h) {
return (half_impl::isnan)(h);
@@ -830,8 +807,20 @@
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isfinite)(const Eigen::half& h) {
return (half_impl::isfinite)(h);
}
+
+#endif
+
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bit_cast<Eigen::half, uint16_t>(const uint16_t& src) {
+ return Eigen::half(Eigen::half_impl::raw_uint16_to_half(src));
+}
+
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC uint16_t bit_cast<uint16_t, Eigen::half>(const Eigen::half& src) {
+ return Eigen::half_impl::raw_half_as_uint16(src);
+}
+
} // namespace numext
} // namespace Eigen
-#endif
#endif // EIGEN_HALF_H
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 30edd70..709cebe 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -87,8 +87,8 @@
// fuctionally equivalent to _mm_shuffle_ps in SSE when interleave
// == false (i.e. shuffle<false>(m, n, mask) equals _mm_shuffle_ps(m, n, mask)),
// interleave m and n when interleave == true. Currently used in LU/arch/InverseSize4.h
-// to enable a shared implementation for fast inversion of matrices of size 4.
-template<bool interleave>
+// to enable a shared implementation for fast inversion of matrices of size 4.
+template<bool interleave>
EIGEN_STRONG_INLINE Packet4f shuffle(const Packet4f &m, const Packet4f &n, int mask)
{
const float* a = reinterpret_cast<const float*>(&m);
@@ -97,8 +97,8 @@
return res;
}
-template<>
-EIGEN_STRONG_INLINE Packet4f shuffle<true>(const Packet4f &m, const Packet4f &n, int mask)
+template<>
+EIGEN_STRONG_INLINE Packet4f shuffle<true>(const Packet4f &m, const Packet4f &n, int mask)
{
const float* a = reinterpret_cast<const float*>(&m);
const float* b = reinterpret_cast<const float*>(&n);
@@ -109,7 +109,7 @@
EIGEN_STRONG_INLINE static int eigen_neon_shuffle_mask(int p, int q, int r, int s) {return ((s)<<6|(r)<<4|(q)<<2|(p));}
EIGEN_STRONG_INLINE Packet4f vec4f_swizzle2(const Packet4f& a, const Packet4f& b, int p, int q, int r, int s)
-{
+{
return shuffle<false>(a,b,eigen_neon_shuffle_mask(p, q, r, s));
}
EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b)
@@ -1922,13 +1922,13 @@
template<> EIGEN_STRONG_INLINE void pstoreu<uint64_t>(uint64_t* to, const Packet2ul& from)
{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u64(to,from); }
-template<> EIGEN_DEVICE_FUNC inline Packet2f pgather<float, Packet2f>(const float* from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2f pgather<float, Packet2f>(const float* from, Index stride)
{
Packet2f res = vld1_dup_f32(from);
res = vld1_lane_f32(from + 1*stride, res, 1);
return res;
}
-template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pgather<float, Packet4f>(const float* from, Index stride)
{
Packet4f res = vld1q_dup_f32(from);
res = vld1q_lane_f32(from + 1*stride, res, 1);
@@ -1936,14 +1936,14 @@
res = vld1q_lane_f32(from + 3*stride, res, 3);
return res;
}
-template<> EIGEN_DEVICE_FUNC inline Packet4c pgather<int8_t, Packet4c>(const int8_t* from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c pgather<int8_t, Packet4c>(const int8_t* from, Index stride)
{
Packet4c res;
for (int i = 0; i != 4; i++)
reinterpret_cast<int8_t*>(&res)[i] = *(from + i * stride);
return res;
}
-template<> EIGEN_DEVICE_FUNC inline Packet8c pgather<int8_t, Packet8c>(const int8_t* from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c pgather<int8_t, Packet8c>(const int8_t* from, Index stride)
{
Packet8c res = vld1_dup_s8(from);
res = vld1_lane_s8(from + 1*stride, res, 1);
@@ -1955,7 +1955,7 @@
res = vld1_lane_s8(from + 7*stride, res, 7);
return res;
}
-template<> EIGEN_DEVICE_FUNC inline Packet16c pgather<int8_t, Packet16c>(const int8_t* from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pgather<int8_t, Packet16c>(const int8_t* from, Index stride)
{
Packet16c res = vld1q_dup_s8(from);
res = vld1q_lane_s8(from + 1*stride, res, 1);
@@ -1975,14 +1975,14 @@
res = vld1q_lane_s8(from + 15*stride, res, 15);
return res;
}
-template<> EIGEN_DEVICE_FUNC inline Packet4uc pgather<uint8_t, Packet4uc>(const uint8_t* from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc pgather<uint8_t, Packet4uc>(const uint8_t* from, Index stride)
{
Packet4uc res;
for (int i = 0; i != 4; i++)
reinterpret_cast<uint8_t*>(&res)[i] = *(from + i * stride);
return res;
}
-template<> EIGEN_DEVICE_FUNC inline Packet8uc pgather<uint8_t, Packet8uc>(const uint8_t* from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc pgather<uint8_t, Packet8uc>(const uint8_t* from, Index stride)
{
Packet8uc res = vld1_dup_u8(from);
res = vld1_lane_u8(from + 1*stride, res, 1);
@@ -1994,7 +1994,7 @@
res = vld1_lane_u8(from + 7*stride, res, 7);
return res;
}
-template<> EIGEN_DEVICE_FUNC inline Packet16uc pgather<uint8_t, Packet16uc>(const uint8_t* from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pgather<uint8_t, Packet16uc>(const uint8_t* from, Index stride)
{
Packet16uc res = vld1q_dup_u8(from);
res = vld1q_lane_u8(from + 1*stride, res, 1);
@@ -2014,7 +2014,7 @@
res = vld1q_lane_u8(from + 15*stride, res, 15);
return res;
}
-template<> EIGEN_DEVICE_FUNC inline Packet4s pgather<int16_t, Packet4s>(const int16_t* from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s pgather<int16_t, Packet4s>(const int16_t* from, Index stride)
{
Packet4s res = vld1_dup_s16(from);
res = vld1_lane_s16(from + 1*stride, res, 1);
@@ -2022,7 +2022,7 @@
res = vld1_lane_s16(from + 3*stride, res, 3);
return res;
}
-template<> EIGEN_DEVICE_FUNC inline Packet8s pgather<int16_t, Packet8s>(const int16_t* from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pgather<int16_t, Packet8s>(const int16_t* from, Index stride)
{
Packet8s res = vld1q_dup_s16(from);
res = vld1q_lane_s16(from + 1*stride, res, 1);
@@ -2034,7 +2034,7 @@
res = vld1q_lane_s16(from + 7*stride, res, 7);
return res;
}
-template<> EIGEN_DEVICE_FUNC inline Packet4us pgather<uint16_t, Packet4us>(const uint16_t* from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us pgather<uint16_t, Packet4us>(const uint16_t* from, Index stride)
{
Packet4us res = vld1_dup_u16(from);
res = vld1_lane_u16(from + 1*stride, res, 1);
@@ -2042,7 +2042,7 @@
res = vld1_lane_u16(from + 3*stride, res, 3);
return res;
}
-template<> EIGEN_DEVICE_FUNC inline Packet8us pgather<uint16_t, Packet8us>(const uint16_t* from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pgather<uint16_t, Packet8us>(const uint16_t* from, Index stride)
{
Packet8us res = vld1q_dup_u16(from);
res = vld1q_lane_u16(from + 1*stride, res, 1);
@@ -2054,13 +2054,13 @@
res = vld1q_lane_u16(from + 7*stride, res, 7);
return res;
}
-template<> EIGEN_DEVICE_FUNC inline Packet2i pgather<int32_t, Packet2i>(const int32_t* from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2i pgather<int32_t, Packet2i>(const int32_t* from, Index stride)
{
Packet2i res = vld1_dup_s32(from);
res = vld1_lane_s32(from + 1*stride, res, 1);
return res;
}
-template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int32_t, Packet4i>(const int32_t* from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pgather<int32_t, Packet4i>(const int32_t* from, Index stride)
{
Packet4i res = vld1q_dup_s32(from);
res = vld1q_lane_s32(from + 1*stride, res, 1);
@@ -2068,13 +2068,13 @@
res = vld1q_lane_s32(from + 3*stride, res, 3);
return res;
}
-template<> EIGEN_DEVICE_FUNC inline Packet2ui pgather<uint32_t, Packet2ui>(const uint32_t* from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ui pgather<uint32_t, Packet2ui>(const uint32_t* from, Index stride)
{
Packet2ui res = vld1_dup_u32(from);
res = vld1_lane_u32(from + 1*stride, res, 1);
return res;
}
-template<> EIGEN_DEVICE_FUNC inline Packet4ui pgather<uint32_t, Packet4ui>(const uint32_t* from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pgather<uint32_t, Packet4ui>(const uint32_t* from, Index stride)
{
Packet4ui res = vld1q_dup_u32(from);
res = vld1q_lane_u32(from + 1*stride, res, 1);
@@ -2082,37 +2082,37 @@
res = vld1q_lane_u32(from + 3*stride, res, 3);
return res;
}
-template<> EIGEN_DEVICE_FUNC inline Packet2l pgather<int64_t, Packet2l>(const int64_t* from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pgather<int64_t, Packet2l>(const int64_t* from, Index stride)
{
Packet2l res = vld1q_dup_s64(from);
res = vld1q_lane_s64(from + 1*stride, res, 1);
return res;
}
-template<> EIGEN_DEVICE_FUNC inline Packet2ul pgather<uint64_t, Packet2ul>(const uint64_t* from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pgather<uint64_t, Packet2ul>(const uint64_t* from, Index stride)
{
Packet2ul res = vld1q_dup_u64(from);
res = vld1q_lane_u64(from + 1*stride, res, 1);
return res;
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet2f>(float* to, const Packet2f& from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<float, Packet2f>(float* to, const Packet2f& from, Index stride)
{
vst1_lane_f32(to + stride*0, from, 0);
vst1_lane_f32(to + stride*1, from, 1);
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
{
vst1q_lane_f32(to + stride*0, from, 0);
vst1q_lane_f32(to + stride*1, from, 1);
vst1q_lane_f32(to + stride*2, from, 2);
vst1q_lane_f32(to + stride*3, from, 3);
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<int8_t, Packet4c>(int8_t* to, const Packet4c& from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet4c>(int8_t* to, const Packet4c& from, Index stride)
{
for (int i = 0; i != 4; i++)
*(to + i * stride) = reinterpret_cast<const int8_t*>(&from)[i];
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<int8_t, Packet8c>(int8_t* to, const Packet8c& from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet8c>(int8_t* to, const Packet8c& from, Index stride)
{
vst1_lane_s8(to + stride*0, from, 0);
vst1_lane_s8(to + stride*1, from, 1);
@@ -2123,7 +2123,7 @@
vst1_lane_s8(to + stride*6, from, 6);
vst1_lane_s8(to + stride*7, from, 7);
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<int8_t, Packet16c>(int8_t* to, const Packet16c& from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet16c>(int8_t* to, const Packet16c& from, Index stride)
{
vst1q_lane_s8(to + stride*0, from, 0);
vst1q_lane_s8(to + stride*1, from, 1);
@@ -2142,12 +2142,12 @@
vst1q_lane_s8(to + stride*14, from, 14);
vst1q_lane_s8(to + stride*15, from, 15);
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<uint8_t, Packet4uc>(uint8_t* to, const Packet4uc& from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet4uc>(uint8_t* to, const Packet4uc& from, Index stride)
{
for (int i = 0; i != 4; i++)
*(to + i * stride) = reinterpret_cast<const uint8_t*>(&from)[i];
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<uint8_t, Packet8uc>(uint8_t* to, const Packet8uc& from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet8uc>(uint8_t* to, const Packet8uc& from, Index stride)
{
vst1_lane_u8(to + stride*0, from, 0);
vst1_lane_u8(to + stride*1, from, 1);
@@ -2158,7 +2158,7 @@
vst1_lane_u8(to + stride*6, from, 6);
vst1_lane_u8(to + stride*7, from, 7);
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<uint8_t, Packet16uc>(uint8_t* to, const Packet16uc& from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet16uc>(uint8_t* to, const Packet16uc& from, Index stride)
{
vst1q_lane_u8(to + stride*0, from, 0);
vst1q_lane_u8(to + stride*1, from, 1);
@@ -2177,14 +2177,14 @@
vst1q_lane_u8(to + stride*14, from, 14);
vst1q_lane_u8(to + stride*15, from, 15);
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<int16_t, Packet4s>(int16_t* to, const Packet4s& from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int16_t, Packet4s>(int16_t* to, const Packet4s& from, Index stride)
{
vst1_lane_s16(to + stride*0, from, 0);
vst1_lane_s16(to + stride*1, from, 1);
vst1_lane_s16(to + stride*2, from, 2);
vst1_lane_s16(to + stride*3, from, 3);
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<int16_t, Packet8s>(int16_t* to, const Packet8s& from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int16_t, Packet8s>(int16_t* to, const Packet8s& from, Index stride)
{
vst1q_lane_s16(to + stride*0, from, 0);
vst1q_lane_s16(to + stride*1, from, 1);
@@ -2195,14 +2195,14 @@
vst1q_lane_s16(to + stride*6, from, 6);
vst1q_lane_s16(to + stride*7, from, 7);
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<uint16_t, Packet4us>(uint16_t* to, const Packet4us& from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint16_t, Packet4us>(uint16_t* to, const Packet4us& from, Index stride)
{
vst1_lane_u16(to + stride*0, from, 0);
vst1_lane_u16(to + stride*1, from, 1);
vst1_lane_u16(to + stride*2, from, 2);
vst1_lane_u16(to + stride*3, from, 3);
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<uint16_t, Packet8us>(uint16_t* to, const Packet8us& from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint16_t, Packet8us>(uint16_t* to, const Packet8us& from, Index stride)
{
vst1q_lane_u16(to + stride*0, from, 0);
vst1q_lane_u16(to + stride*1, from, 1);
@@ -2213,36 +2213,36 @@
vst1q_lane_u16(to + stride*6, from, 6);
vst1q_lane_u16(to + stride*7, from, 7);
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<int32_t, Packet2i>(int32_t* to, const Packet2i& from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int32_t, Packet2i>(int32_t* to, const Packet2i& from, Index stride)
{
vst1_lane_s32(to + stride*0, from, 0);
vst1_lane_s32(to + stride*1, from, 1);
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from, Index stride)
{
vst1q_lane_s32(to + stride*0, from, 0);
vst1q_lane_s32(to + stride*1, from, 1);
vst1q_lane_s32(to + stride*2, from, 2);
vst1q_lane_s32(to + stride*3, from, 3);
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<uint32_t, Packet2ui>(uint32_t* to, const Packet2ui& from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint32_t, Packet2ui>(uint32_t* to, const Packet2ui& from, Index stride)
{
vst1_lane_u32(to + stride*0, from, 0);
vst1_lane_u32(to + stride*1, from, 1);
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<uint32_t, Packet4ui>(uint32_t* to, const Packet4ui& from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint32_t, Packet4ui>(uint32_t* to, const Packet4ui& from, Index stride)
{
vst1q_lane_u32(to + stride*0, from, 0);
vst1q_lane_u32(to + stride*1, from, 1);
vst1q_lane_u32(to + stride*2, from, 2);
vst1q_lane_u32(to + stride*3, from, 3);
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<int64_t, Packet2l>(int64_t* to, const Packet2l& from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int64_t, Packet2l>(int64_t* to, const Packet2l& from, Index stride)
{
vst1q_lane_s64(to + stride*0, from, 0);
vst1q_lane_s64(to + stride*1, from, 1);
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<uint64_t, Packet2ul>(uint64_t* to, const Packet2ul& from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint64_t, Packet2ul>(uint64_t* to, const Packet2ul& from, Index stride)
{
vst1q_lane_u64(to + stride*0, from, 0);
vst1q_lane_u64(to + stride*1, from, 1);
@@ -2457,23 +2457,23 @@
template<> EIGEN_STRONG_INLINE uint64_t predux<Packet2ul>(const Packet2ul& a)
{ return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1); }
-template<> EIGEN_DEVICE_FUNC inline Packet4c predux_half_dowto4(const Packet8c& a)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c predux_half_dowto4(const Packet8c& a)
{
return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(a,
vreinterpret_s8_s32(vrev64_s32(vreinterpret_s32_s8(a))))), 0);
}
-template<> EIGEN_DEVICE_FUNC inline Packet8c predux_half_dowto4(const Packet16c& a)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c predux_half_dowto4(const Packet16c& a)
{ return vadd_s8(vget_high_s8(a), vget_low_s8(a)); }
-template<> EIGEN_DEVICE_FUNC inline Packet4uc predux_half_dowto4(const Packet8uc& a)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc predux_half_dowto4(const Packet8uc& a)
{
return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(a,
vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(a))))), 0);
}
-template<> EIGEN_DEVICE_FUNC inline Packet8uc predux_half_dowto4(const Packet16uc& a)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc predux_half_dowto4(const Packet16uc& a)
{ return vadd_u8(vget_high_u8(a), vget_low_u8(a)); }
-template<> EIGEN_DEVICE_FUNC inline Packet4s predux_half_dowto4(const Packet8s& a)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s predux_half_dowto4(const Packet8s& a)
{ return vadd_s16(vget_high_s16(a), vget_low_s16(a)); }
-template<> EIGEN_DEVICE_FUNC inline Packet4us predux_half_dowto4(const Packet8us& a)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us predux_half_dowto4(const Packet8us& a)
{ return vadd_u16(vget_high_u16(a), vget_low_u16(a)); }
// Other reduction functions:
@@ -2752,13 +2752,13 @@
return vget_lane_u32(vpmax_u32(tmp, tmp), 0);
}
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2f, 2>& kernel)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2f, 2>& kernel)
{
const float32x2x2_t z = vzip_f32(kernel.packet[0], kernel.packet[1]);
kernel.packet[0] = z.val[0];
kernel.packet[1] = z.val[1];
}
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4f, 4>& kernel)
{
const float32x4x2_t tmp1 = vzipq_f32(kernel.packet[0], kernel.packet[1]);
const float32x4x2_t tmp2 = vzipq_f32(kernel.packet[2], kernel.packet[3]);
@@ -2768,7 +2768,7 @@
kernel.packet[2] = vcombine_f32(vget_low_f32(tmp1.val[1]), vget_low_f32(tmp2.val[1]));
kernel.packet[3] = vcombine_f32(vget_high_f32(tmp1.val[1]), vget_high_f32(tmp2.val[1]));
}
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4c, 4>& kernel)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4c, 4>& kernel)
{
const int8x8_t a = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[2], vdup_n_s32(kernel.packet[0]), 1));
const int8x8_t b = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[3], vdup_n_s32(kernel.packet[1]), 1));
@@ -2781,7 +2781,7 @@
kernel.packet[2] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[1]), 0);
kernel.packet[3] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[1]), 1);
}
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8c, 8>& kernel)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8c, 8>& kernel)
{
int8x8x2_t zip8[4];
uint16x4x2_t zip16[4];
@@ -2811,7 +2811,7 @@
}
}
}
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16c, 16>& kernel)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 16>& kernel)
{
int8x16x2_t zip8[8];
uint16x8x2_t zip16[8];
@@ -2858,7 +2858,7 @@
}
}
}
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4uc, 4>& kernel)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4uc, 4>& kernel)
{
const uint8x8_t a = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[2], vdup_n_u32(kernel.packet[0]), 1));
const uint8x8_t b = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[3], vdup_n_u32(kernel.packet[1]), 1));
@@ -2871,7 +2871,7 @@
kernel.packet[2] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[1]), 0);
kernel.packet[3] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[1]), 1);
}
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8uc, 8>& kernel)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8uc, 8>& kernel)
{
uint8x8x2_t zip8[4];
uint16x4x2_t zip16[4];
@@ -2901,7 +2901,7 @@
}
}
}
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16uc, 16>& kernel)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 16>& kernel)
{
uint8x16x2_t zip8[8];
uint16x8x2_t zip16[8];
@@ -2946,7 +2946,7 @@
}
}
}
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4s, 4>& kernel)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4s, 4>& kernel)
{
const int16x4x2_t zip16_1 = vzip_s16(kernel.packet[0], kernel.packet[1]);
const int16x4x2_t zip16_2 = vzip_s16(kernel.packet[2], kernel.packet[3]);
@@ -2960,7 +2960,7 @@
kernel.packet[3] = vreinterpret_s16_u32(zip32_2.val[1]);
}
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8s, 4>& kernel)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8s, 4>& kernel)
{
const int16x8x2_t zip16_1 = vzipq_s16(kernel.packet[0], kernel.packet[1]);
const int16x8x2_t zip16_2 = vzipq_s16(kernel.packet[2], kernel.packet[3]);
@@ -2974,7 +2974,7 @@
kernel.packet[3] = vreinterpretq_s16_u32(zip32_2.val[1]);
}
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16uc, 4>& kernel)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 4>& kernel)
{
const uint8x16x2_t zip8_1 = vzipq_u8(kernel.packet[0], kernel.packet[1]);
const uint8x16x2_t zip8_2 = vzipq_u8(kernel.packet[2], kernel.packet[3]);
@@ -2988,7 +2988,7 @@
kernel.packet[3] = vreinterpretq_u8_u16(zip16_2.val[1]);
}
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8s, 8>& kernel)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8s, 8>& kernel)
{
const int16x8x2_t zip16_1 = vzipq_s16(kernel.packet[0], kernel.packet[1]);
const int16x8x2_t zip16_2 = vzipq_s16(kernel.packet[2], kernel.packet[3]);
@@ -3009,7 +3009,7 @@
kernel.packet[6] = vreinterpretq_s16_u32(vcombine_u32(vget_low_u32(zip32_2.val[1]), vget_low_u32(zip32_4.val[1])));
kernel.packet[7] = vreinterpretq_s16_u32(vcombine_u32(vget_high_u32(zip32_2.val[1]), vget_high_u32(zip32_4.val[1])));
}
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4us, 4>& kernel)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4us, 4>& kernel)
{
const uint16x4x2_t zip16_1 = vzip_u16(kernel.packet[0], kernel.packet[1]);
const uint16x4x2_t zip16_2 = vzip_u16(kernel.packet[2], kernel.packet[3]);
@@ -3022,7 +3022,7 @@
kernel.packet[2] = vreinterpret_u16_u32(zip32_2.val[0]);
kernel.packet[3] = vreinterpret_u16_u32(zip32_2.val[1]);
}
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8us, 8>& kernel)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8us, 8>& kernel)
{
const uint16x8x2_t zip16_1 = vzipq_u16(kernel.packet[0], kernel.packet[1]);
const uint16x8x2_t zip16_2 = vzipq_u16(kernel.packet[2], kernel.packet[3]);
@@ -3043,13 +3043,13 @@
kernel.packet[6] = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(zip32_2.val[1]), vget_low_u32(zip32_4.val[1])));
kernel.packet[7] = vreinterpretq_u16_u32(vcombine_u32(vget_high_u32(zip32_2.val[1]), vget_high_u32(zip32_4.val[1])));
}
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2i, 2>& kernel)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2i, 2>& kernel)
{
const int32x2x2_t z = vzip_s32(kernel.packet[0], kernel.packet[1]);
kernel.packet[0] = z.val[0];
kernel.packet[1] = z.val[1];
}
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4i, 4>& kernel)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4i, 4>& kernel)
{
const int32x4x2_t tmp1 = vzipq_s32(kernel.packet[0], kernel.packet[1]);
const int32x4x2_t tmp2 = vzipq_s32(kernel.packet[2], kernel.packet[3]);
@@ -3059,13 +3059,13 @@
kernel.packet[2] = vcombine_s32(vget_low_s32(tmp1.val[1]), vget_low_s32(tmp2.val[1]));
kernel.packet[3] = vcombine_s32(vget_high_s32(tmp1.val[1]), vget_high_s32(tmp2.val[1]));
}
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2ui, 2>& kernel)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2ui, 2>& kernel)
{
const uint32x2x2_t z = vzip_u32(kernel.packet[0], kernel.packet[1]);
kernel.packet[0] = z.val[0];
kernel.packet[1] = z.val[1];
}
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4ui, 4>& kernel)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4ui, 4>& kernel)
{
const uint32x4x2_t tmp1 = vzipq_u32(kernel.packet[0], kernel.packet[1]);
const uint32x4x2_t tmp2 = vzipq_u32(kernel.packet[2], kernel.packet[3]);
@@ -3075,7 +3075,7 @@
kernel.packet[2] = vcombine_u32(vget_low_u32(tmp1.val[1]), vget_low_u32(tmp2.val[1]));
kernel.packet[3] = vcombine_u32(vget_high_u32(tmp1.val[1]), vget_high_u32(tmp2.val[1]));
}
-EIGEN_DEVICE_FUNC inline void
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
ptranspose(PacketBlock<Packet2l, 2>& kernel)
{
#if EIGEN_ARCH_ARM64
@@ -3094,7 +3094,7 @@
kernel.packet[1] = vcombine_s64(tmp[0][1], tmp[1][1]);
#endif
}
-EIGEN_DEVICE_FUNC inline void
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
ptranspose(PacketBlock<Packet2ul, 2>& kernel)
{
#if EIGEN_ARCH_ARM64
@@ -3114,37 +3114,37 @@
#endif
}
-template<> EIGEN_DEVICE_FUNC inline Packet2f pselect( const Packet2f& mask, const Packet2f& a, const Packet2f& b)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2f pselect( const Packet2f& mask, const Packet2f& a, const Packet2f& b)
{ return vbsl_f32(vreinterpret_u32_f32(mask), a, b); }
-template<> EIGEN_DEVICE_FUNC inline Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b)
{ return vbslq_f32(vreinterpretq_u32_f32(mask), a, b); }
-template<> EIGEN_DEVICE_FUNC inline Packet8c pselect(const Packet8c& mask, const Packet8c& a, const Packet8c& b)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c pselect(const Packet8c& mask, const Packet8c& a, const Packet8c& b)
{ return vbsl_s8(vreinterpret_u8_s8(mask), a, b); }
-template<> EIGEN_DEVICE_FUNC inline Packet16c pselect(const Packet16c& mask, const Packet16c& a, const Packet16c& b)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pselect(const Packet16c& mask, const Packet16c& a, const Packet16c& b)
{ return vbslq_s8(vreinterpretq_u8_s8(mask), a, b); }
-template<> EIGEN_DEVICE_FUNC inline Packet8uc pselect(const Packet8uc& mask, const Packet8uc& a, const Packet8uc& b)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc pselect(const Packet8uc& mask, const Packet8uc& a, const Packet8uc& b)
{ return vbsl_u8(mask, a, b); }
-template<> EIGEN_DEVICE_FUNC inline Packet16uc pselect(const Packet16uc& mask, const Packet16uc& a, const Packet16uc& b)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pselect(const Packet16uc& mask, const Packet16uc& a, const Packet16uc& b)
{ return vbslq_u8(mask, a, b); }
-template<> EIGEN_DEVICE_FUNC inline Packet4s pselect(const Packet4s& mask, const Packet4s& a, const Packet4s& b)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s pselect(const Packet4s& mask, const Packet4s& a, const Packet4s& b)
{ return vbsl_s16(vreinterpret_u16_s16(mask), a, b); }
-template<> EIGEN_DEVICE_FUNC inline Packet8s pselect(const Packet8s& mask, const Packet8s& a, const Packet8s& b)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pselect(const Packet8s& mask, const Packet8s& a, const Packet8s& b)
{ return vbslq_s16(vreinterpretq_u16_s16(mask), a, b); }
-template<> EIGEN_DEVICE_FUNC inline Packet4us pselect(const Packet4us& mask, const Packet4us& a, const Packet4us& b)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us pselect(const Packet4us& mask, const Packet4us& a, const Packet4us& b)
{ return vbsl_u16(mask, a, b); }
-template<> EIGEN_DEVICE_FUNC inline Packet8us pselect(const Packet8us& mask, const Packet8us& a, const Packet8us& b)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pselect(const Packet8us& mask, const Packet8us& a, const Packet8us& b)
{ return vbslq_u16(mask, a, b); }
-template<> EIGEN_DEVICE_FUNC inline Packet2i pselect(const Packet2i& mask, const Packet2i& a, const Packet2i& b)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2i pselect(const Packet2i& mask, const Packet2i& a, const Packet2i& b)
{ return vbsl_s32(vreinterpret_u32_s32(mask), a, b); }
-template<> EIGEN_DEVICE_FUNC inline Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b)
{ return vbslq_s32(vreinterpretq_u32_s32(mask), a, b); }
-template<> EIGEN_DEVICE_FUNC inline Packet2ui pselect(const Packet2ui& mask, const Packet2ui& a, const Packet2ui& b)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ui pselect(const Packet2ui& mask, const Packet2ui& a, const Packet2ui& b)
{ return vbsl_u32(mask, a, b); }
-template<> EIGEN_DEVICE_FUNC inline Packet4ui pselect(const Packet4ui& mask, const Packet4ui& a, const Packet4ui& b)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pselect(const Packet4ui& mask, const Packet4ui& a, const Packet4ui& b)
{ return vbslq_u32(mask, a, b); }
-template<> EIGEN_DEVICE_FUNC inline Packet2l pselect(const Packet2l& mask, const Packet2l& a, const Packet2l& b)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pselect(const Packet2l& mask, const Packet2l& a, const Packet2l& b)
{ return vbslq_s64(vreinterpretq_u64_s64(mask), a, b); }
-template<> EIGEN_DEVICE_FUNC inline Packet2ul pselect(const Packet2ul& mask, const Packet2ul& a, const Packet2ul& b)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pselect(const Packet2ul& mask, const Packet2ul& a, const Packet2ul& b)
{ return vbslq_u64(mask, a, b); }
/**
@@ -3254,7 +3254,7 @@
vcltq_f32(_x, pset1<Packet4f>((std::numeric_limits<float>::min)())));
// Compute approximate reciprocal sqrt.
Packet4f x = vrsqrteq_f32(_x);
- // Do a single step of Newton's iteration.
+ // Do a single step of Newton's iteration.
//the number 1.5f was set reference to Quake3's fast inverse square root
x = vmulq_f32(x, psub(pset1<Packet4f>(1.5f), pmul(half, pmul(x, x))));
// Flush results for denormals to zero.
@@ -3273,7 +3273,7 @@
return vreinterpret_f32_u32(vbic_u32(vreinterpret_u32_f32(pmul(_x, x)), denormal_mask));
}
-#else
+#else
template<> EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& _x){return vsqrtq_f32(_x);}
template<> EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& _x){return vsqrt_f32(_x); }
#endif
@@ -3441,7 +3441,7 @@
return pandnot<Packet4us>(a, b);
}
-template<> EIGEN_DEVICE_FUNC inline Packet4bf pselect(const Packet4bf& mask, const Packet4bf& a,
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4bf pselect(const Packet4bf& mask, const Packet4bf& a,
const Packet4bf& b)
{
return pselect<Packet4us>(mask, a, b);
@@ -3507,7 +3507,7 @@
return preverse<Packet4us>(a);
}
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4bf, 4>& kernel)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4bf, 4>& kernel)
{
PacketBlock<Packet4us, 4> k;
k.packet[0] = kernel.packet[0];
@@ -3573,7 +3573,7 @@
typedef float64x1_t Packet1d;
// fuctionally equivalent to _mm_shuffle_pd in SSE (i.e. shuffle(m, n, mask) equals _mm_shuffle_pd(m,n,mask))
-// Currently used in LU/arch/InverseSize4.h to enable a shared implementation
+// Currently used in LU/arch/InverseSize4.h to enable a shared implementation
// for fast inversion of matrices of size 4.
EIGEN_STRONG_INLINE Packet2d shuffle(const Packet2d& m, const Packet2d& n, int mask)
{
@@ -3739,7 +3739,7 @@
template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from)
{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_f64(to,from); }
-template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pgather<double, Packet2d>(const double* from, Index stride)
{
Packet2d res = pset1<Packet2d>(0.0);
res = vld1q_lane_f64(from + 0*stride, res, 0);
@@ -3747,7 +3747,7 @@
return res;
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
{
vst1q_lane_f64(to + stride*0, from, 0);
vst1q_lane_f64(to + stride*1, from, 1);
@@ -3791,7 +3791,7 @@
{ return vgetq_lane_f64(vpmaxq_f64(a,a), 0); }
-EIGEN_DEVICE_FUNC inline void
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
ptranspose(PacketBlock<Packet2d, 2>& kernel)
{
const float64x2_t tmp1 = vzip1q_f64(kernel.packet[0], kernel.packet[1]);
@@ -3801,7 +3801,7 @@
kernel.packet[1] = tmp2;
}
-template<> EIGEN_DEVICE_FUNC inline Packet2d pselect( const Packet2d& mask, const Packet2d& a, const Packet2d& b)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pselect( const Packet2d& mask, const Packet2d& a, const Packet2d& b)
{ return vbslq_f64(vreinterpretq_u64_f64(mask), a, b); }
template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent)
@@ -3829,7 +3829,7 @@
vcltq_f64(_x, pset1<Packet2d>((std::numeric_limits<double>::min)())));
// Compute approximate reciprocal sqrt.
Packet2d x = vrsqrteq_f64(_x);
- // Do a single step of Newton's iteration.
+ // Do a single step of Newton's iteration.
//the number 1.5f was set reference to Quake3's fast inverse square root
x = vmulq_f64(x, psub(pset1<Packet2d>(1.5), pmul(half, pmul(x, x))));
// Do one more Newton's iteration to get more accurate result.
@@ -3838,7 +3838,7 @@
return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(pmul(_x, x)), denormal_mask));
}
-#else
+#else
template<> EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& _x){ return vsqrtq_f64(_x); }
#endif
@@ -3849,16 +3849,15 @@
typedef float16x4_t Packet4hf;
typedef float16x8_t Packet8hf;
-// TODO(tellenbach): Enable packets of size 8 as soon as the GEBP can handle them
template <>
struct packet_traits<Eigen::half> : default_packet_traits {
- typedef Packet4hf type;
+ typedef Packet8hf type;
typedef Packet4hf half;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
- size = 4,
- HasHalfPacket = 0,
+ size = 8,
+ HasHalfPacket = 1,
HasCmp = 1,
HasCast = 1,
@@ -3904,7 +3903,7 @@
template <>
struct unpacket_traits<Packet8hf> {
typedef Eigen::half type;
- typedef Packet8hf half;
+ typedef Packet4hf half;
enum {
size = 8,
alignment = Aligned16,
@@ -3914,6 +3913,11 @@
};
};
+template<>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf predux_half_dowto4<Packet8hf>(const Packet8hf& a) {
+ return vadd_f16(vget_low_f16(a), vget_high_f16(a));
+}
+
template <>
EIGEN_STRONG_INLINE Packet8hf pset1<Packet8hf>(const Eigen::half& from) {
return vdupq_n_f16(from.x);
@@ -4189,23 +4193,23 @@
return vcombine_f16(lo, hi);
}
-EIGEN_DEVICE_FUNC inline Packet8hf pinsertfirst(const Packet8hf& a, Eigen::half b) { return vsetq_lane_f16(b.x, a, 0); }
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pinsertfirst(const Packet8hf& a, Eigen::half b) { return vsetq_lane_f16(b.x, a, 0); }
-EIGEN_DEVICE_FUNC inline Packet4hf pinsertfirst(const Packet4hf& a, Eigen::half b) { return vset_lane_f16(b.x, a, 0); }
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pinsertfirst(const Packet4hf& a, Eigen::half b) { return vset_lane_f16(b.x, a, 0); }
template <>
-EIGEN_DEVICE_FUNC inline Packet8hf pselect(const Packet8hf& mask, const Packet8hf& a, const Packet8hf& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pselect(const Packet8hf& mask, const Packet8hf& a, const Packet8hf& b) {
return vbslq_f16(vreinterpretq_u16_f16(mask), a, b);
}
template <>
-EIGEN_DEVICE_FUNC inline Packet4hf pselect(const Packet4hf& mask, const Packet4hf& a, const Packet4hf& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pselect(const Packet4hf& mask, const Packet4hf& a, const Packet4hf& b) {
return vbsl_f16(vreinterpret_u16_f16(mask), a, b);
}
-EIGEN_DEVICE_FUNC inline Packet8hf pinsertlast(const Packet8hf& a, Eigen::half b) { return vsetq_lane_f16(b.x, a, 7); }
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pinsertlast(const Packet8hf& a, Eigen::half b) { return vsetq_lane_f16(b.x, a, 7); }
-EIGEN_DEVICE_FUNC inline Packet4hf pinsertlast(const Packet4hf& a, Eigen::half b) { return vset_lane_f16(b.x, a, 3); }
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pinsertlast(const Packet4hf& a, Eigen::half b) { return vset_lane_f16(b.x, a, 3); }
template <>
EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet8hf& from) {
@@ -4228,7 +4232,7 @@
}
template <>
-EIGEN_DEVICE_FUNC inline Packet8hf pgather<Eigen::half, Packet8hf>(const Eigen::half* from, Index stride) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pgather<Eigen::half, Packet8hf>(const Eigen::half* from, Index stride) {
Packet8hf res = pset1<Packet8hf>(Eigen::half(0.f));
res = vsetq_lane_f16(from[0 * stride].x, res, 0);
res = vsetq_lane_f16(from[1 * stride].x, res, 1);
@@ -4242,7 +4246,7 @@
}
template <>
-EIGEN_DEVICE_FUNC inline Packet4hf pgather<Eigen::half, Packet4hf>(const Eigen::half* from, Index stride) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pgather<Eigen::half, Packet4hf>(const Eigen::half* from, Index stride) {
Packet4hf res = pset1<Packet4hf>(Eigen::half(0.f));
res = vset_lane_f16(from[0 * stride].x, res, 0);
res = vset_lane_f16(from[1 * stride].x, res, 1);
@@ -4252,7 +4256,7 @@
}
template <>
-EIGEN_DEVICE_FUNC inline void pscatter<Eigen::half, Packet8hf>(Eigen::half* to, const Packet8hf& from, Index stride) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8hf>(Eigen::half* to, const Packet8hf& from, Index stride) {
to[stride * 0].x = vgetq_lane_f16(from, 0);
to[stride * 1].x = vgetq_lane_f16(from, 1);
to[stride * 2].x = vgetq_lane_f16(from, 2);
@@ -4264,7 +4268,7 @@
}
template <>
-EIGEN_DEVICE_FUNC inline void pscatter<Eigen::half, Packet4hf>(Eigen::half* to, const Packet4hf& from, Index stride) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4hf>(Eigen::half* to, const Packet4hf& from, Index stride) {
to[stride * 0].x = vget_lane_f16(from, 0);
to[stride * 1].x = vget_lane_f16(from, 1);
to[stride * 2].x = vget_lane_f16(from, 2);
@@ -4418,7 +4422,8 @@
return h;
}
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8hf, 4>& kernel) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8hf, 4>& kernel)
+{
EIGEN_ALIGN16 Eigen::half in[4][8];
pstore<Eigen::half>(in[0], kernel.packet[0]);
@@ -4432,11 +4437,11 @@
for (int i = 0; i < 4; ++i) {
EIGEN_UNROLL_LOOP
for (int j = 0; j < 4; ++j) {
- out[i][j] = in[j][2*i];
+ out[i][j] = in[j][2 * i];
}
EIGEN_UNROLL_LOOP
for (int j = 0; j < 4; ++j) {
- out[i][j+4] = in[j][2*i+1];
+ out[i][j + 4] = in[j][2 * i + 1];
}
}
@@ -4446,7 +4451,7 @@
kernel.packet[3] = pload<Packet8hf>(out[3]);
}
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4hf, 4>& kernel) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4hf, 4>& kernel) {
EIGEN_ALIGN16 float16x4x4_t tmp_x4;
float16_t* tmp = (float16_t*)&kernel;
tmp_x4 = vld4_f16(tmp);
@@ -4457,7 +4462,7 @@
kernel.packet[3] = tmp_x4.val[3];
}
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8hf, 8>& kernel) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8hf, 8>& kernel) {
float16x8x2_t T_1[4];
T_1[0] = vuzpq_f16(kernel.packet[0], kernel.packet[1]);
diff --git a/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h b/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
index 9db119b..9486502 100644
--- a/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
+++ b/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
@@ -10,7 +10,7 @@
#ifndef EIGEN_CONSERVATIVESPARSESPARSEPRODUCT_H
#define EIGEN_CONSERVATIVESPARSESPARSEPRODUCT_H
-namespace Eigen {
+namespace Eigen {
namespace internal {
@@ -25,16 +25,16 @@
Index rows = lhs.innerSize();
Index cols = rhs.outerSize();
eigen_assert(lhs.outerSize() == rhs.innerSize());
-
+
ei_declare_aligned_stack_constructed_variable(bool, mask, rows, 0);
ei_declare_aligned_stack_constructed_variable(ResScalar, values, rows, 0);
ei_declare_aligned_stack_constructed_variable(Index, indices, rows, 0);
-
+
std::memset(mask,0,sizeof(bool)*rows);
evaluator<Lhs> lhsEval(lhs);
evaluator<Rhs> rhsEval(rhs);
-
+
// estimate the number of non zero entries
// given a rhs column containing Y non zeros, we assume that the respective Y columns
// of the lhs differs in average of one non zeros, thus the number of non zeros for
@@ -141,7 +141,7 @@
typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorMatrix;
typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorMatrixAux;
typedef typename sparse_eval<ColMajorMatrixAux,ResultType::RowsAtCompileTime,ResultType::ColsAtCompileTime,ColMajorMatrixAux::Flags>::type ColMajorMatrix;
-
+
// If the result is tall and thin (in the extreme case a column vector)
// then it is faster to sort the coefficients inplace instead of transposing twice.
// FIXME, the following heuristic is probably not very good.
@@ -155,7 +155,7 @@
else
{
ColMajorMatrixAux resCol(lhs.rows(),rhs.cols());
- // ressort to transpose to sort the entries
+ // resort to transpose to sort the entries
internal::conservative_sparse_sparse_product_impl<Lhs,Rhs,ColMajorMatrixAux>(lhs, rhs, resCol, false);
RowMajorMatrix resRow(resCol);
res = resRow.markAsRValue();
diff --git a/test/bfloat16_float.cpp b/test/bfloat16_float.cpp
index 09df2b2..fc648df 100644
--- a/test/bfloat16_float.cpp
+++ b/test/bfloat16_float.cpp
@@ -13,6 +13,9 @@
#include <Eigen/src/Core/arch/Default/BFloat16.h>
+#define VERIFY_BFLOAT16_BITS_EQUAL(h, bits) \
+ VERIFY_IS_EQUAL((numext::bit_cast<numext::uint16_t>(h)), (static_cast<numext::uint16_t>(bits)))
+
// Make sure it's possible to forward declare Eigen::bfloat16
namespace Eigen {
struct bfloat16;
@@ -58,31 +61,45 @@
{
using Eigen::bfloat16_impl::__bfloat16_raw;
+ // Round-trip casts
+ VERIFY_IS_EQUAL(
+ numext::bit_cast<bfloat16>(numext::bit_cast<numext::uint16_t>(bfloat16(1.0f))),
+ bfloat16(1.0f));
+ VERIFY_IS_EQUAL(
+ numext::bit_cast<bfloat16>(numext::bit_cast<numext::uint16_t>(bfloat16(0.5f))),
+ bfloat16(0.5f));
+ VERIFY_IS_EQUAL(
+ numext::bit_cast<bfloat16>(numext::bit_cast<numext::uint16_t>(bfloat16(-0.33333f))),
+ bfloat16(-0.33333f));
+ VERIFY_IS_EQUAL(
+ numext::bit_cast<bfloat16>(numext::bit_cast<numext::uint16_t>(bfloat16(0.0f))),
+ bfloat16(0.0f));
+
// Conversion from float.
- VERIFY_IS_EQUAL(bfloat16(1.0f).value, 0x3f80);
- VERIFY_IS_EQUAL(bfloat16(0.5f).value, 0x3f00);
- VERIFY_IS_EQUAL(bfloat16(0.33333f).value, 0x3eab);
- VERIFY_IS_EQUAL(bfloat16(3.38e38f).value, 0x7f7e);
- VERIFY_IS_EQUAL(bfloat16(3.40e38f).value, 0x7f80); // Becomes infinity.
+ VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(1.0f), 0x3f80);
+ VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(0.5f), 0x3f00);
+ VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(0.33333f), 0x3eab);
+ VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(3.38e38f), 0x7f7e);
+ VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(3.40e38f), 0x7f80); // Becomes infinity.
// Verify round-to-nearest-even behavior.
float val1 = static_cast<float>(bfloat16(__bfloat16_raw(0x3c00)));
float val2 = static_cast<float>(bfloat16(__bfloat16_raw(0x3c01)));
float val3 = static_cast<float>(bfloat16(__bfloat16_raw(0x3c02)));
- VERIFY_IS_EQUAL(bfloat16(0.5f * (val1 + val2)).value, 0x3c00);
- VERIFY_IS_EQUAL(bfloat16(0.5f * (val2 + val3)).value, 0x3c02);
+ VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(0.5f * (val1 + val2)), 0x3c00);
+ VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(0.5f * (val2 + val3)), 0x3c02);
// Conversion from int.
- VERIFY_IS_EQUAL(bfloat16(-1).value, 0xbf80);
- VERIFY_IS_EQUAL(bfloat16(0).value, 0x0000);
- VERIFY_IS_EQUAL(bfloat16(1).value, 0x3f80);
- VERIFY_IS_EQUAL(bfloat16(2).value, 0x4000);
- VERIFY_IS_EQUAL(bfloat16(3).value, 0x4040);
- VERIFY_IS_EQUAL(bfloat16(12).value, 0x4140);
+ VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(-1), 0xbf80);
+ VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(0), 0x0000);
+ VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(1), 0x3f80);
+ VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(2), 0x4000);
+ VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(3), 0x4040);
+ VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(12), 0x4140);
// Conversion from bool.
- VERIFY_IS_EQUAL(bfloat16(false).value, 0x0000);
- VERIFY_IS_EQUAL(bfloat16(true).value, 0x3f80);
+ VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(false), 0x0000);
+ VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(true), 0x3f80);
// Conversion to bool
VERIFY_IS_EQUAL(static_cast<bool>(bfloat16(3)), true);
@@ -102,8 +119,8 @@
VERIFY_IS_EQUAL(bfloat16(0.0f), bfloat16(0.0f));
VERIFY_IS_EQUAL(bfloat16(-0.0f), bfloat16(0.0f));
VERIFY_IS_EQUAL(bfloat16(-0.0f), bfloat16(-0.0f));
- VERIFY_IS_EQUAL(bfloat16(0.0f).value, 0x0000);
- VERIFY_IS_EQUAL(bfloat16(-0.0f).value, 0x8000);
+ VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(0.0f), 0x0000);
+ VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(-0.0f), 0x8000);
// Flush denormals to zero
for (float denorm = -std::numeric_limits<float>::denorm_min();
@@ -117,16 +134,16 @@
VERIFY_IS_EQUAL(bfloat16(denorm), false);
if (std::signbit(denorm)) {
- VERIFY_IS_EQUAL(bf_trunc.value, 0x8000);
+ VERIFY_BFLOAT16_BITS_EQUAL(bf_trunc, 0x8000);
} else {
- VERIFY_IS_EQUAL(bf_trunc.value, 0x0000);
+ VERIFY_BFLOAT16_BITS_EQUAL(bf_trunc, 0x0000);
}
bfloat16 bf_round = Eigen::bfloat16_impl::float_to_bfloat16_rtne<false>(denorm);
VERIFY_IS_EQUAL(static_cast<float>(bf_round), 0.0f);
if (std::signbit(denorm)) {
- VERIFY_IS_EQUAL(bf_round.value, 0x8000);
+ VERIFY_BFLOAT16_BITS_EQUAL(bf_round, 0x8000);
} else {
- VERIFY_IS_EQUAL(bf_round.value, 0x0000);
+ VERIFY_BFLOAT16_BITS_EQUAL(bf_round, 0x0000);
}
}
@@ -231,33 +248,35 @@
VERIFY((numext::isinf)(bfloat16(__bfloat16_raw(0x7f80))));
VERIFY((numext::isnan)(bfloat16(__bfloat16_raw(0x7fc0))));
- VERIFY_IS_EQUAL(bfloat16(BinaryToFloat(0x0, 0xff, 0x40, 0x0)).value, 0x7fc0);
- VERIFY_IS_EQUAL(bfloat16(BinaryToFloat(0x1, 0xff, 0x40, 0x0)).value, 0xffc0);
- VERIFY_IS_EQUAL(Eigen::bfloat16_impl::truncate_to_bfloat16(
- BinaryToFloat(0x0, 0xff, 0x40, 0x0))
- .value,
- 0x7fc0);
- VERIFY_IS_EQUAL(Eigen::bfloat16_impl::truncate_to_bfloat16(
- BinaryToFloat(0x1, 0xff, 0x40, 0x0))
- .value,
- 0xffc0);
+ VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(BinaryToFloat(0x0, 0xff, 0x40, 0x0)), 0x7fc0);
+ VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(BinaryToFloat(0x1, 0xff, 0x40, 0x0)), 0xffc0);
+ VERIFY_BFLOAT16_BITS_EQUAL(Eigen::bfloat16_impl::truncate_to_bfloat16(
+ BinaryToFloat(0x0, 0xff, 0x40, 0x0)),
+ 0x7fc0);
+ VERIFY_BFLOAT16_BITS_EQUAL(Eigen::bfloat16_impl::truncate_to_bfloat16(
+ BinaryToFloat(0x1, 0xff, 0x40, 0x0)),
+ 0xffc0);
}
void test_numtraits()
{
- std::cout << "epsilon = " << NumTraits<bfloat16>::epsilon() << " (0x" << std::hex << NumTraits<bfloat16>::epsilon().value << ")" << std::endl;
- std::cout << "highest = " << NumTraits<bfloat16>::highest() << " (0x" << std::hex << NumTraits<bfloat16>::highest().value << ")" << std::endl;
- std::cout << "lowest = " << NumTraits<bfloat16>::lowest() << " (0x" << std::hex << NumTraits<bfloat16>::lowest().value << ")" << std::endl;
- std::cout << "min = " << (std::numeric_limits<bfloat16>::min)() << " (0x" << std::hex << (std::numeric_limits<bfloat16>::min)().value << ")" << std::endl;
- std::cout << "denorm min = " << (std::numeric_limits<bfloat16>::denorm_min)() << " (0x" << std::hex << (std::numeric_limits<bfloat16>::denorm_min)().value << ")" << std::endl;
- std::cout << "infinity = " << NumTraits<bfloat16>::infinity() << " (0x" << std::hex << NumTraits<bfloat16>::infinity().value << ")" << std::endl;
- std::cout << "quiet nan = " << NumTraits<bfloat16>::quiet_NaN() << " (0x" << std::hex << NumTraits<bfloat16>::quiet_NaN().value << ")" << std::endl;
- std::cout << "signaling nan = " << std::numeric_limits<bfloat16>::signaling_NaN() << " (0x" << std::hex << std::numeric_limits<bfloat16>::signaling_NaN().value << ")" << std::endl;
+ std::cout << "epsilon = " << NumTraits<bfloat16>::epsilon() << " (0x" << std::hex << numext::bit_cast<numext::uint16_t>(NumTraits<bfloat16>::epsilon()) << ")" << std::endl;
+ std::cout << "highest = " << NumTraits<bfloat16>::highest() << " (0x" << std::hex << numext::bit_cast<numext::uint16_t>(NumTraits<bfloat16>::highest()) << ")" << std::endl;
+ std::cout << "lowest = " << NumTraits<bfloat16>::lowest() << " (0x" << std::hex << numext::bit_cast<numext::uint16_t>(NumTraits<bfloat16>::lowest()) << ")" << std::endl;
+ std::cout << "min = " << (std::numeric_limits<bfloat16>::min)() << " (0x" << std::hex << numext::bit_cast<numext::uint16_t>((std::numeric_limits<bfloat16>::min)()) << ")" << std::endl;
+ std::cout << "denorm min = " << (std::numeric_limits<bfloat16>::denorm_min)() << " (0x" << std::hex << numext::bit_cast<numext::uint16_t>((std::numeric_limits<bfloat16>::denorm_min)()) << ")" << std::endl;
+ std::cout << "infinity = " << NumTraits<bfloat16>::infinity() << " (0x" << std::hex << numext::bit_cast<numext::uint16_t>(NumTraits<bfloat16>::infinity()) << ")" << std::endl;
+ std::cout << "quiet nan = " << NumTraits<bfloat16>::quiet_NaN() << " (0x" << std::hex << numext::bit_cast<numext::uint16_t>(NumTraits<bfloat16>::quiet_NaN()) << ")" << std::endl;
+ std::cout << "signaling nan = " << std::numeric_limits<bfloat16>::signaling_NaN() << " (0x" << std::hex << numext::bit_cast<numext::uint16_t>(std::numeric_limits<bfloat16>::signaling_NaN()) << ")" << std::endl;
VERIFY(NumTraits<bfloat16>::IsSigned);
- VERIFY_IS_EQUAL( std::numeric_limits<bfloat16>::infinity().value, bfloat16(std::numeric_limits<float>::infinity()).value );
- VERIFY_IS_EQUAL( std::numeric_limits<bfloat16>::quiet_NaN().value, bfloat16(std::numeric_limits<float>::quiet_NaN()).value );
+ VERIFY_IS_EQUAL(
+ numext::bit_cast<numext::uint16_t>(std::numeric_limits<bfloat16>::infinity()),
+ numext::bit_cast<numext::uint16_t>(bfloat16(std::numeric_limits<float>::infinity())) );
+ VERIFY_IS_EQUAL(
+ numext::bit_cast<numext::uint16_t>(std::numeric_limits<bfloat16>::quiet_NaN()),
+ numext::bit_cast<numext::uint16_t>(bfloat16(std::numeric_limits<float>::quiet_NaN())) );
VERIFY( (std::numeric_limits<bfloat16>::min)() > bfloat16(0.f) );
VERIFY( (std::numeric_limits<bfloat16>::denorm_min)() > bfloat16(0.f) );
VERIFY_IS_EQUAL( (std::numeric_limits<bfloat16>::denorm_min)()/bfloat16(2), bfloat16(0.f) );
diff --git a/test/half_float.cpp b/test/half_float.cpp
index b301b37..cf6df54 100644
--- a/test/half_float.cpp
+++ b/test/half_float.cpp
@@ -11,6 +11,9 @@
#include <Eigen/src/Core/arch/Default/Half.h>
+#define VERIFY_HALF_BITS_EQUAL(h, bits) \
+ VERIFY_IS_EQUAL((numext::bit_cast<numext::uint16_t>(h)), (static_cast<numext::uint16_t>(bits)))
+
// Make sure it's possible to forward declare Eigen::half
namespace Eigen {
struct half;
@@ -22,75 +25,51 @@
{
using Eigen::half_impl::__half_raw;
- // We don't use a uint16_t raw member x if the platform has native Arm __fp16
- // support
-#if !defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
+ // Round-trip bit-cast with uint16.
+ VERIFY_IS_EQUAL(
+ numext::bit_cast<half>(numext::bit_cast<numext::uint16_t>(half(1.0f))),
+ half(1.0f));
+ VERIFY_IS_EQUAL(
+ numext::bit_cast<half>(numext::bit_cast<numext::uint16_t>(half(0.5f))),
+ half(0.5f));
+ VERIFY_IS_EQUAL(
+ numext::bit_cast<half>(numext::bit_cast<numext::uint16_t>(half(-0.33333f))),
+ half(-0.33333f));
+ VERIFY_IS_EQUAL(
+ numext::bit_cast<half>(numext::bit_cast<numext::uint16_t>(half(0.0f))),
+ half(0.0f));
+
// Conversion from float.
- VERIFY_IS_EQUAL(half(1.0f).x, 0x3c00);
- VERIFY_IS_EQUAL(half(0.5f).x, 0x3800);
- VERIFY_IS_EQUAL(half(0.33333f).x, 0x3555);
- VERIFY_IS_EQUAL(half(0.0f).x, 0x0000);
- VERIFY_IS_EQUAL(half(-0.0f).x, 0x8000);
- VERIFY_IS_EQUAL(half(65504.0f).x, 0x7bff);
- VERIFY_IS_EQUAL(half(65536.0f).x, 0x7c00); // Becomes infinity.
+ VERIFY_HALF_BITS_EQUAL(half(1.0f), 0x3c00);
+ VERIFY_HALF_BITS_EQUAL(half(0.5f), 0x3800);
+ VERIFY_HALF_BITS_EQUAL(half(0.33333f), 0x3555);
+ VERIFY_HALF_BITS_EQUAL(half(0.0f), 0x0000);
+ VERIFY_HALF_BITS_EQUAL(half(-0.0f), 0x8000);
+ VERIFY_HALF_BITS_EQUAL(half(65504.0f), 0x7bff);
+ VERIFY_HALF_BITS_EQUAL(half(65536.0f), 0x7c00); // Becomes infinity.
// Denormals.
- VERIFY_IS_EQUAL(half(-5.96046e-08f).x, 0x8001);
- VERIFY_IS_EQUAL(half(5.96046e-08f).x, 0x0001);
- VERIFY_IS_EQUAL(half(1.19209e-07f).x, 0x0002);
+ VERIFY_HALF_BITS_EQUAL(half(-5.96046e-08f), 0x8001);
+ VERIFY_HALF_BITS_EQUAL(half(5.96046e-08f), 0x0001);
+ VERIFY_HALF_BITS_EQUAL(half(1.19209e-07f), 0x0002);
// Verify round-to-nearest-even behavior.
float val1 = float(half(__half_raw(0x3c00)));
float val2 = float(half(__half_raw(0x3c01)));
float val3 = float(half(__half_raw(0x3c02)));
- VERIFY_IS_EQUAL(half(0.5f * (val1 + val2)).x, 0x3c00);
- VERIFY_IS_EQUAL(half(0.5f * (val2 + val3)).x, 0x3c02);
+ VERIFY_HALF_BITS_EQUAL(half(0.5f * (val1 + val2)), 0x3c00);
+ VERIFY_HALF_BITS_EQUAL(half(0.5f * (val2 + val3)), 0x3c02);
// Conversion from int.
- VERIFY_IS_EQUAL(half(-1).x, 0xbc00);
- VERIFY_IS_EQUAL(half(0).x, 0x0000);
- VERIFY_IS_EQUAL(half(1).x, 0x3c00);
- VERIFY_IS_EQUAL(half(2).x, 0x4000);
- VERIFY_IS_EQUAL(half(3).x, 0x4200);
+ VERIFY_HALF_BITS_EQUAL(half(-1), 0xbc00);
+ VERIFY_HALF_BITS_EQUAL(half(0), 0x0000);
+ VERIFY_HALF_BITS_EQUAL(half(1), 0x3c00);
+ VERIFY_HALF_BITS_EQUAL(half(2), 0x4000);
+ VERIFY_HALF_BITS_EQUAL(half(3), 0x4200);
// Conversion from bool.
- VERIFY_IS_EQUAL(half(false).x, 0x0000);
- VERIFY_IS_EQUAL(half(true).x, 0x3c00);
-#endif
-
-#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
- // Conversion from float.
- VERIFY_IS_EQUAL(half(1.0f).x, __fp16(1.0f));
- VERIFY_IS_EQUAL(half(0.5f).x, __fp16(0.5f));
- VERIFY_IS_EQUAL(half(0.33333f).x, __fp16(0.33333f));
- VERIFY_IS_EQUAL(half(0.0f).x, __fp16(0.0f));
- VERIFY_IS_EQUAL(half(-0.0f).x, __fp16(-0.0f));
- VERIFY_IS_EQUAL(half(65504.0f).x, __fp16(65504.0f));
- VERIFY_IS_EQUAL(half(65536.0f).x, __fp16(65536.0f)); // Becomes infinity.
-
- // Denormals.
- VERIFY_IS_EQUAL(half(-5.96046e-08f).x, __fp16(-5.96046e-08f));
- VERIFY_IS_EQUAL(half(5.96046e-08f).x, __fp16(5.96046e-08f));
- VERIFY_IS_EQUAL(half(1.19209e-07f).x, __fp16(1.19209e-07f));
-
- // Verify round-to-nearest-even behavior.
- float val1 = float(half(__half_raw(0x3c00)));
- float val2 = float(half(__half_raw(0x3c01)));
- float val3 = float(half(__half_raw(0x3c02)));
- VERIFY_IS_EQUAL(half(0.5f * (val1 + val2)).x, __fp16(0.5f * (val1 + val2)));
- VERIFY_IS_EQUAL(half(0.5f * (val2 + val3)).x, __fp16(0.5f * (val2 + val3)));
-
- // Conversion from int.
- VERIFY_IS_EQUAL(half(-1).x, __fp16(-1));
- VERIFY_IS_EQUAL(half(0).x, __fp16(0));
- VERIFY_IS_EQUAL(half(1).x, __fp16(1));
- VERIFY_IS_EQUAL(half(2).x, __fp16(2));
- VERIFY_IS_EQUAL(half(3).x, __fp16(3));
-
- // Conversion from bool.
- VERIFY_IS_EQUAL(half(false).x, __fp16(false));
- VERIFY_IS_EQUAL(half(true).x, __fp16(true));
-#endif
+ VERIFY_HALF_BITS_EQUAL(half(false), 0x0000);
+ VERIFY_HALF_BITS_EQUAL(half(true), 0x3c00);
// Conversion to float.
VERIFY_IS_EQUAL(float(half(__half_raw(0x0000))), 0.0f);
@@ -143,24 +122,27 @@
void test_numtraits()
{
- std::cout << "epsilon = " << NumTraits<half>::epsilon() << " (0x" << std::hex << NumTraits<half>::epsilon().x << ")" << std::endl;
- std::cout << "highest = " << NumTraits<half>::highest() << " (0x" << std::hex << NumTraits<half>::highest().x << ")" << std::endl;
- std::cout << "lowest = " << NumTraits<half>::lowest() << " (0x" << std::hex << NumTraits<half>::lowest().x << ")" << std::endl;
- std::cout << "min = " << (std::numeric_limits<half>::min)() << " (0x" << std::hex << half((std::numeric_limits<half>::min)()).x << ")" << std::endl;
- std::cout << "denorm min = " << (std::numeric_limits<half>::denorm_min)() << " (0x" << std::hex << half((std::numeric_limits<half>::denorm_min)()).x << ")" << std::endl;
- std::cout << "infinity = " << NumTraits<half>::infinity() << " (0x" << std::hex << NumTraits<half>::infinity().x << ")" << std::endl;
- std::cout << "quiet nan = " << NumTraits<half>::quiet_NaN() << " (0x" << std::hex << NumTraits<half>::quiet_NaN().x << ")" << std::endl;
- std::cout << "signaling nan = " << std::numeric_limits<half>::signaling_NaN() << " (0x" << std::hex << std::numeric_limits<half>::signaling_NaN().x << ")" << std::endl;
+ std::cout << "epsilon = " << NumTraits<half>::epsilon() << " (0x" << std::hex << numext::bit_cast<numext::uint16_t>(NumTraits<half>::epsilon()) << ")" << std::endl;
+ std::cout << "highest = " << NumTraits<half>::highest() << " (0x" << std::hex << numext::bit_cast<numext::uint16_t>(NumTraits<half>::highest()) << ")" << std::endl;
+ std::cout << "lowest = " << NumTraits<half>::lowest() << " (0x" << std::hex << numext::bit_cast<numext::uint16_t>(NumTraits<half>::lowest()) << ")" << std::endl;
+ std::cout << "min = " << (std::numeric_limits<half>::min)() << " (0x" << std::hex << numext::bit_cast<numext::uint16_t>(half((std::numeric_limits<half>::min)())) << ")" << std::endl;
+ std::cout << "denorm min = " << (std::numeric_limits<half>::denorm_min)() << " (0x" << std::hex << numext::bit_cast<numext::uint16_t>(half((std::numeric_limits<half>::denorm_min)())) << ")" << std::endl;
+ std::cout << "infinity = " << NumTraits<half>::infinity() << " (0x" << std::hex << numext::bit_cast<numext::uint16_t>(NumTraits<half>::infinity()) << ")" << std::endl;
+ std::cout << "quiet nan = " << NumTraits<half>::quiet_NaN() << " (0x" << std::hex << numext::bit_cast<numext::uint16_t>(NumTraits<half>::quiet_NaN()) << ")" << std::endl;
+ std::cout << "signaling nan = " << std::numeric_limits<half>::signaling_NaN() << " (0x" << std::hex << numext::bit_cast<numext::uint16_t>(std::numeric_limits<half>::signaling_NaN()) << ")" << std::endl;
VERIFY(NumTraits<half>::IsSigned);
- VERIFY_IS_EQUAL( std::numeric_limits<half>::infinity().x, half(std::numeric_limits<float>::infinity()).x );
+ VERIFY_IS_EQUAL(
+ numext::bit_cast<numext::uint16_t>(std::numeric_limits<half>::infinity()),
+ numext::bit_cast<numext::uint16_t>(half(std::numeric_limits<float>::infinity())) );
+ VERIFY_IS_EQUAL(
+ numext::bit_cast<numext::uint16_t>(std::numeric_limits<half>::quiet_NaN()),
+ numext::bit_cast<numext::uint16_t>(half(std::numeric_limits<float>::quiet_NaN())) );
+ VERIFY_IS_EQUAL(
+ numext::bit_cast<numext::uint16_t>(std::numeric_limits<half>::signaling_NaN()),
+ numext::bit_cast<numext::uint16_t>(half(std::numeric_limits<float>::signaling_NaN())) );
-// If we have a native fp16 types this becomes a nan == nan comparision so we have to disable it
-#if !defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
- VERIFY_IS_EQUAL( std::numeric_limits<half>::quiet_NaN().x, half(std::numeric_limits<float>::quiet_NaN()).x );
- VERIFY_IS_EQUAL( std::numeric_limits<half>::signaling_NaN().x, half(std::numeric_limits<float>::signaling_NaN()).x );
-#endif
VERIFY( (std::numeric_limits<half>::min)() > half(0.f) );
VERIFY( (std::numeric_limits<half>::denorm_min)() > half(0.f) );
VERIFY( (std::numeric_limits<half>::min)()/half(2) > half(0.f) );
diff --git a/test/packetmath.cpp b/test/packetmath.cpp
index 81425b8..afe36ea 100644
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@@ -1073,7 +1073,12 @@
CALL_SUBTEST_10(test::runner<uint64_t>::run());
CALL_SUBTEST_11(test::runner<std::complex<float> >::run());
CALL_SUBTEST_12(test::runner<std::complex<double> >::run());
+#if defined(EIGEN_VECTORIZE_AVX)
+ // AVX half packets not fully implemented.
CALL_SUBTEST_13((packetmath<half, internal::packet_traits<half>::type>()));
+#else
+ CALL_SUBTEST_13(test::runner<half>::run());
+#endif
CALL_SUBTEST_14((packetmath<bool, internal::packet_traits<bool>::type>()));
CALL_SUBTEST_15(test::runner<bfloat16>::run());
g_first_pass = false;
diff --git a/test/sparse_product.cpp b/test/sparse_product.cpp
index c8caebe..6e85f69 100644
--- a/test/sparse_product.cpp
+++ b/test/sparse_product.cpp
@@ -100,6 +100,7 @@
VERIFY_IS_APPROX(m4=(m2t.transpose()*m3t.transpose()).pruned(0), refMat4=refMat2t.transpose()*refMat3t.transpose());
VERIFY_IS_APPROX(m4=(m2*m3t.transpose()).pruned(0), refMat4=refMat2*refMat3t.transpose());
+#ifndef EIGEN_SPARSE_PRODUCT_IGNORE_TEMPORARY_COUNT
// make sure the right product implementation is called:
if((!SparseMatrixType::IsRowMajor) && m2.rows()<=m3.cols())
{
@@ -107,6 +108,7 @@
VERIFY_EVALUATION_COUNT(m4 = (m2*m3).pruned(0), 1);
VERIFY_EVALUATION_COUNT(m4 = (m2*m3).eval().pruned(0), 4);
}
+#endif
// and that pruning is effective:
{
@@ -151,7 +153,7 @@
VERIFY_IS_APPROX(dm4.noalias()-=m2*refMat3, refMat4-=refMat2*refMat3);
VERIFY_IS_APPROX(dm4=m2*(refMat3+refMat3), refMat4=refMat2*(refMat3+refMat3));
VERIFY_IS_APPROX(dm4=m2t.transpose()*(refMat3+refMat5)*0.5, refMat4=refMat2t.transpose()*(refMat3+refMat5)*0.5);
-
+
// sparse * dense vector
VERIFY_IS_APPROX(dm4.col(0)=m2*refMat3.col(0), refMat4.col(0)=refMat2*refMat3.col(0));
VERIFY_IS_APPROX(dm4.col(0)=m2*refMat3t.transpose().col(0), refMat4.col(0)=refMat2*refMat3t.transpose().col(0));
@@ -182,7 +184,7 @@
VERIFY_IS_APPROX( m4=m2.middleCols(c,1)*dm5.col(c1).transpose(), refMat4=refMat2.col(c)*dm5.col(c1).transpose());
VERIFY_IS_EQUAL(m4.nonZeros(), (refMat4.array()!=0).count());
VERIFY_IS_APPROX(dm4=m2.col(c)*dm5.col(c1).transpose(), refMat4=refMat2.col(c)*dm5.col(c1).transpose());
-
+
VERIFY_IS_APPROX(m4=dm5.col(c1)*m2.col(c).transpose(), refMat4=dm5.col(c1)*refMat2.col(c).transpose());
VERIFY_IS_EQUAL(m4.nonZeros(), (refMat4.array()!=0).count());
VERIFY_IS_APPROX(m4=dm5.col(c1)*m2.middleCols(c,1).transpose(), refMat4=dm5.col(c1)*refMat2.col(c).transpose());
@@ -211,23 +213,23 @@
}
VERIFY_IS_APPROX(m6=m6*m6, refMat6=refMat6*refMat6);
-
+
// sparse matrix * sparse vector
ColSpVector cv0(cols), cv1;
DenseVector dcv0(cols), dcv1;
initSparse(2*density,dcv0, cv0);
-
+
RowSpVector rv0(depth), rv1;
RowDenseVector drv0(depth), drv1(rv1);
initSparse(2*density,drv0, rv0);
- VERIFY_IS_APPROX(cv1=m3*cv0, dcv1=refMat3*dcv0);
+ VERIFY_IS_APPROX(cv1=m3*cv0, dcv1=refMat3*dcv0);
VERIFY_IS_APPROX(rv1=rv0*m3, drv1=drv0*refMat3);
VERIFY_IS_APPROX(cv1=m3t.adjoint()*cv0, dcv1=refMat3t.adjoint()*dcv0);
VERIFY_IS_APPROX(cv1=rv0*m3, dcv1=drv0*refMat3);
VERIFY_IS_APPROX(rv1=m3*cv0, drv1=refMat3*dcv0);
}
-
+
// test matrix - diagonal product
{
DenseMatrix refM2 = DenseMatrix::Zero(rows, cols);
@@ -243,7 +245,7 @@
VERIFY_IS_APPROX(m3=m2.transpose()*d2, refM3=refM2.transpose()*d2);
VERIFY_IS_APPROX(m3=d2*m2, refM3=d2*refM2);
VERIFY_IS_APPROX(m3=d1*m2.transpose(), refM3=d1*refM2.transpose());
-
+
// also check with a SparseWrapper:
DenseVector v1 = DenseVector::Random(cols);
DenseVector v2 = DenseVector::Random(rows);
@@ -252,12 +254,12 @@
VERIFY_IS_APPROX(m3=m2.transpose()*v2.asDiagonal(), refM3=refM2.transpose()*v2.asDiagonal());
VERIFY_IS_APPROX(m3=v2.asDiagonal()*m2, refM3=v2.asDiagonal()*refM2);
VERIFY_IS_APPROX(m3=v1.asDiagonal()*m2.transpose(), refM3=v1.asDiagonal()*refM2.transpose());
-
+
VERIFY_IS_APPROX(m3=v2.asDiagonal()*m2*v1.asDiagonal(), refM3=v2.asDiagonal()*refM2*v1.asDiagonal());
VERIFY_IS_APPROX(v2=m2*v1.asDiagonal()*v1, refM2*v1.asDiagonal()*v1);
VERIFY_IS_APPROX(v3=v2.asDiagonal()*m2*v1, v2.asDiagonal()*refM2*v1);
-
+
// evaluate to a dense matrix to check the .row() and .col() iterator functions
VERIFY_IS_APPROX(d3=m2*d1, refM3=refM2*d1);
VERIFY_IS_APPROX(d3=m2.transpose()*d2, refM3=refM2.transpose()*d2);
@@ -310,20 +312,20 @@
VERIFY_IS_APPROX(x.noalias()+=mUp.template selfadjointView<Upper>()*b, refX+=refS*b);
VERIFY_IS_APPROX(x.noalias()-=mLo.template selfadjointView<Lower>()*b, refX-=refS*b);
VERIFY_IS_APPROX(x.noalias()+=mS.template selfadjointView<Upper|Lower>()*b, refX+=refS*b);
-
+
// sparse selfadjointView with sparse matrices
SparseMatrixType mSres(rows,rows);
VERIFY_IS_APPROX(mSres = mLo.template selfadjointView<Lower>()*mS,
refX = refLo.template selfadjointView<Lower>()*refS);
VERIFY_IS_APPROX(mSres = mS * mLo.template selfadjointView<Lower>(),
refX = refS * refLo.template selfadjointView<Lower>());
-
+
// sparse triangularView with dense matrices
VERIFY_IS_APPROX(x=mA.template triangularView<Upper>()*b, refX=refA.template triangularView<Upper>()*b);
VERIFY_IS_APPROX(x=mA.template triangularView<Lower>()*b, refX=refA.template triangularView<Lower>()*b);
VERIFY_IS_APPROX(x=b*mA.template triangularView<Upper>(), refX=b*refA.template triangularView<Upper>());
VERIFY_IS_APPROX(x=b*mA.template triangularView<Lower>(), refX=b*refA.template triangularView<Lower>());
-
+
// sparse triangularView with sparse matrices
VERIFY_IS_APPROX(mSres = mA.template triangularView<Lower>()*mS, refX = refA.template triangularView<Lower>()*refS);
VERIFY_IS_APPROX(mSres = mS * mA.template triangularView<Lower>(), refX = refS * refA.template triangularView<Lower>());
@@ -368,9 +370,9 @@
Vector d(1);
d[0] = 2;
-
+
double res = 2;
-
+
VERIFY_IS_APPROX( ( cmA*d.asDiagonal() ).eval().coeff(0,0), res );
VERIFY_IS_APPROX( ( d.asDiagonal()*rmA ).eval().coeff(0,0), res );
VERIFY_IS_APPROX( ( rmA*d.asDiagonal() ).eval().coeff(0,0), res );
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
index ea286fe..13450e1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
@@ -91,24 +91,21 @@
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state, uint64_t stream) {
- Eigen::half result;
- // Generate 10 random bits for the mantissa
+ // Generate 10 random bits for the mantissa, merge with exponent.
unsigned rnd = PCG_XSH_RS_generator(state, stream);
- result.x = static_cast<uint16_t>(rnd & 0x3ffu);
- // Set the exponent
- result.x |= (static_cast<uint16_t>(15) << 10);
+ const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x3ffu) | (static_cast<uint16_t>(15) << 10);
+ Eigen::half result = Eigen::numext::bit_cast<Eigen::half>(half_bits);
// Return the final result
return result - Eigen::half(1.0f);
}
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Eigen::bfloat16 RandomToTypeUniform<Eigen::bfloat16>(uint64_t* state, uint64_t stream) {
- Eigen::bfloat16 result;
- // Generate 7 random bits for the mantissa
+
+ // Generate 7 random bits for the mantissa, merge with exponent.
unsigned rnd = PCG_XSH_RS_generator(state, stream);
- result.value = static_cast<uint16_t>(rnd & 0x7fu);
- // Set the exponent
- result.value |= (static_cast<uint16_t>(127) << 7);
+ const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x7fu) | (static_cast<uint16_t>(127) << 7);
+ Eigen::bfloat16 result = Eigen::numext::bit_cast<Eigen::bfloat16>(half_bits);
// Return the final result
return result - Eigen::bfloat16(1.0f);
}
@@ -169,19 +166,19 @@
uint64_t seed = 0) {
m_state = PCG_XSH_RS_state(seed);
#ifdef EIGEN_USE_SYCL
- // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
+ // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
// Therefor, we need two step to initializate the m_state.
// IN SYCL, the constructor of the functor is s called on the CPU
- // and we get the clock seed here from the CPU. However, This seed is
+ // and we get the clock seed here from the CPU. However, This seed is
//the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
// and only available on the Operator() function (which is called on the GPU).
- // Thus for CUDA (((CLOCK + global_thread_id)* 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread
- // but for SYCL ((CLOCK * 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread and each thread adds
- // the (global_thread_id* 6364136223846793005ULL) for itself only once, in order to complete the construction
- // similar to CUDA Therefore, the thread Id injection is not available at this stage.
- //However when the operator() is called the thread ID will be avilable. So inside the opeator,
- // we add the thrreadID, BlockId,... (which is equivalent of i)
- //to the seed and construct the unique m_state per thead similar to cuda.
+ // Thus for CUDA (((CLOCK + global_thread_id)* 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread
+ // but for SYCL ((CLOCK * 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread and each thread adds
+ // the (global_thread_id* 6364136223846793005ULL) for itself only once, in order to complete the construction
+ // similar to CUDA Therefore, the thread Id injection is not available at this stage.
+ //However when the operator() is called the thread ID will be avilable. So inside the opeator,
+ // we add the thrreadID, BlockId,... (which is equivalent of i)
+ //to the seed and construct the unique m_state per thead similar to cuda.
m_exec_once =false;
#endif
}
@@ -282,16 +279,16 @@
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) {
m_state = PCG_XSH_RS_state(seed);
#ifdef EIGEN_USE_SYCL
- // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
+ // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
// Therefor, we need two steps to initializate the m_state.
// IN SYCL, the constructor of the functor is s called on the CPU
- // and we get the clock seed here from the CPU. However, This seed is
+ // and we get the clock seed here from the CPU. However, This seed is
//the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
// and only available on the Operator() function (which is called on the GPU).
- // Therefore, the thread Id injection is not available at this stage. However when the operator()
- //is called the thread ID will be avilable. So inside the opeator,
- // we add the thrreadID, BlockId,... (which is equivalent of i)
- //to the seed and construct the unique m_state per thead similar to cuda.
+ // Therefore, the thread Id injection is not available at this stage. However when the operator()
+ //is called the thread ID will be avilable. So inside the opeator,
+ // we add the thrreadID, BlockId,... (which is equivalent of i)
+ //to the seed and construct the unique m_state per thead similar to cuda.
m_exec_once =false;
#endif
}
diff --git a/unsupported/test/cxx11_tensor_random.cpp b/unsupported/test/cxx11_tensor_random.cpp
index 4740d58..b9d4c55 100644
--- a/unsupported/test/cxx11_tensor_random.cpp
+++ b/unsupported/test/cxx11_tensor_random.cpp
@@ -11,9 +11,10 @@
#include <Eigen/CXX11/Tensor>
+template<typename Scalar>
static void test_default()
{
- Tensor<float, 1> vec(6);
+ Tensor<Scalar, 1> vec(6);
vec.setRandom();
// Fixme: we should check that the generated numbers follow a uniform
@@ -23,10 +24,11 @@
}
}
+template<typename Scalar>
static void test_normal()
{
- Tensor<float, 1> vec(6);
- vec.setRandom<Eigen::internal::NormalRandomGenerator<float>>();
+ Tensor<Scalar, 1> vec(6);
+ vec.template setRandom<Eigen::internal::NormalRandomGenerator<Scalar>>();
// Fixme: we should check that the generated numbers follow a gaussian
// distribution instead.
@@ -72,7 +74,13 @@
EIGEN_DECLARE_TEST(cxx11_tensor_random)
{
- CALL_SUBTEST(test_default());
- CALL_SUBTEST(test_normal());
+ CALL_SUBTEST((test_default<float>()));
+ CALL_SUBTEST((test_normal<float>()));
+ CALL_SUBTEST((test_default<double>()));
+ CALL_SUBTEST((test_normal<double>()));
+ CALL_SUBTEST((test_default<Eigen::half>()));
+ CALL_SUBTEST((test_normal<Eigen::half>()));
+ CALL_SUBTEST((test_default<Eigen::bfloat16>()));
+ CALL_SUBTEST((test_normal<Eigen::bfloat16>()));
CALL_SUBTEST(test_custom());
}
diff --git a/unsupported/test/sparse_extra.cpp b/unsupported/test/sparse_extra.cpp
index b5d656f..cbb799a 100644
--- a/unsupported/test/sparse_extra.cpp
+++ b/unsupported/test/sparse_extra.cpp
@@ -22,6 +22,9 @@
#endif
#define EIGEN_NO_DEPRECATED_WARNING
+// Disable counting of temporaries, since sparse_product(DynamicSparseMatrix)
+// has an extra copy-assignment.
+#define EIGEN_SPARSE_PRODUCT_IGNORE_TEMPORARY_COUNT
#include "sparse_product.cpp"
#if 0 // sparse_basic(DynamicSparseMatrix) does not compile at all -> disabled