Update Eigen to commit:34780d8bd13d0af0cf17a22789ef286e8512594d
CHANGELOG
=========
34780d8bd - Include immintrin.h header for enscripten.
2cf4d18c9 - Disable AVX512 GEMM kernels by default.
a678a3e05 - Fix aligned_realloc to call check_that_malloc_is_allowed() if ptr == 0
4a5635940 - Add option to disable avx512 GEBP kernels
1092574b2 - Fix wrong doxygen group usage
e1165dbf9 - AutoDiff depends on Core, so include appropriate header.
bb51d9f4f - Fix ODR violations.
06a458a13 - Enable subtests which use device side malloc since this has been fixed in ROCm 5.2.
84cf3ff18 - Add pload_partial, pstore_partial (and unaligned versions), pgather_partial, pscatter_partial, loadPacketPartial and storePacketPartial.
c603275dc - Better performance for Power10 using more load and store vector pairs for GEMV
PiperOrigin-RevId: 464973344
Change-Id: Ida84f32c0ffb7c5cfddb2ca55ee5fbfb5267f729
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index 3ea6855..8119200 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -606,14 +606,46 @@
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
pabsdiff(const Packet& a, const Packet& b) { return pselect(pcmp_lt(a, b), psub(b, a), psub(a, b)); }
-/** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */
+/** \internal \returns a packet version of \a *from, from must be properly aligned */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
pload(const typename unpacket_traits<Packet>::type* from) { return *from; }
+/** \internal \returns n elements of a packet version of \a *from, from must be properly aligned
+ * offset indicates the starting element in which to load and
+ * offset + n <= unpacket_traits::size
+ * All elements before offset and after the last element loaded will initialized with zero */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pload_partial(const typename unpacket_traits<Packet>::type* from, const Index n, const Index offset = 0)
+{
+ const Index packet_size = unpacket_traits<Packet>::size;
+ eigen_assert(n + offset <= packet_size && "number of elements plus offset will read past end of packet");
+ typedef typename unpacket_traits<Packet>::type Scalar;
+ EIGEN_ALIGN_MAX Scalar elements[packet_size] = { Scalar(0) };
+ for (Index i = offset; i < numext::mini(n+offset,packet_size); i++) {
+ elements[i] = from[i-offset];
+ }
+ return pload<Packet>(elements);
+}
+
/** \internal \returns a packet version of \a *from, (un-aligned load) */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
ploadu(const typename unpacket_traits<Packet>::type* from) { return *from; }
+/** \internal \returns n elements of a packet version of \a *from, (un-aligned load)
+ * All elements after the last element loaded will initialized with zero */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+ploadu_partial(const typename unpacket_traits<Packet>::type* from, const Index n)
+{
+ const Index packet_size = unpacket_traits<Packet>::size;
+ eigen_assert(n <= packet_size && "number of elements will read past end of packet");
+ typedef typename unpacket_traits<Packet>::type Scalar;
+ EIGEN_ALIGN_MAX Scalar elements[packet_size] = { Scalar(0) };
+ for (Index i = 0; i < numext::mini(n,packet_size); i++) {
+ elements[i] = from[i];
+ }
+ return pload<Packet>(elements);
+}
+
/** \internal \returns a packet version of \a *from, (un-aligned masked load)
* There is no generic implementation. We only have implementations for specialized
* cases. Generic case should not be called.
@@ -704,14 +736,40 @@
}
-/** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */
+/** \internal copy the packet \a from to \a *to, \a to must be properly aligned */
template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstore(Scalar* to, const Packet& from)
{ (*to) = from; }
+/** \internal copy n elements of the packet \a from to \a *to, \a to must be properly aligned
+ * offset indicates the starting element in which to store and
+ * offset + n <= unpacket_traits::size */
+template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstore_partial(Scalar* to, const Packet& from, const Index n, const Index offset = 0)
+{
+ const Index packet_size = unpacket_traits<Packet>::size;
+ eigen_assert(n + offset <= packet_size && "number of elements plus offset will write past end of packet");
+ EIGEN_ALIGN_MAX Scalar elements[packet_size];
+ pstore<Scalar>(elements, from);
+ for (Index i = 0; i < numext::mini(n,packet_size-offset); i++) {
+ to[i] = elements[i + offset];
+ }
+}
+
/** \internal copy the packet \a from to \a *to, (un-aligned store) */
template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu(Scalar* to, const Packet& from)
{ (*to) = from; }
+/** \internal copy n elements of the packet \a from to \a *to, (un-aligned store) */
+template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu_partial(Scalar* to, const Packet& from, const Index n)
+{
+ const Index packet_size = unpacket_traits<Packet>::size;
+ eigen_assert(n <= packet_size && "number of elements will write past end of packet");
+ EIGEN_ALIGN_MAX Scalar elements[packet_size];
+ pstore<Scalar>(elements, from);
+ for (Index i = 0; i < numext::mini(n,packet_size); i++) {
+ to[i] = elements[i];
+ }
+}
+
/** \internal copy the packet \a from to \a *to, (un-aligned store with a mask)
* There is no generic implementation. We only have implementations for specialized
* cases. Generic case should not be called.
@@ -721,11 +779,31 @@
std::enable_if_t<unpacket_traits<Packet>::masked_store_available, void>
pstoreu(Scalar* to, const Packet& from, typename unpacket_traits<Packet>::mask_t umask);
- template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather(const Scalar* from, Index /*stride*/)
- { return ploadu<Packet>(from); }
+template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather(const Scalar* from, Index /*stride*/)
+{ return ploadu<Packet>(from); }
- template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pscatter(Scalar* to, const Packet& from, Index /*stride*/)
- { pstore(to, from); }
+template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather_partial(const Scalar* from, Index stride, const Index n)
+{
+ const Index packet_size = unpacket_traits<Packet>::size;
+ EIGEN_ALIGN_MAX Scalar elements[packet_size] = { Scalar(0) };
+ for (Index i = 0; i < numext::mini(n,packet_size); i++) {
+ elements[i] = from[i*stride];
+ }
+ return pload<Packet>(elements);
+}
+
+template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pscatter(Scalar* to, const Packet& from, Index /*stride*/)
+{ pstore(to, from); }
+
+template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pscatter_partial(Scalar* to, const Packet& from, Index stride, const Index n)
+{
+ const Index packet_size = unpacket_traits<Packet>::size;
+ EIGEN_ALIGN_MAX Scalar elements[packet_size];
+ pstore<Scalar>(elements, from);
+ for (Index i = 0; i < numext::mini(n,packet_size); i++) {
+ to[i*stride] = elements[i];
+ }
+}
/** \internal tries to do cache prefetching of \a addr */
template<typename Scalar> EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr)
@@ -996,6 +1074,17 @@
return ploadu<Packet>(from);
}
+/** \internal \returns n elements of a packet version of \a *from.
+ * The pointer \a from must be aligned on a \a Alignment bytes boundary. */
+template<typename Packet, int Alignment>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt_partial(const typename unpacket_traits<Packet>::type* from, const Index n, const Index offset = 0)
+{
+ if(Alignment >= unpacket_traits<Packet>::alignment)
+ return pload_partial<Packet>(from, n, offset);
+ else
+ return ploadu_partial<Packet>(from, n);
+}
+
/** \internal copy the packet \a from to \a *to.
* The pointer \a from must be aligned on a \a Alignment bytes boundary. */
template<typename Scalar, typename Packet, int Alignment>
@@ -1007,6 +1096,17 @@
pstoreu(to, from);
}
+/** \internal copy n elements of the packet \a from to \a *to.
+ * The pointer \a from must be aligned on a \a Alignment bytes boundary. */
+template<typename Scalar, typename Packet, int Alignment>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret_partial(Scalar* to, const Packet& from, const Index n, const Index offset = 0)
+{
+ if(Alignment >= unpacket_traits<Packet>::alignment)
+ pstore_partial(to, from, n, offset);
+ else
+ pstoreu_partial(to, from, n);
+}
+
/** \internal \returns a packet version of \a *from.
* Unlike ploadt, ploadt_ro takes advantage of the read-only memory path on the
* hardware if available to speedup the loading of data that won't be modified
diff --git a/Eigen/src/Core/PlainObjectBase.h b/Eigen/src/Core/PlainObjectBase.h
index e0bde54..222eaf5 100644
--- a/Eigen/src/Core/PlainObjectBase.h
+++ b/Eigen/src/Core/PlainObjectBase.h
@@ -638,7 +638,7 @@
*
* \see class Map
*/
- //@{
+ ///@{
static inline ConstMapType Map(const Scalar* data)
{ return ConstMapType(data); }
static inline MapType Map(Scalar* data)
@@ -702,7 +702,7 @@
template<int Outer, int Inner>
static inline typename StridedAlignedMapType<Stride<Outer, Inner> >::type MapAligned(Scalar* data, Index rows, Index cols, const Stride<Outer, Inner>& stride)
{ return typename StridedAlignedMapType<Stride<Outer, Inner> >::type(data, rows, cols, stride); }
- //@}
+ ///@}
using Base::setConstant;
EIGEN_DEVICE_FUNC Derived& setConstant(Index size, const Scalar& val);
diff --git a/Eigen/src/Core/arch/AVX512/GemmKernel.h b/Eigen/src/Core/arch/AVX512/GemmKernel.h
index 477c50f..cb7cfdf 100644
--- a/Eigen/src/Core/arch/AVX512/GemmKernel.h
+++ b/Eigen/src/Core/arch/AVX512/GemmKernel.h
@@ -7,8 +7,8 @@
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#ifndef GEMM_KERNEL_H
-#define GEMM_KERNEL_H
+#ifndef EIGEN_CORE_ARCH_AVX512_GEMM_KERNEL_H
+#define EIGEN_CORE_ARCH_AVX512_GEMM_KERNEL_H
#if EIGEN_COMP_MSVC
#include <intrin.h>
@@ -20,6 +20,11 @@
#include "../../InternalHeaderCheck.h"
+#if !defined(EIGEN_USE_AVX512_GEMM_KERNELS)
+// Disable new AVX512 kernels by default.
+#define EIGEN_USE_AVX512_GEMM_KERNELS 0
+#endif
+
#define SECOND_FETCH (32)
#if (EIGEN_COMP_GNUC_STRICT != 0) && !defined(EIGEN_ARCH_AVX512_GEMM_KERNEL_USE_LESS_A_REGS)
// Use less registers to load A elements to workaround compiler spills. Loose a
@@ -930,6 +935,8 @@
g.template compute_kern<max_a_unroll, max_b_unroll>();
}
+// Template specializations of GEBP kernels with nr = 8.
+#if EIGEN_USE_AVX512_GEMM_KERNELS
template <bool ConjLhs_, bool ConjRhs_, int PacketSize_>
class gebp_traits<float, float, ConjLhs_, ConjRhs_, Architecture::Target, PacketSize_>
: public gebp_traits<float, float, ConjLhs_, ConjRhs_, Architecture::Generic, PacketSize_> {
@@ -1218,8 +1225,11 @@
}
}
}
+#endif // EIGEN_USE_AVX512_GEMM_KERNELS
} // namespace internal
} // namespace Eigen
-#endif // GEMM_KERNEL_H
+#undef SECOND_FETCH
+
+#endif // EIGEN_CORE_ARCH_AVX512_GEMM_KERNEL_H
diff --git a/Eigen/src/Core/arch/AVX512/TrsmKernel.h b/Eigen/src/Core/arch/AVX512/TrsmKernel.h
index 1b351ea..94f1f5a 100644
--- a/Eigen/src/Core/arch/AVX512/TrsmKernel.h
+++ b/Eigen/src/Core/arch/AVX512/TrsmKernel.h
@@ -7,8 +7,8 @@
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#ifndef EIGEN_TRSM_KERNEL_IMPL_H
-#define EIGEN_TRSM_KERNEL_IMPL_H
+#ifndef EIGEN_CORE_ARCH_AVX512_TRSM_KERNEL_H
+#define EIGEN_CORE_ARCH_AVX512_TRSM_KERNEL_H
#include "../../InternalHeaderCheck.h"
@@ -106,6 +106,10 @@
int64_t cutoff_l = static_cast<int64_t>(cutoff_d);
return (cutoff_l / EIGEN_AVX_MAX_NUM_ROW) * EIGEN_AVX_MAX_NUM_ROW;
}
+#else // !(EIGEN_USE_AVX512_TRSM_KERNELS) || !(EIGEN_COMP_CLANG != 0)
+#define EIGEN_ENABLE_AVX512_NOCOPY_TRSM_CUTOFFS 0
+#define EIGEN_ENABLE_AVX512_NOCOPY_TRSM_R_CUTOFFS 0
+#define EIGEN_ENABLE_AVX512_NOCOPY_TRSM_L_CUTOFFS 0
#endif
/**
@@ -1184,4 +1188,4 @@
#endif // EIGEN_USE_AVX512_TRSM_KERNELS
} // namespace internal
} // namespace Eigen
-#endif // EIGEN_TRSM_KERNEL_IMPL_H
+#endif // EIGEN_CORE_ARCH_AVX512_TRSM_KERNEL_H
diff --git a/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc b/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc
index 032937c..6b09424 100644
--- a/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc
+++ b/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc
@@ -7,8 +7,8 @@
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#ifndef EIGEN_UNROLLS_IMPL_H
-#define EIGEN_UNROLLS_IMPL_H
+#ifndef EIGEN_CORE_ARCH_AVX512_TRSM_UNROLLS_H
+#define EIGEN_CORE_ARCH_AVX512_TRSM_UNROLLS_H
template <bool isARowMajor = true>
static EIGEN_ALWAYS_INLINE int64_t idA(int64_t i, int64_t j, int64_t LDA) {
@@ -1209,4 +1209,4 @@
};
} // namespace unrolls
-#endif // EIGEN_UNROLLS_IMPL_H
+#endif // EIGEN_CORE_ARCH_AVX512_TRSM_UNROLLS_H
diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h
index ba5a3fd..6046035 100644
--- a/Eigen/src/Core/arch/AltiVec/Complex.h
+++ b/Eigen/src/Core/arch/AltiVec/Complex.h
@@ -132,10 +132,20 @@
template<> EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from) { return Packet2cf(pload<Packet4f>((const float *) from)); }
template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) { return Packet2cf(ploadu<Packet4f>((const float*) from)); }
+template<> EIGEN_ALWAYS_INLINE Packet2cf pload_partial<Packet2cf>(const std::complex<float>* from, const Index n, const Index offset)
+{
+ return Packet2cf(pload_partial<Packet4f>((const float *) from, n * 2, offset * 2));
+}
+template<> EIGEN_ALWAYS_INLINE Packet2cf ploadu_partial<Packet2cf>(const std::complex<float>* from, const Index n)
+{
+ return Packet2cf(ploadu_partial<Packet4f>((const float*) from, n * 2));
+}
template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); }
template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { pstore((float*)to, from.v); }
template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { pstoreu((float*)to, from.v); }
+template<> EIGEN_ALWAYS_INLINE void pstore_partial <std::complex<float> >(std::complex<float> * to, const Packet2cf& from, const Index n, const Index offset) { pstore_partial((float*)to, from.v, n * 2, offset * 2); }
+template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<std::complex<float> >(std::complex<float> * to, const Packet2cf& from, const Index n) { pstoreu_partial((float*)to, from.v, n * 2); }
EIGEN_STRONG_INLINE Packet2cf pload2(const std::complex<float>& from0, const std::complex<float>& from1)
{
@@ -157,19 +167,46 @@
return Packet2cf(res0);
}
-template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
+template<> EIGEN_ALWAYS_INLINE Packet2cf pload_ignore<Packet2cf>(const std::complex<float>* from)
{
- EIGEN_ALIGN16 std::complex<float> af[2];
- af[0] = from[0*stride];
- af[1] = from[1*stride];
- return pload<Packet2cf>(af);
+ Packet2cf res;
+ res.v = pload_ignore<Packet4f>(reinterpret_cast<const float*>(from));
+ return res;
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
+
+template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pgather_complex_size2(const Scalar* from, Index stride, const Index n = 2)
{
- EIGEN_ALIGN16 std::complex<float> af[2];
- pstore<std::complex<float> >((std::complex<float> *) af, from);
- to[0*stride] = af[0];
- to[1*stride] = af[1];
+ eigen_assert(n <= unpacket_traits<Packet>::size && "number of elements will gather past end of packet");
+ EIGEN_ALIGN16 Scalar af[2];
+ for (Index i = 0; i < n; i++) {
+ af[i] = from[i*stride];
+ }
+ return pload_ignore<Packet>(af);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
+{
+ return pgather_complex_size2<std::complex<float>, Packet2cf>(from, stride);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2cf pgather_partial<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride, const Index n)
+{
+ return pgather_complex_size2<std::complex<float>, Packet2cf>(from, stride, n);
+}
+template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_complex_size2(Scalar* to, const Packet& from, Index stride, const Index n = 2)
+{
+ eigen_assert(n <= unpacket_traits<Packet>::size && "number of elements will scatter past end of packet");
+ EIGEN_ALIGN16 Scalar af[2];
+ pstore<Scalar>((Scalar *) af, from);
+ for (Index i = 0; i < n; i++) {
+ to[i*stride] = af[i];
+ }
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
+{
+ pscatter_complex_size2<std::complex<float>, Packet2cf>(to, from, stride);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride, const Index n)
+{
+ pscatter_complex_size2<std::complex<float>, Packet2cf>(to, from, stride, n);
}
template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(a.v + b.v); }
@@ -336,17 +373,35 @@
template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from) { return Packet1cd(pload<Packet2d>((const double*)from)); }
template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { return Packet1cd(ploadu<Packet2d>((const double*)from)); }
+template<> EIGEN_ALWAYS_INLINE Packet1cd pload_partial<Packet1cd>(const std::complex<double>* from, const Index n, const Index offset)
+{
+ return Packet1cd(pload_partial<Packet2d>((const double*)from, n * 2, offset * 2));
+}
+template<> EIGEN_ALWAYS_INLINE Packet1cd ploadu_partial<Packet1cd>(const std::complex<double>* from, const Index n)
+{
+ return Packet1cd(ploadu_partial<Packet2d>((const double*)from, n * 2));
+}
template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { pstore((double*)to, from.v); }
template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { pstoreu((double*)to, from.v); }
+template<> EIGEN_ALWAYS_INLINE void pstore_partial <std::complex<double> >(std::complex<double> * to, const Packet1cd& from, const Index n, const Index offset) { pstore_partial((double*)to, from.v, n * 2, offset * 2); }
+template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<std::complex<double> >(std::complex<double> * to, const Packet1cd& from, const Index n) { pstoreu_partial((double*)to, from.v, n * 2); }
template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>& from)
{ /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }
-template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index)
{
return pload<Packet1cd>(from);
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet1cd pgather_partial<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index, const Index)
+{
+ return pload<Packet1cd>(from);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index)
+{
+ pstore<std::complex<double> >(to, from);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index, const Index)
{
pstore<std::complex<double> >(to, from);
}
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h
index 940a817..9d00b93 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h
@@ -207,12 +207,8 @@
} \
}
-#if EIGEN_COMP_LLVM
-#define GEMV_LOADPAIR2_COL_MMA(iter1, iter2)
-#else
#define GEMV_LOADPAIR2_COL_MMA(iter1, iter2) \
b##iter1 = *reinterpret_cast<__vector_pair *>(res + i + ((iter2) * ResPacketSize));
-#endif
#define GEMV_LOAD2_COL_MMA(iter1, iter2, iter3, N) \
if (GEMV_GETN(N) > iter1) { \
@@ -231,8 +227,9 @@
#if EIGEN_COMP_LLVM
#define GEMV_WORKPAIR2_COL_MMA(iter2, iter3, iter4) \
ResPacket f##iter2[2]; \
- f##iter2[0] = pmadd(result##iter2.packet[0], palpha, ploadu<ResPacket>(res + i + ((iter4) * ResPacketSize))); \
- f##iter2[1] = pmadd(result##iter3.packet[(iter2 == iter3) ? 2 : 0], palpha, ploadu<ResPacket>(res + i + (((iter4) + 1) * ResPacketSize))); \
+ __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(f##iter2), &b##iter2); \
+ f##iter2[0] = pmadd(result##iter2.packet[0], palpha, f##iter2[0]); \
+ f##iter2[1] = pmadd(result##iter3.packet[(iter2 == iter3) ? 2 : 0], palpha, f##iter2[1]); \
GEMV_BUILDPAIR_MMA(b##iter2, f##iter2[0], f##iter2[1]);
#else
#define GEMV_WORKPAIR2_COL_MMA(iter2, iter3, iter4) \
@@ -932,7 +929,7 @@
{
PResPacket c2 = pcplxflipconj(c0);
PResPacket c3 = pcplxflipconj(c1);
-#if EIGEN_COMP_LLVM || !defined(_ARCH_PWR10)
+#if !defined(_ARCH_PWR10)
ScalarPacket c4 = pload_complex<ResPacket>(res + (iter2 * ResPacketSize));
ScalarPacket c5 = pload_complex<ResPacket>(res + ((iter2 + 1) * ResPacketSize));
PResPacket c6 = PResPacket(pmadd_complex<ScalarPacket, AlphaData>(c0.v, c2.v, c4, b0));
@@ -941,6 +938,13 @@
pstoreu(res + ((iter2 + 1) * ResPacketSize), c7);
#else
__vector_pair a = *reinterpret_cast<__vector_pair *>(res + (iter2 * ResPacketSize));
+#if EIGEN_COMP_LLVM
+ PResPacket c6[2];
+ __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(c6), &a);
+ c6[0] = PResPacket(pmadd_complex<ScalarPacket, AlphaData>(c0.v, c2.v, c6[0].v, b0));
+ c6[1] = PResPacket(pmadd_complex<ScalarPacket, AlphaData>(c1.v, c3.v, c6[1].v, b0));
+ GEMV_BUILDPAIR_MMA(a, c6[0].v, c6[1].v);
+#else
if (GEMV_IS_COMPLEX_FLOAT) {
__asm__ ("xvmaddasp %L0,%x1,%x2\n\txvmaddasp %0,%x1,%x3" : "+&d" (a) : "wa" (b0.separate.r.v), "wa" (c0.v), "wa" (c1.v));
__asm__ ("xvmaddasp %L0,%x1,%x2\n\txvmaddasp %0,%x1,%x3" : "+&d" (a) : "wa" (b0.separate.i.v), "wa" (c2.v), "wa" (c3.v));
@@ -948,6 +952,7 @@
__asm__ ("xvmaddadp %L0,%x1,%x2\n\txvmaddadp %0,%x1,%x3" : "+&d" (a) : "wa" (b0.separate.r.v), "wa" (c0.v), "wa" (c1.v));
__asm__ ("xvmaddadp %L0,%x1,%x2\n\txvmaddadp %0,%x1,%x3" : "+&d" (a) : "wa" (b0.separate.i.v), "wa" (c2.v), "wa" (c3.v));
}
+#endif
*reinterpret_cast<__vector_pair *>(res + (iter2 * ResPacketSize)) = a;
#endif
}
diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index 91b3e20..4dd53f6 100644
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -143,6 +143,12 @@
#define EIGEN_PPC_PREFETCH(ADDR) asm( " dcbt [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" );
#endif
+#if EIGEN_COMP_LLVM
+#define LOAD_STORE_UNROLL_16 _Pragma("unroll 16")
+#else
+#define LOAD_STORE_UNROLL_16 _Pragma("GCC unroll(16)")
+#endif
+
template <>
struct packet_traits<float> : default_packet_traits {
typedef Packet4f type;
@@ -472,6 +478,118 @@
}
template <typename Packet>
+EIGEN_ALWAYS_INLINE Packet pload_ignore(const __UNPACK_TYPE__(Packet)* from)
+{
+ // some versions of GCC throw "unused-but-set-parameter".
+ // ignoring these warnings for now.
+ EIGEN_UNUSED_VARIABLE(from);
+ EIGEN_DEBUG_ALIGNED_LOAD
+ // Ignore partial input memory initialized
+#if !EIGEN_COMP_LLVM
+ #pragma GCC diagnostic push
+ #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+#ifdef __VSX__
+ return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
+#else
+ return vec_ld(0, from);
+#endif
+#if !EIGEN_COMP_LLVM
+ #pragma GCC diagnostic pop
+#endif
+}
+
+template<> EIGEN_ALWAYS_INLINE Packet8bf pload_ignore<Packet8bf>(const bfloat16* from)
+{
+ return pload_ignore<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
+}
+
+template <typename Packet>
+EIGEN_ALWAYS_INLINE Packet pload_partial_common(const __UNPACK_TYPE__(Packet)* from, const Index n, const Index offset)
+{
+ // some versions of GCC throw "unused-but-set-parameter".
+ // ignoring these warnings for now.
+ const Index packet_size = unpacket_traits<Packet>::size;
+ eigen_assert(n + offset <= packet_size && "number of elements plus offset will read past end of packet");
+ const Index size = sizeof(__UNPACK_TYPE__(Packet));
+#ifdef _ARCH_PWR9
+ EIGEN_DEBUG_ALIGNED_LOAD
+ EIGEN_UNUSED_VARIABLE(from);
+ Packet load = vec_xl_len(const_cast<__UNPACK_TYPE__(Packet)*>(from), n * size);
+ if (offset) {
+ Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
+#ifdef _BIG_ENDIAN
+ load = Packet(vec_sro(Packet16uc(load), shift));
+#else
+ load = Packet(vec_slo(Packet16uc(load), shift));
+#endif
+ }
+ return load;
+#else
+ EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) load[packet_size];
+ unsigned char* load2 = reinterpret_cast<unsigned char *>(load + offset);
+ unsigned char* from2 = reinterpret_cast<unsigned char *>(const_cast<__UNPACK_TYPE__(Packet)*>(from));
+ Index n2 = n * size;
+ Index i = 0;
+ if (16 <= n2) {
+ pstoreu(load2, ploadu<Packet16uc>(from2));
+ i += 16;
+ }
+ if (i + 8 <= n2) {
+ *reinterpret_cast<uint64_t *>(load2 + i) = *reinterpret_cast<uint64_t *>(from2 + i);
+ i += 8;
+ }
+ if (i + 4 <= n2) {
+ *reinterpret_cast<uint32_t *>(load2 + i) = *reinterpret_cast<uint32_t *>(from2 + i);
+ i += 4;
+ }
+ if (i + 2 <= n2) {
+ *reinterpret_cast<uint16_t *>(load2 + i) = *reinterpret_cast<uint16_t *>(from2 + i);
+ i += 2;
+ }
+ if (i < n2) {
+ *reinterpret_cast<uint8_t *>(load2 + i) = *reinterpret_cast<uint8_t *>(from2 + i);
+ }
+ return pload_ignore<Packet>(load);
+#endif
+}
+
+template<> EIGEN_ALWAYS_INLINE Packet4f pload_partial<Packet4f>(const float* from, const Index n, const Index offset)
+{
+ return pload_partial_common<Packet4f>(from, n, offset);
+}
+
+template<> EIGEN_ALWAYS_INLINE Packet4i pload_partial<Packet4i>(const int* from, const Index n, const Index offset)
+{
+ return pload_partial_common<Packet4i>(from, n, offset);
+}
+
+template<> EIGEN_ALWAYS_INLINE Packet8s pload_partial<Packet8s>(const short int* from, const Index n, const Index offset)
+{
+ return pload_partial_common<Packet8s>(from, n, offset);
+}
+
+template<> EIGEN_ALWAYS_INLINE Packet8us pload_partial<Packet8us>(const unsigned short int* from, const Index n, const Index offset)
+{
+ return pload_partial_common<Packet8us>(from, n, offset);
+}
+
+template<> EIGEN_ALWAYS_INLINE Packet8bf pload_partial<Packet8bf>(const bfloat16* from, const Index n, const Index offset)
+{
+ return pload_partial_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from), n, offset);
+}
+
+template<> EIGEN_ALWAYS_INLINE Packet16c pload_partial<Packet16c>(const signed char* from, const Index n, const Index offset)
+{
+ return pload_partial_common<Packet16c>(from, n, offset);
+}
+
+template<> EIGEN_ALWAYS_INLINE Packet16uc pload_partial<Packet16uc>(const unsigned char* from, const Index n, const Index offset)
+{
+ return pload_partial_common<Packet16uc>(from, n, offset);
+}
+
+template <typename Packet>
EIGEN_STRONG_INLINE void pstore_common(__UNPACK_TYPE__(Packet)* to, const Packet& from){
// some versions of GCC throw "unused-but-set-parameter" (float *to).
// ignoring these warnings for now.
@@ -519,6 +637,90 @@
pstore_common<Packet16uc>(to, from);
}
+template<typename Packet> EIGEN_ALWAYS_INLINE void pstore_partial_common(__UNPACK_TYPE__(Packet)* to, const Packet& from, const Index n, const Index offset)
+{
+ // some versions of GCC throw "unused-but-set-parameter" (float *to).
+ // ignoring these warnings for now.
+ const Index packet_size = unpacket_traits<Packet>::size;
+ eigen_assert(n + offset <= packet_size && "number of elements plus offset will write past end of packet");
+ const Index size = sizeof(__UNPACK_TYPE__(Packet));
+#ifdef _ARCH_PWR9
+ EIGEN_UNUSED_VARIABLE(to);
+ EIGEN_DEBUG_ALIGNED_STORE
+ Packet store = from;
+ if (offset) {
+ Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
+#ifdef _BIG_ENDIAN
+ store = Packet(vec_slo(Packet16uc(store), shift));
+#else
+ store = Packet(vec_sro(Packet16uc(store), shift));
+#endif
+ }
+ vec_xst_len(store, to, n * size);
+#else
+ EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) store[packet_size];
+ pstore(store, from);
+ unsigned char* store2 = reinterpret_cast<unsigned char *>(store + offset);
+ unsigned char* to2 = reinterpret_cast<unsigned char *>(to);
+ Index n2 = n * size;
+ Index i = 0;
+ if (16 <= n2) {
+ pstore(to2, ploadu<Packet16uc>(store2));
+ i += 16;
+ }
+ if (i + 8 <= n2) {
+ *reinterpret_cast<uint64_t *>(to2 + i) = *reinterpret_cast<uint64_t *>(store2 + i);
+ i += 8;
+ }
+ if (i + 4 <= n2) {
+ *reinterpret_cast<uint32_t *>(to2 + i) = *reinterpret_cast<uint32_t *>(store2 + i);
+ i += 4;
+ }
+ if (i + 2 <= n2) {
+ *reinterpret_cast<uint16_t *>(to2 + i) = *reinterpret_cast<uint16_t *>(store2 + i);
+ i += 2;
+ }
+ if (i < n2) {
+ *reinterpret_cast<uint8_t *>(to2 + i) = *reinterpret_cast<uint8_t *>(store2 + i);
+ }
+#endif
+}
+
+template<> EIGEN_ALWAYS_INLINE void pstore_partial<float>(float* to, const Packet4f& from, const Index n, const Index offset)
+{
+ pstore_partial_common<Packet4f>(to, from, n, offset);
+}
+
+template<> EIGEN_ALWAYS_INLINE void pstore_partial<int>(int* to, const Packet4i& from, const Index n, const Index offset)
+{
+ pstore_partial_common<Packet4i>(to, from, n, offset);
+}
+
+template<> EIGEN_ALWAYS_INLINE void pstore_partial<short int>(short int* to, const Packet8s& from, const Index n, const Index offset)
+{
+ pstore_partial_common<Packet8s>(to, from, n, offset);
+}
+
+template<> EIGEN_ALWAYS_INLINE void pstore_partial<unsigned short int>(unsigned short int* to, const Packet8us& from, const Index n, const Index offset)
+{
+ pstore_partial_common<Packet8us>(to, from, n, offset);
+}
+
+template<> EIGEN_ALWAYS_INLINE void pstore_partial<bfloat16>(bfloat16* to, const Packet8bf& from, const Index n, const Index offset)
+{
+ pstore_partial_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from, n, offset);
+}
+
+template<> EIGEN_ALWAYS_INLINE void pstore_partial<signed char>(signed char* to, const Packet16c& from, const Index n, const Index offset)
+{
+ pstore_partial_common<Packet16c>(to, from, n, offset);
+}
+
+template<> EIGEN_ALWAYS_INLINE void pstore_partial<unsigned char>(unsigned char* to, const Packet16uc& from, const Index n, const Index offset)
+{
+ pstore_partial_common<Packet16uc>(to, from, n, offset);
+}
+
template<typename Packet>
EIGEN_STRONG_INLINE Packet pset1_size4(const __UNPACK_TYPE__(Packet)& from)
{
@@ -596,168 +798,167 @@
pbroadcast4_common<Packet4i>(a, a0, a1, a2, a3);
}
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather_common(const __UNPACK_TYPE__(Packet)* from, Index stride)
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pgather_common(const __UNPACK_TYPE__(Packet)* from, Index stride, const Index n = unpacket_traits<Packet>::size)
{
- EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[4];
- a[0] = from[0*stride];
- a[1] = from[1*stride];
- a[2] = from[2*stride];
- a[3] = from[3*stride];
- return pload<Packet>(a);
+ EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[unpacket_traits<Packet>::size];
+ eigen_assert(n <= unpacket_traits<Packet>::size && "number of elements will gather past end of packet");
+ LOAD_STORE_UNROLL_16
+ for (Index i = 0; i < n; i++) {
+ a[i] = from[i*stride];
+ }
+ // Leave rest of the array uninitialized
+ return pload_ignore<Packet>(a);
}
-template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4f pgather<float, Packet4f>(const float* from, Index stride)
{
return pgather_common<Packet4f>(from, stride);
}
-template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4i pgather<int, Packet4i>(const int* from, Index stride)
{
return pgather_common<Packet4i>(from, stride);
}
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather_size8(const __UNPACK_TYPE__(Packet)* from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8s pgather<short int, Packet8s>(const short int* from, Index stride)
{
- EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[8];
- a[0] = from[0*stride];
- a[1] = from[1*stride];
- a[2] = from[2*stride];
- a[3] = from[3*stride];
- a[4] = from[4*stride];
- a[5] = from[5*stride];
- a[6] = from[6*stride];
- a[7] = from[7*stride];
- return pload<Packet>(a);
+ return pgather_common<Packet8s>(from, stride);
}
-template<> EIGEN_DEVICE_FUNC inline Packet8s pgather<short int, Packet8s>(const short int* from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8us pgather<unsigned short int, Packet8us>(const unsigned short int* from, Index stride)
{
- return pgather_size8<Packet8s>(from, stride);
+ return pgather_common<Packet8us>(from, stride);
}
-template<> EIGEN_DEVICE_FUNC inline Packet8us pgather<unsigned short int, Packet8us>(const unsigned short int* from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8bf pgather<bfloat16, Packet8bf>(const bfloat16* from, Index stride)
{
- return pgather_size8<Packet8us>(from, stride);
+ return pgather_common<Packet8bf>(from, stride);
}
-template<> EIGEN_DEVICE_FUNC inline Packet8bf pgather<bfloat16, Packet8bf>(const bfloat16* from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16c pgather<signed char, Packet16c>(const signed char* from, Index stride)
{
- return pgather_size8<Packet8bf>(from, stride);
+ return pgather_common<Packet16c>(from, stride);
}
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather_size16(const __UNPACK_TYPE__(Packet)* from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16uc pgather<unsigned char, Packet16uc>(const unsigned char* from, Index stride)
{
- EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[16];
- a[0] = from[0*stride];
- a[1] = from[1*stride];
- a[2] = from[2*stride];
- a[3] = from[3*stride];
- a[4] = from[4*stride];
- a[5] = from[5*stride];
- a[6] = from[6*stride];
- a[7] = from[7*stride];
- a[8] = from[8*stride];
- a[9] = from[9*stride];
- a[10] = from[10*stride];
- a[11] = from[11*stride];
- a[12] = from[12*stride];
- a[13] = from[13*stride];
- a[14] = from[14*stride];
- a[15] = from[15*stride];
- return pload<Packet>(a);
+ return pgather_common<Packet16uc>(from, stride);
}
-
-template<> EIGEN_DEVICE_FUNC inline Packet16c pgather<signed char, Packet16c>(const signed char* from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4f pgather_partial<float, Packet4f>(const float* from, Index stride, const Index n)
{
- return pgather_size16<Packet16c>(from, stride);
+ return pgather_common<Packet4f>(from, stride, n);
}
-template<> EIGEN_DEVICE_FUNC inline Packet16uc pgather<unsigned char, Packet16uc>(const unsigned char* from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4i pgather_partial<int, Packet4i>(const int* from, Index stride, const Index n)
{
- return pgather_size16<Packet16uc>(from, stride);
+ return pgather_common<Packet4i>(from, stride, n);
}
-template<typename Packet> EIGEN_DEVICE_FUNC inline void pscatter_size4(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8s pgather_partial<short int, Packet8s>(const short int* from, Index stride, const Index n)
{
- EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[4];
+ return pgather_common<Packet8s>(from, stride, n);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8us pgather_partial<unsigned short int, Packet8us>(const unsigned short int* from, Index stride, const Index n)
+{
+ return pgather_common<Packet8us>(from, stride, n);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8bf pgather_partial<bfloat16, Packet8bf>(const bfloat16* from, Index stride, const Index n)
+{
+ return pgather_common<Packet8bf>(from, stride, n);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16c pgather_partial<signed char, Packet16c>(const signed char* from, Index stride, const Index n)
+{
+ return pgather_common<Packet16c>(from, stride, n);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16uc pgather_partial<unsigned char, Packet16uc>(const unsigned char* from, Index stride, const Index n)
+{
+ return pgather_common<Packet16uc>(from, stride, n);
+}
+
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_common(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride, const Index n = unpacket_traits<Packet>::size)
+{
+ EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[unpacket_traits<Packet>::size];
+ eigen_assert(n <= unpacket_traits<Packet>::size && "number of elements will scatter past end of packet");
pstore<__UNPACK_TYPE__(Packet)>(a, from);
- to[0*stride] = a[0];
- to[1*stride] = a[1];
- to[2*stride] = a[2];
- to[3*stride] = a[3];
+ LOAD_STORE_UNROLL_16
+ for (Index i = 0; i < n; i++) {
+ to[i*stride] = a[i];
+ }
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
{
- pscatter_size4<Packet4f>(to, from, stride);
+ pscatter_common<Packet4f>(to, from, stride);
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
{
- pscatter_size4<Packet4i>(to, from, stride);
+ pscatter_common<Packet4i>(to, from, stride);
}
-template<typename Packet> EIGEN_DEVICE_FUNC inline void pscatter_size8(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<short int, Packet8s>(short int* to, const Packet8s& from, Index stride)
{
- EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[8];
- pstore<__UNPACK_TYPE__(Packet)>(a, from);
- to[0*stride] = a[0];
- to[1*stride] = a[1];
- to[2*stride] = a[2];
- to[3*stride] = a[3];
- to[4*stride] = a[4];
- to[5*stride] = a[5];
- to[6*stride] = a[6];
- to[7*stride] = a[7];
+ pscatter_common<Packet8s>(to, from, stride);
}
-
-template<> EIGEN_DEVICE_FUNC inline void pscatter<short int, Packet8s>(short int* to, const Packet8s& from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<unsigned short int, Packet8us>(unsigned short int* to, const Packet8us& from, Index stride)
{
- pscatter_size8<Packet8s>(to, from, stride);
+ pscatter_common<Packet8us>(to, from, stride);
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<unsigned short int, Packet8us>(unsigned short int* to, const Packet8us& from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from, Index stride)
{
- pscatter_size8<Packet8us>(to, from, stride);
+ pscatter_common<Packet8bf>(to, from, stride);
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<signed char, Packet16c>(signed char* to, const Packet16c& from, Index stride)
{
- pscatter_size8<Packet8bf>(to, from, stride);
+ pscatter_common<Packet16c>(to, from, stride);
}
-template<typename Packet> EIGEN_DEVICE_FUNC inline void pscatter_size16(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<unsigned char, Packet16uc>(unsigned char* to, const Packet16uc& from, Index stride)
{
- EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[16];
- pstore<__UNPACK_TYPE__(Packet)>(a, from);
- to[0*stride] = a[0];
- to[1*stride] = a[1];
- to[2*stride] = a[2];
- to[3*stride] = a[3];
- to[4*stride] = a[4];
- to[5*stride] = a[5];
- to[6*stride] = a[6];
- to[7*stride] = a[7];
- to[8*stride] = a[8];
- to[9*stride] = a[9];
- to[10*stride] = a[10];
- to[11*stride] = a[11];
- to[12*stride] = a[12];
- to[13*stride] = a[13];
- to[14*stride] = a[14];
- to[15*stride] = a[15];
+ pscatter_common<Packet16uc>(to, from, stride);
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<signed char, Packet16c>(signed char* to, const Packet16c& from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<float, Packet4f>(float* to, const Packet4f& from, Index stride, const Index n)
{
- pscatter_size16<Packet16c>(to, from, stride);
+ pscatter_common<Packet4f>(to, from, stride, n);
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<unsigned char, Packet16uc>(unsigned char* to, const Packet16uc& from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<int, Packet4i>(int* to, const Packet4i& from, Index stride, const Index n)
{
- pscatter_size16<Packet16uc>(to, from, stride);
+ pscatter_common<Packet4i>(to, from, stride, n);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<short int, Packet8s>(short int* to, const Packet8s& from, Index stride, const Index n)
+{
+ pscatter_common<Packet8s>(to, from, stride, n);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<unsigned short int, Packet8us>(unsigned short int* to, const Packet8us& from, Index stride, const Index n)
+{
+ pscatter_common<Packet8us>(to, from, stride, n);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from, Index stride, const Index n)
+{
+ pscatter_common<Packet8bf>(to, from, stride, n);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<signed char, Packet16c>(signed char* to, const Packet16c& from, Index stride, const Index n)
+{
+ pscatter_common<Packet16c>(to, from, stride, n);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<unsigned char, Packet16uc>(unsigned char* to, const Packet16uc& from, Index stride, const Index n)
+{
+ pscatter_common<Packet16uc>(to, from, stride, n);
}
template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return pset1<Packet4f>(a) + p4f_COUNTDOWN; }
@@ -1008,6 +1209,73 @@
return ploadu_common<Packet16uc>(from);
}
+template<typename Packet> EIGEN_ALWAYS_INLINE Packet ploadu_partial_common(const __UNPACK_TYPE__(Packet)* from, const Index n)
+{
+ const Index packet_size = unpacket_traits<Packet>::size;
+ eigen_assert(n <= packet_size && "number of elements will read past end of packet");
+ const Index size = sizeof(__UNPACK_TYPE__(Packet));
+#ifdef _ARCH_PWR9
+ EIGEN_DEBUG_ALIGNED_LOAD
+ EIGEN_DEBUG_UNALIGNED_LOAD
+ return vec_xl_len(const_cast<__UNPACK_TYPE__(Packet)*>(from), n * size);
+#else
+ EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) load[packet_size];
+ unsigned char* load2 = reinterpret_cast<unsigned char *>(load);
+ unsigned char* from2 = reinterpret_cast<unsigned char *>(const_cast<__UNPACK_TYPE__(Packet)*>(from));
+ Index n2 = n * size;
+ Index i = 0;
+ if (16 <= n2) {
+ pstore(load2, ploadu<Packet16uc>(from2));
+ i += 16;
+ }
+ if (i + 8 <= n2) {
+ *reinterpret_cast<uint64_t *>(load2 + i) = *reinterpret_cast<uint64_t *>(from2 + i);
+ i += 8;
+ }
+ if (i + 4 <= n2) {
+ *reinterpret_cast<uint32_t *>(load2 + i) = *reinterpret_cast<uint32_t *>(from2 + i);
+ i += 4;
+ }
+ if (i + 2 <= n2) {
+ *reinterpret_cast<uint16_t *>(load2 + i) = *reinterpret_cast<uint16_t *>(from2 + i);
+ i += 2;
+ }
+ if (i < n2) {
+ *reinterpret_cast<uint8_t *>(load2 + i) = *reinterpret_cast<uint8_t *>(from2 + i);
+ }
+ return pload_ignore<Packet>(load);
+#endif
+}
+
+template<> EIGEN_ALWAYS_INLINE Packet4f ploadu_partial<Packet4f>(const float* from, const Index n)
+{
+ return ploadu_partial_common<Packet4f>(from, n);
+}
+template<> EIGEN_ALWAYS_INLINE Packet4i ploadu_partial<Packet4i>(const int* from, const Index n)
+{
+ return ploadu_partial_common<Packet4i>(from, n);
+}
+template<> EIGEN_ALWAYS_INLINE Packet8s ploadu_partial<Packet8s>(const short int* from, const Index n)
+{
+ return ploadu_partial_common<Packet8s>(from, n);
+}
+template<> EIGEN_ALWAYS_INLINE Packet8us ploadu_partial<Packet8us>(const unsigned short int* from, const Index n)
+{
+ return ploadu_partial_common<Packet8us>(from, n);
+}
+template<> EIGEN_ALWAYS_INLINE Packet8bf ploadu_partial<Packet8bf>(const bfloat16* from, const Index n)
+{
+ return ploadu_partial_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from), n);
+}
+template<> EIGEN_ALWAYS_INLINE Packet16c ploadu_partial<Packet16c>(const signed char* from, const Index n)
+{
+ return ploadu_partial_common<Packet16c>(from, n);
+}
+template<> EIGEN_ALWAYS_INLINE Packet16uc ploadu_partial<Packet16uc>(const unsigned char* from, const Index n)
+{
+ return ploadu_partial_common<Packet16uc>(from, n);
+}
+
template<typename Packet> EIGEN_STRONG_INLINE Packet ploaddup_common(const __UNPACK_TYPE__(Packet)* from)
{
Packet p;
@@ -1128,6 +1396,77 @@
pstoreu_common<Packet16uc>(to, from);
}
+template<typename Packet> EIGEN_ALWAYS_INLINE void pstoreu_partial_common(__UNPACK_TYPE__(Packet)* to, const Packet& from, const Index n)
+{
+ const Index packet_size = unpacket_traits<Packet>::size;
+ eigen_assert(n <= packet_size && "number of elements will write past end of packet");
+ const Index size = sizeof(__UNPACK_TYPE__(Packet));
+#ifdef _ARCH_PWR9
+ EIGEN_DEBUG_UNALIGNED_STORE
+ vec_xst_len(from, to, n * size);
+#else
+ EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) store[packet_size];
+ pstore(store, from);
+ unsigned char* store2 = reinterpret_cast<unsigned char *>(store);
+ unsigned char* to2 = reinterpret_cast<unsigned char *>(to);
+ Index n2 = n * size;
+ Index i = 0;
+ if (16 <= n2) {
+ pstoreu(to2, pload<Packet16uc>(store2));
+ i += 16;
+ }
+ if (i + 8 <= n2) {
+ *reinterpret_cast<uint64_t *>(to2 + i) = *reinterpret_cast<uint64_t *>(store2 + i);
+ i += 8;
+ }
+ if (i + 4 <= n2) {
+ *reinterpret_cast<uint32_t *>(to2 + i) = *reinterpret_cast<uint32_t *>(store2 + i);
+ i += 4;
+ }
+ if (i + 2 <= n2) {
+ *reinterpret_cast<uint16_t *>(to2 + i) = *reinterpret_cast<uint16_t *>(store2 + i);
+ i += 2;
+ }
+ if (i < n2) {
+ *reinterpret_cast<uint8_t *>(to2 + i) = *reinterpret_cast<uint8_t *>(store2 + i);
+ }
+
+ LOAD_STORE_UNROLL_16
+ for (Index i = 0; i < n; i++) {
+ to[i] = from[i];
+ }
+#endif
+}
+
+template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<float>(float* to, const Packet4f& from, const Index n)
+{
+ pstoreu_partial_common<Packet4f>(to, from, n);
+}
+template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<int>(int* to, const Packet4i& from, const Index n)
+{
+ pstoreu_partial_common<Packet4i>(to, from, n);
+}
+template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<short int>(short int* to, const Packet8s& from, const Index n)
+{
+ pstoreu_partial_common<Packet8s>(to, from, n);
+}
+template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<unsigned short int>(unsigned short int* to, const Packet8us& from, const Index n)
+{
+ pstoreu_partial_common<Packet8us>(to, from, n);
+}
+template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<bfloat16>(bfloat16* to, const Packet8bf& from, const Index n)
+{
+ pstoreu_partial_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from, n);
+}
+template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<signed char>(signed char* to, const Packet16c& from, const Index n)
+{
+ pstoreu_partial_common<Packet16c>(to, from, n);
+}
+template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<unsigned char>(unsigned char* to, const Packet16uc& from, const Index n)
+{
+ pstoreu_partial_common<Packet16uc>(to, from, n);
+}
+
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_PPC_PREFETCH(addr); }
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { EIGEN_PPC_PREFETCH(addr); }
@@ -2387,12 +2726,22 @@
return vec_xl(0, const_cast<double *>(from)); // cast needed by Clang
}
+template<> EIGEN_ALWAYS_INLINE Packet2d pload_partial<Packet2d>(const double* from, const Index n, const Index offset)
+{
+ return pload_partial_common<Packet2d>(from, n, offset);
+}
+
template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from)
{
EIGEN_DEBUG_ALIGNED_STORE
vec_xst(from, 0, to);
}
+template<> EIGEN_ALWAYS_INLINE void pstore_partial<double>(double* to, const Packet2d& from, const Index n, const Index offset)
+{
+ pstore_partial_common<Packet2d>(to, from, n, offset);
+}
+
template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
Packet2d v = {from, from};
return v;
@@ -2414,19 +2763,21 @@
a3 = pset1<Packet2d>(a[3]);
}
-template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2d pgather<double, Packet2d>(const double* from, Index stride)
{
- EIGEN_ALIGN16 double af[2];
- af[0] = from[0*stride];
- af[1] = from[1*stride];
- return pload<Packet2d>(af);
+ return pgather_common<Packet2d>(from, stride);
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2d pgather_partial<double, Packet2d>(const double* from, Index stride, const Index n)
{
- EIGEN_ALIGN16 double af[2];
- pstore<double>(af, from);
- to[0*stride] = af[0];
- to[1*stride] = af[1];
+ return pgather_common<Packet2d>(from, stride, n);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
+{
+ pscatter_common<Packet2d>(to, from, stride);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<double, Packet2d>(double* to, const Packet2d& from, Index stride, const Index n)
+{
+ pscatter_common<Packet2d>(to, from, stride, n);
}
template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return pset1<Packet2d>(a) + p2d_COUNTDOWN; }
@@ -2517,6 +2868,11 @@
return vec_xl(0, const_cast<double*>(from));
}
+template<> EIGEN_ALWAYS_INLINE Packet2d ploadu_partial<Packet2d>(const double* from, const Index n)
+{
+ return ploadu_partial_common<Packet2d>(from, n);
+}
+
template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from)
{
Packet2d p;
@@ -2531,6 +2887,11 @@
vec_xst(from, 0, to);
}
+template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<double>(double* to, const Packet2d& from, const Index n)
+{
+ pstoreu_partial_common<Packet2d>(to, from, n);
+}
+
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_PPC_PREFETCH(addr); }
template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { EIGEN_ALIGN16 double x[2]; pstore<double>(x, a); return x[0]; }
diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h
index e2eef19..56473a9 100644
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@@ -100,6 +100,11 @@
return ploadt<PacketType, AlignmentType>(m_data + i);
}
+ template<typename PacketType>
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacketPartial(Index i, Index n, Index offset = 0) const {
+ return ploadt_partial<PacketType, AlignmentType>(m_data + i, n, offset);
+ }
+
template<typename PacketType, int AlignmentT>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType load(Index i) const {
return ploadt<PacketType, AlignmentT>(m_data + i);
@@ -110,6 +115,11 @@
pstoret<Scalar, PacketType, AlignmentType>(m_data + i, p);
}
+ template<typename PacketType>
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketPartial(Index i, const PacketType &p, Index n, Index offset = 0) const {
+ pstoret_partial<Scalar, PacketType, AlignmentType>(m_data + i, p, n, offset);
+ }
+
protected:
Scalar *m_data;
};
@@ -208,6 +218,11 @@
return ploadt<PacketType, AlignmentType>(&operator()(i, j));
}
+ template<typename PacketType>
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacketPartial(Index i, Index j, Index n, Index offset = 0) const {
+ return ploadt_partial<PacketType, AlignmentType>(&operator()(i, j), n, offset);
+ }
+
template <typename PacketT, int AlignmentT>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i, Index j) const {
return ploadt<PacketT, AlignmentT>(&operator()(i, j));
@@ -218,6 +233,11 @@
pstoret<Scalar, PacketType, AlignmentType>(&operator()(i, j), p);
}
+ template<typename PacketType>
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketPartial(Index i, Index j, const PacketType &p, Index n, Index offset = 0) const {
+ pstoret_partial<Scalar, PacketType, AlignmentType>(&operator()(i, j), p, n, offset);
+ }
+
template<typename SubPacket>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const {
pscatter<Scalar, SubPacket>(&operator()(i, j), p, m_stride);
@@ -272,10 +292,20 @@
}
template<typename PacketType>
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacketPartial(Index i, Index n, Index /*offset*/) const {
+ return pgather_partial<Scalar,PacketType>(m_data + i*m_incr.value(), m_incr.value(), n);
+ }
+
+ template<typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketType &p) const {
pscatter<Scalar, PacketType>(m_data + i*m_incr.value(), p, m_incr.value());
}
+ template<typename PacketType>
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketPartial(Index i, const PacketType &p, Index n, Index /*offset*/) const {
+ pscatter_partial<Scalar, PacketType>(m_data + i*m_incr.value(), p, m_incr.value(), n);
+ }
+
protected:
Scalar *m_data;
const internal::variable_if_dynamic<Index,Incr> m_incr;
@@ -312,6 +342,11 @@
return pgather<Scalar,PacketType>(&operator()(i, j),m_incr.value());
}
+ template<typename PacketType>
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacketPartial(Index i, Index j, Index n, Index /*offset*/) const {
+ return pgather_partial<Scalar,PacketType>(&operator()(i, j),m_incr.value(),n);
+ }
+
template <typename PacketT, int AlignmentT>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i, Index j) const {
return pgather<Scalar,PacketT>(&operator()(i, j),m_incr.value());
@@ -322,6 +357,11 @@
pscatter<Scalar, PacketType>(&operator()(i, j), p, m_incr.value());
}
+ template<typename PacketType>
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketPartial(Index i, Index j, const PacketType &p, Index n, Index /*offset*/) const {
+ pscatter_partial<Scalar, PacketType>(&operator()(i, j), p, m_incr.value(), n);
+ }
+
template<typename SubPacket>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const {
pscatter<Scalar, SubPacket>(&operator()(i, j), p, m_stride);
diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h
index 4ba1370..e473215 100644
--- a/Eigen/src/Core/util/ConfigureVectorization.h
+++ b/Eigen/src/Core/util/ConfigureVectorization.h
@@ -317,7 +317,7 @@
extern "C" {
// In theory we should only include immintrin.h and not the other *mmintrin.h header files directly.
// Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus:
- #if EIGEN_COMP_ICC >= 1110
+ #if EIGEN_COMP_ICC >= 1110 || EIGEN_COMP_EMSCRIPTEN
#include <immintrin.h>
#else
#include <mmintrin.h>
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index 7657ead..9823fa3 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -216,6 +216,7 @@
*/
inline void* aligned_realloc(void *ptr, std::size_t new_size, std::size_t old_size)
{
+ if (ptr == 0) return aligned_malloc(new_size);
EIGEN_UNUSED_VARIABLE(old_size)
void *result;
diff --git a/Eigen/src/SparseLU/SparseLU_Structs.h b/Eigen/src/SparseLU/SparseLU_Structs.h
index 74c6561..3ab0c72 100644
--- a/Eigen/src/SparseLU/SparseLU_Structs.h
+++ b/Eigen/src/SparseLU/SparseLU_Structs.h
@@ -72,8 +72,8 @@
namespace Eigen {
namespace internal {
-
-typedef enum {LUSUP, UCOL, LSUB, USUB, LLVL, ULVL} MemType;
+
+enum MemType {LUSUP, UCOL, LSUB, USUB, LLVL, ULVL};
template <typename IndexVector, typename ScalarVector>
struct LU_GlobalLU_t {
diff --git a/test/gpu_example.cu b/test/gpu_example.cu
index 9bc34e5..a69f5ea 100644
--- a/test/gpu_example.cu
+++ b/test/gpu_example.cu
@@ -21,7 +21,7 @@
EIGEN_DEVICE_FUNC
Type3 operator()(const Type1& A, const Type2& B, Type3& C) const {
C = A + B; // Populate output parameter.
- Type3 D = A + B; // Populate return value.
+ Type3 D = A + B; // Populate return value.
return D;
}
};
@@ -31,7 +31,7 @@
void test_add(const T& type) {
const Index rows = type.rows();
const Index cols = type.cols();
-
+
// Create random inputs.
const T A = T::Random(rows, cols);
const T B = T::Random(rows, cols);
@@ -39,17 +39,17 @@
// Create kernel.
AddKernel add_kernel;
-
+
// Run add_kernel(A, B, C) via run(...).
// This will run on the GPU if using a GPU compiler, or CPU otherwise,
// facilitating generic tests that can run on either.
T D = run(add_kernel, A, B, C);
-
+
// Check that both output parameter and return value are correctly populated.
const T expected = A + B;
VERIFY_IS_CWISE_EQUAL(C, expected);
VERIFY_IS_CWISE_EQUAL(D, expected);
-
+
// In a GPU-only test, we can verify that the CPU and GPU produce the
// same results.
T C_cpu, C_gpu;
@@ -70,31 +70,30 @@
template <typename T1, typename T2, typename T3>
void test_multiply(const T1& type1, const T2& type2, const T3& type3) {
-
const T1 A = T1::Random(type1.rows(), type1.cols());
const T2 B = T2::Random(type2.rows(), type2.cols());
T3 C;
MultiplyKernel multiply_kernel;
-
+
// The run(...) family of functions uses a memory buffer to transfer data back
// and forth to and from the device. The size of this buffer is estimated
// from the size of all input parameters. If the estimated buffer size is
// not sufficient for transferring outputs from device-to-host, then an
// explicit buffer size needs to be specified.
-
+
// 2 outputs of size (A * B). For each matrix output, the buffer will store
// the number of rows, columns, and the data.
size_t buffer_capacity_hint = 2 * ( // 2 output parameters
2 * sizeof(typename T3::Index) // # Rows, # Cols
+ A.rows() * B.cols() * sizeof(typename T3::Scalar)); // Output data
-
+
T3 D = run_with_hint(buffer_capacity_hint, multiply_kernel, A, B, C);
-
+
const T3 expected = A * B;
VERIFY_IS_CWISE_APPROX(C, expected);
VERIFY_IS_CWISE_APPROX(D, expected);
-
+
T3 C_cpu, C_gpu;
T3 D_cpu = run_on_cpu(multiply_kernel, A, B, C_cpu);
T3 D_gpu = run_on_gpu_with_hint(buffer_capacity_hint,
@@ -107,30 +106,24 @@
EIGEN_DECLARE_TEST(gpu_example)
{
// For the number of repeats, call the desired subtests.
- for(int i = 0; i < g_repeat; i++) {
+ for(int i = 0; i < g_repeat; i++) {
// Call subtests with different sized/typed inputs.
CALL_SUBTEST( test_add(Eigen::Vector3f()) );
CALL_SUBTEST( test_add(Eigen::Matrix3d()) );
-#if !defined(EIGEN_USE_HIP) // FIXME
CALL_SUBTEST( test_add(Eigen::MatrixX<int>(10, 10)) );
-#endif
CALL_SUBTEST( test_add(Eigen::Array44f()) );
-#if !defined(EIGEN_USE_HIP)
CALL_SUBTEST( test_add(Eigen::ArrayXd(20)) );
CALL_SUBTEST( test_add(Eigen::ArrayXXi(13, 17)) );
-#endif
CALL_SUBTEST( test_multiply(Eigen::Matrix3d(),
Eigen::Matrix3d(),
Eigen::Matrix3d()) );
-#if !defined(EIGEN_USE_HIP)
CALL_SUBTEST( test_multiply(Eigen::MatrixX<int>(10, 10),
Eigen::MatrixX<int>(10, 10),
Eigen::MatrixX<int>()) );
CALL_SUBTEST( test_multiply(Eigen::MatrixXf(12, 1),
Eigen::MatrixXf(1, 32),
Eigen::MatrixXf()) );
-#endif
}
}
diff --git a/test/packetmath.cpp b/test/packetmath.cpp
index 2d8e708..163ef47 100644
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@@ -458,6 +458,36 @@
VERIFY(test::areApprox(data1, data2 + offset, PacketSize) && "internal::pstoreu");
}
+ for (int M = 0; M < PacketSize; ++M) {
+ for (int N = 0; N <= PacketSize; ++N) {
+ for (int j = 0; j < size; ++j) {
+ data1[j] = internal::random<Scalar>() / RealScalar(PacketSize);
+ data2[j] = internal::random<Scalar>() / RealScalar(PacketSize);
+ refvalue = (std::max)(refvalue, numext::abs(data1[j]));
+ }
+
+ if (M == 0) {
+ internal::pstore_partial(data2, internal::pload_partial<Packet>(data1, N), N);
+ VERIFY(test::areApprox(data1, data2, N) && "aligned loadN/storeN");
+
+ for (int offset = 0; offset < PacketSize; ++offset) {
+ internal::pstore_partial(data2, internal::ploadu_partial<Packet>(data1 + offset, N), N);
+ VERIFY(test::areApprox(data1 + offset, data2, N) && "internal::ploadu_partial");
+ }
+
+ for (int offset = 0; offset < PacketSize; ++offset) {
+ internal::pstoreu_partial(data2 + offset, internal::pload_partial<Packet>(data1, N), N);
+ VERIFY(test::areApprox(data1, data2 + offset, N) && "internal::pstoreu_partial");
+ }
+ }
+
+ if (N + M > PacketSize) continue; // Don't read or write past end of Packet
+
+ internal::pstore_partial(data2, internal::pload_partial<Packet>(data1, N, M), N, M);
+ VERIFY(test::areApprox(data1, data2, N) && "aligned offset loadN/storeN");
+ }
+ }
+
if (internal::unpacket_traits<Packet>::masked_load_available) {
test::packet_helper<internal::unpacket_traits<Packet>::masked_load_available, Packet> h;
unsigned long long max_umask = (0x1ull << PacketSize);
@@ -1372,6 +1402,36 @@
for (int i = 0; i < PacketSize; ++i) {
VERIFY(test::isApproxAbs(data1[i], buffer[i * 7], refvalue) && "pgather");
}
+
+ for (Index N = 0; N <= PacketSize; ++N) {
+ for (Index i = 0; i < N; ++i) {
+ data1[i] = internal::random<Scalar>() / RealScalar(PacketSize);
+ }
+
+ for (Index i = 0; i < N * 20; ++i) {
+ buffer[i] = Scalar(0);
+ }
+
+ packet = internal::pload_partial<Packet>(data1, N);
+ internal::pscatter_partial<Scalar, Packet>(buffer, packet, stride, N);
+
+ for (Index i = 0; i < N * 20; ++i) {
+ if ((i % stride) == 0 && i < stride * N) {
+ VERIFY(test::isApproxAbs(buffer[i], data1[i / stride], refvalue) && "pscatter_partial");
+ } else {
+ VERIFY(test::isApproxAbs(buffer[i], Scalar(0), refvalue) && "pscatter_partial");
+ }
+ }
+
+ for (Index i = 0; i < N * 7; ++i) {
+ buffer[i] = internal::random<Scalar>() / RealScalar(PacketSize);
+ }
+ packet = internal::pgather_partial<Scalar, Packet>(buffer, 7, N);
+ internal::pstore_partial(data1, packet, N);
+ for (Index i = 0; i < N; ++i) {
+ VERIFY(test::isApproxAbs(data1[i], buffer[i * 7], refvalue) && "pgather_partial");
+ }
+ }
}
namespace Eigen {
diff --git a/unsupported/Eigen/AutoDiff b/unsupported/Eigen/AutoDiff
index 1d1be3b..62fc0b3 100644
--- a/unsupported/Eigen/AutoDiff
+++ b/unsupported/Eigen/AutoDiff
@@ -10,6 +10,8 @@
#ifndef EIGEN_AUTODIFF_MODULE_H
#define EIGEN_AUTODIFF_MODULE_H
+#include "../../Eigen/Core"
+
namespace Eigen {
/**
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
index 295f6e5..8c705ec 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
@@ -256,10 +256,10 @@
// the SAME case.
// When the stride is 1, we have the simplified case R'=R-K+1, C'=C-K+1, Pr=0,
// Pc=0.
-typedef enum {
+enum PaddingType {
PADDING_VALID = 1,
PADDING_SAME = 2
-} PaddingType;
+};
} // end namespace Eigen