Update Eigen to commit:34780d8bd13d0af0cf17a22789ef286e8512594d

CHANGELOG
=========
34780d8bd - Include immintrin.h header for enscripten.
2cf4d18c9 - Disable AVX512 GEMM kernels by default.
a678a3e05 - Fix aligned_realloc to call check_that_malloc_is_allowed() if ptr == 0
4a5635940 - Add option to disable avx512 GEBP kernels
1092574b2 - Fix wrong doxygen group usage
e1165dbf9 - AutoDiff depends on Core, so include appropriate header.
bb51d9f4f - Fix ODR violations.
06a458a13 - Enable subtests which use device side malloc since this has been fixed in ROCm 5.2.
84cf3ff18 - Add pload_partial, pstore_partial (and unaligned versions), pgather_partial, pscatter_partial, loadPacketPartial and storePacketPartial.
c603275dc - Better performance for Power10 using more load and store vector pairs for GEMV

PiperOrigin-RevId: 464973344
Change-Id: Ida84f32c0ffb7c5cfddb2ca55ee5fbfb5267f729
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index 3ea6855..8119200 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -606,14 +606,46 @@
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pabsdiff(const Packet& a, const Packet& b) { return pselect(pcmp_lt(a, b), psub(b, a), psub(a, b)); }
 
-/** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */
+/** \internal \returns a packet version of \a *from, from must be properly aligned */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pload(const typename unpacket_traits<Packet>::type* from) { return *from; }
 
+/** \internal \returns n elements of a packet version of \a *from, from must be properly aligned
+  * offset indicates the starting element in which to load and
+  * offset + n <= unpacket_traits::size
+  * All elements before offset and after the last element loaded will initialized with zero */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pload_partial(const typename unpacket_traits<Packet>::type* from, const Index n, const Index offset = 0)
+{
+  const Index packet_size = unpacket_traits<Packet>::size;
+  eigen_assert(n + offset <= packet_size && "number of elements plus offset will read past end of packet");
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  EIGEN_ALIGN_MAX Scalar elements[packet_size] = { Scalar(0) };
+  for (Index i = offset; i < numext::mini(n+offset,packet_size); i++) {
+    elements[i] = from[i-offset];
+  }
+  return pload<Packet>(elements);
+}
+
 /** \internal \returns a packet version of \a *from, (un-aligned load) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 ploadu(const typename unpacket_traits<Packet>::type* from) { return *from; }
 
+/** \internal \returns n elements of a packet version of \a *from, (un-aligned load)
+  * All elements after the last element loaded will initialized with zero */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+ploadu_partial(const typename unpacket_traits<Packet>::type* from, const Index n)
+{
+  const Index packet_size = unpacket_traits<Packet>::size;
+  eigen_assert(n <= packet_size && "number of elements will read past end of packet");
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  EIGEN_ALIGN_MAX Scalar elements[packet_size] = { Scalar(0) };
+  for (Index i = 0; i < numext::mini(n,packet_size); i++) {
+    elements[i] = from[i];
+  }
+  return pload<Packet>(elements);
+}
+
 /** \internal \returns a packet version of \a *from, (un-aligned masked load)
  * There is no generic implementation. We only have implementations for specialized
  * cases. Generic case should not be called.
@@ -704,14 +736,40 @@
 }
 
 
-/** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */
+/** \internal copy the packet \a from to \a *to, \a to must be properly aligned */
 template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstore(Scalar* to, const Packet& from)
 { (*to) = from; }
 
+/** \internal copy n elements of the packet \a from to \a *to, \a to must be properly aligned
+ * offset indicates the starting element in which to store and
+ * offset + n <= unpacket_traits::size */
+template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstore_partial(Scalar* to, const Packet& from, const Index n, const Index offset = 0)
+{
+  const Index packet_size = unpacket_traits<Packet>::size;
+  eigen_assert(n + offset <= packet_size && "number of elements plus offset will write past end of packet");
+  EIGEN_ALIGN_MAX Scalar elements[packet_size];
+  pstore<Scalar>(elements, from);
+  for (Index i = 0; i < numext::mini(n,packet_size-offset); i++) {
+    to[i] = elements[i + offset];
+  }
+}
+
 /** \internal copy the packet \a from to \a *to, (un-aligned store) */
 template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu(Scalar* to, const Packet& from)
 {  (*to) = from; }
 
+/** \internal copy n elements of the packet \a from to \a *to, (un-aligned store) */
+template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu_partial(Scalar* to, const Packet& from, const Index n)
+{
+  const Index packet_size = unpacket_traits<Packet>::size;
+  eigen_assert(n <= packet_size && "number of elements will write past end of packet");
+  EIGEN_ALIGN_MAX Scalar elements[packet_size];
+  pstore<Scalar>(elements, from);
+  for (Index i = 0; i < numext::mini(n,packet_size); i++) {
+    to[i] = elements[i];
+  }
+}
+
 /** \internal copy the packet \a from to \a *to, (un-aligned store with a mask)
  * There is no generic implementation. We only have implementations for specialized
  * cases. Generic case should not be called.
@@ -721,11 +779,31 @@
 std::enable_if_t<unpacket_traits<Packet>::masked_store_available, void>
 pstoreu(Scalar* to, const Packet& from, typename unpacket_traits<Packet>::mask_t umask);
 
- template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather(const Scalar* from, Index /*stride*/)
- { return ploadu<Packet>(from); }
+template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather(const Scalar* from, Index /*stride*/)
+{ return ploadu<Packet>(from); }
 
- template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pscatter(Scalar* to, const Packet& from, Index /*stride*/)
- { pstore(to, from); }
+template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather_partial(const Scalar* from, Index stride, const Index n)
+{
+  const Index packet_size = unpacket_traits<Packet>::size;
+  EIGEN_ALIGN_MAX Scalar elements[packet_size] = { Scalar(0) };
+  for (Index i = 0; i < numext::mini(n,packet_size); i++) {
+    elements[i] = from[i*stride];
+  }
+  return pload<Packet>(elements);
+}
+
+template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pscatter(Scalar* to, const Packet& from, Index /*stride*/)
+{ pstore(to, from); }
+
+template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pscatter_partial(Scalar* to, const Packet& from, Index stride, const Index n)
+{
+  const Index packet_size = unpacket_traits<Packet>::size;
+  EIGEN_ALIGN_MAX Scalar elements[packet_size];
+  pstore<Scalar>(elements, from);
+  for (Index i = 0; i < numext::mini(n,packet_size); i++) {
+    to[i*stride] = elements[i];
+  }
+}
 
 /** \internal tries to do cache prefetching of \a addr */
 template<typename Scalar> EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr)
@@ -996,6 +1074,17 @@
     return ploadu<Packet>(from);
 }
 
+/** \internal \returns n elements of a packet version of \a *from.
+  * The pointer \a from must be aligned on a \a Alignment bytes boundary. */
+template<typename Packet, int Alignment>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt_partial(const typename unpacket_traits<Packet>::type* from, const Index n, const Index offset = 0)
+{
+  if(Alignment >= unpacket_traits<Packet>::alignment)
+    return pload_partial<Packet>(from, n, offset);
+  else
+    return ploadu_partial<Packet>(from, n);
+}
+
 /** \internal copy the packet \a from to \a *to.
   * The pointer \a from must be aligned on a \a Alignment bytes boundary. */
 template<typename Scalar, typename Packet, int Alignment>
@@ -1007,6 +1096,17 @@
     pstoreu(to, from);
 }
 
+/** \internal copy n elements of the packet \a from to \a *to.
+  * The pointer \a from must be aligned on a \a Alignment bytes boundary. */
+template<typename Scalar, typename Packet, int Alignment>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret_partial(Scalar* to, const Packet& from, const Index n, const Index offset = 0)
+{
+  if(Alignment >= unpacket_traits<Packet>::alignment)
+    pstore_partial(to, from, n, offset);
+  else
+    pstoreu_partial(to, from, n);
+}
+
 /** \internal \returns a packet version of \a *from.
   * Unlike ploadt, ploadt_ro takes advantage of the read-only memory path on the
   * hardware if available to speedup the loading of data that won't be modified
diff --git a/Eigen/src/Core/PlainObjectBase.h b/Eigen/src/Core/PlainObjectBase.h
index e0bde54..222eaf5 100644
--- a/Eigen/src/Core/PlainObjectBase.h
+++ b/Eigen/src/Core/PlainObjectBase.h
@@ -638,7 +638,7 @@
       *
       * \see class Map
       */
-    //@{
+    ///@{
     static inline ConstMapType Map(const Scalar* data)
     { return ConstMapType(data); }
     static inline MapType Map(Scalar* data)
@@ -702,7 +702,7 @@
     template<int Outer, int Inner>
     static inline typename StridedAlignedMapType<Stride<Outer, Inner> >::type MapAligned(Scalar* data, Index rows, Index cols, const Stride<Outer, Inner>& stride)
     { return typename StridedAlignedMapType<Stride<Outer, Inner> >::type(data, rows, cols, stride); }
-    //@}
+    ///@}
 
     using Base::setConstant;
     EIGEN_DEVICE_FUNC Derived& setConstant(Index size, const Scalar& val);
diff --git a/Eigen/src/Core/arch/AVX512/GemmKernel.h b/Eigen/src/Core/arch/AVX512/GemmKernel.h
index 477c50f..cb7cfdf 100644
--- a/Eigen/src/Core/arch/AVX512/GemmKernel.h
+++ b/Eigen/src/Core/arch/AVX512/GemmKernel.h
@@ -7,8 +7,8 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef GEMM_KERNEL_H
-#define GEMM_KERNEL_H
+#ifndef EIGEN_CORE_ARCH_AVX512_GEMM_KERNEL_H
+#define EIGEN_CORE_ARCH_AVX512_GEMM_KERNEL_H
 
 #if EIGEN_COMP_MSVC
 #include <intrin.h>
@@ -20,6 +20,11 @@
 
 #include "../../InternalHeaderCheck.h"
 
+#if !defined(EIGEN_USE_AVX512_GEMM_KERNELS)
+// Disable new AVX512 kernels by default.
+#define EIGEN_USE_AVX512_GEMM_KERNELS 0
+#endif
+
 #define SECOND_FETCH (32)
 #if (EIGEN_COMP_GNUC_STRICT != 0) && !defined(EIGEN_ARCH_AVX512_GEMM_KERNEL_USE_LESS_A_REGS)
 // Use less registers to load A elements to workaround compiler spills. Loose a
@@ -930,6 +935,8 @@
   g.template compute_kern<max_a_unroll, max_b_unroll>();
 }
 
+// Template specializations of GEBP kernels with nr = 8.
+#if EIGEN_USE_AVX512_GEMM_KERNELS
 template <bool ConjLhs_, bool ConjRhs_, int PacketSize_>
 class gebp_traits<float, float, ConjLhs_, ConjRhs_, Architecture::Target, PacketSize_>
     : public gebp_traits<float, float, ConjLhs_, ConjRhs_, Architecture::Generic, PacketSize_> {
@@ -1218,8 +1225,11 @@
     }
   }
 }
+#endif // EIGEN_USE_AVX512_GEMM_KERNELS
 
 }  // namespace internal
 }  // namespace Eigen
 
-#endif  // GEMM_KERNEL_H
+#undef SECOND_FETCH
+
+#endif  // EIGEN_CORE_ARCH_AVX512_GEMM_KERNEL_H
diff --git a/Eigen/src/Core/arch/AVX512/TrsmKernel.h b/Eigen/src/Core/arch/AVX512/TrsmKernel.h
index 1b351ea..94f1f5a 100644
--- a/Eigen/src/Core/arch/AVX512/TrsmKernel.h
+++ b/Eigen/src/Core/arch/AVX512/TrsmKernel.h
@@ -7,8 +7,8 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef EIGEN_TRSM_KERNEL_IMPL_H
-#define EIGEN_TRSM_KERNEL_IMPL_H
+#ifndef EIGEN_CORE_ARCH_AVX512_TRSM_KERNEL_H
+#define EIGEN_CORE_ARCH_AVX512_TRSM_KERNEL_H
 
 #include "../../InternalHeaderCheck.h"
 
@@ -106,6 +106,10 @@
   int64_t cutoff_l = static_cast<int64_t>(cutoff_d);
   return (cutoff_l / EIGEN_AVX_MAX_NUM_ROW) * EIGEN_AVX_MAX_NUM_ROW;
 }
+#else // !(EIGEN_USE_AVX512_TRSM_KERNELS) || !(EIGEN_COMP_CLANG != 0)
+#define EIGEN_ENABLE_AVX512_NOCOPY_TRSM_CUTOFFS 0
+#define EIGEN_ENABLE_AVX512_NOCOPY_TRSM_R_CUTOFFS 0
+#define EIGEN_ENABLE_AVX512_NOCOPY_TRSM_L_CUTOFFS 0
 #endif
 
 /**
@@ -1184,4 +1188,4 @@
 #endif  // EIGEN_USE_AVX512_TRSM_KERNELS
 }  // namespace internal
 }  // namespace Eigen
-#endif  // EIGEN_TRSM_KERNEL_IMPL_H
+#endif  // EIGEN_CORE_ARCH_AVX512_TRSM_KERNEL_H
diff --git a/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc b/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc
index 032937c..6b09424 100644
--- a/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc
+++ b/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc
@@ -7,8 +7,8 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef EIGEN_UNROLLS_IMPL_H
-#define EIGEN_UNROLLS_IMPL_H
+#ifndef EIGEN_CORE_ARCH_AVX512_TRSM_UNROLLS_H
+#define EIGEN_CORE_ARCH_AVX512_TRSM_UNROLLS_H
 
 template <bool isARowMajor = true>
 static EIGEN_ALWAYS_INLINE int64_t idA(int64_t i, int64_t j, int64_t LDA) {
@@ -1209,4 +1209,4 @@
 };
 }  // namespace unrolls
 
-#endif  // EIGEN_UNROLLS_IMPL_H
+#endif  // EIGEN_CORE_ARCH_AVX512_TRSM_UNROLLS_H
diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h
index ba5a3fd..6046035 100644
--- a/Eigen/src/Core/arch/AltiVec/Complex.h
+++ b/Eigen/src/Core/arch/AltiVec/Complex.h
@@ -132,10 +132,20 @@
 
 template<> EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>*        from) { return Packet2cf(pload<Packet4f>((const float *) from)); }
 template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>*       from) { return Packet2cf(ploadu<Packet4f>((const float*) from)); }
+template<> EIGEN_ALWAYS_INLINE Packet2cf pload_partial<Packet2cf>(const std::complex<float>* from, const Index n, const Index offset)
+{
+  return Packet2cf(pload_partial<Packet4f>((const float *) from, n * 2, offset * 2));
+}
+template<> EIGEN_ALWAYS_INLINE Packet2cf ploadu_partial<Packet2cf>(const std::complex<float>* from, const Index n)
+{
+  return Packet2cf(ploadu_partial<Packet4f>((const float*) from, n * 2));
+}
 template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>*     from) { return pset1<Packet2cf>(*from); }
 
 template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { pstore((float*)to, from.v); }
 template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { pstoreu((float*)to, from.v); }
+template<> EIGEN_ALWAYS_INLINE void pstore_partial <std::complex<float> >(std::complex<float> *  to, const Packet2cf& from, const Index n, const Index offset) { pstore_partial((float*)to, from.v, n * 2, offset * 2); }
+template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<std::complex<float> >(std::complex<float> *  to, const Packet2cf& from, const Index n) { pstoreu_partial((float*)to, from.v, n * 2); }
 
 EIGEN_STRONG_INLINE Packet2cf pload2(const std::complex<float>& from0, const std::complex<float>& from1)
 {
@@ -157,19 +167,46 @@
   return Packet2cf(res0);
 }
 
-template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
+template<> EIGEN_ALWAYS_INLINE Packet2cf pload_ignore<Packet2cf>(const std::complex<float>*     from)
 {
-  EIGEN_ALIGN16 std::complex<float> af[2];
-  af[0] = from[0*stride];
-  af[1] = from[1*stride];
-  return pload<Packet2cf>(af);
+  Packet2cf res;
+  res.v = pload_ignore<Packet4f>(reinterpret_cast<const float*>(from));
+  return res;
 }
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
+
+template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pgather_complex_size2(const Scalar* from, Index stride, const Index n = 2)
 {
-  EIGEN_ALIGN16 std::complex<float> af[2];
-  pstore<std::complex<float> >((std::complex<float> *) af, from);
-  to[0*stride] = af[0];
-  to[1*stride] = af[1];
+  eigen_assert(n <= unpacket_traits<Packet>::size && "number of elements will gather past end of packet");
+  EIGEN_ALIGN16 Scalar af[2];
+  for (Index i = 0; i < n; i++) {
+    af[i] = from[i*stride];
+  }
+  return pload_ignore<Packet>(af);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
+{
+  return pgather_complex_size2<std::complex<float>, Packet2cf>(from, stride);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2cf pgather_partial<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride, const Index n)
+{
+  return pgather_complex_size2<std::complex<float>, Packet2cf>(from, stride, n);
+}
+template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_complex_size2(Scalar* to, const Packet& from, Index stride, const Index n = 2)
+{
+  eigen_assert(n <= unpacket_traits<Packet>::size && "number of elements will scatter past end of packet");
+  EIGEN_ALIGN16 Scalar af[2];
+  pstore<Scalar>((Scalar *) af, from);
+  for (Index i = 0; i < n; i++) {
+    to[i*stride] = af[i];
+  }
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
+{
+  pscatter_complex_size2<std::complex<float>, Packet2cf>(to, from, stride);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride, const Index n)
+{
+  pscatter_complex_size2<std::complex<float>, Packet2cf>(to, from, stride, n);
 }
 
 template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(a.v + b.v); }
@@ -336,17 +373,35 @@
 
 template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from) { return Packet1cd(pload<Packet2d>((const double*)from)); }
 template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { return Packet1cd(ploadu<Packet2d>((const double*)from)); }
+template<> EIGEN_ALWAYS_INLINE Packet1cd pload_partial<Packet1cd>(const std::complex<double>* from, const Index n, const Index offset)
+{
+  return Packet1cd(pload_partial<Packet2d>((const double*)from, n * 2, offset * 2));
+}
+template<> EIGEN_ALWAYS_INLINE Packet1cd ploadu_partial<Packet1cd>(const std::complex<double>* from, const Index n)
+{
+  return Packet1cd(ploadu_partial<Packet2d>((const double*)from, n * 2));
+}
 template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { pstore((double*)to, from.v); }
 template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { pstoreu((double*)to, from.v); }
+template<> EIGEN_ALWAYS_INLINE void pstore_partial <std::complex<double> >(std::complex<double> *  to, const Packet1cd& from, const Index n, const Index offset) { pstore_partial((double*)to, from.v, n * 2, offset * 2); }
+template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<std::complex<double> >(std::complex<double> *  to, const Packet1cd& from, const Index n) { pstoreu_partial((double*)to, from.v, n * 2); }
 
 template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>&  from)
 { /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }
 
-template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index)
 {
   return pload<Packet1cd>(from);
 }
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet1cd pgather_partial<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index, const Index)
+{
+  return pload<Packet1cd>(from);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index)
+{
+  pstore<std::complex<double> >(to, from);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index, const Index)
 {
   pstore<std::complex<double> >(to, from);
 }
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h
index 940a817..9d00b93 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h
@@ -207,12 +207,8 @@
     } \
   }
 
-#if EIGEN_COMP_LLVM
-#define GEMV_LOADPAIR2_COL_MMA(iter1, iter2)
-#else
 #define GEMV_LOADPAIR2_COL_MMA(iter1, iter2) \
   b##iter1 = *reinterpret_cast<__vector_pair *>(res + i + ((iter2) * ResPacketSize));
-#endif
 
 #define GEMV_LOAD2_COL_MMA(iter1, iter2, iter3, N) \
   if (GEMV_GETN(N) > iter1) { \
@@ -231,8 +227,9 @@
 #if EIGEN_COMP_LLVM
 #define GEMV_WORKPAIR2_COL_MMA(iter2, iter3, iter4) \
   ResPacket f##iter2[2]; \
-  f##iter2[0] = pmadd(result##iter2.packet[0], palpha, ploadu<ResPacket>(res + i + ((iter4) * ResPacketSize))); \
-  f##iter2[1] = pmadd(result##iter3.packet[(iter2 == iter3) ? 2 : 0], palpha, ploadu<ResPacket>(res + i + (((iter4) + 1) * ResPacketSize))); \
+  __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(f##iter2), &b##iter2); \
+  f##iter2[0] = pmadd(result##iter2.packet[0], palpha, f##iter2[0]); \
+  f##iter2[1] = pmadd(result##iter3.packet[(iter2 == iter3) ? 2 : 0], palpha, f##iter2[1]); \
   GEMV_BUILDPAIR_MMA(b##iter2, f##iter2[0], f##iter2[1]);
 #else
 #define GEMV_WORKPAIR2_COL_MMA(iter2, iter3, iter4) \
@@ -932,7 +929,7 @@
 {
     PResPacket c2 = pcplxflipconj(c0);
     PResPacket c3 = pcplxflipconj(c1);
-#if EIGEN_COMP_LLVM || !defined(_ARCH_PWR10)
+#if !defined(_ARCH_PWR10)
     ScalarPacket c4 = pload_complex<ResPacket>(res + (iter2 * ResPacketSize));
     ScalarPacket c5 = pload_complex<ResPacket>(res + ((iter2 + 1) * ResPacketSize));
     PResPacket c6 = PResPacket(pmadd_complex<ScalarPacket, AlphaData>(c0.v, c2.v, c4, b0));
@@ -941,6 +938,13 @@
     pstoreu(res + ((iter2 + 1) * ResPacketSize), c7);
 #else
     __vector_pair a = *reinterpret_cast<__vector_pair *>(res + (iter2 * ResPacketSize));
+#if EIGEN_COMP_LLVM
+    PResPacket c6[2];
+    __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(c6), &a);
+    c6[0] = PResPacket(pmadd_complex<ScalarPacket, AlphaData>(c0.v, c2.v, c6[0].v, b0));
+    c6[1] = PResPacket(pmadd_complex<ScalarPacket, AlphaData>(c1.v, c3.v, c6[1].v, b0));
+    GEMV_BUILDPAIR_MMA(a, c6[0].v, c6[1].v);
+#else
     if (GEMV_IS_COMPLEX_FLOAT) {
         __asm__ ("xvmaddasp %L0,%x1,%x2\n\txvmaddasp %0,%x1,%x3" : "+&d" (a) : "wa" (b0.separate.r.v), "wa" (c0.v), "wa" (c1.v));
         __asm__ ("xvmaddasp %L0,%x1,%x2\n\txvmaddasp %0,%x1,%x3" : "+&d" (a) : "wa" (b0.separate.i.v), "wa" (c2.v), "wa" (c3.v));
@@ -948,6 +952,7 @@
         __asm__ ("xvmaddadp %L0,%x1,%x2\n\txvmaddadp %0,%x1,%x3" : "+&d" (a) : "wa" (b0.separate.r.v), "wa" (c0.v), "wa" (c1.v));
         __asm__ ("xvmaddadp %L0,%x1,%x2\n\txvmaddadp %0,%x1,%x3" : "+&d" (a) : "wa" (b0.separate.i.v), "wa" (c2.v), "wa" (c3.v));
     }
+#endif
     *reinterpret_cast<__vector_pair *>(res + (iter2 * ResPacketSize)) = a;
 #endif
 }
diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index 91b3e20..4dd53f6 100644
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -143,6 +143,12 @@
   #define EIGEN_PPC_PREFETCH(ADDR) asm( "   dcbt [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" );
 #endif
 
+#if EIGEN_COMP_LLVM
+#define LOAD_STORE_UNROLL_16 _Pragma("unroll 16")
+#else
+#define LOAD_STORE_UNROLL_16 _Pragma("GCC unroll(16)")
+#endif
+
 template <>
 struct packet_traits<float> : default_packet_traits {
   typedef Packet4f type;
@@ -472,6 +478,118 @@
 }
 
 template <typename Packet>
+EIGEN_ALWAYS_INLINE Packet pload_ignore(const __UNPACK_TYPE__(Packet)* from)
+{
+  // some versions of GCC throw "unused-but-set-parameter".
+  // ignoring these warnings for now.
+  EIGEN_UNUSED_VARIABLE(from);
+  EIGEN_DEBUG_ALIGNED_LOAD
+  // Ignore partial input memory initialized
+#if !EIGEN_COMP_LLVM
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+#ifdef __VSX__
+  return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
+#else
+  return vec_ld(0, from);
+#endif
+#if !EIGEN_COMP_LLVM
+  #pragma GCC diagnostic pop
+#endif
+}
+
+template<> EIGEN_ALWAYS_INLINE Packet8bf pload_ignore<Packet8bf>(const bfloat16*     from)
+{
+  return pload_ignore<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
+}
+
+template <typename Packet>
+EIGEN_ALWAYS_INLINE Packet pload_partial_common(const __UNPACK_TYPE__(Packet)* from, const Index n, const Index offset)
+{
+  // some versions of GCC throw "unused-but-set-parameter".
+  // ignoring these warnings for now.
+  const Index packet_size = unpacket_traits<Packet>::size;
+  eigen_assert(n + offset <= packet_size && "number of elements plus offset will read past end of packet");
+  const Index size = sizeof(__UNPACK_TYPE__(Packet));
+#ifdef _ARCH_PWR9
+  EIGEN_DEBUG_ALIGNED_LOAD
+  EIGEN_UNUSED_VARIABLE(from);
+  Packet load = vec_xl_len(const_cast<__UNPACK_TYPE__(Packet)*>(from), n * size);
+  if (offset) {
+    Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
+#ifdef _BIG_ENDIAN
+    load = Packet(vec_sro(Packet16uc(load), shift));
+#else
+    load = Packet(vec_slo(Packet16uc(load), shift));
+#endif
+  }
+  return load;
+#else
+  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) load[packet_size];
+  unsigned char* load2 = reinterpret_cast<unsigned char *>(load + offset);
+  unsigned char* from2 = reinterpret_cast<unsigned char *>(const_cast<__UNPACK_TYPE__(Packet)*>(from));
+  Index n2 = n * size;
+  Index i = 0;
+  if (16 <= n2) {
+    pstoreu(load2, ploadu<Packet16uc>(from2));
+    i += 16;
+  }
+  if (i + 8 <= n2) {
+    *reinterpret_cast<uint64_t *>(load2 + i) = *reinterpret_cast<uint64_t *>(from2 + i);
+    i += 8;
+  }
+  if (i + 4 <= n2) {
+    *reinterpret_cast<uint32_t *>(load2 + i) = *reinterpret_cast<uint32_t *>(from2 + i);
+    i += 4;
+  }
+  if (i + 2 <= n2) {
+    *reinterpret_cast<uint16_t *>(load2 + i) = *reinterpret_cast<uint16_t *>(from2 + i);
+    i += 2;
+  }
+  if (i < n2) {
+    *reinterpret_cast<uint8_t *>(load2 + i) = *reinterpret_cast<uint8_t *>(from2 + i);
+  }
+  return pload_ignore<Packet>(load);
+#endif
+}
+
+template<> EIGEN_ALWAYS_INLINE Packet4f pload_partial<Packet4f>(const float* from, const Index n, const Index offset)
+{
+  return pload_partial_common<Packet4f>(from, n, offset);
+}
+
+template<> EIGEN_ALWAYS_INLINE Packet4i pload_partial<Packet4i>(const int* from, const Index n, const Index offset)
+{
+  return pload_partial_common<Packet4i>(from, n, offset);
+}
+
+template<> EIGEN_ALWAYS_INLINE Packet8s pload_partial<Packet8s>(const short int* from, const Index n, const Index offset)
+{
+  return pload_partial_common<Packet8s>(from, n, offset);
+}
+
+template<> EIGEN_ALWAYS_INLINE Packet8us pload_partial<Packet8us>(const unsigned short int* from, const Index n, const Index offset)
+{
+  return pload_partial_common<Packet8us>(from, n, offset);
+}
+
+template<> EIGEN_ALWAYS_INLINE Packet8bf pload_partial<Packet8bf>(const bfloat16* from, const Index n, const Index offset)
+{
+  return pload_partial_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from), n, offset);
+}
+
+template<> EIGEN_ALWAYS_INLINE Packet16c pload_partial<Packet16c>(const signed char* from, const Index n, const Index offset)
+{
+  return pload_partial_common<Packet16c>(from, n, offset);
+}
+
+template<> EIGEN_ALWAYS_INLINE Packet16uc pload_partial<Packet16uc>(const unsigned char* from, const Index n, const Index offset)
+{
+  return pload_partial_common<Packet16uc>(from, n, offset);
+}
+
+template <typename Packet>
 EIGEN_STRONG_INLINE void pstore_common(__UNPACK_TYPE__(Packet)* to, const Packet& from){
   // some versions of GCC throw "unused-but-set-parameter" (float *to).
   // ignoring these warnings for now.
@@ -519,6 +637,90 @@
   pstore_common<Packet16uc>(to, from);
 }
 
+template<typename Packet> EIGEN_ALWAYS_INLINE void pstore_partial_common(__UNPACK_TYPE__(Packet)*  to, const Packet& from, const Index n, const Index offset)
+{
+  // some versions of GCC throw "unused-but-set-parameter" (float *to).
+  // ignoring these warnings for now.
+  const Index packet_size = unpacket_traits<Packet>::size;
+  eigen_assert(n + offset <= packet_size && "number of elements plus offset will write past end of packet");
+  const Index size = sizeof(__UNPACK_TYPE__(Packet));
+#ifdef _ARCH_PWR9
+  EIGEN_UNUSED_VARIABLE(to);
+  EIGEN_DEBUG_ALIGNED_STORE
+  Packet store = from;
+  if (offset) {
+    Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
+#ifdef _BIG_ENDIAN
+    store = Packet(vec_slo(Packet16uc(store), shift));
+#else
+    store = Packet(vec_sro(Packet16uc(store), shift));
+#endif
+  }
+  vec_xst_len(store, to, n * size);
+#else
+  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) store[packet_size];
+  pstore(store, from);
+  unsigned char* store2 = reinterpret_cast<unsigned char *>(store + offset);
+  unsigned char* to2 = reinterpret_cast<unsigned char *>(to);
+  Index n2 = n * size;
+  Index i = 0;
+  if (16 <= n2) {
+    pstore(to2, ploadu<Packet16uc>(store2));
+    i += 16;
+  }
+  if (i + 8 <= n2) {
+    *reinterpret_cast<uint64_t *>(to2 + i) = *reinterpret_cast<uint64_t *>(store2 + i);
+    i += 8;
+  }
+  if (i + 4 <= n2) {
+    *reinterpret_cast<uint32_t *>(to2 + i) = *reinterpret_cast<uint32_t *>(store2 + i);
+    i += 4;
+  }
+  if (i + 2 <= n2) {
+    *reinterpret_cast<uint16_t *>(to2 + i) = *reinterpret_cast<uint16_t *>(store2 + i);
+    i += 2;
+  }
+  if (i < n2) {
+    *reinterpret_cast<uint8_t *>(to2 + i) = *reinterpret_cast<uint8_t *>(store2 + i);
+  }
+#endif
+}
+
+template<> EIGEN_ALWAYS_INLINE void pstore_partial<float>(float*  to, const Packet4f& from, const Index n, const Index offset)
+{
+  pstore_partial_common<Packet4f>(to, from, n, offset);
+}
+
+template<> EIGEN_ALWAYS_INLINE void pstore_partial<int>(int*  to, const Packet4i& from, const Index n, const Index offset)
+{
+  pstore_partial_common<Packet4i>(to, from, n, offset);
+}
+
+template<> EIGEN_ALWAYS_INLINE void pstore_partial<short int>(short int*  to, const Packet8s& from, const Index n, const Index offset)
+{
+  pstore_partial_common<Packet8s>(to, from, n, offset);
+}
+
+template<> EIGEN_ALWAYS_INLINE void pstore_partial<unsigned short int>(unsigned short int*  to, const Packet8us& from, const Index n, const Index offset)
+{
+  pstore_partial_common<Packet8us>(to, from, n, offset);
+}
+
+template<> EIGEN_ALWAYS_INLINE void pstore_partial<bfloat16>(bfloat16*      to, const Packet8bf& from, const Index n, const Index offset)
+{
+  pstore_partial_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from, n, offset);
+}
+
+template<> EIGEN_ALWAYS_INLINE void pstore_partial<signed char>(signed char*  to, const Packet16c& from, const Index n, const Index offset)
+{
+  pstore_partial_common<Packet16c>(to, from, n, offset);
+}
+
+template<> EIGEN_ALWAYS_INLINE void pstore_partial<unsigned char>(unsigned char*  to, const Packet16uc& from, const Index n, const Index offset)
+{
+  pstore_partial_common<Packet16uc>(to, from, n, offset);
+}
+
 template<typename Packet>
 EIGEN_STRONG_INLINE Packet pset1_size4(const __UNPACK_TYPE__(Packet)& from)
 {
@@ -596,168 +798,167 @@
   pbroadcast4_common<Packet4i>(a, a0, a1, a2, a3);
 }
 
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather_common(const __UNPACK_TYPE__(Packet)* from, Index stride)
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pgather_common(const __UNPACK_TYPE__(Packet)* from, Index stride, const Index n = unpacket_traits<Packet>::size)
 {
-  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[4];
-  a[0] = from[0*stride];
-  a[1] = from[1*stride];
-  a[2] = from[2*stride];
-  a[3] = from[3*stride];
-  return pload<Packet>(a);
+  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[unpacket_traits<Packet>::size];
+  eigen_assert(n <= unpacket_traits<Packet>::size && "number of elements will gather past end of packet");
+  LOAD_STORE_UNROLL_16
+  for (Index i = 0; i < n; i++) {
+    a[i] = from[i*stride];
+  }
+  // Leave rest of the array uninitialized
+  return pload_ignore<Packet>(a);
 }
 
-template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4f pgather<float, Packet4f>(const float* from, Index stride)
 {
   return pgather_common<Packet4f>(from, stride);
 }
 
-template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4i pgather<int, Packet4i>(const int* from, Index stride)
 {
   return pgather_common<Packet4i>(from, stride);
 }
 
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather_size8(const __UNPACK_TYPE__(Packet)* from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8s pgather<short int, Packet8s>(const short int* from, Index stride)
 {
-  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[8];
-  a[0] = from[0*stride];
-  a[1] = from[1*stride];
-  a[2] = from[2*stride];
-  a[3] = from[3*stride];
-  a[4] = from[4*stride];
-  a[5] = from[5*stride];
-  a[6] = from[6*stride];
-  a[7] = from[7*stride];
-  return pload<Packet>(a);
+  return pgather_common<Packet8s>(from, stride);
 }
 
-template<> EIGEN_DEVICE_FUNC inline Packet8s pgather<short int, Packet8s>(const short int* from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8us pgather<unsigned short int, Packet8us>(const unsigned short int* from, Index stride)
 {
-  return pgather_size8<Packet8s>(from, stride);
+  return pgather_common<Packet8us>(from, stride);
 }
 
-template<> EIGEN_DEVICE_FUNC inline Packet8us pgather<unsigned short int, Packet8us>(const unsigned short int* from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8bf pgather<bfloat16, Packet8bf>(const bfloat16* from, Index stride)
 {
-  return pgather_size8<Packet8us>(from, stride);
+  return pgather_common<Packet8bf>(from, stride);
 }
 
-template<> EIGEN_DEVICE_FUNC inline Packet8bf pgather<bfloat16, Packet8bf>(const bfloat16* from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16c pgather<signed char, Packet16c>(const signed char* from, Index stride)
 {
-  return pgather_size8<Packet8bf>(from, stride);
+  return pgather_common<Packet16c>(from, stride);
 }
 
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather_size16(const __UNPACK_TYPE__(Packet)* from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16uc pgather<unsigned char, Packet16uc>(const unsigned char* from, Index stride)
 {
-  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[16];
-  a[0] = from[0*stride];
-  a[1] = from[1*stride];
-  a[2] = from[2*stride];
-  a[3] = from[3*stride];
-  a[4] = from[4*stride];
-  a[5] = from[5*stride];
-  a[6] = from[6*stride];
-  a[7] = from[7*stride];
-  a[8] = from[8*stride];
-  a[9] = from[9*stride];
-  a[10] = from[10*stride];
-  a[11] = from[11*stride];
-  a[12] = from[12*stride];
-  a[13] = from[13*stride];
-  a[14] = from[14*stride];
-  a[15] = from[15*stride];
-  return pload<Packet>(a);
+  return pgather_common<Packet16uc>(from, stride);
 }
 
-
-template<> EIGEN_DEVICE_FUNC inline Packet16c pgather<signed char, Packet16c>(const signed char* from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4f pgather_partial<float, Packet4f>(const float* from, Index stride, const Index n)
 {
-  return pgather_size16<Packet16c>(from, stride);
+  return pgather_common<Packet4f>(from, stride, n);
 }
 
-template<> EIGEN_DEVICE_FUNC inline Packet16uc pgather<unsigned char, Packet16uc>(const unsigned char* from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4i pgather_partial<int, Packet4i>(const int* from, Index stride, const Index n)
 {
-  return pgather_size16<Packet16uc>(from, stride);
+  return pgather_common<Packet4i>(from, stride, n);
 }
 
-template<typename Packet> EIGEN_DEVICE_FUNC inline void pscatter_size4(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8s pgather_partial<short int, Packet8s>(const short int* from, Index stride, const Index n)
 {
-  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[4];
+  return pgather_common<Packet8s>(from, stride, n);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8us pgather_partial<unsigned short int, Packet8us>(const unsigned short int* from, Index stride, const Index n)
+{
+  return pgather_common<Packet8us>(from, stride, n);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8bf pgather_partial<bfloat16, Packet8bf>(const bfloat16* from, Index stride, const Index n)
+{
+  return pgather_common<Packet8bf>(from, stride, n);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16c pgather_partial<signed char, Packet16c>(const signed char* from, Index stride, const Index n)
+{
+  return pgather_common<Packet16c>(from, stride, n);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16uc pgather_partial<unsigned char, Packet16uc>(const unsigned char* from, Index stride, const Index n)
+{
+  return pgather_common<Packet16uc>(from, stride, n);
+}
+
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_common(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride, const Index n = unpacket_traits<Packet>::size)
+{
+  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[unpacket_traits<Packet>::size];
+  eigen_assert(n <= unpacket_traits<Packet>::size && "number of elements will scatter past end of packet");
   pstore<__UNPACK_TYPE__(Packet)>(a, from);
-  to[0*stride] = a[0];
-  to[1*stride] = a[1];
-  to[2*stride] = a[2];
-  to[3*stride] = a[3];
+  LOAD_STORE_UNROLL_16
+  for (Index i = 0; i < n; i++) {
+    to[i*stride] = a[i];
+  }
 }
 
-template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
 {
-  pscatter_size4<Packet4f>(to, from, stride);
+  pscatter_common<Packet4f>(to, from, stride);
 }
 
-template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
 {
-  pscatter_size4<Packet4i>(to, from, stride);
+  pscatter_common<Packet4i>(to, from, stride);
 }
 
-template<typename Packet> EIGEN_DEVICE_FUNC inline void pscatter_size8(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<short int, Packet8s>(short int* to, const Packet8s& from, Index stride)
 {
-  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[8];
-  pstore<__UNPACK_TYPE__(Packet)>(a, from);
-  to[0*stride] = a[0];
-  to[1*stride] = a[1];
-  to[2*stride] = a[2];
-  to[3*stride] = a[3];
-  to[4*stride] = a[4];
-  to[5*stride] = a[5];
-  to[6*stride] = a[6];
-  to[7*stride] = a[7];
+  pscatter_common<Packet8s>(to, from, stride);
 }
 
-
-template<> EIGEN_DEVICE_FUNC inline void pscatter<short int, Packet8s>(short int* to, const Packet8s& from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<unsigned short int, Packet8us>(unsigned short int* to, const Packet8us& from, Index stride)
 {
-  pscatter_size8<Packet8s>(to, from, stride);
+  pscatter_common<Packet8us>(to, from, stride);
 }
 
-template<> EIGEN_DEVICE_FUNC inline void pscatter<unsigned short int, Packet8us>(unsigned short int* to, const Packet8us& from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from, Index stride)
 {
-  pscatter_size8<Packet8us>(to, from, stride);
+  pscatter_common<Packet8bf>(to, from, stride);
 }
 
-template<> EIGEN_DEVICE_FUNC inline void pscatter<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<signed char, Packet16c>(signed char* to, const Packet16c& from, Index stride)
 {
-  pscatter_size8<Packet8bf>(to, from, stride);
+  pscatter_common<Packet16c>(to, from, stride);
 }
 
-template<typename Packet> EIGEN_DEVICE_FUNC inline void pscatter_size16(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<unsigned char, Packet16uc>(unsigned char* to, const Packet16uc& from, Index stride)
 {
-  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[16];
-  pstore<__UNPACK_TYPE__(Packet)>(a, from);
-  to[0*stride] = a[0];
-  to[1*stride] = a[1];
-  to[2*stride] = a[2];
-  to[3*stride] = a[3];
-  to[4*stride] = a[4];
-  to[5*stride] = a[5];
-  to[6*stride] = a[6];
-  to[7*stride] = a[7];
-  to[8*stride] = a[8];
-  to[9*stride] = a[9];
-  to[10*stride] = a[10];
-  to[11*stride] = a[11];
-  to[12*stride] = a[12];
-  to[13*stride] = a[13];
-  to[14*stride] = a[14];
-  to[15*stride] = a[15];
+  pscatter_common<Packet16uc>(to, from, stride);
 }
 
-template<> EIGEN_DEVICE_FUNC inline void pscatter<signed char, Packet16c>(signed char* to, const Packet16c& from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<float, Packet4f>(float* to, const Packet4f& from, Index stride, const Index n)
 {
-  pscatter_size16<Packet16c>(to, from, stride);
+  pscatter_common<Packet4f>(to, from, stride, n);
 }
 
-template<> EIGEN_DEVICE_FUNC inline void pscatter<unsigned char, Packet16uc>(unsigned char* to, const Packet16uc& from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<int, Packet4i>(int* to, const Packet4i& from, Index stride, const Index n)
 {
-  pscatter_size16<Packet16uc>(to, from, stride);
+  pscatter_common<Packet4i>(to, from, stride, n);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<short int, Packet8s>(short int* to, const Packet8s& from, Index stride, const Index n)
+{
+  pscatter_common<Packet8s>(to, from, stride, n);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<unsigned short int, Packet8us>(unsigned short int* to, const Packet8us& from, Index stride, const Index n)
+{
+  pscatter_common<Packet8us>(to, from, stride, n);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from, Index stride, const Index n)
+{
+  pscatter_common<Packet8bf>(to, from, stride, n);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<signed char, Packet16c>(signed char* to, const Packet16c& from, Index stride, const Index n)
+{
+  pscatter_common<Packet16c>(to, from, stride, n);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<unsigned char, Packet16uc>(unsigned char* to, const Packet16uc& from, Index stride, const Index n)
+{
+  pscatter_common<Packet16uc>(to, from, stride, n);
 }
 
 template<> EIGEN_STRONG_INLINE Packet4f   plset<Packet4f>(const float&     a) { return pset1<Packet4f>(a) + p4f_COUNTDOWN;  }
@@ -1008,6 +1209,73 @@
   return ploadu_common<Packet16uc>(from);
 }
 
+template<typename Packet> EIGEN_ALWAYS_INLINE Packet ploadu_partial_common(const __UNPACK_TYPE__(Packet)* from, const Index n)
+{
+  const Index packet_size = unpacket_traits<Packet>::size;
+  eigen_assert(n <= packet_size && "number of elements will read past end of packet");
+  const Index size = sizeof(__UNPACK_TYPE__(Packet));
+#ifdef _ARCH_PWR9
+  EIGEN_DEBUG_ALIGNED_LOAD
+  EIGEN_DEBUG_UNALIGNED_LOAD
+  return vec_xl_len(const_cast<__UNPACK_TYPE__(Packet)*>(from), n * size);
+#else
+  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) load[packet_size];
+  unsigned char* load2 = reinterpret_cast<unsigned char *>(load);
+  unsigned char* from2 = reinterpret_cast<unsigned char *>(const_cast<__UNPACK_TYPE__(Packet)*>(from));
+  Index n2 = n * size;
+  Index i = 0;
+  if (16 <= n2) {
+    pstore(load2, ploadu<Packet16uc>(from2));
+    i += 16;
+  }
+  if (i + 8 <= n2) {
+    *reinterpret_cast<uint64_t *>(load2 + i) = *reinterpret_cast<uint64_t *>(from2 + i);
+    i += 8;
+  }
+  if (i + 4 <= n2) {
+    *reinterpret_cast<uint32_t *>(load2 + i) = *reinterpret_cast<uint32_t *>(from2 + i);
+    i += 4;
+  }
+  if (i + 2 <= n2) {
+    *reinterpret_cast<uint16_t *>(load2 + i) = *reinterpret_cast<uint16_t *>(from2 + i);
+    i += 2;
+  }
+  if (i < n2) {
+    *reinterpret_cast<uint8_t *>(load2 + i) = *reinterpret_cast<uint8_t *>(from2 + i);
+  }
+  return pload_ignore<Packet>(load);
+#endif
+}
+
+template<> EIGEN_ALWAYS_INLINE Packet4f ploadu_partial<Packet4f>(const float* from, const Index n)
+{
+  return ploadu_partial_common<Packet4f>(from, n);
+}
+template<> EIGEN_ALWAYS_INLINE Packet4i ploadu_partial<Packet4i>(const int* from, const Index n)
+{
+  return ploadu_partial_common<Packet4i>(from, n);
+}
+template<> EIGEN_ALWAYS_INLINE Packet8s ploadu_partial<Packet8s>(const short int* from, const Index n)
+{
+  return ploadu_partial_common<Packet8s>(from, n);
+}
+template<> EIGEN_ALWAYS_INLINE Packet8us ploadu_partial<Packet8us>(const unsigned short int* from, const Index n)
+{
+  return ploadu_partial_common<Packet8us>(from, n);
+}
+template<> EIGEN_ALWAYS_INLINE Packet8bf ploadu_partial<Packet8bf>(const bfloat16* from, const Index n)
+{
+  return ploadu_partial_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from), n);
+}
+template<> EIGEN_ALWAYS_INLINE Packet16c ploadu_partial<Packet16c>(const signed char* from, const Index n)
+{
+  return ploadu_partial_common<Packet16c>(from, n);
+}
+template<> EIGEN_ALWAYS_INLINE Packet16uc ploadu_partial<Packet16uc>(const unsigned char* from, const Index n)
+{
+  return ploadu_partial_common<Packet16uc>(from, n);
+}
+
 template<typename Packet> EIGEN_STRONG_INLINE Packet ploaddup_common(const __UNPACK_TYPE__(Packet)*   from)
 {
   Packet p;
@@ -1128,6 +1396,77 @@
   pstoreu_common<Packet16uc>(to, from);
 }
 
+template<typename Packet> EIGEN_ALWAYS_INLINE void pstoreu_partial_common(__UNPACK_TYPE__(Packet)*  to, const Packet& from, const Index n)
+{
+  const Index packet_size = unpacket_traits<Packet>::size;
+  eigen_assert(n <= packet_size && "number of elements will write past end of packet");
+  const Index size = sizeof(__UNPACK_TYPE__(Packet));
+#ifdef _ARCH_PWR9
+  EIGEN_DEBUG_UNALIGNED_STORE
+  vec_xst_len(from, to, n * size);
+#else
+  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) store[packet_size];
+  pstore(store, from);
+  unsigned char* store2 = reinterpret_cast<unsigned char *>(store);
+  unsigned char* to2 = reinterpret_cast<unsigned char *>(to);
+  Index n2 = n * size;
+  Index i = 0;
+  if (16 <= n2) {
+    pstoreu(to2, pload<Packet16uc>(store2));
+    i += 16;
+  }
+  if (i + 8 <= n2) {
+    *reinterpret_cast<uint64_t *>(to2 + i) = *reinterpret_cast<uint64_t *>(store2 + i);
+    i += 8;
+  }
+  if (i + 4 <= n2) {
+    *reinterpret_cast<uint32_t *>(to2 + i) = *reinterpret_cast<uint32_t *>(store2 + i);
+    i += 4;
+  }
+  if (i + 2 <= n2) {
+    *reinterpret_cast<uint16_t *>(to2 + i) = *reinterpret_cast<uint16_t *>(store2 + i);
+    i += 2;
+  }
+  if (i < n2) {
+    *reinterpret_cast<uint8_t *>(to2 + i) = *reinterpret_cast<uint8_t *>(store2 + i);
+  }
+
+  LOAD_STORE_UNROLL_16
+  for (Index i = 0; i < n; i++) {
+    to[i] = from[i];
+  }
+#endif
+}
+
+template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<float>(float*  to, const Packet4f& from, const Index n)
+{
+  pstoreu_partial_common<Packet4f>(to, from, n);
+}
+template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<int>(int*  to, const Packet4i& from, const Index n)
+{
+  pstoreu_partial_common<Packet4i>(to, from, n);
+}
+template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<short int>(short int*  to, const Packet8s& from, const Index n)
+{
+  pstoreu_partial_common<Packet8s>(to, from, n);
+}
+template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<unsigned short int>(unsigned short int*  to, const Packet8us& from, const Index n)
+{
+  pstoreu_partial_common<Packet8us>(to, from, n);
+}
+template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<bfloat16>(bfloat16*      to, const Packet8bf& from, const Index n)
+{
+  pstoreu_partial_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from, n);
+}
+template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<signed char>(signed char*  to, const Packet16c& from, const Index n)
+{
+  pstoreu_partial_common<Packet16c>(to, from, n);
+}
+template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<unsigned char>(unsigned char*  to, const Packet16uc& from, const Index n)
+{
+  pstoreu_partial_common<Packet16uc>(to, from, n);
+}
+
 template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr)    { EIGEN_PPC_PREFETCH(addr); }
 template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*     addr)    { EIGEN_PPC_PREFETCH(addr); }
 
@@ -2387,12 +2726,22 @@
   return vec_xl(0, const_cast<double *>(from)); // cast needed by Clang
 }
 
+template<> EIGEN_ALWAYS_INLINE Packet2d pload_partial<Packet2d>(const double* from, const Index n, const Index offset)
+{
+  return pload_partial_common<Packet2d>(from, n, offset);
+}
+
 template<> EIGEN_STRONG_INLINE void pstore<double>(double*   to, const Packet2d& from)
 {
   EIGEN_DEBUG_ALIGNED_STORE
   vec_xst(from, 0, to);
 }
 
+template<> EIGEN_ALWAYS_INLINE void pstore_partial<double>(double*  to, const Packet2d& from, const Index n, const Index offset)
+{
+  pstore_partial_common<Packet2d>(to, from, n, offset);
+}
+
 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double&  from) {
   Packet2d v = {from, from};
   return v;
@@ -2414,19 +2763,21 @@
   a3 = pset1<Packet2d>(a[3]);
 }
 
-template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2d pgather<double, Packet2d>(const double* from, Index stride)
 {
-  EIGEN_ALIGN16 double af[2];
-  af[0] = from[0*stride];
-  af[1] = from[1*stride];
- return pload<Packet2d>(af);
+  return pgather_common<Packet2d>(from, stride);
 }
-template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2d pgather_partial<double, Packet2d>(const double* from, Index stride, const Index n)
 {
-  EIGEN_ALIGN16 double af[2];
-  pstore<double>(af, from);
-  to[0*stride] = af[0];
-  to[1*stride] = af[1];
+  return pgather_common<Packet2d>(from, stride, n);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
+{
+  pscatter_common<Packet2d>(to, from, stride);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<double, Packet2d>(double* to, const Packet2d& from, Index stride, const Index n)
+{
+  pscatter_common<Packet2d>(to, from, stride, n);
 }
 
 template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return pset1<Packet2d>(a) + p2d_COUNTDOWN; }
@@ -2517,6 +2868,11 @@
   return vec_xl(0, const_cast<double*>(from));
 }
 
+template<> EIGEN_ALWAYS_INLINE Packet2d ploadu_partial<Packet2d>(const double* from, const Index n)
+{
+  return ploadu_partial_common<Packet2d>(from, n);
+}
+
 template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*   from)
 {
   Packet2d p;
@@ -2531,6 +2887,11 @@
   vec_xst(from, 0, to);
 }
 
+template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<double>(double*  to, const Packet2d& from, const Index n)
+{
+  pstoreu_partial_common<Packet2d>(to, from, n);
+}
+
 template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_PPC_PREFETCH(addr); }
 
 template<> EIGEN_STRONG_INLINE double  pfirst<Packet2d>(const Packet2d& a) { EIGEN_ALIGN16 double x[2]; pstore<double>(x, a); return x[0]; }
diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h
index e2eef19..56473a9 100644
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@@ -100,6 +100,11 @@
     return ploadt<PacketType, AlignmentType>(m_data + i);
   }
 
+  template<typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacketPartial(Index i, Index n, Index offset = 0) const {
+    return ploadt_partial<PacketType, AlignmentType>(m_data + i, n, offset);
+  }
+
   template<typename PacketType, int AlignmentT>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType load(Index i) const {
     return ploadt<PacketType, AlignmentT>(m_data + i);
@@ -110,6 +115,11 @@
     pstoret<Scalar, PacketType, AlignmentType>(m_data + i, p);
   }
 
+  template<typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketPartial(Index i, const PacketType &p, Index n, Index offset = 0) const {
+    pstoret_partial<Scalar, PacketType, AlignmentType>(m_data + i, p, n, offset);
+  }
+
 protected:
   Scalar *m_data;
 };
@@ -208,6 +218,11 @@
     return ploadt<PacketType, AlignmentType>(&operator()(i, j));
   }
 
+  template<typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacketPartial(Index i, Index j, Index n, Index offset = 0) const {
+    return ploadt_partial<PacketType, AlignmentType>(&operator()(i, j), n, offset);
+  }
+
   template <typename PacketT, int AlignmentT>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i, Index j) const {
     return ploadt<PacketT, AlignmentT>(&operator()(i, j));
@@ -218,6 +233,11 @@
     pstoret<Scalar, PacketType, AlignmentType>(&operator()(i, j), p);
   }
 
+  template<typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketPartial(Index i, Index j, const PacketType &p, Index n, Index offset = 0) const {
+    pstoret_partial<Scalar, PacketType, AlignmentType>(&operator()(i, j), p, n, offset);
+  }
+
   template<typename SubPacket>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const {
     pscatter<Scalar, SubPacket>(&operator()(i, j), p, m_stride);
@@ -272,10 +292,20 @@
   }
 
   template<typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacketPartial(Index i, Index n, Index /*offset*/) const {
+    return pgather_partial<Scalar,PacketType>(m_data + i*m_incr.value(), m_incr.value(), n);
+  }
+
+  template<typename PacketType>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketType &p) const {
     pscatter<Scalar, PacketType>(m_data + i*m_incr.value(), p, m_incr.value());
   }
 
+  template<typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketPartial(Index i, const PacketType &p, Index n, Index /*offset*/) const {
+    pscatter_partial<Scalar, PacketType>(m_data + i*m_incr.value(), p, m_incr.value(), n);
+  }
+
 protected:
   Scalar *m_data;
   const internal::variable_if_dynamic<Index,Incr> m_incr;
@@ -312,6 +342,11 @@
     return pgather<Scalar,PacketType>(&operator()(i, j),m_incr.value());
   }
 
+  template<typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacketPartial(Index i, Index j, Index n, Index /*offset*/) const {
+    return pgather_partial<Scalar,PacketType>(&operator()(i, j),m_incr.value(),n);
+  }
+
   template <typename PacketT, int AlignmentT>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i, Index j) const {
     return pgather<Scalar,PacketT>(&operator()(i, j),m_incr.value());
@@ -322,6 +357,11 @@
     pscatter<Scalar, PacketType>(&operator()(i, j), p, m_incr.value());
   }
 
+  template<typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketPartial(Index i, Index j, const PacketType &p, Index n, Index /*offset*/) const {
+    pscatter_partial<Scalar, PacketType>(&operator()(i, j), p, m_incr.value(), n);
+  }
+
   template<typename SubPacket>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const {
     pscatter<Scalar, SubPacket>(&operator()(i, j), p, m_stride);
diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h
index 4ba1370..e473215 100644
--- a/Eigen/src/Core/util/ConfigureVectorization.h
+++ b/Eigen/src/Core/util/ConfigureVectorization.h
@@ -317,7 +317,7 @@
     extern "C" {
       // In theory we should only include immintrin.h and not the other *mmintrin.h header files directly.
       // Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus:
-      #if EIGEN_COMP_ICC >= 1110
+      #if EIGEN_COMP_ICC >= 1110 || EIGEN_COMP_EMSCRIPTEN
         #include <immintrin.h>
       #else
         #include <mmintrin.h>
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index 7657ead..9823fa3 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -216,6 +216,7 @@
   */
 inline void* aligned_realloc(void *ptr, std::size_t new_size, std::size_t old_size)
 {
+  if (ptr == 0) return aligned_malloc(new_size);
   EIGEN_UNUSED_VARIABLE(old_size)
 
   void *result;
diff --git a/Eigen/src/SparseLU/SparseLU_Structs.h b/Eigen/src/SparseLU/SparseLU_Structs.h
index 74c6561..3ab0c72 100644
--- a/Eigen/src/SparseLU/SparseLU_Structs.h
+++ b/Eigen/src/SparseLU/SparseLU_Structs.h
@@ -72,8 +72,8 @@
 
 namespace Eigen {
 namespace internal {
-  
-typedef enum {LUSUP, UCOL, LSUB, USUB, LLVL, ULVL} MemType; 
+
+enum MemType {LUSUP, UCOL, LSUB, USUB, LLVL, ULVL};
 
 template <typename IndexVector, typename ScalarVector>
 struct LU_GlobalLU_t {
diff --git a/test/gpu_example.cu b/test/gpu_example.cu
index 9bc34e5..a69f5ea 100644
--- a/test/gpu_example.cu
+++ b/test/gpu_example.cu
@@ -21,7 +21,7 @@
   EIGEN_DEVICE_FUNC
   Type3 operator()(const Type1& A, const Type2& B, Type3& C) const {
     C = A + B;       // Populate output parameter.
-    Type3 D = A + B; // Populate return value.    
+    Type3 D = A + B; // Populate return value.
     return D;
   }
 };
@@ -31,7 +31,7 @@
 void test_add(const T& type) {
   const Index rows = type.rows();
   const Index cols = type.cols();
-  
+
   // Create random inputs.
   const T A = T::Random(rows, cols);
   const T B = T::Random(rows, cols);
@@ -39,17 +39,17 @@
 
   // Create kernel.
   AddKernel add_kernel;
-  
+
   // Run add_kernel(A, B, C) via run(...).
   // This will run on the GPU if using a GPU compiler, or CPU otherwise,
   // facilitating generic tests that can run on either.
   T D = run(add_kernel, A, B, C);
-  
+
   // Check that both output parameter and return value are correctly populated.
   const T expected = A + B;
   VERIFY_IS_CWISE_EQUAL(C, expected);
   VERIFY_IS_CWISE_EQUAL(D, expected);
-  
+
   // In a GPU-only test, we can verify that the CPU and GPU produce the
   // same results.
   T C_cpu, C_gpu;
@@ -70,31 +70,30 @@
 
 template <typename T1, typename T2, typename T3>
 void test_multiply(const T1& type1, const T2& type2, const T3& type3) {
-  
   const T1 A = T1::Random(type1.rows(), type1.cols());
   const T2 B = T2::Random(type2.rows(), type2.cols());
   T3 C;
 
   MultiplyKernel multiply_kernel;
-  
+
   // The run(...) family of functions uses a memory buffer to transfer data back
   // and forth to and from the device.  The size of this buffer is estimated
   // from the size of all input parameters.  If the estimated buffer size is
   // not sufficient for transferring outputs from device-to-host, then an
   // explicit buffer size needs to be specified.
-  
+
   // 2 outputs of size (A * B). For each matrix output, the buffer will store
   // the number of rows, columns, and the data.
   size_t buffer_capacity_hint = 2 * (                     // 2 output parameters
     2 * sizeof(typename T3::Index)                        // # Rows, # Cols
     + A.rows() * B.cols() * sizeof(typename T3::Scalar)); // Output data
-  
+
   T3 D = run_with_hint(buffer_capacity_hint, multiply_kernel, A, B, C);
-  
+
   const T3 expected = A * B;
   VERIFY_IS_CWISE_APPROX(C, expected);
   VERIFY_IS_CWISE_APPROX(D, expected);
-  
+
   T3 C_cpu, C_gpu;
   T3 D_cpu = run_on_cpu(multiply_kernel, A, B, C_cpu);
   T3 D_gpu = run_on_gpu_with_hint(buffer_capacity_hint,
@@ -107,30 +106,24 @@
 EIGEN_DECLARE_TEST(gpu_example)
 {
   // For the number of repeats, call the desired subtests.
-  for(int i = 0; i < g_repeat; i++) {    
+  for(int i = 0; i < g_repeat; i++) {
     // Call subtests with different sized/typed inputs.
     CALL_SUBTEST( test_add(Eigen::Vector3f()) );
     CALL_SUBTEST( test_add(Eigen::Matrix3d()) );
-#if !defined(EIGEN_USE_HIP) // FIXME
     CALL_SUBTEST( test_add(Eigen::MatrixX<int>(10, 10)) );
-#endif
 
     CALL_SUBTEST( test_add(Eigen::Array44f()) );
-#if !defined(EIGEN_USE_HIP)
     CALL_SUBTEST( test_add(Eigen::ArrayXd(20)) );
     CALL_SUBTEST( test_add(Eigen::ArrayXXi(13, 17)) );
-#endif
 
     CALL_SUBTEST( test_multiply(Eigen::Matrix3d(),
                                 Eigen::Matrix3d(),
                                 Eigen::Matrix3d()) );
-#if !defined(EIGEN_USE_HIP)
     CALL_SUBTEST( test_multiply(Eigen::MatrixX<int>(10, 10),
                                 Eigen::MatrixX<int>(10, 10),
                                 Eigen::MatrixX<int>()) );
     CALL_SUBTEST( test_multiply(Eigen::MatrixXf(12, 1),
                                 Eigen::MatrixXf(1, 32),
                                 Eigen::MatrixXf()) );
-#endif
   }
 }
diff --git a/test/packetmath.cpp b/test/packetmath.cpp
index 2d8e708..163ef47 100644
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@@ -458,6 +458,36 @@
     VERIFY(test::areApprox(data1, data2 + offset, PacketSize) && "internal::pstoreu");
   }
 
+  for (int M = 0; M < PacketSize; ++M) {
+    for (int N = 0; N <= PacketSize; ++N) {
+      for (int j = 0; j < size; ++j) {
+        data1[j] = internal::random<Scalar>() / RealScalar(PacketSize);
+        data2[j] = internal::random<Scalar>() / RealScalar(PacketSize);
+        refvalue = (std::max)(refvalue, numext::abs(data1[j]));
+      }
+
+      if (M == 0) {
+        internal::pstore_partial(data2, internal::pload_partial<Packet>(data1, N), N);
+        VERIFY(test::areApprox(data1, data2, N) && "aligned loadN/storeN");
+
+        for (int offset = 0; offset < PacketSize; ++offset) {
+          internal::pstore_partial(data2, internal::ploadu_partial<Packet>(data1 + offset, N), N);
+          VERIFY(test::areApprox(data1 + offset, data2, N) && "internal::ploadu_partial");
+        }
+
+        for (int offset = 0; offset < PacketSize; ++offset) {
+          internal::pstoreu_partial(data2 + offset, internal::pload_partial<Packet>(data1, N), N);
+          VERIFY(test::areApprox(data1, data2 + offset, N) && "internal::pstoreu_partial");
+        }
+      }
+
+      if (N + M > PacketSize) continue;  // Don't read or write past end of Packet
+
+      internal::pstore_partial(data2, internal::pload_partial<Packet>(data1, N, M), N, M);
+      VERIFY(test::areApprox(data1, data2, N) && "aligned offset loadN/storeN");
+    }
+  }
+
   if (internal::unpacket_traits<Packet>::masked_load_available) {
     test::packet_helper<internal::unpacket_traits<Packet>::masked_load_available, Packet> h;
     unsigned long long max_umask = (0x1ull << PacketSize);
@@ -1372,6 +1402,36 @@
   for (int i = 0; i < PacketSize; ++i) {
     VERIFY(test::isApproxAbs(data1[i], buffer[i * 7], refvalue) && "pgather");
   }
+
+  for (Index N = 0; N <= PacketSize; ++N) {
+    for (Index i = 0; i < N; ++i) {
+      data1[i] = internal::random<Scalar>() / RealScalar(PacketSize);
+    }
+
+    for (Index i = 0; i < N * 20; ++i) {
+      buffer[i] = Scalar(0);
+    }
+
+    packet = internal::pload_partial<Packet>(data1, N);
+    internal::pscatter_partial<Scalar, Packet>(buffer, packet, stride, N);
+
+    for (Index i = 0; i < N * 20; ++i) {
+      if ((i % stride) == 0 && i < stride * N) {
+        VERIFY(test::isApproxAbs(buffer[i], data1[i / stride], refvalue) && "pscatter_partial");
+      } else {
+        VERIFY(test::isApproxAbs(buffer[i], Scalar(0), refvalue) && "pscatter_partial");
+      }
+    }
+
+    for (Index i = 0; i < N * 7; ++i) {
+      buffer[i] = internal::random<Scalar>() / RealScalar(PacketSize);
+    }
+    packet = internal::pgather_partial<Scalar, Packet>(buffer, 7, N);
+    internal::pstore_partial(data1, packet, N);
+    for (Index i = 0; i < N; ++i) {
+      VERIFY(test::isApproxAbs(data1[i], buffer[i * 7], refvalue) && "pgather_partial");
+    }
+  }
 }
 
 namespace Eigen {
diff --git a/unsupported/Eigen/AutoDiff b/unsupported/Eigen/AutoDiff
index 1d1be3b..62fc0b3 100644
--- a/unsupported/Eigen/AutoDiff
+++ b/unsupported/Eigen/AutoDiff
@@ -10,6 +10,8 @@
 #ifndef EIGEN_AUTODIFF_MODULE_H
 #define EIGEN_AUTODIFF_MODULE_H
 
+#include "../../Eigen/Core"
+
 namespace Eigen {
 
 /**
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
index 295f6e5..8c705ec 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
@@ -256,10 +256,10 @@
 // the SAME case.
 // When the stride is 1, we have the simplified case R'=R-K+1, C'=C-K+1, Pr=0,
 // Pc=0.
-typedef enum {
+enum PaddingType {
   PADDING_VALID = 1,
   PADDING_SAME = 2
-} PaddingType;
+};
 
 }  // end namespace Eigen