Update Eigen to: https://gitlab.com/libeigen/eigen/-/commit/3d9051ea84a5089b277c88dac456b3b1576bfa7f

BEGIN_PUBLIC

Update Eigen to: https://gitlab.com/libeigen/eigen/-/commit/3d9051ea84a5089b277c88dac456b3b1576bfa7f

Additional minor changes to account for API change in `TensorDeviceGpu` - static variables replaced
by a safer singleton class.

END_PUBLIC

For TFRT, the OSS eigen version hasn't been updated in over a year.  This is now corrected, and
required a small change in handling `__shfl_down_sync` for `Eigen::half`, for which an overload
is now provided.

PiperOrigin-RevId: 374792818
diff --git a/Eigen/LU b/Eigen/LU
index 0fb184b..1236ceb 100644
--- a/Eigen/LU
+++ b/Eigen/LU
@@ -38,9 +38,7 @@
 #include "src/LU/Determinant.h"
 #include "src/LU/InverseImpl.h"
 
-// Use the SSE optimized version whenever possible. At the moment the
-// SSE version doesn't compile when AVX is enabled
-#if (defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX) || defined EIGEN_VECTORIZE_NEON
+#if defined EIGEN_VECTORIZE_SSE || defined EIGEN_VECTORIZE_NEON
   #include "src/LU/arch/InverseSize4.h"
 #endif
 
diff --git a/Eigen/src/Core/DenseStorage.h b/Eigen/src/Core/DenseStorage.h
index f6e1d0a..9acca6c 100644
--- a/Eigen/src/Core/DenseStorage.h
+++ b/Eigen/src/Core/DenseStorage.h
@@ -163,6 +163,30 @@
   EIGEN_DEVICE_FUNC plain_array(constructor_without_unaligned_array_assert) {}
 };
 
+struct plain_array_helper {
+  template<typename T, int Size, int MatrixOrArrayOptions, int Alignment>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  static void copy(const plain_array<T, Size, MatrixOrArrayOptions, Alignment>& src, const Eigen::Index size,
+                         plain_array<T, Size, MatrixOrArrayOptions, Alignment>& dst) {
+    smart_copy(src.array, src.array + size, dst.array);
+  }
+  
+  template<typename T, int Size, int MatrixOrArrayOptions, int Alignment>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  static void swap(plain_array<T, Size, MatrixOrArrayOptions, Alignment>& a, const Eigen::Index a_size,
+                   plain_array<T, Size, MatrixOrArrayOptions, Alignment>& b, const Eigen::Index b_size) {
+    if (a_size < b_size) {
+      std::swap_ranges(b.array, b.array + a_size, a.array);
+      smart_move(b.array + a_size, b.array + b_size, a.array + a_size);
+    } else if (a_size > b_size) {
+      std::swap_ranges(a.array, a.array + b_size, b.array);
+      smart_move(a.array + b_size, a.array + a_size, b.array + b_size);
+    } else {
+      std::swap_ranges(a.array, a.array + a_size, b.array);
+    }
+  }
+};
+
 } // end namespace internal
 
 /** \internal
@@ -268,21 +292,25 @@
     EIGEN_DEVICE_FUNC DenseStorage() : m_rows(0), m_cols(0) {}
     EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
       : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(0), m_cols(0) {}
-    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) : m_data(other.m_data), m_rows(other.m_rows), m_cols(other.m_cols) {}
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other)
+      : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(other.m_rows), m_cols(other.m_cols)
+    {
+      internal::plain_array_helper::copy(other.m_data, m_rows * m_cols, m_data);
+    }
     EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
     {
       if (this != &other)
       {
-        m_data = other.m_data;
         m_rows = other.m_rows;
         m_cols = other.m_cols;
+        internal::plain_array_helper::copy(other.m_data, m_rows * m_cols, m_data);
       }
       return *this;
     }
     EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index cols) : m_rows(rows), m_cols(cols) {}
     EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
     {
-      numext::swap(m_data,other.m_data);
+      internal::plain_array_helper::swap(m_data, m_rows * m_cols, other.m_data, other.m_rows * other.m_cols);
       numext::swap(m_rows,other.m_rows);
       numext::swap(m_cols,other.m_cols);
     }
@@ -303,21 +331,26 @@
     EIGEN_DEVICE_FUNC DenseStorage() : m_rows(0) {}
     EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
       : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(0) {}
-    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) : m_data(other.m_data), m_rows(other.m_rows) {}
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other)
+      : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(other.m_rows)
+    {
+      internal::plain_array_helper::copy(other.m_data, m_rows * _Cols, m_data);
+    }
+    
     EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
     {
       if (this != &other)
       {
-        m_data = other.m_data;
         m_rows = other.m_rows;
+        internal::plain_array_helper::copy(other.m_data, m_rows * _Cols, m_data);
       }
       return *this;
     }
     EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index) : m_rows(rows) {}
     EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
-    {
-      numext::swap(m_data,other.m_data);
-      numext::swap(m_rows,other.m_rows);
+    { 
+      internal::plain_array_helper::swap(m_data, m_rows * _Cols, other.m_data, other.m_rows * _Cols);
+      numext::swap(m_rows, other.m_rows);
     }
     EIGEN_DEVICE_FUNC Index rows(void) const EIGEN_NOEXCEPT {return m_rows;}
     EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols(void) const EIGEN_NOEXCEPT {return _Cols;}
@@ -336,20 +369,24 @@
     EIGEN_DEVICE_FUNC DenseStorage() : m_cols(0) {}
     EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
       : m_data(internal::constructor_without_unaligned_array_assert()), m_cols(0) {}
-    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) : m_data(other.m_data), m_cols(other.m_cols) {}
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) 
+      : m_data(internal::constructor_without_unaligned_array_assert()), m_cols(other.m_cols)
+    {
+      internal::plain_array_helper::copy(other.m_data, _Rows * m_cols, m_data);
+    }
     EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
     {
       if (this != &other)
       {
-        m_data = other.m_data;
         m_cols = other.m_cols;
+        internal::plain_array_helper::copy(other.m_data, _Rows * m_cols, m_data);
       }
       return *this;
     }
     EIGEN_DEVICE_FUNC DenseStorage(Index, Index, Index cols) : m_cols(cols) {}
     EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
-      numext::swap(m_data,other.m_data);
-      numext::swap(m_cols,other.m_cols);
+      internal::plain_array_helper::swap(m_data, _Rows * m_cols, other.m_data, _Rows * other.m_cols);
+      numext::swap(m_cols, other.m_cols);
     }
     EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows(void) const EIGEN_NOEXCEPT {return _Rows;}
     EIGEN_DEVICE_FUNC Index cols(void) const EIGEN_NOEXCEPT {return m_cols;}
diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index 2920121..7f82090 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -260,19 +260,8 @@
   }
 };
 
-template<typename Scalar> struct conj_impl : conj_default_impl<Scalar> {};
-
-#if defined(EIGEN_GPU_COMPILE_PHASE)
-template<typename T>
-struct conj_impl<std::complex<T> >
-{
-  EIGEN_DEVICE_FUNC
-  static inline std::complex<T> run(const std::complex<T>& x)
-  {
-    return std::complex<T>(x.real(), -x.imag());
-  }
-};
-#endif
+template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
+struct conj_impl : conj_default_impl<Scalar, IsComplex> {};
 
 template<typename Scalar>
 struct conj_retval
@@ -592,8 +581,9 @@
 
 template<typename Scalar>
 struct arg_default_impl<Scalar, true> {
+  typedef typename NumTraits<Scalar>::Real RealScalar;
   EIGEN_DEVICE_FUNC
-  static inline Scalar run(const Scalar& x)
+  static inline RealScalar run(const Scalar& x)
   {
     #if defined(EIGEN_HIP_DEVICE_COMPILE)
     // HIP does not seem to have a native device side implementation for the math routine "arg"
@@ -601,7 +591,7 @@
     #else
     EIGEN_USING_STD(arg);
     #endif
-    return static_cast<Scalar>(arg(x));
+    return static_cast<RealScalar>(arg(x));
   }
 };
 
@@ -612,7 +602,7 @@
   EIGEN_DEVICE_FUNC
   static inline RealScalar run(const Scalar& x)
   {
-    return (x < Scalar(0)) ? Scalar(EIGEN_PI) : Scalar(0);
+    return (x < Scalar(0)) ? RealScalar(EIGEN_PI) : RealScalar(0);
   }
 };
 #else
@@ -623,7 +613,7 @@
   EIGEN_DEVICE_FUNC
   static inline RealScalar run(const Scalar& x)
   {
-    return (x < Scalar(0)) ? Scalar(EIGEN_PI) : Scalar(0);
+    return (x < RealScalar(0)) ? RealScalar(EIGEN_PI) : RealScalar(0);
   }
 };
 
@@ -2022,6 +2012,18 @@
   }
 };
 
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+template<typename T>
+struct conj_impl<std::complex<T>, true>
+{
+  EIGEN_DEVICE_FUNC
+  static inline std::complex<T> run(const std::complex<T>& x)
+  {
+    return std::complex<T>(numext::real(x), -numext::imag(x));
+  }
+};
+#endif
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/StlIterators.h b/Eigen/src/Core/StlIterators.h
index a2bc0de..09041db 100644
--- a/Eigen/src/Core/StlIterators.h
+++ b/Eigen/src/Core/StlIterators.h
@@ -7,6 +7,9 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+#ifndef EIGEN_STLITERATORS_H
+#define EIGEN_STLITERATORS_H
+
 namespace Eigen {
 
 namespace internal {
@@ -30,10 +33,10 @@
   typedef Index difference_type;
   typedef std::random_access_iterator_tag iterator_category;
 
-  indexed_based_stl_iterator_base() : mp_xpr(0), m_index(0) {}
-  indexed_based_stl_iterator_base(XprType& xpr, Index index) : mp_xpr(&xpr), m_index(index) {}
+  indexed_based_stl_iterator_base() EIGEN_NO_THROW : mp_xpr(0), m_index(0) {}
+  indexed_based_stl_iterator_base(XprType& xpr, Index index) EIGEN_NO_THROW : mp_xpr(&xpr), m_index(index) {}
 
-  indexed_based_stl_iterator_base(const non_const_iterator& other)
+  indexed_based_stl_iterator_base(const non_const_iterator& other) EIGEN_NO_THROW
     : mp_xpr(other.mp_xpr), m_index(other.m_index)
   {}
 
@@ -190,17 +193,17 @@
   typedef typename internal::conditional<bool(is_lvalue), value_type&, const value_type&>::type reference;
 
 
-  pointer_based_stl_iterator() : m_ptr(0) {}
-  pointer_based_stl_iterator(XprType& xpr, Index index) : m_incr(xpr.innerStride())
+  pointer_based_stl_iterator() EIGEN_NO_THROW : m_ptr(0) {}
+  pointer_based_stl_iterator(XprType& xpr, Index index) EIGEN_NO_THROW : m_incr(xpr.innerStride())
   {
     m_ptr = xpr.data() + index * m_incr.value();
   }
 
-  pointer_based_stl_iterator(const non_const_iterator& other)
+  pointer_based_stl_iterator(const non_const_iterator& other) EIGEN_NO_THROW
     : m_ptr(other.m_ptr), m_incr(other.m_incr)
   {}
 
-  pointer_based_stl_iterator& operator=(const non_const_iterator& other)
+  pointer_based_stl_iterator& operator=(const non_const_iterator& other) EIGEN_NO_THROW
   {
     m_ptr = other.m_ptr;
     m_incr.setValue(other.m_incr);
@@ -456,3 +459,5 @@
 }
 
 } // namespace Eigen
+
+#endif // EIGEN_STLITERATORS_H
diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h
index 41929cb..6fd726d 100644
--- a/Eigen/src/Core/arch/AVX512/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h
@@ -119,74 +119,11 @@
   return pmax(pmul(y, _mm512_castsi512_ps(emm0)), _x);
 }
 
-/*template <>
+template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
 pexp<Packet8d>(const Packet8d& _x) {
-  Packet8d x = _x;
-
-  _EIGEN_DECLARE_CONST_Packet8d(1, 1.0);
-  _EIGEN_DECLARE_CONST_Packet8d(2, 2.0);
-
-  _EIGEN_DECLARE_CONST_Packet8d(exp_hi, 709.437);
-  _EIGEN_DECLARE_CONST_Packet8d(exp_lo, -709.436139303);
-
-  _EIGEN_DECLARE_CONST_Packet8d(cephes_LOG2EF, 1.4426950408889634073599);
-
-  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p0, 1.26177193074810590878e-4);
-  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p1, 3.02994407707441961300e-2);
-  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p2, 9.99999999999999999910e-1);
-
-  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q0, 3.00198505138664455042e-6);
-  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q1, 2.52448340349684104192e-3);
-  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q2, 2.27265548208155028766e-1);
-  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q3, 2.00000000000000000009e0);
-
-  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_C1, 0.693145751953125);
-  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_C2, 1.42860682030941723212e-6);
-
-  // clamp x
-  x = pmax(pmin(x, p8d_exp_hi), p8d_exp_lo);
-
-  // Express exp(x) as exp(g + n*log(2)).
-  const Packet8d n =
-      _mm512_mul_round_pd(p8d_cephes_LOG2EF, x, _MM_FROUND_TO_NEAREST_INT);
-
-  // Get the remainder modulo log(2), i.e. the "g" described above. Subtract
-  // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last
-  // digits right.
-  const Packet8d nC1 = pmul(n, p8d_cephes_exp_C1);
-  const Packet8d nC2 = pmul(n, p8d_cephes_exp_C2);
-  x = psub(x, nC1);
-  x = psub(x, nC2);
-
-  const Packet8d x2 = pmul(x, x);
-
-  // Evaluate the numerator polynomial of the rational interpolant.
-  Packet8d px = p8d_cephes_exp_p0;
-  px = pmadd(px, x2, p8d_cephes_exp_p1);
-  px = pmadd(px, x2, p8d_cephes_exp_p2);
-  px = pmul(px, x);
-
-  // Evaluate the denominator polynomial of the rational interpolant.
-  Packet8d qx = p8d_cephes_exp_q0;
-  qx = pmadd(qx, x2, p8d_cephes_exp_q1);
-  qx = pmadd(qx, x2, p8d_cephes_exp_q2);
-  qx = pmadd(qx, x2, p8d_cephes_exp_q3);
-
-  // I don't really get this bit, copied from the SSE2 routines, so...
-  // TODO(gonnet): Figure out what is going on here, perhaps find a better
-  // rational interpolant?
-  x = _mm512_div_pd(px, psub(qx, px));
-  x = pmadd(p8d_2, x, p8d_1);
-
-  // Build e=2^n.
-  const Packet8d e = _mm512_castsi512_pd(_mm512_slli_epi64(
-      _mm512_add_epi64(_mm512_cvtpd_epi64(n), _mm512_set1_epi64(1023)), 52));
-
-  // Construct the result 2^n * exp(g) = e * x. The max is used to catch
-  // non-finite values in the input.
-  return pmax(pmul(x, e), _x);
-  }*/
+  return pexp_double(_x);
+}
 
 F16_PACKET_FUNCTION(Packet16f, Packet16h, pexp)
 BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pexp)
diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h
index f874137..59bbef0 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
@@ -140,6 +140,7 @@
     HasHalfPacket = 1,
 #if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
     HasLog  = 1,
+    HasExp = 1,
     HasSqrt = EIGEN_FAST_MATH,
     HasRsqrt = EIGEN_FAST_MATH,
 #endif
@@ -486,7 +487,7 @@
 }
 
 template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) {
-  __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_NGT_UQ);
+  __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_NGE_UQ);
   return _mm512_castsi512_ps(
       _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
 }
@@ -517,7 +518,7 @@
 }
 template <>
 EIGEN_STRONG_INLINE Packet8d pcmp_lt_or_nan(const Packet8d& a, const Packet8d& b) {
-  __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_NGT_UQ);
+  __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_NGE_UQ);
   return _mm512_castsi512_pd(
       _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu));
 }
@@ -929,7 +930,8 @@
   Packet8i b = parithmetic_shift_right<2>(e);  // floor(e/4)
   
   // 2^b
-  Packet8i hi = _mm256_shuffle_epi32(padd(b, bias), _MM_SHUFFLE(3, 1, 2, 0));
+  const Packet8i permute_idx = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
+  Packet8i hi = _mm256_permutevar8x32_epi32(padd(b, bias), permute_idx);
   Packet8i lo = _mm256_slli_epi64(hi, 52);
   hi = _mm256_slli_epi64(_mm256_srli_epi64(hi, 32), 52);
   Packet8d c = _mm512_castsi512_pd(_mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1));
@@ -937,7 +939,7 @@
   
   // 2^(e - 3b)
   b = psub(psub(psub(e, b), b), b);  // e - 3b
-  hi = _mm256_shuffle_epi32(padd(b, bias), _MM_SHUFFLE(3, 1, 2, 0));
+  hi = _mm256_permutevar8x32_epi32(padd(b, bias), permute_idx);
   lo = _mm256_slli_epi64(hi, 52);
   hi = _mm256_slli_epi64(_mm256_srli_epi64(hi, 32), 52);
   c = _mm512_castsi512_pd(_mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1));
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
index e3ba061..dbdb81e 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
@@ -493,21 +493,21 @@
             cblock.packet[1] = lhs.template loadPacket<PacketC>(i, j + 2);
           }
         } else {
-          const std::complex<Scalar> *lhs0, *lhs1;
+          std::complex<Scalar> lhs0, lhs1;
           if (UseLhs) {
-            lhs0 = &lhs(j + 0, i);
-            lhs1 = &lhs(j + 1, i);
-            cblock.packet[0] = pload2(lhs0, lhs1);
-            lhs0 = &lhs(j + 2, i);
-            lhs1 = &lhs(j + 3, i);
-            cblock.packet[1] = pload2(lhs0, lhs1);
+            lhs0 = lhs(j + 0, i);
+            lhs1 = lhs(j + 1, i);
+            cblock.packet[0] = pload2(&lhs0, &lhs1);
+            lhs0 = lhs(j + 2, i);
+            lhs1 = lhs(j + 3, i);
+            cblock.packet[1] = pload2(&lhs0, &lhs1);
           } else {
-            lhs0 = &lhs(i, j + 0);
-            lhs1 = &lhs(i, j + 1);
-            cblock.packet[0] = pload2(lhs0, lhs1);
-            lhs0 = &lhs(i, j + 2);
-            lhs1 = &lhs(i, j + 3);
-            cblock.packet[1] = pload2(lhs0, lhs1);
+            lhs0 = lhs(i, j + 0);
+            lhs1 = lhs(i, j + 1);
+            cblock.packet[0] = pload2(&lhs0, &lhs1);
+            lhs0 = lhs(i, j + 2);
+            lhs1 = lhs(i, j + 3);
+            cblock.packet[1] = pload2(&lhs0, &lhs1);
           }
         }
 
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h
index 8edf79c..08855bd 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h
@@ -99,11 +99,9 @@
 }
 
 template<typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
-EIGEN_STRONG_INLINE void pgerMMA(__vector_quad* acc, const __vector_pair& a, const Packet4f& b)
+EIGEN_STRONG_INLINE void pgerMMA(__vector_quad*, const __vector_pair&, const Packet4f&)
 {
-  EIGEN_UNUSED_VARIABLE(acc); // Just for compilation
-  EIGEN_UNUSED_VARIABLE(a);
-  EIGEN_UNUSED_VARIABLE(b);
+  // Just for compilation
 }
 
 template<typename Scalar, typename Packet, typename RhsPacket, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
@@ -150,11 +148,9 @@
 }
 
 template<>
-EIGEN_STRONG_INLINE void ploadRhsMMA(const float* rhs, __vector_pair& rhsV)
+EIGEN_STRONG_INLINE void ploadRhsMMA(const float*, __vector_pair&)
 {
   // Just for compilation
-  EIGEN_UNUSED_VARIABLE(rhs);
-  EIGEN_UNUSED_VARIABLE(rhsV);
 }
 
 // PEEL_MMA loop factor.
diff --git a/Eigen/src/Core/arch/CUDA/Complex.h b/Eigen/src/Core/arch/CUDA/Complex.h
index b1618e5..deb4c86 100644
--- a/Eigen/src/Core/arch/CUDA/Complex.h
+++ b/Eigen/src/Core/arch/CUDA/Complex.h
@@ -67,27 +67,26 @@
   const T a_imag = numext::imag(a);
   const T b_real = numext::real(b);
   const T b_imag = numext::imag(b);
-  const T norm = T(1) / (b_real * b_real + b_imag * b_imag);
-  return std::complex<T>((a_real * b_real + a_imag * b_imag) * norm,
-                          (a_imag * b_real - a_real * b_imag) * norm);
+  const T norm = (b_real * b_real + b_imag * b_imag);
+  return std::complex<T>((a_real * b_real + a_imag * b_imag) / norm,
+                          (a_imag * b_real - a_real * b_imag) / norm);
 }
 
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 std::complex<T> complex_divide_stable(const std::complex<T>& a, const std::complex<T>& b) {
+  const T a_real = numext::real(a);
+  const T a_imag = numext::imag(a);
   const T b_real = numext::real(b);
   const T b_imag = numext::imag(b);
-  // Guard against over/under-flow.
-  const T scale = T(1) / (numext::abs(b_real) + numext::abs(b_imag));
-  const T a_real_scaled = numext::real(a) * scale;
-  const T a_imag_scaled = numext::imag(a) * scale;
-  const T b_real_scaled = b_real * scale;
-  const T b_imag_scaled = b_imag * scale;
-  
-  const T b_norm2_scaled = b_real_scaled * b_real_scaled + b_imag_scaled * b_imag_scaled;
-  return std::complex<T>(
-      (a_real_scaled * b_real_scaled + a_imag_scaled * b_imag_scaled) / b_norm2_scaled,
-      (a_imag_scaled * b_real_scaled - a_real_scaled * b_imag_scaled) / b_norm2_scaled);
+  // Smith's complex division (https://arxiv.org/pdf/1210.4539.pdf),
+  // guards against over/under-flow.
+  const bool scale_imag = numext::abs(b_imag) <= numext::abs(b_real);
+  const T rscale = scale_imag ? T(1) : b_real / b_imag;
+  const T iscale = scale_imag ? b_imag / b_real : T(1);
+  const T denominator = b_real * rscale + b_imag * iscale;
+  return std::complex<T>((a_real * rscale + a_imag * iscale) / denominator, 
+                         (a_imag * rscale - a_real * iscale) / denominator);
 }
 
 template<typename T>
diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h
index 1aa361b..a889ab1 100644
--- a/Eigen/src/Core/arch/NEON/Complex.h
+++ b/Eigen/src/Core/arch/NEON/Complex.h
@@ -124,13 +124,6 @@
 template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 { return Packet2cf(psub<Packet4f>(a.v, b.v)); }
 
-template<> EIGEN_STRONG_INLINE Packet2cf pxor<Packet2cf>(const Packet2cf& a, const Packet2cf& b);
-template<> EIGEN_STRONG_INLINE Packet2cf paddsub<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
-  Packet4f mask = {-0.0f, -0.0f, 0.0f, 0.0f};
-  return Packet2cf(padd(a.v, pxor(mask, b.v)));  
-}
-
 template<> EIGEN_STRONG_INLINE Packet1cf pnegate(const Packet1cf& a) { return Packet1cf(pnegate<Packet2f>(a.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate<Packet4f>(a.v)); }
 
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 9cf4e07..2b48570 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -3701,6 +3701,11 @@
   return F32MaskToBf16Mask(pcmp_lt<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
 }
 
+template<> EIGEN_STRONG_INLINE Packet4bf pcmp_lt_or_nan<Packet4bf>(const Packet4bf& a, const Packet4bf& b)
+{
+  return F32MaskToBf16Mask(pcmp_lt_or_nan<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
 template<> EIGEN_STRONG_INLINE Packet4bf pcmp_le<Packet4bf>(const Packet4bf& a, const Packet4bf& b)
 {
   return F32MaskToBf16Mask(pcmp_le<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h
index f6f1b8c..1cab374 100644
--- a/Eigen/src/Core/arch/SSE/Complex.h
+++ b/Eigen/src/Core/arch/SSE/Complex.h
@@ -19,7 +19,7 @@
 {
   EIGEN_STRONG_INLINE Packet2cf() {}
   EIGEN_STRONG_INLINE explicit Packet2cf(const __m128& a) : v(a) {}
-  __m128  v;
+  Packet4f v;
 };
 
 // Use the packet_traits defined in AVX/PacketMath.h instead if we're going
@@ -66,12 +66,6 @@
 
 template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pxor<Packet2cf>(const Packet2cf& a, const Packet2cf& b);
-template<> EIGEN_STRONG_INLINE Packet2cf paddsub<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
-  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x0,0x0));
-  return Packet2cf(padd(a.v, pxor(mask, b.v)));
-}
 
 template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a)
 {
@@ -113,19 +107,13 @@
 template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
 {
   Packet2cf res;
-#if EIGEN_GNUC_AT_MOST(4,2)
-  // Workaround annoying "may be used uninitialized in this function" warning with gcc 4.2
-  res.v = _mm_loadl_pi(_mm_set1_ps(0.0f), reinterpret_cast<const __m64*>(&from));
-#elif EIGEN_GNUC_AT_LEAST(4,6)
-  // Suppress annoying "may be used uninitialized in this function" warning with gcc >= 4.6
-  #pragma GCC diagnostic push
-  #pragma GCC diagnostic ignored "-Wuninitialized"
-  res.v = _mm_loadl_pi(res.v, (const __m64*)&from);
-  #pragma GCC diagnostic pop
+#ifdef EIGEN_VECTORIZE_SSE3
+  res.v = _mm_castpd_ps(_mm_loaddup_pd(reinterpret_cast<double const*>(&from)));
 #else
-  res.v = _mm_loadl_pi(res.v, (const __m64*)&from);
+  res.v = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<double const*>(&from)));
+  res.v = _mm_movelh_ps(res.v, res.v);
 #endif
-  return Packet2cf(_mm_movelh_ps(res.v,res.v));
+  return res;
 }
 
 template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); }
@@ -252,7 +240,7 @@
 {
   EIGEN_STRONG_INLINE Packet1cd() {}
   EIGEN_STRONG_INLINE explicit Packet1cd(const __m128d& a) : v(a) {}
-  __m128d  v;
+  Packet2d v;
 };
 
 // Use the packet_traits defined in AVX/PacketMath.h instead if we're going
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index 273dffb..efd7199 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -16,8 +16,8 @@
 //------------------------------------------------------------------------------------------
 
 #define EIGEN_WORLD_VERSION 3
-#define EIGEN_MAJOR_VERSION 4
-#define EIGEN_MINOR_VERSION 99
+#define EIGEN_MAJOR_VERSION 3
+#define EIGEN_MINOR_VERSION 90
 
 #define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \
                                       (EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \
@@ -688,8 +688,7 @@
 // Does the compiler support result_of?
 // result_of was deprecated in c++17 and removed in c++ 20
 #ifndef EIGEN_HAS_STD_RESULT_OF
-#if EIGEN_MAX_CPP_VER >= 11 && \
-    (defined(__cplusplus) && __cplusplus >= 201103L && __cplusplus < 201703L)
+#if EIGEN_HAS_CXX11 && EIGEN_COMP_CXXVER < 17
 #define EIGEN_HAS_STD_RESULT_OF 1
 #else
 #define EIGEN_HAS_STD_RESULT_OF 0
@@ -708,8 +707,7 @@
 #endif  // EIGEN_HAS_STD_HASH
 
 #ifndef EIGEN_HAS_STD_INVOKE_RESULT
-#if EIGEN_MAX_CPP_VER >= 17 && \
-    (defined(__cplusplus) && __cplusplus >= 201703L)
+#if EIGEN_MAX_CPP_VER >= 17 && EIGEN_COMP_CXXVER >= 17
 #define EIGEN_HAS_STD_INVOKE_RESULT 1
 #else
 #define EIGEN_HAS_STD_INVOKE_RESULT 0
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index 7cbe8a6..875318c 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -566,6 +566,17 @@
   }
 };
 
+#if EIGEN_HAS_RVALUE_REFERENCES
+template<typename T> EIGEN_DEVICE_FUNC T* smart_move(T* start, T* end, T* target)
+{
+  return std::move(start, end, target);
+}
+#else
+template<typename T> EIGEN_DEVICE_FUNC T* smart_move(T* start, T* end, T* target)
+{
+  return std::copy(start, end, target);
+}
+#endif
 
 /*****************************************************************************
 *** Implementation of runtime stack allocation (falling back to malloc)    ***
diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h
index 9c77717..2c63a95 100644
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@@ -136,15 +136,14 @@
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
     operator T() const { return T(Value); }
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    void setValue(T) const {}
+    void setValue(T v) const { EIGEN_ONLY_USED_FOR_DEBUG(v); eigen_assert(v == T(Value)); }
 };
 
 template<typename T> class variable_if_dynamic<T, Dynamic>
 {
     T m_value;
-    EIGEN_DEVICE_FUNC variable_if_dynamic() { eigen_assert(false); }
   public:
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamic(T value) : m_value(value) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamic(T value = 0) EIGEN_NO_THROW : m_value(value) {}
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T value() const { return m_value; }
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator T() const { return m_value; }
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T value) { m_value = value; }
diff --git a/Eigen/src/LU/arch/InverseSize4.h b/Eigen/src/LU/arch/InverseSize4.h
index 5a8d0c1..ee5548a 100644
--- a/Eigen/src/LU/arch/InverseSize4.h
+++ b/Eigen/src/LU/arch/InverseSize4.h
@@ -54,10 +54,12 @@
   {
     ActualMatrixType matrix(mat);
 
-    Packet4f _L1 = matrix.template packet<MatrixAlignment>(0);
-    Packet4f _L2 = matrix.template packet<MatrixAlignment>(4);
-    Packet4f _L3 = matrix.template packet<MatrixAlignment>(8);
-    Packet4f _L4 = matrix.template packet<MatrixAlignment>(12);
+    const float* data = matrix.data();
+    const Index stride = matrix.innerStride();
+    Packet4f _L1 = ploadt<Packet4f,MatrixAlignment>(data);
+    Packet4f _L2 = ploadt<Packet4f,MatrixAlignment>(data + stride*4);
+    Packet4f _L3 = ploadt<Packet4f,MatrixAlignment>(data + stride*8);
+    Packet4f _L4 = ploadt<Packet4f,MatrixAlignment>(data + stride*12);
 
     // Four 2x2 sub-matrices of the input matrix
     // input = [[A, B],
@@ -189,25 +191,26 @@
 
     Packet2d A1, A2, B1, B2, C1, C2, D1, D2;
 
+    const double* data = matrix.data();
+    const Index stride = matrix.innerStride();
     if (StorageOrdersMatch)
     {
-      A1 = matrix.template packet<MatrixAlignment>(0);
-      B1 = matrix.template packet<MatrixAlignment>(2);
-      A2 = matrix.template packet<MatrixAlignment>(4);
-      B2 = matrix.template packet<MatrixAlignment>(6);
-      C1 = matrix.template packet<MatrixAlignment>(8);
-      D1 = matrix.template packet<MatrixAlignment>(10);
-      C2 = matrix.template packet<MatrixAlignment>(12);
-      D2 = matrix.template packet<MatrixAlignment>(14);
+      A1 = ploadt<Packet2d,MatrixAlignment>(data + stride*0);
+      B1 = ploadt<Packet2d,MatrixAlignment>(data + stride*2);
+      A2 = ploadt<Packet2d,MatrixAlignment>(data + stride*4);
+      B2 = ploadt<Packet2d,MatrixAlignment>(data + stride*6);
+      C1 = ploadt<Packet2d,MatrixAlignment>(data + stride*8);
+      D1 = ploadt<Packet2d,MatrixAlignment>(data + stride*10);
+      C2 = ploadt<Packet2d,MatrixAlignment>(data + stride*12);
+      D2 = ploadt<Packet2d,MatrixAlignment>(data + stride*14);
     }
     else
     {
       Packet2d temp;
-      A1 = matrix.template packet<MatrixAlignment>(0);
-      C1 = matrix.template packet<MatrixAlignment>(2);
-      A2 = matrix.template packet<MatrixAlignment>(4);
-      C2 = matrix.template packet<MatrixAlignment>(6);
-
+      A1 = ploadt<Packet2d,MatrixAlignment>(data + stride*0);
+      C1 = ploadt<Packet2d,MatrixAlignment>(data + stride*2);
+      A2 = ploadt<Packet2d,MatrixAlignment>(data + stride*4);
+      C2 = ploadt<Packet2d,MatrixAlignment>(data + stride*6);
       temp = A1;
       A1 = vec2d_unpacklo(A1, A2);
       A2 = vec2d_unpackhi(temp, A2);
@@ -216,10 +219,10 @@
       C1 = vec2d_unpacklo(C1, C2);
       C2 = vec2d_unpackhi(temp, C2);
 
-      B1 = matrix.template packet<MatrixAlignment>(8);
-      D1 = matrix.template packet<MatrixAlignment>(10);
-      B2 = matrix.template packet<MatrixAlignment>(12);
-      D2 = matrix.template packet<MatrixAlignment>(14);
+      B1 = ploadt<Packet2d,MatrixAlignment>(data + stride*8);
+      D1 = ploadt<Packet2d,MatrixAlignment>(data + stride*10);
+      B2 = ploadt<Packet2d,MatrixAlignment>(data + stride*12);
+      D2 = ploadt<Packet2d,MatrixAlignment>(data + stride*14);
 
       temp = B1;
       B1 = vec2d_unpacklo(B1, B2);
diff --git a/bench/basicbenchmark.h b/bench/basicbenchmark.h
index 3fdc357..8059375 100644
--- a/bench/basicbenchmark.h
+++ b/bench/basicbenchmark.h
@@ -16,13 +16,13 @@
     {
       asm("#begin_bench_loop LazyEval");
       if (MatrixType::SizeAtCompileTime!=Eigen::Dynamic) asm("#fixedsize");
-      m = (I + 0.00005 * (m + m.lazy() * m)).eval();
+      m = (I + 0.00005 * (m + m.lazyProduct(m))).eval();
     }
     else if (Mode==OmpEval)
     {
       asm("#begin_bench_loop OmpEval");
       if (MatrixType::SizeAtCompileTime!=Eigen::Dynamic) asm("#fixedsize");
-      m = (I + 0.00005 * (m + m.lazy() * m)).evalOMP();
+      m = (I + 0.00005 * (m + m.lazyProduct(m))).eval();
     }
     else
     {
diff --git a/bench/sparse_randomsetter.cpp b/bench/sparse_randomsetter.cpp
index 19a76e3..c433742 100644
--- a/bench/sparse_randomsetter.cpp
+++ b/bench/sparse_randomsetter.cpp
@@ -1,6 +1,7 @@
 
 #define NOGMM
 #define NOMTL
+#define EIGEN_GOOGLEHASH_SUPPORT 1
 
 #include <map>
 #include <ext/hash_map>
diff --git a/test/SafeScalar.h b/test/SafeScalar.h
new file mode 100644
index 0000000..c5cb757
--- /dev/null
+++ b/test/SafeScalar.h
@@ -0,0 +1,30 @@
+
+// A Scalar that asserts for uninitialized access.
+template<typename T>
+class SafeScalar {
+ public:
+  SafeScalar() : initialized_(false) {}
+  SafeScalar(const SafeScalar& other) {
+    *this = other;
+  }
+  SafeScalar& operator=(const SafeScalar& other) {
+    val_ = T(other);
+    initialized_ = true;
+    return *this;
+  }
+  
+  SafeScalar(T val) : val_(val), initialized_(true) {}
+  SafeScalar& operator=(T val) {
+    val_ = val;
+    initialized_ = true;
+  }
+  
+  operator T() const {
+    VERIFY(initialized_ && "Uninitialized access.");
+    return val_;
+  }
+ 
+ private:
+  T val_;
+  bool initialized_;
+};
diff --git a/test/dense_storage.cpp b/test/dense_storage.cpp
index 7fa2585..36ccbb0 100644
--- a/test/dense_storage.cpp
+++ b/test/dense_storage.cpp
@@ -8,17 +8,16 @@
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 #include "main.h"
+#include "AnnoyingScalar.h"
+#include "SafeScalar.h"
 
 #include <Eigen/Core>
 
-template <typename T, int Rows, int Cols>
-void dense_storage_copy()
+template <typename T, int Size, int Rows, int Cols>
+void dense_storage_copy(int rows, int cols)
 {
-  static const int Size = ((Rows==Dynamic || Cols==Dynamic) ? Dynamic : Rows*Cols);
-  typedef DenseStorage<T,Size, Rows,Cols, 0> DenseStorageType;
+  typedef DenseStorage<T, Size, Rows, Cols, 0> DenseStorageType;
   
-  const int rows = (Rows==Dynamic) ? 4 : Rows;
-  const int cols = (Cols==Dynamic) ? 3 : Cols;
   const int size = rows*cols;
   DenseStorageType reference(size, rows, cols);
   T* raw_reference = reference.data();
@@ -31,14 +30,11 @@
     VERIFY_IS_EQUAL(raw_reference[i], raw_copied_reference[i]);
 }
 
-template <typename T, int Rows, int Cols>
-void dense_storage_assignment()
+template <typename T, int Size, int Rows, int Cols>
+void dense_storage_assignment(int rows, int cols)
 {
-  static const int Size = ((Rows==Dynamic || Cols==Dynamic) ? Dynamic : Rows*Cols);
-  typedef DenseStorage<T,Size, Rows,Cols, 0> DenseStorageType;
+  typedef DenseStorage<T, Size, Rows, Cols, 0> DenseStorageType;
   
-  const int rows = (Rows==Dynamic) ? 4 : Rows;
-  const int cols = (Cols==Dynamic) ? 3 : Cols;
   const int size = rows*cols;
   DenseStorageType reference(size, rows, cols);
   T* raw_reference = reference.data();
@@ -52,6 +48,34 @@
     VERIFY_IS_EQUAL(raw_reference[i], raw_copied_reference[i]);
 }
 
+template <typename T, int Size, int Rows, int Cols>
+void dense_storage_swap(int rows0, int cols0, int rows1, int cols1)
+{
+  typedef DenseStorage<T, Size, Rows, Cols, 0> DenseStorageType;
+  
+  const int size0 = rows0*cols0;
+  DenseStorageType a(size0, rows0, cols0);
+  for (int i=0; i<size0; ++i) {
+    a.data()[i] = static_cast<T>(i);
+  }
+  
+  const int size1 = rows1*cols1;
+  DenseStorageType b(size1, rows1, cols1);
+  for (int i=0; i<size1; ++i) {
+    b.data()[i] = static_cast<T>(-i);
+  }
+  
+  a.swap(b);
+  
+  for (int i=0; i<size0; ++i) {
+    VERIFY_IS_EQUAL(b.data()[i], static_cast<T>(i));
+  }
+  
+  for (int i=0; i<size1; ++i) {
+    VERIFY_IS_EQUAL(a.data()[i], static_cast<T>(-i));
+  }
+}
+
 template<typename T, int Size, std::size_t Alignment>
 void dense_storage_alignment()
 {
@@ -78,30 +102,78 @@
   #endif
 }
 
+template<typename T>
+void dense_storage_tests() {
+  // Dynamic Storage.
+  dense_storage_copy<T,Dynamic,Dynamic,Dynamic>(4, 3);  
+  dense_storage_copy<T,Dynamic,Dynamic,3>(4, 3);
+  dense_storage_copy<T,Dynamic,4,Dynamic>(4, 3);
+  // Fixed Storage.
+  dense_storage_copy<T,12,4,3>(4, 3);
+  dense_storage_copy<T,12,Dynamic,Dynamic>(4, 3);
+  dense_storage_copy<T,12,4,Dynamic>(4, 3);
+  dense_storage_copy<T,12,Dynamic,3>(4, 3);
+  // Fixed Storage with Uninitialized Elements.
+  dense_storage_copy<T,18,Dynamic,Dynamic>(4, 3);
+  dense_storage_copy<T,18,4,Dynamic>(4, 3);
+  dense_storage_copy<T,18,Dynamic,3>(4, 3);
+  
+  // Dynamic Storage.
+  dense_storage_assignment<T,Dynamic,Dynamic,Dynamic>(4, 3);  
+  dense_storage_assignment<T,Dynamic,Dynamic,3>(4, 3);
+  dense_storage_assignment<T,Dynamic,4,Dynamic>(4, 3);
+  // Fixed Storage.
+  dense_storage_assignment<T,12,4,3>(4, 3);
+  dense_storage_assignment<T,12,Dynamic,Dynamic>(4, 3);
+  dense_storage_assignment<T,12,4,Dynamic>(4, 3);
+  dense_storage_assignment<T,12,Dynamic,3>(4, 3);
+  // Fixed Storage with Uninitialized Elements.
+  dense_storage_assignment<T,18,Dynamic,Dynamic>(4, 3);
+  dense_storage_assignment<T,18,4,Dynamic>(4, 3);
+  dense_storage_assignment<T,18,Dynamic,3>(4, 3);
+  
+  // Dynamic Storage.
+  dense_storage_swap<T,Dynamic,Dynamic,Dynamic>(4, 3, 4, 3); 
+  dense_storage_swap<T,Dynamic,Dynamic,Dynamic>(4, 3, 2, 1);  
+  dense_storage_swap<T,Dynamic,Dynamic,Dynamic>(2, 1, 4, 3);
+  dense_storage_swap<T,Dynamic,Dynamic,3>(4, 3, 4, 3);
+  dense_storage_swap<T,Dynamic,Dynamic,3>(4, 3, 2, 3);
+  dense_storage_swap<T,Dynamic,Dynamic,3>(2, 3, 4, 3);
+  dense_storage_swap<T,Dynamic,4,Dynamic>(4, 3, 4, 3);
+  dense_storage_swap<T,Dynamic,4,Dynamic>(4, 3, 4, 1);
+  dense_storage_swap<T,Dynamic,4,Dynamic>(4, 1, 4, 3);
+  // Fixed Storage.
+  dense_storage_swap<T,12,4,3>(4, 3, 4, 3);
+  dense_storage_swap<T,12,Dynamic,Dynamic>(4, 3, 4, 3);
+  dense_storage_swap<T,12,Dynamic,Dynamic>(4, 3, 2, 1);
+  dense_storage_swap<T,12,Dynamic,Dynamic>(2, 1, 4, 3);
+  dense_storage_swap<T,12,4,Dynamic>(4, 3, 4, 3);
+  dense_storage_swap<T,12,4,Dynamic>(4, 3, 4, 1);
+  dense_storage_swap<T,12,4,Dynamic>(4, 1, 4, 3);
+  dense_storage_swap<T,12,Dynamic,3>(4, 3, 4, 3);
+  dense_storage_swap<T,12,Dynamic,3>(4, 3, 2, 3);
+  dense_storage_swap<T,12,Dynamic,3>(2, 3, 4, 3);
+  // Fixed Storage with Uninitialized Elements.
+  dense_storage_swap<T,18,Dynamic,Dynamic>(4, 3, 4, 3);
+  dense_storage_swap<T,18,Dynamic,Dynamic>(4, 3, 2, 1);
+  dense_storage_swap<T,18,Dynamic,Dynamic>(2, 1, 4, 3);
+  dense_storage_swap<T,18,4,Dynamic>(4, 3, 4, 3);
+  dense_storage_swap<T,18,4,Dynamic>(4, 3, 4, 1);
+  dense_storage_swap<T,18,4,Dynamic>(4, 1, 4, 3);
+  dense_storage_swap<T,18,Dynamic,3>(4, 3, 4, 3);
+  dense_storage_swap<T,18,Dynamic,3>(4, 3, 2, 3);
+  dense_storage_swap<T,18,Dynamic,3>(2, 3, 4, 3);
+  
+  dense_storage_alignment<T,16,8>();
+  dense_storage_alignment<T,16,16>();
+  dense_storage_alignment<T,16,32>();
+  dense_storage_alignment<T,16,64>();
+}
+
 EIGEN_DECLARE_TEST(dense_storage)
 {
-  dense_storage_copy<int,Dynamic,Dynamic>();  
-  dense_storage_copy<int,Dynamic,3>();
-  dense_storage_copy<int,4,Dynamic>();
-  dense_storage_copy<int,4,3>();
-
-  dense_storage_copy<float,Dynamic,Dynamic>();
-  dense_storage_copy<float,Dynamic,3>();
-  dense_storage_copy<float,4,Dynamic>();  
-  dense_storage_copy<float,4,3>();
-  
-  dense_storage_assignment<int,Dynamic,Dynamic>();  
-  dense_storage_assignment<int,Dynamic,3>();
-  dense_storage_assignment<int,4,Dynamic>();
-  dense_storage_assignment<int,4,3>();
-
-  dense_storage_assignment<float,Dynamic,Dynamic>();
-  dense_storage_assignment<float,Dynamic,3>();
-  dense_storage_assignment<float,4,Dynamic>();  
-  dense_storage_assignment<float,4,3>(); 
-
-  dense_storage_alignment<float,16,8>();
-  dense_storage_alignment<float,16,16>();
-  dense_storage_alignment<float,16,32>();
-  dense_storage_alignment<float,16,64>();
+  dense_storage_tests<int>();
+  dense_storage_tests<float>();
+  dense_storage_tests<SafeScalar<float> >();
+  dense_storage_tests<AnnoyingScalar>();
 }
diff --git a/test/eigensolver_selfadjoint.cpp b/test/eigensolver_selfadjoint.cpp
index 65b80c3..0fb2f4d 100644
--- a/test/eigensolver_selfadjoint.cpp
+++ b/test/eigensolver_selfadjoint.cpp
@@ -234,15 +234,21 @@
 {
   int s = 0;
   for(int i = 0; i < g_repeat; i++) {
+
     // trivial test for 1x1 matrices:
     CALL_SUBTEST_1( selfadjointeigensolver(Matrix<float, 1, 1>()));
     CALL_SUBTEST_1( selfadjointeigensolver(Matrix<double, 1, 1>()));
+    CALL_SUBTEST_1( selfadjointeigensolver(Matrix<std::complex<double>, 1, 1>()));
+
     // very important to test 3x3 and 2x2 matrices since we provide special paths for them
     CALL_SUBTEST_12( selfadjointeigensolver(Matrix2f()) );
     CALL_SUBTEST_12( selfadjointeigensolver(Matrix2d()) );
+    CALL_SUBTEST_12( selfadjointeigensolver(Matrix2cd()) );
     CALL_SUBTEST_13( selfadjointeigensolver(Matrix3f()) );
     CALL_SUBTEST_13( selfadjointeigensolver(Matrix3d()) );
+    CALL_SUBTEST_13( selfadjointeigensolver(Matrix3cd()) );
     CALL_SUBTEST_2( selfadjointeigensolver(Matrix4d()) );
+    CALL_SUBTEST_2( selfadjointeigensolver(Matrix4cd()) );
     
     s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/4);
     CALL_SUBTEST_3( selfadjointeigensolver(MatrixXf(s,s)) );
@@ -254,6 +260,8 @@
     // some trivial but implementation-wise tricky cases
     CALL_SUBTEST_4( selfadjointeigensolver(MatrixXd(1,1)) );
     CALL_SUBTEST_4( selfadjointeigensolver(MatrixXd(2,2)) );
+    CALL_SUBTEST_5( selfadjointeigensolver(MatrixXcd(1,1)) );
+    CALL_SUBTEST_5( selfadjointeigensolver(MatrixXcd(2,2)) );
     CALL_SUBTEST_6( selfadjointeigensolver(Matrix<double,1,1>()) );
     CALL_SUBTEST_7( selfadjointeigensolver(Matrix<double,2,2>()) );
   }
diff --git a/test/numext.cpp b/test/numext.cpp
index cf1ca17..8a2fde5 100644
--- a/test/numext.cpp
+++ b/test/numext.cpp
@@ -62,6 +62,20 @@
 }
 
 template<typename T>
+void check_arg() {
+  typedef typename NumTraits<T>::Real Real;
+  VERIFY_IS_EQUAL(numext::abs(T(0)), T(0));
+  VERIFY_IS_EQUAL(numext::abs(T(1)), T(1));
+
+  for(int k=0; k<100; ++k)
+  {
+    T x = internal::random<T>();
+    Real y = numext::arg(x);
+    VERIFY_IS_APPROX( y, std::arg(x) );
+  }
+}
+
+template<typename T>
 struct check_sqrt_impl {
   static void run() {
     for (int i=0; i<1000; ++i) {
@@ -242,10 +256,12 @@
     CALL_SUBTEST( check_abs<float>() );
     CALL_SUBTEST( check_abs<double>() );
     CALL_SUBTEST( check_abs<long double>() );
-
     CALL_SUBTEST( check_abs<std::complex<float> >() );
     CALL_SUBTEST( check_abs<std::complex<double> >() );
 
+    CALL_SUBTEST( check_arg<std::complex<float> >() );
+    CALL_SUBTEST( check_arg<std::complex<double> >() );
+
     CALL_SUBTEST( check_sqrt<float>() );
     CALL_SUBTEST( check_sqrt<double>() );
     CALL_SUBTEST( check_sqrt<std::complex<float> >() );
diff --git a/test/packetmath.cpp b/test/packetmath.cpp
index 67d329a..0bb511d 100644
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@@ -279,10 +279,75 @@
   CHECK_CWISE2_IF(true, internal::pcmp_eq, internal::pcmp_eq);
 }
 
+template <typename Scalar, typename Packet>
+void packetmath_boolean_mask_ops_real() {
+  const int PacketSize = internal::unpacket_traits<Packet>::size;
+  const int size = 2 * PacketSize;
+  EIGEN_ALIGN_MAX Scalar data1[size];
+  EIGEN_ALIGN_MAX Scalar data2[size];
+  EIGEN_ALIGN_MAX Scalar ref[size];
+
+  for (int i = 0; i < PacketSize; ++i) {
+    data1[i] = internal::random<Scalar>();
+    data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
+  }
+
+  CHECK_CWISE2_IF(true, internal::pcmp_lt_or_nan, internal::pcmp_lt_or_nan);
+
+  //Test (-0) <=/< (0) for signed operations
+  for (int i = 0; i < PacketSize; ++i) {
+    data1[i] = Scalar(-0.0);
+    data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
+  }
+  CHECK_CWISE2_IF(true, internal::pcmp_lt_or_nan, internal::pcmp_lt_or_nan);
+
+  //Test NaN
+  for (int i = 0; i < PacketSize; ++i) {
+    data1[i] = NumTraits<Scalar>::quiet_NaN();
+    data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
+  }
+  CHECK_CWISE2_IF(true, internal::pcmp_lt_or_nan, internal::pcmp_lt_or_nan);
+}
+
+template <typename Scalar, typename Packet>
+void packetmath_boolean_mask_ops_notcomplex() {
+  const int PacketSize = internal::unpacket_traits<Packet>::size;
+  const int size = 2 * PacketSize;
+  EIGEN_ALIGN_MAX Scalar data1[size];
+  EIGEN_ALIGN_MAX Scalar data2[size];
+  EIGEN_ALIGN_MAX Scalar ref[size];
+
+  for (int i = 0; i < PacketSize; ++i) {
+    data1[i] = internal::random<Scalar>();
+    data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
+  }
+
+  CHECK_CWISE2_IF(true, internal::pcmp_le, internal::pcmp_le);
+  CHECK_CWISE2_IF(true, internal::pcmp_lt, internal::pcmp_lt);
+
+  //Test (-0) <=/< (0) for signed operations
+  for (int i = 0; i < PacketSize; ++i) {
+    data1[i] = Scalar(-0.0);
+    data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
+  }
+  CHECK_CWISE2_IF(true, internal::pcmp_le, internal::pcmp_le);
+  CHECK_CWISE2_IF(true, internal::pcmp_lt, internal::pcmp_lt);
+
+  //Test NaN
+  for (int i = 0; i < PacketSize; ++i) {
+    data1[i] = NumTraits<Scalar>::quiet_NaN();
+    data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
+  }
+  CHECK_CWISE2_IF(true, internal::pcmp_le, internal::pcmp_le);
+  CHECK_CWISE2_IF(true, internal::pcmp_lt, internal::pcmp_lt);
+}
+
 // Packet16b representing bool does not support ptrue, pandnot or pcmp_eq, since the scalar path
 // (for some compilers) compute the bitwise and with 0x1 of the results to keep the value in [0,1].
 template<>
 void packetmath_boolean_mask_ops<bool, internal::packet_traits<bool>::type>() {}
+template<>
+void packetmath_boolean_mask_ops_notcomplex<bool, internal::packet_traits<bool>::type>() {}
 
 template <typename Scalar, typename Packet>
 void packetmath_minus_zero_add() {
@@ -574,6 +639,8 @@
   CHECK_CWISE1_EXACT_IF(PacketTraits::HasCeil, numext::ceil, internal::pceil);
   CHECK_CWISE1_EXACT_IF(PacketTraits::HasFloor, numext::floor, internal::pfloor);
   CHECK_CWISE1_EXACT_IF(PacketTraits::HasRint, numext::rint, internal::print);
+
+  packetmath_boolean_mask_ops_real<Scalar,Packet>();
   
   // Rounding edge cases.
   if (PacketTraits::HasRound || PacketTraits::HasCeil || PacketTraits::HasFloor || PacketTraits::HasRint) {
@@ -1020,6 +1087,8 @@
     CHECK_CWISE2_IF(PacketTraits::HasMin, propagate_nan_min, (internal::pmin<PropagateNaN>));
     CHECK_CWISE2_IF(PacketTraits::HasMax, propagate_nan_max, internal::pmax<PropagateNaN>);
   }
+
+  packetmath_boolean_mask_ops_notcomplex<Scalar, Packet>();
 }
 
 template <typename Scalar, typename Packet, bool ConjLhs, bool ConjRhs>
diff --git a/test/rvalue_types.cpp b/test/rvalue_types.cpp
index c20a32f..2c9999c 100644
--- a/test/rvalue_types.cpp
+++ b/test/rvalue_types.cpp
@@ -13,41 +13,12 @@
 #if EIGEN_HAS_CXX11
 #include "MovableScalar.h"
 #endif
+#include "SafeScalar.h"
 
 #include <Eigen/Core>
 
 using internal::UIntPtr;
 
-// A Scalar that asserts for uninitialized access.
-template<typename T>
-class SafeScalar {
- public:
-  SafeScalar() : initialized_(false) {}
-  SafeScalar(const SafeScalar& other) {
-    *this = other;
-  }
-  SafeScalar& operator=(const SafeScalar& other) {
-    val_ = T(other);
-    initialized_ = true;
-    return *this;
-  }
-  
-  SafeScalar(T val) : val_(val), initialized_(true) {}
-  SafeScalar& operator=(T val) {
-    val_ = val;
-    initialized_ = true;
-  }
-  
-  operator T() const {
-    VERIFY(initialized_ && "Uninitialized access.");
-    return val_;
-  }
- 
- private:
-  T val_;
-  bool initialized_;
-};
-
 #if EIGEN_HAS_RVALUE_REFERENCES
 template <typename MatrixType>
 void rvalue_copyassign(const MatrixType& m)
diff --git a/test/sparse.h b/test/sparse.h
index df471b4..6cd07fc 100644
--- a/test/sparse.h
+++ b/test/sparse.h
@@ -29,10 +29,6 @@
 
 #endif
 
-#ifdef EIGEN_GOOGLEHASH_SUPPORT
-  #include <google/sparse_hash_map>
-#endif
-
 #include <Eigen/Cholesky>
 #include <Eigen/LU>
 #include <Eigen/Sparse>
diff --git a/test/stl_iterators.cpp b/test/stl_iterators.cpp
index 997f801..72bbf82 100644
--- a/test/stl_iterators.cpp
+++ b/test/stl_iterators.cpp
@@ -7,9 +7,9 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+#include "main.h"
 #include <iterator>
 #include <numeric>
-#include "main.h"
 
 template< class Iterator >
 std::reverse_iterator<Iterator>
@@ -47,6 +47,18 @@
 template<typename XprType>
 bool is_generic_randaccess_stl_iterator(const internal::generic_randaccess_stl_iterator<XprType> &) { return true; }
 
+template<typename Iter>
+bool is_default_constructible_and_assignable(const Iter& it)
+{
+#if EIGEN_HAS_CXX11
+  VERIFY(std::is_default_constructible<Iter>::value);
+  VERIFY(std::is_nothrow_default_constructible<Iter>::value);
+#endif
+  Iter it2;
+  it2 = it;
+  return (it==it2);
+}
+
 template<typename Xpr>
 void check_begin_end_for_loop(Xpr xpr)
 {
@@ -124,6 +136,22 @@
   
   Index i, j;
 
+  // Verify that iterators are default constructible (See bug #1900)
+  {
+    VERIFY( is_default_constructible_and_assignable(v.begin()));
+    VERIFY( is_default_constructible_and_assignable(v.end()));
+    VERIFY( is_default_constructible_and_assignable(cv.begin()));
+    VERIFY( is_default_constructible_and_assignable(cv.end()));
+
+    VERIFY( is_default_constructible_and_assignable(A.row(0).begin()));
+    VERIFY( is_default_constructible_and_assignable(A.row(0).end()));
+    VERIFY( is_default_constructible_and_assignable(cA.row(0).begin()));
+    VERIFY( is_default_constructible_and_assignable(cA.row(0).end()));
+
+    VERIFY( is_default_constructible_and_assignable(B.row(0).begin()));
+    VERIFY( is_default_constructible_and_assignable(B.row(0).end()));
+  }
+
   // Check we got a fast pointer-based iterator when expected
   {
     VERIFY( is_pointer_based_stl_iterator(v.begin()) );
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index d73c600..0938bb5 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -41,14 +41,6 @@
 #include <random>
 #include <thread>
 
-#ifdef _WIN32
-#include <windows.h>
-#elif defined(__APPLE__)
-#include <mach/mach_time.h>
-#else
-#include <time.h>
-#endif
-
 #if defined(EIGEN_USE_THREADS) || defined(EIGEN_USE_SYCL)
 #include "ThreadPool"
 #endif
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
index 9422dcd..ec2e3cb 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
@@ -42,51 +42,84 @@
   virtual unsigned int* semaphore() const = 0;
 };
 
-static gpuDeviceProp_t* m_deviceProperties;
-static bool m_devicePropInitialized = false;
+class GpuDeviceProperties {
+ public:
+  GpuDeviceProperties() : 
+      initialized_(false), first_(true), device_properties_(nullptr) {}
+ 
+  ~GpuDeviceProperties() {
+    if (device_properties_) {
+      delete[] device_properties_;
+    }
+  }
+  
+  EIGEN_STRONG_INLINE const gpuDeviceProp_t& get(int device) const {
+    return device_properties_[device];
+  }
 
-static void initializeDeviceProp() {
-  if (!m_devicePropInitialized) {
-    // Attempts to ensure proper behavior in the case of multiple threads
-    // calling this function simultaneously. This would be trivial to
-    // implement if we could use std::mutex, but unfortunately mutex don't
-    // compile with nvcc, so we resort to atomics and thread fences instead.
-    // Note that if the caller uses a compiler that doesn't support c++11 we
-    // can't ensure that the initialization is thread safe.
-    static std::atomic<bool> first(true);
-    if (first.exchange(false)) {
-      // We're the first thread to reach this point.
-      int num_devices;
-      gpuError_t status = gpuGetDeviceCount(&num_devices);
-      if (status != gpuSuccess) {
-        std::cerr << "Failed to get the number of GPU devices: "
-                  << gpuGetErrorString(status)
-                  << std::endl;
-        gpu_assert(status == gpuSuccess);
-      }
-      m_deviceProperties = new gpuDeviceProp_t[num_devices];
-      for (int i = 0; i < num_devices; ++i) {
-        status = gpuGetDeviceProperties(&m_deviceProperties[i], i);
+  EIGEN_STRONG_INLINE bool isInitialized() const {
+    return initialized_;
+  }
+
+  void initialize() {
+    if (!initialized_) {
+      // Attempts to ensure proper behavior in the case of multiple threads
+      // calling this function simultaneously. This would be trivial to
+      // implement if we could use std::mutex, but unfortunately mutex don't
+      // compile with nvcc, so we resort to atomics and thread fences instead.
+      // Note that if the caller uses a compiler that doesn't support c++11 we
+      // can't ensure that the initialization is thread safe.
+      if (first_.exchange(false)) {
+        // We're the first thread to reach this point.
+        int num_devices;
+        gpuError_t status = gpuGetDeviceCount(&num_devices);
         if (status != gpuSuccess) {
-          std::cerr << "Failed to initialize GPU device #"
-                    << i
-                    << ": "
+          std::cerr << "Failed to get the number of GPU devices: "
                     << gpuGetErrorString(status)
                     << std::endl;
           gpu_assert(status == gpuSuccess);
         }
-      }
+        device_properties_ = new gpuDeviceProp_t[num_devices];
+        for (int i = 0; i < num_devices; ++i) {
+          status = gpuGetDeviceProperties(&device_properties_[i], i);
+          if (status != gpuSuccess) {
+            std::cerr << "Failed to initialize GPU device #"
+                      << i
+                      << ": "
+                      << gpuGetErrorString(status)
+                      << std::endl;
+            gpu_assert(status == gpuSuccess);
+          }
+        }
 
-      std::atomic_thread_fence(std::memory_order_release);
-      m_devicePropInitialized = true;
-    } else {
-      // Wait for the other thread to inititialize the properties.
-      while (!m_devicePropInitialized) {
-        std::atomic_thread_fence(std::memory_order_acquire);
-        std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+        std::atomic_thread_fence(std::memory_order_release);
+        initialized_ = true;
+      } else {
+        // Wait for the other thread to inititialize the properties.
+        while (!initialized_) {
+          std::atomic_thread_fence(std::memory_order_acquire);
+          std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+        }
       }
     }
   }
+
+ private:
+  volatile bool initialized_;
+  std::atomic<bool> first_;
+  gpuDeviceProp_t* device_properties_;
+};
+
+EIGEN_ALWAYS_INLINE const GpuDeviceProperties& GetGpuDeviceProperties() {
+  static GpuDeviceProperties* deviceProperties = new GpuDeviceProperties();
+  if (!deviceProperties->isInitialized()) {
+    deviceProperties->initialize();
+  }
+  return *deviceProperties;
+}
+
+EIGEN_ALWAYS_INLINE const gpuDeviceProp_t& GetGpuDeviceProperties(int device) {
+  return GetGpuDeviceProperties().get(device);
 }
 
 static const gpuStream_t default_stream = gpuStreamDefault;
@@ -96,12 +129,9 @@
   // Use the default stream on the current device
   GpuStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
     gpuGetDevice(&device_);
-    initializeDeviceProp();
   }
   // Use the default stream on the specified device
-  GpuStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {
-    initializeDeviceProp();
-  }
+  GpuStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {}
   // Use the specified stream. Note that it's the
   // caller responsibility to ensure that the stream can run on
   // the specified device. If no device is specified the code
@@ -118,7 +148,6 @@
       gpu_assert(device < num_devices);
       device_ = device;
     }
-    initializeDeviceProp();
   }
 
   virtual ~GpuStreamDevice() {
@@ -129,7 +158,7 @@
 
   const gpuStream_t& stream() const { return *stream_; }
   const gpuDeviceProp_t& deviceProperties() const {
-    return m_deviceProperties[device_];
+    return GetGpuDeviceProperties(device_);
   }
   virtual void* allocate(size_t num_bytes) const {
     gpuError_t err = gpuSetDevice(device_);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
index 132458a..f0f1e83 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
@@ -466,7 +466,7 @@
 template <typename Dims1, typename Dims2, ptrdiff_t n>
 struct sizes_match_below_dim<Dims1, Dims2, n, n> {
   static EIGEN_DEVICE_FUNC  EIGEN_STRONG_INLINE bool run(Dims1& dims1, Dims2& dims2) {
-    return (array_get<n-1>(dims1) == array_get<n-1>(dims2)) &
+    return (array_get<n-1>(dims1) == array_get<n-1>(dims2)) &&
         sizes_match_below_dim<Dims1, Dims2, n-1, n-1>::run(dims1, dims2);
   }
 };
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
index 13450e1..37c1d1c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
@@ -21,47 +21,11 @@
   // We don't support 3d kernels since we currently only use 1 and
   // 2d kernels.
   gpu_assert(threadIdx.z == 0);
-  return clock64() +
-      blockIdx.x * blockDim.x + threadIdx.x +
-      gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y);
-
-#elif defined _WIN32
-  // Use the current time as a baseline.
-  SYSTEMTIME st;
-  GetSystemTime(&st);
-  int time = st.wSecond + 1000 * st.wMilliseconds;
-  // Mix in a random number to make sure that we get different seeds if
-  // we try to generate seeds faster than the clock resolution.
-  // We need 2 random values since the generator only generate 16 bits at
-  // a time (https://msdn.microsoft.com/en-us/library/398ax69y.aspx)
-  int rnd1 = ::rand();
-  int rnd2 = ::rand();
-  uint64_t rnd = (rnd1 | rnd2 << 16) ^ time;
-  return rnd;
-
-#elif defined __APPLE__
-  // Same approach as for win32, except that the random number generator
-  // is better (// https://developer.apple.com/legacy/library/documentation/Darwin/Reference/ManPages/man3/random.3.html#//apple_ref/doc/man/3/random).
-  uint64_t rnd = ::random() ^ mach_absolute_time();
-  return rnd;
-
-#elif defined __native_client__
-  // Same approach as for win32, except using clock_gettime
-  timespec ts;
-  clock_gettime(CLOCK_REALTIME, &ts);
-  int rnd1 = ::rand();
-  int rnd2 = ::rand();
-  uint64_t rnd = (rnd1 | rnd2 << 16) ^ ts.tv_nsec;
-  return rnd;
-
+  return blockIdx.x * blockDim.x + threadIdx.x 
+         + gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y);
 #else
-  // Augment the current time with pseudo random number generation
-  // to ensure that we get different seeds if we try to generate seeds
-  // faster than the clock resolution.
-  timespec ts;
-  clock_gettime(CLOCK_REALTIME, &ts);
-  uint64_t rnd = ::random() ^ ts.tv_nsec;
-  return rnd;
+  // Rely on Eigen's random implementation.
+  return random<uint64_t>();
 #endif
 }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
index 98c8250..a06c4a9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
@@ -357,8 +357,8 @@
 
 }
 
-template <typename Self, typename Reducer>
-struct ScanLauncher<Self, Reducer, GpuDevice, false> {
+template <typename Self, typename Reducer, bool Vectorize>
+struct ScanLauncher<Self, Reducer, GpuDevice, Vectorize> {
   void operator()(const Self& self, typename Self::CoeffReturnType* data) {
      Index total_size = internal::array_prod(self.dimensions());
      Index num_blocks = (total_size / self.size() + 63) / 64;
diff --git a/unsupported/Eigen/IterativeSolvers b/unsupported/Eigen/IterativeSolvers
index 8930a74..38e09fd 100644
--- a/unsupported/Eigen/IterativeSolvers
+++ b/unsupported/Eigen/IterativeSolvers
@@ -14,6 +14,7 @@
 #include "../../Eigen/Jacobi"
 #include "../../Eigen/Householder"
 
+
 /**
   * \defgroup IterativeLinearSolvers_Module Iterative solvers module
   * This module aims to provide various iterative linear and non linear solver algorithms.
@@ -23,11 +24,12 @@
   *  - an IDR(s) implementation
   *  - a DGMRES implementation
   *  - a MINRES implementation
+  *
   * \code
   * #include <unsupported/Eigen/IterativeSolvers>
   * \endcode
   */
-//@{
+
 
 #include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
 
@@ -40,6 +42,5 @@
 
 #include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 
-//@}
 
 #endif // EIGEN_ITERATIVE_SOLVERS_MODULE_H
diff --git a/unsupported/Eigen/SparseExtra b/unsupported/Eigen/SparseExtra
index 819cffa..ba5cbd6 100644
--- a/unsupported/Eigen/SparseExtra
+++ b/unsupported/Eigen/SparseExtra
@@ -24,6 +24,7 @@
 
 #ifdef EIGEN_GOOGLEHASH_SUPPORT
   #include <google/dense_hash_map>
+  #include <google/sparse_hash_map>
 #endif
 
 /**
diff --git a/unsupported/Eigen/src/SparseExtra/RandomSetter.h b/unsupported/Eigen/src/SparseExtra/RandomSetter.h
index 7542cf7..985702b 100644
--- a/unsupported/Eigen/src/SparseExtra/RandomSetter.h
+++ b/unsupported/Eigen/src/SparseExtra/RandomSetter.h
@@ -10,7 +10,13 @@
 #ifndef EIGEN_RANDOMSETTER_H
 #define EIGEN_RANDOMSETTER_H
 
-namespace Eigen { 
+#if defined(EIGEN_GOOGLEHASH_SUPPORT)
+// Ensure the ::google namespace exists, required for checking existence of 
+// ::google::dense_hash_map and ::google::sparse_hash_map.
+namespace google {}
+#endif
+
+namespace Eigen {
 
 /** Represents a std::map
   *
@@ -56,7 +62,26 @@
 };
 #endif // EIGEN_UNORDERED_MAP_SUPPORT
 
-#ifdef _DENSE_HASH_MAP_H_
+#if defined(EIGEN_GOOGLEHASH_SUPPORT)
+
+namespace google {
+  
+// Namespace work-around, since sometimes dense_hash_map and sparse_hash_map
+// are in the global namespace, and other times they are under ::google.
+using namespace ::google;
+
+template<typename KeyType, typename Scalar>
+struct DenseHashMap {
+  typedef dense_hash_map<KeyType, Scalar> type;
+};
+
+template<typename KeyType, typename Scalar>
+struct SparseHashMap {
+  typedef sparse_hash_map<KeyType, Scalar> type;
+};
+
+} // namespace google
+
 /** Represents a google::dense_hash_map
   *
   * \see RandomSetter
@@ -64,7 +89,7 @@
 template<typename Scalar> struct GoogleDenseHashMapTraits
 {
   typedef int KeyType;
-  typedef google::dense_hash_map<KeyType,Scalar> Type;
+  typedef typename google::DenseHashMap<KeyType,Scalar>::type Type;
   enum {
     IsSorted = 0
   };
@@ -72,9 +97,7 @@
   static void setInvalidKey(Type& map, const KeyType& k)
   { map.set_empty_key(k); }
 };
-#endif
 
-#ifdef _SPARSE_HASH_MAP_H_
 /** Represents a google::sparse_hash_map
   *
   * \see RandomSetter
@@ -82,7 +105,7 @@
 template<typename Scalar> struct GoogleSparseHashMapTraits
 {
   typedef int KeyType;
-  typedef google::sparse_hash_map<KeyType,Scalar> Type;
+  typedef typename google::SparseHashMap<KeyType,Scalar>::type Type;
   enum {
     IsSorted = 0
   };
@@ -134,18 +157,17 @@
   * GoogleSparseHashMapTraits, GnuHashMapTraits, and finally StdMapTraits.
   *
   * For performance and memory consumption reasons it is highly recommended to use one of
-  * the Google's hash_map implementation. To enable the support for them, you have two options:
-  *  - \#include <google/dense_hash_map> yourself \b before Eigen/Sparse header
-  *  - define EIGEN_GOOGLEHASH_SUPPORT
-  * In the later case the inclusion of <google/dense_hash_map> is made for you.
+  * Google's hash_map implementations. To enable the support for them, you must define
+  * EIGEN_GOOGLEHASH_SUPPORT. This will include both <google/dense_hash_map> and
+  * <google/sparse_hash_map> for you.
   *
-  * \see http://code.google.com/p/google-sparsehash/
+  * \see https://github.com/sparsehash/sparsehash
   */
 template<typename SparseMatrixType,
          template <typename T> class MapTraits =
-#if defined _DENSE_HASH_MAP_H_
+#if defined(EIGEN_GOOGLEHASH_SUPPORT)
           GoogleDenseHashMapTraits
-#elif defined _HASH_MAP
+#elif defined(_HASH_MAP)
           GnuHashMapTraits
 #else
           StdMapTraits
diff --git a/unsupported/test/cxx11_tensor_of_float16_gpu.cu b/unsupported/test/cxx11_tensor_of_float16_gpu.cu
index c55676c..062f76e 100644
--- a/unsupported/test/cxx11_tensor_of_float16_gpu.cu
+++ b/unsupported/test/cxx11_tensor_of_float16_gpu.cu
@@ -444,7 +444,7 @@
       d_float, num_elem);
   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half1(
       d_res_half1, num_elem);
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Unaligned> gpu_res_half2(
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Unaligned> gpu_res_half2(
       d_res_half2, num_elem);
   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
       d_res_float, num_elem);
@@ -461,7 +461,7 @@
   Tensor<float, 1> half_prec2(num_elem);
   Tensor<float, 1> full_prec(num_elem);
   gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res_half1, num_elem*sizeof(float));
-  gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res_half1, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res_half2, num_elem*sizeof(float));
   gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
   gpu_device.synchronize();
 
diff --git a/unsupported/test/sparse_extra.cpp b/unsupported/test/sparse_extra.cpp
index bc681e3..cdfd10c 100644
--- a/unsupported/test/sparse_extra.cpp
+++ b/unsupported/test/sparse_extra.cpp
@@ -123,10 +123,8 @@
     #ifdef EIGEN_UNORDERED_MAP_SUPPORT
     VERIFY(( test_random_setter<RandomSetter<SparseMatrixType, StdUnorderedMapTraits> >(m,refMat,nonzeroCoords) ));
     #endif
-    #ifdef _DENSE_HASH_MAP_H_
+    #ifdef EIGEN_GOOGLEHASH_SUPPORT
     VERIFY(( test_random_setter<RandomSetter<SparseMatrixType, GoogleDenseHashMapTraits> >(m,refMat,nonzeroCoords) ));
-    #endif
-    #ifdef _SPARSE_HASH_MAP_H_
     VERIFY(( test_random_setter<RandomSetter<SparseMatrixType, GoogleSparseHashMapTraits> >(m,refMat,nonzeroCoords) ));
     #endif