Update Eigen to: https://gitlab.com/libeigen/eigen/-/commit/352f1422d3ceea19a04cab297c6339e0870e1c6c

BEGIN_PUBLIC

Update Eigen to: https://gitlab.com/libeigen/eigen/-/commit/352f1422d3ceea19a04cab297c6339e0870e1c6c

END_PUBLIC

NOTE: This Eigen update introduces a change in the way transforms are decomposed
into a scale and rotation.  Non-degenerate cases will have small
numerical differences, and degenerate cases (i.e. one dimension is flipped) now
choose to adjust the rotation to better preserve continuity. In particular, this
is affecting numeric results with the FaceNet model.

If your test is failing, please update your golden data.  If the cause does not
seem to be goldens-related, please reach out and we will help to identify and
resolve the issue.

PiperOrigin-RevId: 351646522
diff --git a/Eigen/src/Core/CwiseUnaryView.h b/Eigen/src/Core/CwiseUnaryView.h
index ff3134d..02b034d 100644
--- a/Eigen/src/Core/CwiseUnaryView.h
+++ b/Eigen/src/Core/CwiseUnaryView.h
@@ -64,23 +64,23 @@
     typedef typename internal::ref_selector<MatrixType>::non_const_type MatrixTypeNested;
     typedef typename internal::remove_all<MatrixType>::type NestedExpression;
 
-    explicit inline CwiseUnaryView(MatrixType& mat, const ViewOp& func = ViewOp())
+    explicit EIGEN_DEVICE_FUNC inline CwiseUnaryView(MatrixType& mat, const ViewOp& func = ViewOp())
       : m_matrix(mat), m_functor(func) {}
 
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(CwiseUnaryView)
 
-    EIGEN_STRONG_INLINE Index rows() const { return m_matrix.rows(); }
-    EIGEN_STRONG_INLINE Index cols() const { return m_matrix.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rows() const { return m_matrix.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index cols() const { return m_matrix.cols(); }
 
     /** \returns the functor representing unary operation */
-    const ViewOp& functor() const { return m_functor; }
+    EIGEN_DEVICE_FUNC const ViewOp& functor() const { return m_functor; }
 
     /** \returns the nested expression */
-    const typename internal::remove_all<MatrixTypeNested>::type&
+    EIGEN_DEVICE_FUNC const typename internal::remove_all<MatrixTypeNested>::type&
     nestedExpression() const { return m_matrix; }
 
     /** \returns the nested expression */
-    typename internal::remove_reference<MatrixTypeNested>::type&
+    EIGEN_DEVICE_FUNC typename internal::remove_reference<MatrixTypeNested>::type&
     nestedExpression() { return m_matrix; }
 
   protected:
diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h
index bf7ef54..6906aa7 100644
--- a/Eigen/src/Core/GeneralProduct.h
+++ b/Eigen/src/Core/GeneralProduct.h
@@ -228,8 +228,7 @@
     ActualLhsType actualLhs = LhsBlasTraits::extract(lhs);
     ActualRhsType actualRhs = RhsBlasTraits::extract(rhs);
 
-    ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs)
-                                  * RhsBlasTraits::extractScalarFactor(rhs);
+    ResScalar actualAlpha = combine_scalar_factors(alpha, lhs, rhs);
 
     // make sure Dest is a compile-time vector type (bug 1166)
     typedef typename conditional<Dest::IsVectorAtCompileTime, Dest, typename Dest::ColXpr>::type ActualDest;
@@ -320,8 +319,7 @@
     typename add_const<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(lhs);
     typename add_const<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(rhs);
 
-    ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs)
-                                  * RhsBlasTraits::extractScalarFactor(rhs);
+    ResScalar actualAlpha = combine_scalar_factors(alpha, lhs, rhs);
 
     enum {
       // FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1
diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index 3cf91bd..f64116a 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -324,6 +324,43 @@
 };
 
 /****************************************************************************
+* Implementation of sqrt                                                 *
+****************************************************************************/
+
+template<typename Scalar>
+struct sqrt_impl
+{
+  EIGEN_DEVICE_FUNC
+  static EIGEN_ALWAYS_INLINE Scalar run(const Scalar& x)
+  {
+    EIGEN_USING_STD(sqrt);
+    return sqrt(x);
+  }
+};
+
+// Complex sqrt defined in MathFunctionsImpl.h.
+template<typename T> EIGEN_DEVICE_FUNC std::complex<T> complex_sqrt(const std::complex<T>& a_x);
+
+// MSVC incorrectly handles inf cases.
+#if EIGEN_COMP_MSVC > 0
+template<typename T>
+struct sqrt_impl<std::complex<T> >
+{
+  EIGEN_DEVICE_FUNC
+  static EIGEN_ALWAYS_INLINE std::complex<T> run(const std::complex<T>& x)
+  {
+    return complex_sqrt<T>(x);
+  }
+};
+#endif
+
+template<typename Scalar>
+struct sqrt_retval
+{
+  typedef Scalar type;
+};
+
+/****************************************************************************
 * Implementation of norm1                                                *
 ****************************************************************************/
 
@@ -1368,12 +1405,11 @@
   *
   * It's usage is justified in performance critical functions, like norm/normalize.
   */
-template<typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-T sqrt(const T &x)
+template<typename Scalar>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE EIGEN_MATHFUNC_RETVAL(sqrt, Scalar) sqrt(const Scalar& x)
 {
-  EIGEN_USING_STD(sqrt);
-  return sqrt(x);
+  return EIGEN_MATHFUNC_IMPL(sqrt, Scalar)::run(x);
 }
 
 // Boolean specialization, avoids implicit float to bool conversion (-Wimplicit-conversion-floating-point-to-bool).
diff --git a/Eigen/src/Core/MathFunctionsImpl.h b/Eigen/src/Core/MathFunctionsImpl.h
index 8288ad8..9222285 100644
--- a/Eigen/src/Core/MathFunctionsImpl.h
+++ b/Eigen/src/Core/MathFunctionsImpl.h
@@ -99,6 +99,49 @@
   }
 };
 
+// Generic complex sqrt implementation that correctly handles corner cases
+// according to https://en.cppreference.com/w/cpp/numeric/complex/sqrt
+template<typename T>
+EIGEN_DEVICE_FUNC std::complex<T> complex_sqrt(const std::complex<T>& z) {
+  // Computes the principal sqrt of the input.
+  //
+  // For a complex square root of the number x + i*y. We want to find real
+  // numbers u and v such that
+  //    (u + i*v)^2 = x + i*y  <=>
+  //    u^2 - v^2 + i*2*u*v = x + i*v.
+  // By equating the real and imaginary parts we get:
+  //    u^2 - v^2 = x
+  //    2*u*v = y.
+  //
+  // For x >= 0, this has the numerically stable solution
+  //    u = sqrt(0.5 * (x + sqrt(x^2 + y^2)))
+  //    v = y / (2 * u)
+  // and for x < 0,
+  //    v = sign(y) * sqrt(0.5 * (-x + sqrt(x^2 + y^2)))
+  //    u = y / (2 * v)
+  //
+  // Letting w = sqrt(0.5 * (|x| + |z|)),
+  //   if x == 0: u = w, v = sign(y) * w
+  //   if x > 0:  u = w, v = y / (2 * w)
+  //   if x < 0:  u = |y| / (2 * w), v = sign(y) * w
+
+  const T x = numext::real(z);
+  const T y = numext::imag(z);
+  const T zero = T(0);
+  const T cst_half = T(0.5);
+
+  // Special case of isinf(y)
+  if ((numext::isinf)(y)) {
+    return std::complex<T>(std::numeric_limits<T>::infinity(), y);
+  }
+
+  T w = numext::sqrt(cst_half * (numext::abs(x) + numext::abs(z)));
+  return
+    x == zero ? std::complex<T>(w, y < zero ? -w : w)
+    : x > zero ? std::complex<T>(w, y / (2 * w))
+      : std::complex<T>(numext::abs(y) / (2 * w), y < zero ? -w : w );
+}
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h
index 02b5843..079189a 100644
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@@ -441,8 +441,8 @@
     };
     // FIXME: in c++11 this should be auto, and extractScalarFactor should also return auto
     //        this is important for real*complex_mat
-    Scalar actualAlpha =    blas_traits<Lhs>::extractScalarFactor(lhs)
-                          * blas_traits<Rhs>::extractScalarFactor(rhs);
+    Scalar actualAlpha = combine_scalar_factors<Scalar>(lhs, rhs);
+
     eval_dynamic_impl(dst,
                       blas_traits<Lhs>::extract(lhs).template conjugateIf<ConjLhs>(),
                       blas_traits<Rhs>::extract(rhs).template conjugateIf<ConjRhs>(),
diff --git a/Eigen/src/Core/Ref.h b/Eigen/src/Core/Ref.h
index 172c8ff..9fca428 100644
--- a/Eigen/src/Core/Ref.h
+++ b/Eigen/src/Core/Ref.h
@@ -93,29 +93,127 @@
 
   typedef Stride<StrideType::OuterStrideAtCompileTime,StrideType::InnerStrideAtCompileTime> StrideBase;
 
-  template<typename Expression>
-  EIGEN_DEVICE_FUNC void construct(Expression& expr)
-  {
-    EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(PlainObjectType,Expression);
+  // Resolves inner stride if default 0.
+  static EIGEN_DEVICE_FUNC Index resolveInnerStride(Index inner) {
+    if (inner == 0) {
+      return 1;
+    }
+    return inner;
+  }
+  
+  // Resolves outer stride if default 0.
+  static EIGEN_DEVICE_FUNC Index resolveOuterStride(Index inner, Index outer, Index rows, Index cols, bool isVectorAtCompileTime, bool isRowMajor) {
+    if (outer == 0) {
+      if (isVectorAtCompileTime) {
+        outer = inner * rows * cols;
+      } else if (isRowMajor) {
+        outer = inner * cols;
+      } else {
+        outer = inner * rows;
+      }
+    }
+    return outer;
+  }
 
+  // Returns true if construction is valid, false if there is a stride mismatch,
+  // and fails if there is a size mismatch.
+  template<typename Expression>
+  EIGEN_DEVICE_FUNC bool construct(Expression& expr)
+  {
+    // Check matrix sizes.  If this is a compile-time vector, we do allow
+    // implicitly transposing.
+    EIGEN_STATIC_ASSERT(
+      EIGEN_PREDICATE_SAME_MATRIX_SIZE(PlainObjectType, Expression)
+      // If it is a vector, the transpose sizes might match.
+      || ( PlainObjectType::IsVectorAtCompileTime
+            && ((int(PlainObjectType::RowsAtCompileTime)==Eigen::Dynamic
+              || int(Expression::ColsAtCompileTime)==Eigen::Dynamic
+              || int(PlainObjectType::RowsAtCompileTime)==int(Expression::ColsAtCompileTime))
+            &&  (int(PlainObjectType::ColsAtCompileTime)==Eigen::Dynamic
+              || int(Expression::RowsAtCompileTime)==Eigen::Dynamic
+              || int(PlainObjectType::ColsAtCompileTime)==int(Expression::RowsAtCompileTime)))),
+      YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES
+    )
+
+    // Determine runtime rows and columns.
+    Index rows = expr.rows();
+    Index cols = expr.cols();
     if(PlainObjectType::RowsAtCompileTime==1)
     {
       eigen_assert(expr.rows()==1 || expr.cols()==1);
-      ::new (static_cast<Base*>(this)) Base(expr.data(), 1, expr.size());
+      rows = 1;
+      cols = expr.size();
     }
     else if(PlainObjectType::ColsAtCompileTime==1)
     {
       eigen_assert(expr.rows()==1 || expr.cols()==1);
-      ::new (static_cast<Base*>(this)) Base(expr.data(), expr.size(), 1);
+      rows = expr.size();
+      cols = 1;
     }
-    else
-      ::new (static_cast<Base*>(this)) Base(expr.data(), expr.rows(), expr.cols());
+    // Verify that the sizes are valid.
+    eigen_assert(
+      (PlainObjectType::RowsAtCompileTime == Dynamic) || (PlainObjectType::RowsAtCompileTime == rows));
+    eigen_assert(
+      (PlainObjectType::ColsAtCompileTime == Dynamic) || (PlainObjectType::ColsAtCompileTime == cols));
+  
+  
+    // If this is a vector, we might be transposing, which means that stride should swap.
+    const bool transpose = PlainObjectType::IsVectorAtCompileTime && (rows != expr.rows());
+    // If the storage format differs, we also need to swap the stride.
+    const bool row_major = ((PlainObjectType::Flags)&RowMajorBit) != 0;
+    const bool expr_row_major = (Expression::Flags&RowMajorBit) != 0;
+    const bool storage_differs =  (row_major != expr_row_major);
+
+    const bool swap_stride = (transpose != storage_differs);
     
-    if(Expression::IsVectorAtCompileTime && (!PlainObjectType::IsVectorAtCompileTime) && ((Expression::Flags&RowMajorBit)!=(PlainObjectType::Flags&RowMajorBit)))
-      ::new (&m_stride) StrideBase(expr.innerStride(), StrideType::InnerStrideAtCompileTime==0?0:1);
-    else
-      ::new (&m_stride) StrideBase(StrideType::OuterStrideAtCompileTime==0?0:expr.outerStride(),
-                                   StrideType::InnerStrideAtCompileTime==0?0:expr.innerStride());    
+    // Determine expr's actual strides, resolving any defaults if zero.
+    const Index expr_inner_actual = resolveInnerStride(expr.innerStride());
+    const Index expr_outer_actual = resolveOuterStride(expr_inner_actual, 
+                                                       expr.outerStride(),
+                                                       expr.rows(),
+                                                       expr.cols(), 
+                                                       Expression::IsVectorAtCompileTime != 0,
+                                                       expr_row_major);
+                                                       
+    // If this is a column-major row vector or row-major column vector, the inner-stride
+    // is arbitrary, so set it to either the compile-time inner stride or 1.
+    const bool row_vector = (rows == 1);
+    const bool col_vector = (cols == 1);
+    const Index inner_stride = 
+        ( (!row_major && row_vector) || (row_major && col_vector) ) ? 
+            ( StrideType::InnerStrideAtCompileTime > 0 ? Index(StrideType::InnerStrideAtCompileTime) : 1) 
+            : swap_stride ? expr_outer_actual : expr_inner_actual;
+              
+    // If this is a column-major column vector or row-major row vector, the outer-stride
+    // is arbitrary, so set it to either the compile-time outer stride or vector size.
+    const Index outer_stride = 
+      ( (!row_major && col_vector) || (row_major && row_vector) ) ? 
+          ( StrideType::OuterStrideAtCompileTime > 0 ? Index(StrideType::OuterStrideAtCompileTime) : rows * cols * inner_stride) 
+          : swap_stride ? expr_inner_actual : expr_outer_actual;
+  
+    // Check if given inner/outer strides are compatible with compile-time strides.
+    const bool inner_valid = (StrideType::InnerStrideAtCompileTime == Dynamic)
+        || (resolveInnerStride(Index(StrideType::InnerStrideAtCompileTime)) == inner_stride);
+    if (!inner_valid) {
+      return false;
+    }
+    
+    const bool outer_valid = (StrideType::OuterStrideAtCompileTime == Dynamic)
+        || (resolveOuterStride(
+              inner_stride, 
+              Index(StrideType::OuterStrideAtCompileTime),
+              rows, cols, PlainObjectType::IsVectorAtCompileTime != 0,
+              row_major)
+            == outer_stride);
+    if (!outer_valid) {
+      return false;
+    }
+    
+    ::new (static_cast<Base*>(this)) Base(expr.data(), rows, cols);
+    ::new (&m_stride) StrideBase(
+      (StrideType::OuterStrideAtCompileTime == 0) ? 0 : outer_stride,
+      (StrideType::InnerStrideAtCompileTime == 0) ? 0 : inner_stride );
+    return true;
   }
 
   StrideBase m_stride;
@@ -212,7 +310,10 @@
                                  typename internal::enable_if<bool(Traits::template match<Derived>::MatchAtCompileTime),Derived>::type* = 0)
     {
       EIGEN_STATIC_ASSERT(bool(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
-      Base::construct(expr.derived());
+      // Construction must pass since we will not create temprary storage in the non-const case.
+      const bool success = Base::construct(expr.derived());
+      EIGEN_UNUSED_VARIABLE(success)
+      eigen_assert(success);
     }
     template<typename Derived>
     EIGEN_DEVICE_FUNC inline Ref(const DenseBase<Derived>& expr,
@@ -226,7 +327,10 @@
       EIGEN_STATIC_ASSERT(bool(internal::is_lvalue<Derived>::value), THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
       EIGEN_STATIC_ASSERT(bool(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
       EIGEN_STATIC_ASSERT(!Derived::IsPlainObjectBase,THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
-      Base::construct(expr.const_cast_derived());
+      // Construction must pass since we will not create temporary storage in the non-const case.
+      const bool success = Base::construct(expr.const_cast_derived());
+      EIGEN_UNUSED_VARIABLE(success)
+      eigen_assert(success);
     }
 
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Ref)
@@ -267,7 +371,10 @@
     template<typename Expression>
     EIGEN_DEVICE_FUNC void construct(const Expression& expr,internal::true_type)
     {
-      Base::construct(expr);
+      // Check if we can use the underlying expr's storage directly, otherwise call the copy version.
+      if (!Base::construct(expr)) {
+        construct(expr, internal::false_type());
+      }
     }
 
     template<typename Expression>
diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h
index 5fe2cff..c0d362f 100644
--- a/Eigen/src/Core/arch/AVX/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX/MathFunctions.h
@@ -97,33 +97,36 @@
 // For detail see here: http://www.beyond3d.com/content/articles/8/
 #if EIGEN_FAST_MATH
 template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
-psqrt<Packet8f>(const Packet8f& _x) {
-  Packet8f half = pmul(_x, pset1<Packet8f>(.5f));
-  Packet8f denormal_mask = _mm256_and_ps(
-      _mm256_cmp_ps(_x, pset1<Packet8f>((std::numeric_limits<float>::min)()),
-                    _CMP_LT_OQ),
-      _mm256_cmp_ps(_x, _mm256_setzero_ps(), _CMP_GE_OQ));
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet8f psqrt<Packet8f>(const Packet8f& _x) {
+  Packet8f minus_half_x = pmul(_x, pset1<Packet8f>(-0.5f));
+  Packet8f denormal_mask = pandnot(
+      pcmp_lt(_x, pset1<Packet8f>((std::numeric_limits<float>::min)())),
+      pcmp_lt(_x, pzero(_x)));
 
   // Compute approximate reciprocal sqrt.
   Packet8f x = _mm256_rsqrt_ps(_x);
   // Do a single step of Newton's iteration.
-  x = pmul(x, psub(pset1<Packet8f>(1.5f), pmul(half, pmul(x,x))));
+  x = pmul(x, pmadd(minus_half_x, pmul(x,x), pset1<Packet8f>(1.5f)));
   // Flush results for denormals to zero.
-  return _mm256_andnot_ps(denormal_mask, pmul(_x,x));
+  return pandnot(pmul(_x,x), denormal_mask);
 }
+
 #else
+
 template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet8f psqrt<Packet8f>(const Packet8f& _x) {
   return _mm256_sqrt_ps(_x);
 }
+
 #endif
+
 template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4d psqrt<Packet4d>(const Packet4d& _x) {
   return _mm256_sqrt_pd(_x);
 }
-#if EIGEN_FAST_MATH
 
+#if EIGEN_FAST_MATH
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet8f prsqrt<Packet8f>(const Packet8f& _x) {
   _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inf, 0x7f800000);
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index 3974935..2299e4d 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -149,7 +149,7 @@
     HasCeil   = 1,
     HasRint   = 1,
     HasBessel = 1,
-    HasNdtri  = 1,
+    HasNdtri  = 1
   };
 };
 
@@ -193,7 +193,7 @@
     HasCeil = 1,
     HasRint = 1,
     HasBessel = 1,
-    HasNdtri  = 1,
+    HasNdtri  = 1
   };
 };
 #endif
diff --git a/Eigen/src/Core/arch/CUDA/Complex.h b/Eigen/src/Core/arch/CUDA/Complex.h
index 57d1201..df5a3c2 100644
--- a/Eigen/src/Core/arch/CUDA/Complex.h
+++ b/Eigen/src/Core/arch/CUDA/Complex.h
@@ -12,12 +12,12 @@
 
 // clang-format off
 
+#if defined(EIGEN_CUDACC) && defined(EIGEN_GPU_COMPILE_PHASE)
+
 namespace Eigen {
 
 namespace internal {
 
-#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU)
-
 // Many std::complex methods such as operator+, operator-, operator* and
 // operator/ are not constexpr. Due to this, clang does not treat them as device
 // functions and thus Eigen functors making use of these operators fail to
@@ -94,10 +94,22 @@
 
 template<typename T> struct scalar_quotient_op<std::complex<T>, std::complex<T> > : scalar_quotient_op<const std::complex<T>, const std::complex<T> > {};
 
+// Complex sqrt is already specialized on Windows.
+#if EIGEN_COMP_MSVC == 0
+template<typename T>
+struct sqrt_impl<std::complex<T> >
+{
+  EIGEN_DEVICE_FUNC
+  static EIGEN_ALWAYS_INLINE std::complex<T> run(const std::complex<T>& x)
+  {
+    return complex_sqrt<T>(x);
+  }
+};
 #endif
 
-} // end namespace internal
+}  // namespace internal
+}  // namespace Eigen
 
-} // end namespace Eigen
+#endif
 
-#endif // EIGEN_COMPLEX_CUDA_H
+#endif  // EIGEN_COMPLEX_CUDA_H
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
index a6d2de6..9253d8ca 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@@ -703,8 +703,8 @@
   //    u = sqrt(0.5 * (x + sqrt(x^2 + y^2)))
   //    v = 0.5 * (y / u)
   // and for x < 0,
-  //    v = sign(y) * sqrt(0.5 * (x + sqrt(x^2 + y^2)))
-  //    u = |0.5 * (y / v)|
+  //    v = sign(y) * sqrt(0.5 * (-x + sqrt(x^2 + y^2)))
+  //    u = 0.5 * (y / v)
   //
   //  To avoid unnecessary over- and underflow, we compute sqrt(x^2 + y^2) as
   //     l = max(|x|, |y|) * sqrt(1 + (min(|x|, |y|) / max(|x|, |y|))^2) ,
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 47bca82..14f3dbd 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -3273,26 +3273,26 @@
 // effective latency. This is similar to Quake3's fast inverse square root.
 // For more details see: http://www.beyond3d.com/content/articles/8/
 template<> EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& _x){
-  Packet4f half = vmulq_n_f32(_x, 0.5f);
+  Packet4f minus_half_x = vmulq_n_f32(_x, -0.5f);
   Packet4ui denormal_mask = vandq_u32(vcgeq_f32(_x, vdupq_n_f32(0.0f)),
                                       vcltq_f32(_x, pset1<Packet4f>((std::numeric_limits<float>::min)())));
   // Compute approximate reciprocal sqrt.
   Packet4f x = vrsqrteq_f32(_x);
   // Do a single step of Newton's iteration.
   //the number 1.5f was set reference to Quake3's fast inverse square root
-  x = vmulq_f32(x, psub(pset1<Packet4f>(1.5f), pmul(half, pmul(x, x))));
+  x = pmul(x, pmadd(minus_half_x, pmul(x, x), pset1<Packet4f>(1.5f)));
   // Flush results for denormals to zero.
   return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(pmul(_x, x)), denormal_mask));
 }
 
-template<> EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& _x){
-  Packet2f half = vmul_n_f32(_x, 0.5f);
+template<> EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& _x) {
+  Packet2f minus_half_x = vmul_n_f32(_x, -0.5f);
   Packet2ui denormal_mask = vand_u32(vcge_f32(_x, vdup_n_f32(0.0f)),
                                      vclt_f32(_x, pset1<Packet2f>((std::numeric_limits<float>::min)())));
   // Compute approximate reciprocal sqrt.
   Packet2f x = vrsqrte_f32(_x);
   // Do a single step of Newton's iteration.
-  x = vmul_f32(x, psub(pset1<Packet2f>(1.5f), pmul(half, pmul(x, x))));
+  x = pmul(x, pmadd(minus_half_x, pmul(x, x), pset1<Packet2f>(1.5f)));
   // Flush results for denormals to zero.
   return vreinterpret_f32_u32(vbic_u32(vreinterpret_u32_f32(pmul(_x, x)), denormal_mask));
 }
@@ -3877,35 +3877,7 @@
 template<> EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from)
 { return vreinterpretq_f64_u64(vdupq_n_u64(from)); }
 
-#if EIGEN_FAST_MATH
-
-// Functions for sqrt support packet2d.
-// The EIGEN_FAST_MATH version uses the vrsqrte_f64 approximation and one step
-// of Newton's method, at a cost of 1-2 bits of precision as opposed to the
-// exact solution. It does not handle +inf, or denormalized numbers correctly.
-// The main advantage of this approach is not just speed, but also the fact that
-// it can be inlined and pipelined with other computations, further reducing its
-// effective latency. This is similar to Quake3's fast inverse square root.
-// For more details see: http://www.beyond3d.com/content/articles/8/
-template<> EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& _x){
-  Packet2d half = vmulq_n_f64(_x, 0.5);
-  Packet2ul denormal_mask = vandq_u64(vcgeq_f64(_x, vdupq_n_f64(0.0)),
-                                      vcltq_f64(_x, pset1<Packet2d>((std::numeric_limits<double>::min)())));
-  // Compute approximate reciprocal sqrt.
-  Packet2d x = vrsqrteq_f64(_x);
-  // Do a single step of Newton's iteration.
-  //the number 1.5f was set reference to Quake3's fast inverse square root
-  x = pmul(x, psub(pset1<Packet2d>(1.5), pmul(half, pmul(x, x))));
-  // Do two more Newton's iterations to geta result accurate to 1 ulp.
-  x = pmul(x, psub(pset1<Packet2d>(1.5), pmul(half, pmul(x, x))));
-  x = pmul(x, psub(pset1<Packet2d>(1.5), pmul(half, pmul(x, x))));
-  // Flush results for denormals to zero.
-  return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(pmul(_x, x)), denormal_mask));
-}
-
-#else
 template<> EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& _x){ return vsqrtq_f64(_x); }
-#endif
 
 #endif // EIGEN_ARCH_ARM64
 
diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h
index 2d7df2f..9f66d8a 100644
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@@ -86,17 +86,17 @@
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f psqrt<Packet4f>(const Packet4f& _x)
 {
-  Packet4f half = pmul(_x, pset1<Packet4f>(.5f));
-  Packet4f denormal_mask = _mm_and_ps(
-      _mm_cmpge_ps(_x, _mm_setzero_ps()),
-      _mm_cmplt_ps(_x, pset1<Packet4f>((std::numeric_limits<float>::min)())));
+  Packet4f minus_half_x = pmul(_x, pset1<Packet4f>(-0.5f));
+  Packet4f denormal_mask = pandnot(
+      pcmp_lt(_x, pset1<Packet4f>((std::numeric_limits<float>::min)())),
+      pcmp_lt(_x, pzero(_x)));
 
   // Compute approximate reciprocal sqrt.
   Packet4f x = _mm_rsqrt_ps(_x);
   // Do a single step of Newton's iteration.
-  x = pmul(x, psub(pset1<Packet4f>(1.5f), pmul(half, pmul(x,x))));
+  x = pmul(x, pmadd(minus_half_x, pmul(x,x), pset1<Packet4f>(1.5f)));
   // Flush results for denormals to zero.
-  return _mm_andnot_ps(denormal_mask, pmul(_x,x));
+  return pandnot(pmul(_x,x), denormal_mask);
 }
 
 #else
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index 0d55bdf..caa65fc 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -489,8 +489,7 @@
     typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(a_lhs);
     typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(a_rhs);
 
-    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs)
-                               * RhsBlasTraits::extractScalarFactor(a_rhs);
+    Scalar actualAlpha = combine_scalar_factors(alpha, a_lhs, a_rhs);
 
     typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,LhsScalar,RhsScalar,
             Dest::MaxRowsAtCompileTime,Dest::MaxColsAtCompileTime,MaxDepthAtCompileTime> BlockingType;
diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h
index a90e574..c516102 100755
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@@ -618,6 +618,47 @@
   return extract_data_selector<T>::run(m);
 }
 
+/**
+ * \c combine_scalar_factors extracts and multiplies factors from GEMM and GEMV products.
+ * There is a specialization for booleans
+ */
+template<typename ResScalar, typename Lhs, typename Rhs>
+struct combine_scalar_factors_impl
+{
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static ResScalar run(const Lhs& lhs, const Rhs& rhs)
+  {
+    return blas_traits<Lhs>::extractScalarFactor(lhs) * blas_traits<Rhs>::extractScalarFactor(rhs);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static ResScalar run(const ResScalar& alpha, const Lhs& lhs, const Rhs& rhs)
+  {
+    return alpha * blas_traits<Lhs>::extractScalarFactor(lhs) * blas_traits<Rhs>::extractScalarFactor(rhs);
+  }
+};
+template<typename Lhs, typename Rhs>
+struct combine_scalar_factors_impl<bool, Lhs, Rhs>
+{
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run(const Lhs& lhs, const Rhs& rhs)
+  {
+    return blas_traits<Lhs>::extractScalarFactor(lhs) && blas_traits<Rhs>::extractScalarFactor(rhs);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run(const bool& alpha, const Lhs& lhs, const Rhs& rhs)
+  {
+    return alpha && blas_traits<Lhs>::extractScalarFactor(lhs) && blas_traits<Rhs>::extractScalarFactor(rhs);
+  }
+};
+
+template<typename ResScalar, typename Lhs, typename Rhs>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ResScalar combine_scalar_factors(const ResScalar& alpha, const Lhs& lhs, const Rhs& rhs)
+{
+  return combine_scalar_factors_impl<ResScalar,Lhs,Rhs>::run(alpha, lhs, rhs);
+}
+template<typename ResScalar, typename Lhs, typename Rhs>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ResScalar combine_scalar_factors(const Lhs& lhs, const Rhs& rhs)
+{
+  return combine_scalar_factors_impl<ResScalar,Lhs,Rhs>::run(lhs, rhs);
+}
+
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index 6343de4..6c05914 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -661,11 +661,11 @@
 #endif
 
 // Does the compiler support result_of?
-// It's likely that MSVC 2013 supports result_of but I couldn't not find a good source for that,
-// so let's be conservative.
+// result_of was deprecated in c++17 and removed in c++ 20
 #ifndef EIGEN_HAS_STD_RESULT_OF
-#if EIGEN_MAX_CPP_VER>=11 && \
-    (__has_feature(cxx_lambdas) || (defined(__cplusplus) && __cplusplus >= 201103L) || EIGEN_COMP_MSVC >= 1900)
+#if EIGEN_MAX_CPP_VER >= 11 && \
+    ((defined(__cplusplus) && __cplusplus >= 201103L && __cplusplus < 201703L) || \
+    __has_feature(cxx_lambdas))
 #define EIGEN_HAS_STD_RESULT_OF 1
 #else
 #define EIGEN_HAS_STD_RESULT_OF 0
diff --git a/Eigen/src/Geometry/Transform.h b/Eigen/src/Geometry/Transform.h
index 13b1de8..7d258c0 100644
--- a/Eigen/src/Geometry/Transform.h
+++ b/Eigen/src/Geometry/Transform.h
@@ -1097,16 +1097,17 @@
 template<typename RotationMatrixType, typename ScalingMatrixType>
 EIGEN_DEVICE_FUNC void Transform<Scalar,Dim,Mode,Options>::computeRotationScaling(RotationMatrixType *rotation, ScalingMatrixType *scaling) const
 {
+  // Note that JacobiSVD is faster than BDCSVD for small matrices.
   JacobiSVD<LinearMatrixType> svd(linear(), ComputeFullU | ComputeFullV);
 
-  Scalar x = (svd.matrixU() * svd.matrixV().adjoint()).determinant(); // so x has absolute value 1
+  Scalar x = (svd.matrixU() * svd.matrixV().adjoint()).determinant() < Scalar(0) ? Scalar(-1) : Scalar(1); // so x has absolute value 1
   VectorType sv(svd.singularValues());
-  sv.coeffRef(0) *= x;
+  sv.coeffRef(Dim-1) *= x;
   if(scaling) *scaling = svd.matrixV() * sv.asDiagonal() * svd.matrixV().adjoint();
   if(rotation)
   {
     LinearMatrixType m(svd.matrixU());
-    m.col(0) /= x;
+    m.col(Dim-1) *= x;
     *rotation = m * svd.matrixV().adjoint();
   }
 }
@@ -1126,16 +1127,17 @@
 template<typename ScalingMatrixType, typename RotationMatrixType>
 EIGEN_DEVICE_FUNC void Transform<Scalar,Dim,Mode,Options>::computeScalingRotation(ScalingMatrixType *scaling, RotationMatrixType *rotation) const
 {
+  // Note that JacobiSVD is faster than BDCSVD for small matrices.
   JacobiSVD<LinearMatrixType> svd(linear(), ComputeFullU | ComputeFullV);
 
-  Scalar x = (svd.matrixU() * svd.matrixV().adjoint()).determinant(); // so x has absolute value 1
+  Scalar x = (svd.matrixU() * svd.matrixV().adjoint()).determinant() < Scalar(0) ? Scalar(-1) : Scalar(1); // so x has absolute value 1
   VectorType sv(svd.singularValues());
-  sv.coeffRef(0) *= x;
+  sv.coeffRef(Dim-1) *= x;
   if(scaling) *scaling = svd.matrixU() * sv.asDiagonal() * svd.matrixU().adjoint();
   if(rotation)
   {
     LinearMatrixType m(svd.matrixU());
-    m.col(0) /= x;
+    m.col(Dim-1) *= x;
     *rotation = m * svd.matrixV().adjoint();
   }
 }
@@ -1331,7 +1333,7 @@
 {
   typedef typename MatrixType::PlainObject ResultType;
 
-  static EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other)
   {
     return T.matrix() * other;
   }
@@ -1349,7 +1351,7 @@
 
   typedef typename MatrixType::PlainObject ResultType;
 
-  static EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other)
   {
     EIGEN_STATIC_ASSERT(OtherRows==HDim, YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES);
 
@@ -1375,7 +1377,7 @@
 
   typedef typename MatrixType::PlainObject ResultType;
 
-  static EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other)
   {
     EIGEN_STATIC_ASSERT(OtherRows==Dim, YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES);
 
@@ -1400,7 +1402,7 @@
 
   typedef typename MatrixType::PlainObject ResultType;
 
-  static EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other)
   {
     EIGEN_STATIC_ASSERT(OtherRows==Dim, YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES);
 
diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt
index 0c50d5c..a85115b 100644
--- a/lapack/CMakeLists.txt
+++ b/lapack/CMakeLists.txt
@@ -137,16 +137,15 @@
   add_subdirectory(testing/MATGEN)
   add_subdirectory(testing/LIN)
   add_subdirectory(testing/EIG)
-  cmake_policy(SET CMP0026 OLD)
   macro(add_lapack_test output input target)
     set(TEST_INPUT "${LAPACK_SOURCE_DIR}/testing/${input}")
     set(TEST_OUTPUT "${LAPACK_BINARY_DIR}/TESTING/${output}")
-    get_target_property(TEST_LOC ${target} LOCATION)
     string(REPLACE "." "_" input_name ${input})
     set(testName "${target}_${input_name}")
     if(EXISTS "${TEST_INPUT}")
-      add_test(LAPACK-${testName} "${CMAKE_COMMAND}"
-        -DTEST=${TEST_LOC}
+      add_test(NAME LAPACK-${testName}
+        COMMAND "${CMAKE_COMMAND}"
+        -DTEST=$<TARGET_FILE:${target}>
         -DINPUT=${TEST_INPUT} 
         -DOUTPUT=${TEST_OUTPUT} 
         -DINTDIR=${CMAKE_CFG_INTDIR}
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 8eda8d2..ce37821 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -395,6 +395,12 @@
 if(CUDA_FOUND)
   
   set(CUDA_PROPAGATE_HOST_FLAGS OFF)
+  
+  set(EIGEN_CUDA_RELAXED_CONSTEXPR "--expt-relaxed-constexpr")
+  if (${CUDA_VERSION} STREQUAL "7.0")
+    set(EIGEN_CUDA_RELAXED_CONSTEXPR "--relaxed-constexpr")
+  endif()
+  
   if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") 
     set(CUDA_NVCC_FLAGS "-ccbin ${CMAKE_C_COMPILER}" CACHE STRING "nvcc flags" FORCE)
   endif()
@@ -404,7 +410,12 @@
     foreach(GPU IN LISTS EIGEN_CUDA_COMPUTE_ARCH)
       string(APPEND CMAKE_CXX_FLAGS " --cuda-gpu-arch=sm_${GPU}")
     endforeach()
+  else()
+    foreach(GPU IN LISTS EIGEN_CUDA_COMPUTE_ARCH)
+      string(APPEND CUDA_NVCC_FLAGS " -gencode arch=compute_${GPU},code=sm_${GPU}")
+    endforeach()
   endif()
+  string(APPEND CUDA_NVCC_FLAGS " ${EIGEN_CUDA_RELAXED_CONSTEXPR}")
   set(EIGEN_ADD_TEST_FILENAME_EXTENSION  "cu")
   
   ei_add_test(gpu_basic)
diff --git a/test/geo_transformations.cpp b/test/geo_transformations.cpp
index c722679..d433561 100644
--- a/test/geo_transformations.cpp
+++ b/test/geo_transformations.cpp
@@ -672,11 +672,39 @@
     VERIFY(t3.rotation().data()==t3.linear().data());
 }
 
+template<typename Scalar, int Mode, int Options> void transformations_computed_scaling_continuity()
+{
+  typedef Matrix<Scalar, 3, 1> Vector3;
+  typedef Transform<Scalar, 3, Mode, Options> Transform3;
+  typedef Matrix<Scalar, 3, 3> Matrix3;
+
+  // Given: two transforms that differ by '2*eps'.
+  Scalar eps(1e-3);
+  Vector3 v0 = Vector3::Random().normalized(),
+    v1 = Vector3::Random().normalized(),
+    v3 = Vector3::Random().normalized();
+  Transform3 t0, t1;
+  // The interesting case is when their determinants have different signs.
+  Matrix3 rank2 = 50 * v0 * v0.adjoint() + 20 * v1 * v1.adjoint();
+  t0.linear() = rank2 + eps * v3 * v3.adjoint();
+  t1.linear() = rank2 - eps * v3 * v3.adjoint();
+
+  // When: computing the rotation-scaling parts
+  Matrix3 r0, s0, r1, s1;
+  t0.computeRotationScaling(&r0, &s0);
+  t1.computeRotationScaling(&r1, &s1);
+
+  // Then: the scaling parts should differ by no more than '2*eps'.
+  const Scalar c(2.1); // 2 + room for rounding errors
+  VERIFY((s0 - s1).norm() < c * eps);
+}
+
 EIGEN_DECLARE_TEST(geo_transformations)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1(( transformations<double,Affine,AutoAlign>() ));
     CALL_SUBTEST_1(( non_projective_only<double,Affine,AutoAlign>() ));
+    CALL_SUBTEST_1(( transformations_computed_scaling_continuity<double,Affine,AutoAlign>() ));   
     
     CALL_SUBTEST_2(( transformations<float,AffineCompact,AutoAlign>() ));
     CALL_SUBTEST_2(( non_projective_only<float,AffineCompact,AutoAlign>() ));
diff --git a/test/gpu_basic.cu b/test/gpu_basic.cu
index e8069f1..b82b94d 100644
--- a/test/gpu_basic.cu
+++ b/test/gpu_basic.cu
@@ -14,7 +14,6 @@
 #endif
 
 #define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_NO_COMPLEX
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
 
 #include "main.h"
@@ -55,6 +54,59 @@
 };
 
 template<typename T>
+struct complex_sqrt {
+  EIGEN_DEVICE_FUNC
+  void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
+  {
+    using namespace Eigen;
+    typedef typename T::Scalar ComplexType;
+    typedef typename T::Scalar::value_type ValueType;
+    const int num_special_inputs = 18;
+    
+    if (i == 0) {
+      const ValueType nan = std::numeric_limits<ValueType>::quiet_NaN();
+      typedef Eigen::Vector<ComplexType, num_special_inputs> SpecialInputs;
+      SpecialInputs special_in;
+      special_in.setZero();
+      int idx = 0;
+      special_in[idx++] = ComplexType(0, 0);
+      special_in[idx++] = ComplexType(-0, 0);
+      special_in[idx++] = ComplexType(0, -0);
+      special_in[idx++] = ComplexType(-0, -0);
+      // GCC's fallback sqrt implementation fails for inf inputs.
+      // It is called when _GLIBCXX_USE_C99_COMPLEX is false or if
+      // clang includes the GCC header (which temporarily disables
+      // _GLIBCXX_USE_C99_COMPLEX)
+      #if !defined(_GLIBCXX_COMPLEX) || \
+        (_GLIBCXX_USE_C99_COMPLEX && !defined(__CLANG_CUDA_WRAPPERS_COMPLEX))
+      const ValueType inf = std::numeric_limits<ValueType>::infinity();
+      special_in[idx++] = ComplexType(1.0, inf);
+      special_in[idx++] = ComplexType(nan, inf);
+      special_in[idx++] = ComplexType(1.0, -inf);
+      special_in[idx++] = ComplexType(nan, -inf);
+      special_in[idx++] = ComplexType(-inf, 1.0);
+      special_in[idx++] = ComplexType(inf, 1.0);
+      special_in[idx++] = ComplexType(-inf, -1.0);
+      special_in[idx++] = ComplexType(inf, -1.0);
+      special_in[idx++] = ComplexType(-inf, nan);
+      special_in[idx++] = ComplexType(inf, nan);
+      #endif
+      special_in[idx++] = ComplexType(1.0, nan);
+      special_in[idx++] = ComplexType(nan, 1.0);
+      special_in[idx++] = ComplexType(nan, -1.0);
+      special_in[idx++] = ComplexType(nan, nan);
+      
+      Map<SpecialInputs> special_out(out);
+      special_out = special_in.cwiseSqrt();
+    }
+    
+    T x1(in + i);
+    Map<T> res(out + num_special_inputs + i*T::MaxSizeAtCompileTime);
+    res = x1.cwiseSqrt();
+  }
+};
+
+template<typename T>
 struct replicate {
   EIGEN_DEVICE_FUNC
   void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
@@ -161,17 +213,58 @@
   }
 };
 
+template<typename Type1, typename Type2>
+bool verifyIsApproxWithInfsNans(const Type1& a, const Type2& b, typename Type1::Scalar* = 0) // Enabled for Eigen's type only
+{
+  if (a.rows() != b.rows()) {
+    return false;
+  }
+  if (a.cols() != b.cols()) {
+    return false;
+  }
+  for (Index r = 0; r < a.rows(); ++r) {
+    for (Index c = 0; c < a.cols(); ++c) {
+      if (a(r, c) != b(r, c)
+          && !((numext::isnan)(a(r, c)) && (numext::isnan)(b(r, c))) 
+          && !test_isApprox(a(r, c), b(r, c))) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+template<typename Kernel, typename Input, typename Output>
+void test_with_infs_nans(const Kernel& ker, int n, const Input& in, Output& out)
+{
+  Output out_ref, out_gpu;
+  #if !defined(EIGEN_GPU_COMPILE_PHASE)
+  out_ref = out_gpu = out;
+  #else
+  EIGEN_UNUSED_VARIABLE(in);
+  EIGEN_UNUSED_VARIABLE(out);
+  #endif
+  run_on_cpu (ker, n, in,  out_ref);
+  run_on_gpu(ker, n, in, out_gpu);
+  #if !defined(EIGEN_GPU_COMPILE_PHASE)
+  verifyIsApproxWithInfsNans(out_ref, out_gpu);
+  #endif
+}
+
 EIGEN_DECLARE_TEST(gpu_basic)
 {
   ei_test_init_gpu();
   
   int nthreads = 100;
   Eigen::VectorXf in, out;
+  Eigen::VectorXcf cfin, cfout;
   
-  #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
+  #if !defined(EIGEN_GPU_COMPILE_PHASE)
   int data_size = nthreads * 512;
   in.setRandom(data_size);
-  out.setRandom(data_size);
+  out.setConstant(data_size, -1);
+  cfin.setRandom(data_size);
+  cfout.setConstant(data_size, -1);
   #endif
   
   CALL_SUBTEST( run_and_compare_to_gpu(coeff_wise<Vector3f>(), nthreads, in, out) );
@@ -204,6 +297,8 @@
   CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues_direct<Matrix3f>(), nthreads, in, out) );
   CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues_direct<Matrix2f>(), nthreads, in, out) );
 
+  CALL_SUBTEST( test_with_infs_nans(complex_sqrt<Vector3cf>(), nthreads, cfin, cfout) );
+
 #if defined(__NVCC__)
   // FIXME
   // These subtests compiles only with nvcc and fail with HIPCC and clang-cuda
diff --git a/test/gpu_common.h b/test/gpu_common.h
index 049e7aa..fe0485e 100644
--- a/test/gpu_common.h
+++ b/test/gpu_common.h
@@ -68,8 +68,20 @@
 #else
   run_on_gpu_meta_kernel<<<Grids,Blocks>>>(ker, n, d_in, d_out);
 #endif
+  // Pre-launch errors.
+  gpuError_t err = gpuGetLastError();
+  if (err != gpuSuccess) {
+    printf("%s: %s\n", gpuGetErrorName(err), gpuGetErrorString(err));
+    gpu_assert(false);
+  }
   
-  gpuDeviceSynchronize();
+  // Kernel execution errors.
+  err = gpuDeviceSynchronize();
+  if (err != gpuSuccess) {
+    printf("%s: %s\n", gpuGetErrorName(err), gpuGetErrorString(err));
+    gpu_assert(false);
+  }
+  
   
   // check inputs have not been modified
   gpuMemcpy(const_cast<typename Input::Scalar*>(in.data()),  d_in,  in_bytes,  gpuMemcpyDeviceToHost);
@@ -85,7 +97,7 @@
 {
   Input  in_ref,  in_gpu;
   Output out_ref, out_gpu;
-  #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
+  #if !defined(EIGEN_GPU_COMPILE_PHASE)
   in_ref = in_gpu = in;
   out_ref = out_gpu = out;
   #else
@@ -94,7 +106,7 @@
   #endif
   run_on_cpu (ker, n, in_ref,  out_ref);
   run_on_gpu(ker, n, in_gpu, out_gpu);
-  #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
+  #if !defined(EIGEN_GPU_COMPILE_PHASE)
   VERIFY_IS_APPROX(in_ref, in_gpu);
   VERIFY_IS_APPROX(out_ref, out_gpu);
   #endif
@@ -102,14 +114,16 @@
 
 struct compile_time_device_info {
   EIGEN_DEVICE_FUNC
-  void operator()(int /*i*/, const int* /*in*/, int* info) const
+  void operator()(int i, const int* /*in*/, int* info) const
   {
-    #if defined(__CUDA_ARCH__)
-    info[0] = int(__CUDA_ARCH__ +0);
-    #endif
-    #if defined(EIGEN_HIP_DEVICE_COMPILE)
-    info[1] = int(EIGEN_HIP_DEVICE_COMPILE +0);
-    #endif
+    if (i == 0) {
+      #if defined(__CUDA_ARCH__)
+      info[0] = int(__CUDA_ARCH__ +0);
+      #endif
+      #if defined(EIGEN_HIP_DEVICE_COMPILE)
+      info[1] = int(EIGEN_HIP_DEVICE_COMPILE +0);
+      #endif
+    }
   }
 };
 
diff --git a/test/packetmath.cpp b/test/packetmath.cpp
index f19d725..ab9bec1 100644
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@@ -933,7 +933,7 @@
     for (int i = 0; i < size; ++i) {
       data1[i] = Scalar(internal::random<RealScalar>(), internal::random<RealScalar>());
     }
-    CHECK_CWISE1(numext::sqrt, internal::psqrt);
+    CHECK_CWISE1_N(numext::sqrt, internal::psqrt, size);
 
     // Test misc. corner cases.
     const RealScalar zero = RealScalar(0);
@@ -944,32 +944,32 @@
     data1[1] = Scalar(-zero, zero);
     data1[2] = Scalar(one, zero);
     data1[3] = Scalar(zero, one);
-    CHECK_CWISE1(numext::sqrt, internal::psqrt);
+    CHECK_CWISE1_N(numext::sqrt, internal::psqrt, 4);
     data1[0] = Scalar(-one, zero);
     data1[1] = Scalar(zero, -one);
     data1[2] = Scalar(one, one);
     data1[3] = Scalar(-one, -one);
-    CHECK_CWISE1(numext::sqrt, internal::psqrt);
+    CHECK_CWISE1_N(numext::sqrt, internal::psqrt, 4);
     data1[0] = Scalar(inf, zero);
     data1[1] = Scalar(zero, inf);
     data1[2] = Scalar(-inf, zero);
     data1[3] = Scalar(zero, -inf);
-    CHECK_CWISE1(numext::sqrt, internal::psqrt);
+    CHECK_CWISE1_N(numext::sqrt, internal::psqrt, 4);
     data1[0] = Scalar(inf, inf);
     data1[1] = Scalar(-inf, inf);
     data1[2] = Scalar(inf, -inf);
     data1[3] = Scalar(-inf, -inf);
-    CHECK_CWISE1(numext::sqrt, internal::psqrt);
+    CHECK_CWISE1_N(numext::sqrt, internal::psqrt, 4);
     data1[0] = Scalar(nan, zero);
     data1[1] = Scalar(zero, nan);
     data1[2] = Scalar(nan, one);
     data1[3] = Scalar(one, nan);
-    CHECK_CWISE1(numext::sqrt, internal::psqrt);
+    CHECK_CWISE1_N(numext::sqrt, internal::psqrt, 4);
     data1[0] = Scalar(nan, nan);
     data1[1] = Scalar(inf, nan);
     data1[2] = Scalar(nan, inf);
     data1[3] = Scalar(-inf, nan);
-    CHECK_CWISE1(numext::sqrt, internal::psqrt);
+    CHECK_CWISE1_N(numext::sqrt, internal::psqrt, 4);
   }
 }
 
diff --git a/test/packetmath_test_shared.h b/test/packetmath_test_shared.h
index f8dc371..46a4260 100644
--- a/test/packetmath_test_shared.h
+++ b/test/packetmath_test_shared.h
@@ -115,6 +115,17 @@
   VERIFY(test::areApprox(ref, data2, PacketSize) && #POP); \
 }
 
+// Checks component-wise for input of size N. All of data1, data2, and ref
+// should have size at least ceil(N/PacketSize)*PacketSize to avoid memory
+// access errors.
+#define CHECK_CWISE1_N(REFOP, POP, N) { \
+  for (int i=0; i<N; ++i) \
+    ref[i] = REFOP(data1[i]); \
+  for (int j=0; j<N; j+=PacketSize) \
+    internal::pstore(data2 + j, POP(internal::pload<Packet>(data1 + j))); \
+  VERIFY(test::areApprox(ref, data2, N) && #POP); \
+}
+
 template<bool Cond,typename Packet>
 struct packet_helper
 {
diff --git a/test/product_small.cpp b/test/product_small.cpp
index 93876db..1d6df6e 100644
--- a/test/product_small.cpp
+++ b/test/product_small.cpp
@@ -56,18 +56,17 @@
   VERIFY_IS_APPROX(C+=A.lazyProduct(B), ref_prod(D,A,B));
 }
 
-template<typename T>
-void test_dynamic_exact()
+void test_dynamic_bool()
 {
   int rows = internal::random<int>(1,64);
   int cols = internal::random<int>(1,64);
   int depth = internal::random<int>(1,65);
 
-  typedef Matrix<T,Dynamic,Dynamic> MatrixX;
+  typedef Matrix<bool,Dynamic,Dynamic> MatrixX;
   MatrixX A(rows,depth); A.setRandom();
   MatrixX B(depth,cols); B.setRandom();
-  MatrixX  C(rows,cols);  C.setRandom();
-  MatrixX  D(C);
+  MatrixX C(rows,cols);  C.setRandom();
+  MatrixX D(C);
   for(Index i=0;i<C.rows();++i)
     for(Index j=0;j<C.cols();++j)
       for(Index k=0;k<A.cols();++k)
@@ -317,7 +316,7 @@
     CALL_SUBTEST_6( bug_1311<3>() );
     CALL_SUBTEST_6( bug_1311<5>() );
 
-    CALL_SUBTEST_9( test_dynamic_exact<bool>() );
+    CALL_SUBTEST_9( test_dynamic_bool() );
   }
 
   CALL_SUBTEST_6( product_small_regressions<0>() );
diff --git a/test/rand.cpp b/test/rand.cpp
index 1b5c058..dd4e739 100644
--- a/test/rand.cpp
+++ b/test/rand.cpp
@@ -51,7 +51,7 @@
     Scalar r = check_in_range(x,y);
     hist( int((int64(r)-int64(x))/divisor) )++;
   }
-  VERIFY( (((hist.cast<double>()/double(f))-1.0).abs()<0.02).all() );
+  VERIFY( (((hist.cast<double>()/double(f))-1.0).abs()<0.025).all() );
 }
 
 EIGEN_DECLARE_TEST(rand)
diff --git a/test/ref.cpp b/test/ref.cpp
index c0b6ffd..ebfc70d 100644
--- a/test/ref.cpp
+++ b/test/ref.cpp
@@ -141,6 +141,69 @@
   VERIFY_IS_APPROX(mat1, mat2);
 }
 
+template<typename Scalar, int Rows, int Cols>
+void ref_vector_fixed_sizes()
+{
+  typedef Matrix<Scalar,Rows,Cols,RowMajor> RowMajorMatrixType;
+  typedef Matrix<Scalar,Rows,Cols,ColMajor> ColMajorMatrixType;
+  typedef Matrix<Scalar,1,Cols> RowVectorType;
+  typedef Matrix<Scalar,Rows,1> ColVectorType;
+  typedef Matrix<Scalar,Cols,1> RowVectorTransposeType;
+  typedef Matrix<Scalar,1,Rows> ColVectorTransposeType;
+  typedef Stride<Dynamic, Dynamic> DynamicStride;
+
+  RowMajorMatrixType mr = RowMajorMatrixType::Random();
+  ColMajorMatrixType mc = ColMajorMatrixType::Random();
+
+  Index i = internal::random<Index>(0,Rows-1);
+  Index j = internal::random<Index>(0,Cols-1);
+
+  // Reference ith row.
+  Ref<RowVectorType, 0, DynamicStride> mr_ri = mr.row(i);
+  VERIFY_IS_EQUAL(mr_ri, mr.row(i));
+  Ref<RowVectorType, 0, DynamicStride> mc_ri = mc.row(i);
+  VERIFY_IS_EQUAL(mc_ri, mc.row(i));
+
+  // Reference jth col.
+  Ref<ColVectorType, 0, DynamicStride> mr_cj = mr.col(j);
+  VERIFY_IS_EQUAL(mr_cj, mr.col(j));
+  Ref<ColVectorType, 0, DynamicStride> mc_cj = mc.col(j);
+  VERIFY_IS_EQUAL(mc_cj, mc.col(j));
+
+  // Reference the transpose of row i.
+  Ref<RowVectorTransposeType, 0, DynamicStride> mr_rit = mr.row(i);
+  VERIFY_IS_EQUAL(mr_rit, mr.row(i).transpose());
+  Ref<RowVectorTransposeType, 0, DynamicStride> mc_rit = mc.row(i);
+  VERIFY_IS_EQUAL(mc_rit, mc.row(i).transpose());
+
+  // Reference the transpose of col j.
+  Ref<ColVectorTransposeType, 0, DynamicStride> mr_cjt = mr.col(j);
+  VERIFY_IS_EQUAL(mr_cjt, mr.col(j).transpose());
+  Ref<ColVectorTransposeType, 0, DynamicStride> mc_cjt = mc.col(j);
+  VERIFY_IS_EQUAL(mc_cjt, mc.col(j).transpose());
+  
+  // Const references without strides.
+  Ref<const RowVectorType> cmr_ri = mr.row(i);
+  VERIFY_IS_EQUAL(cmr_ri, mr.row(i));
+  Ref<const RowVectorType> cmc_ri = mc.row(i);
+  VERIFY_IS_EQUAL(cmc_ri, mc.row(i));
+
+  Ref<const ColVectorType> cmr_cj = mr.col(j);
+  VERIFY_IS_EQUAL(cmr_cj, mr.col(j));
+  Ref<const ColVectorType> cmc_cj = mc.col(j);
+  VERIFY_IS_EQUAL(cmc_cj, mc.col(j));
+
+  Ref<const RowVectorTransposeType> cmr_rit = mr.row(i);
+  VERIFY_IS_EQUAL(cmr_rit, mr.row(i).transpose());
+  Ref<const RowVectorTransposeType> cmc_rit = mc.row(i);
+  VERIFY_IS_EQUAL(cmc_rit, mc.row(i).transpose());
+
+  Ref<const ColVectorTransposeType> cmr_cjt = mr.col(j);
+  VERIFY_IS_EQUAL(cmr_cjt, mr.col(j).transpose());
+  Ref<const ColVectorTransposeType> cmc_cjt = mc.col(j);
+  VERIFY_IS_EQUAL(cmc_cjt, mc.col(j).transpose());
+}
+
 template<typename PlainObjectType> void check_const_correctness(const PlainObjectType&)
 {
   // verify that ref-to-const don't have LvalueBit
@@ -287,6 +350,9 @@
     CALL_SUBTEST_4( ref_matrix(Matrix<std::complex<double>,10,15>()) );
     CALL_SUBTEST_5( ref_matrix(MatrixXi(internal::random<int>(1,10),internal::random<int>(1,10))) );
     CALL_SUBTEST_6( call_ref() );
+
+    CALL_SUBTEST_8( (ref_vector_fixed_sizes<float,3,5>()) );
+    CALL_SUBTEST_8( (ref_vector_fixed_sizes<float,15,10>()) );
   }
   
   CALL_SUBTEST_7( test_ref_overloads() );
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h
index f32ce27..cb53ce2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h
@@ -16,7 +16,7 @@
 // for some reason gets sent to the gcc/host compiler instead of the gpu/nvcc/hipcc compiler
 // When compiling such files, gcc will end up trying to pick up the CUDA headers by 
 // default (see the code within "unsupported/Eigen/CXX11/Tensor" that is guarded by EIGEN_USE_GPU)
-// This will obsviously not work when trying to compile tensorflow on a system with no CUDA
+// This will obviously not work when trying to compile tensorflow on a system with no CUDA
 // To work around this issue for HIP systems (and leave the default behaviour intact), the
 // HIP tensorflow build defines EIGEN_USE_HIP when compiling all source files, and 
 // "unsupported/Eigen/CXX11/Tensor" has been updated to use HIP header when EIGEN_USE_HIP is
@@ -30,6 +30,9 @@
 #define gpuSuccess hipSuccess
 #define gpuErrorNotReady hipErrorNotReady
 #define gpuGetDeviceCount hipGetDeviceCount
+#define gpuGetLastError hipGetLastError
+#define gpuPeekAtLastError hipPeekAtLastError
+#define gpuGetErrorName hipGetErrorName
 #define gpuGetErrorString hipGetErrorString
 #define gpuGetDeviceProperties hipGetDeviceProperties
 #define gpuStreamDefault hipStreamDefault
@@ -57,6 +60,9 @@
 #define gpuSuccess cudaSuccess
 #define gpuErrorNotReady cudaErrorNotReady
 #define gpuGetDeviceCount cudaGetDeviceCount
+#define gpuGetLastError cudaGetLastError
+#define gpuPeekAtLastError cudaPeekAtLastError
+#define gpuGetErrorName cudaGetErrorName
 #define gpuGetErrorString cudaGetErrorString
 #define gpuGetDeviceProperties cudaGetDeviceProperties
 #define gpuStreamDefault cudaStreamDefault
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
index e6a666f..656fd21 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
@@ -45,8 +45,6 @@
   static const std::size_t MinSize = max_n_1<Size>::size;
   EIGEN_ALIGN_MAX T m_data[MinSize];
 
-  FixedDimensions m_dimensions;
-
  public:
   EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE TensorStorage() {
@@ -57,14 +55,17 @@
   EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE const T *data() const { return m_data; }
 
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE const FixedDimensions& dimensions() const { return m_dimensions; }
+  static EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const FixedDimensions& dimensions()
+  {
+    static const FixedDimensions* singleton_dimensions = new FixedDimensions();
+    return *singleton_dimensions;
+  }
 
   EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE DenseIndex size() const { return m_dimensions.TotalSize(); }
+  EIGEN_STRONG_INLINE DenseIndex size() const { return Size; }
 };
 
-
 // pure dynamic
 template<typename T, typename IndexType, int NumIndices_, int Options_>
 class TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_>
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index cea4bbe..6bbcf50 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -321,18 +321,11 @@
     set(EIGEN_CUDA_RELAXED_CONSTEXPR "--relaxed-constexpr")
   endif()
 
-  if(( (NOT EIGEN_TEST_CXX11) OR (CMAKE_VERSION VERSION_LESS 3.3)) AND EIGEN_TEST_CXX11)
-    set(EIGEN_CUDA_CXX11_FLAG "-std=c++11")
-  else()
-    # otherwise the flag has already been added because of the above set(CMAKE_CXX_STANDARD 11)
-    set(EIGEN_CUDA_CXX11_FLAG "")
-  endif()
-
   set(NVCC_ARCH_FLAGS)
   foreach(ARCH IN LISTS EIGEN_CUDA_COMPUTE_ARCH)
     string(APPEND NVCC_ARCH_FLAGS " -gencode arch=compute_${ARCH},code=sm_${ARCH}")
   endforeach()
-  set(CUDA_NVCC_FLAGS  "${EIGEN_CUDA_CXX11_FLAG} ${EIGEN_CUDA_RELAXED_CONSTEXPR} -Xcudafe \"--display_error_number\" ${NVCC_ARCH_FLAGS} ${CUDA_NVCC_FLAGS}")
+  set(CUDA_NVCC_FLAGS  "${EIGEN_CUDA_RELAXED_CONSTEXPR} -Xcudafe \"--display_error_number\" ${NVCC_ARCH_FLAGS} ${CUDA_NVCC_FLAGS}")
   cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include")
   set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")