Update Eigen to:
https://gitlab.com/libeigen/eigen/-/commit/5e89573e2a1018afa1e0ef6394f5f33851c3bc6f

PiperOrigin-RevId: 411940416
Change-Id: Ic805ea7145294ca1c0829069a40e29eadd99a70f
diff --git a/Eigen/SparseCore b/Eigen/SparseCore
index 76966c4..b2db46b 100644
--- a/Eigen/SparseCore
+++ b/Eigen/SparseCore
@@ -41,7 +41,6 @@
 #include "src/SparseCore/SparseCompressedBase.h"
 #include "src/SparseCore/SparseMatrix.h"
 #include "src/SparseCore/SparseMap.h"
-#include "src/SparseCore/MappedSparseMatrix.h"
 #include "src/SparseCore/SparseVector.h"
 #include "src/SparseCore/SparseRef.h"
 #include "src/SparseCore/SparseCwiseUnaryOp.h"
diff --git a/Eigen/src/CholmodSupport/CholmodSupport.h b/Eigen/src/CholmodSupport/CholmodSupport.h
index a56ebfd..d33af54 100644
--- a/Eigen/src/CholmodSupport/CholmodSupport.h
+++ b/Eigen/src/CholmodSupport/CholmodSupport.h
@@ -157,9 +157,9 @@
 /** Returns a view of the Cholmod sparse matrix \a cm as an Eigen sparse matrix.
   * The data are not copied but shared. */
 template<typename Scalar, int Flags, typename StorageIndex>
-MappedSparseMatrix<Scalar,Flags,StorageIndex> viewAsEigen(cholmod_sparse& cm)
+Map<SparseMatrix<Scalar,Flags,StorageIndex> > viewAsEigen(cholmod_sparse& cm)
 {
-  return MappedSparseMatrix<Scalar,Flags,StorageIndex>
+  return Map<SparseMatrix<Scalar,Flags,StorageIndex> >
          (cm.nrow, cm.ncol, static_cast<StorageIndex*>(cm.p)[cm.ncol],
           static_cast<StorageIndex*>(cm.p), static_cast<StorageIndex*>(cm.i),static_cast<Scalar*>(cm.x) );
 }
diff --git a/Eigen/src/Core/arch/Default/BFloat16.h b/Eigen/src/Core/arch/Default/BFloat16.h
index eb8d40a..36df320 100644
--- a/Eigen/src/Core/arch/Default/BFloat16.h
+++ b/Eigen/src/Core/arch/Default/BFloat16.h
@@ -130,39 +130,39 @@
 namespace std {
 template<>
 struct numeric_limits<Eigen::bfloat16> {
-  static const bool is_specialized = true;
-  static const bool is_signed = true;
-  static const bool is_integer = false;
-  static const bool is_exact = false;
-  static const bool has_infinity = true;
-  static const bool has_quiet_NaN = true;
-  static const bool has_signaling_NaN = true;
-  static const float_denorm_style has_denorm = std::denorm_absent;
-  static const bool has_denorm_loss = false;
-  static const std::float_round_style round_style = numeric_limits<float>::round_style;
-  static const bool is_iec559 = false;
-  static const bool is_bounded = true;
-  static const bool is_modulo = false;
-  static const int digits = 8;
-  static const int digits10 = 2;
-  static const int max_digits10 = 4;
-  static const int radix = 2;
-  static const int min_exponent = numeric_limits<float>::min_exponent;
-  static const int min_exponent10 = numeric_limits<float>::min_exponent10;
-  static const int max_exponent = numeric_limits<float>::max_exponent;
-  static const int max_exponent10 = numeric_limits<float>::max_exponent10;
-  static const bool traps = numeric_limits<float>::traps;
-  static const bool tinyness_before = numeric_limits<float>::tinyness_before;
+  static EIGEN_CONSTEXPR const bool is_specialized = true;
+  static EIGEN_CONSTEXPR const bool is_signed = true;
+  static EIGEN_CONSTEXPR const bool is_integer = false;
+  static EIGEN_CONSTEXPR const bool is_exact = false;
+  static EIGEN_CONSTEXPR const bool has_infinity = true;
+  static EIGEN_CONSTEXPR const bool has_quiet_NaN = true;
+  static EIGEN_CONSTEXPR const bool has_signaling_NaN = true;
+  static EIGEN_CONSTEXPR const float_denorm_style has_denorm = std::denorm_absent;
+  static EIGEN_CONSTEXPR const bool has_denorm_loss = false;
+  static EIGEN_CONSTEXPR const std::float_round_style round_style = numeric_limits<float>::round_style;
+  static EIGEN_CONSTEXPR const bool is_iec559 = false;
+  static EIGEN_CONSTEXPR const bool is_bounded = true;
+  static EIGEN_CONSTEXPR const bool is_modulo = false;
+  static EIGEN_CONSTEXPR const int digits = 8;
+  static EIGEN_CONSTEXPR const int digits10 = 2;
+  static EIGEN_CONSTEXPR const int max_digits10 = 4;
+  static EIGEN_CONSTEXPR const int radix = 2;
+  static EIGEN_CONSTEXPR const int min_exponent = numeric_limits<float>::min_exponent;
+  static EIGEN_CONSTEXPR const int min_exponent10 = numeric_limits<float>::min_exponent10;
+  static EIGEN_CONSTEXPR const int max_exponent = numeric_limits<float>::max_exponent;
+  static EIGEN_CONSTEXPR const int max_exponent10 = numeric_limits<float>::max_exponent10;
+  static EIGEN_CONSTEXPR const bool traps = numeric_limits<float>::traps;
+  static EIGEN_CONSTEXPR const bool tinyness_before = numeric_limits<float>::tinyness_before;
 
-  static Eigen::bfloat16 (min)() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x0080); }
-  static Eigen::bfloat16 lowest() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0xff7f); }
-  static Eigen::bfloat16 (max)() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7f7f); }
-  static Eigen::bfloat16 epsilon() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x3c00); }
-  static Eigen::bfloat16 round_error() { return Eigen::bfloat16(0x3f00); }
-  static Eigen::bfloat16 infinity() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7f80); }
-  static Eigen::bfloat16 quiet_NaN() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7fc0); }
-  static Eigen::bfloat16 signaling_NaN() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7f81); }
-  static Eigen::bfloat16 denorm_min() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x0001); }
+  static EIGEN_CONSTEXPR Eigen::bfloat16 (min)() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x0080); }
+  static EIGEN_CONSTEXPR Eigen::bfloat16 lowest() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0xff7f); }
+  static EIGEN_CONSTEXPR Eigen::bfloat16 (max)() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7f7f); }
+  static EIGEN_CONSTEXPR Eigen::bfloat16 epsilon() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x3c00); }
+  static EIGEN_CONSTEXPR Eigen::bfloat16 round_error() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x3f00); }
+  static EIGEN_CONSTEXPR Eigen::bfloat16 infinity() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7f80); }
+  static EIGEN_CONSTEXPR Eigen::bfloat16 quiet_NaN() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7fc0); }
+  static EIGEN_CONSTEXPR Eigen::bfloat16 signaling_NaN() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7f81); }
+  static EIGEN_CONSTEXPR Eigen::bfloat16 denorm_min() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x0001); }
 };
 
 // If std::numeric_limits<T> is specialized, should also specialize
diff --git a/Eigen/src/Core/arch/Default/Half.h b/Eigen/src/Core/arch/Default/Half.h
index df82612..4155335 100644
--- a/Eigen/src/Core/arch/Default/Half.h
+++ b/Eigen/src/Core/arch/Default/Half.h
@@ -208,39 +208,39 @@
 namespace std {
 template<>
 struct numeric_limits<Eigen::half> {
-  static const bool is_specialized = true;
-  static const bool is_signed = true;
-  static const bool is_integer = false;
-  static const bool is_exact = false;
-  static const bool has_infinity = true;
-  static const bool has_quiet_NaN = true;
-  static const bool has_signaling_NaN = true;
-  static const float_denorm_style has_denorm = denorm_present;
-  static const bool has_denorm_loss = false;
-  static const std::float_round_style round_style = std::round_to_nearest;
-  static const bool is_iec559 = false;
-  static const bool is_bounded = false;
-  static const bool is_modulo = false;
-  static const int digits = 11;
-  static const int digits10 = 3;      // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
-  static const int max_digits10 = 5;  // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
-  static const int radix = 2;
-  static const int min_exponent = -13;
-  static const int min_exponent10 = -4;
-  static const int max_exponent = 16;
-  static const int max_exponent10 = 4;
-  static const bool traps = true;
-  static const bool tinyness_before = false;
+  static EIGEN_CONSTEXPR const bool is_specialized = true;
+  static EIGEN_CONSTEXPR const bool is_signed = true;
+  static EIGEN_CONSTEXPR const bool is_integer = false;
+  static EIGEN_CONSTEXPR const bool is_exact = false;
+  static EIGEN_CONSTEXPR const bool has_infinity = true;
+  static EIGEN_CONSTEXPR const bool has_quiet_NaN = true;
+  static EIGEN_CONSTEXPR const bool has_signaling_NaN = true;
+  static EIGEN_CONSTEXPR const float_denorm_style has_denorm = denorm_present;
+  static EIGEN_CONSTEXPR const bool has_denorm_loss = false;
+  static EIGEN_CONSTEXPR const std::float_round_style round_style = std::round_to_nearest;
+  static EIGEN_CONSTEXPR const bool is_iec559 = false;
+  static EIGEN_CONSTEXPR const bool is_bounded = false;
+  static EIGEN_CONSTEXPR const bool is_modulo = false;
+  static EIGEN_CONSTEXPR const int digits = 11;
+  static EIGEN_CONSTEXPR const int digits10 = 3;      // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
+  static EIGEN_CONSTEXPR const int max_digits10 = 5;  // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
+  static EIGEN_CONSTEXPR const int radix = 2;
+  static EIGEN_CONSTEXPR const int min_exponent = -13;
+  static EIGEN_CONSTEXPR const int min_exponent10 = -4;
+  static EIGEN_CONSTEXPR const int max_exponent = 16;
+  static EIGEN_CONSTEXPR const int max_exponent10 = 4;
+  static EIGEN_CONSTEXPR const bool traps = true;
+  static EIGEN_CONSTEXPR const bool tinyness_before = false;
 
-  static Eigen::half (min)() { return Eigen::half_impl::raw_uint16_to_half(0x400); }
-  static Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); }
-  static Eigen::half (max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); }
-  static Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x0800); }
-  static Eigen::half round_error() { return Eigen::half(0.5); }
-  static Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); }
-  static Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); }
-  static Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7d00); }
-  static Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x1); }
+  static EIGEN_CONSTEXPR Eigen::half (min)() { return Eigen::half_impl::raw_uint16_to_half(0x400); }
+  static EIGEN_CONSTEXPR Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); }
+  static EIGEN_CONSTEXPR Eigen::half (max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); }
+  static EIGEN_CONSTEXPR Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x0800); }
+  static EIGEN_CONSTEXPR Eigen::half round_error() { return Eigen::half_impl::raw_uint16_to_half(0x3800); }
+  static EIGEN_CONSTEXPR Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); }
+  static EIGEN_CONSTEXPR Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); }
+  static EIGEN_CONSTEXPR Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7d00); }
+  static EIGEN_CONSTEXPR Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x1); }
 };
 
 // If std::numeric_limits<T> is specialized, should also specialize
diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h
index 2e5e731..dd6d451 100644
--- a/Eigen/src/Core/util/ConfigureVectorization.h
+++ b/Eigen/src/Core/util/ConfigureVectorization.h
@@ -438,11 +438,11 @@
   #include <arm_fp16.h>
 #endif
 
-#if defined(__F16C__) && (!defined(EIGEN_GPUCC) && (!defined(EIGEN_COMP_CLANG) || EIGEN_COMP_CLANG>=380))
+#if defined(__F16C__) && (!defined(EIGEN_GPUCC) && (!EIGEN_COMP_CLANG || EIGEN_COMP_CLANG>=380))
   // We can use the optimized fp16 to float and float to fp16 conversion routines
   #define EIGEN_HAS_FP16_C
 
-  #if defined(EIGEN_COMP_CLANG)
+  #if EIGEN_COMP_CLANG
     // Workaround for clang: The FP16C intrinsics for clang are included by
     // immintrin.h, as opposed to emmintrin.h as suggested by Intel:
     // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#othertechs=FP16C&expand=1711
diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h
index 848aa68..d7e8a13 100755
--- a/Eigen/src/Core/util/DisableStupidWarnings.h
+++ b/Eigen/src/Core/util/DisableStupidWarnings.h
@@ -75,38 +75,47 @@
 #endif
 
 #if defined __NVCC__
-  #pragma diag_suppress boolean_controlling_expr_is_constant
+  #define EIGEN_MAKE_PRAGMA(X) _Pragma(#X)
+  #if defined __NVCC_DIAG_PRAGMA_SUPPORT__
+    #define EIGEN_NV_DIAG_SUPPRESS(X) EIGEN_MAKE_PRAGMA(nv_diag_suppress X)
+  #else
+    #define EIGEN_NV_DIAG_SUPPRESS(X) EIGEN_MAKE_PRAGMA(diag_suppress X)
+  #endif
+
+  EIGEN_NV_DIAG_SUPPRESS(boolean_controlling_expr_is_constant)
   // Disable the "statement is unreachable" message
-  #pragma diag_suppress code_is_unreachable
+  EIGEN_NV_DIAG_SUPPRESS(code_is_unreachable)
   // Disable the "dynamic initialization in unreachable code" message
-  #pragma diag_suppress initialization_not_reachable
+  EIGEN_NV_DIAG_SUPPRESS(initialization_not_reachable)
   // Disable the "invalid error number" message that we get with older versions of nvcc
-  #pragma diag_suppress 1222
+  EIGEN_NV_DIAG_SUPPRESS(1222)
   // Disable the "calling a __host__ function from a __host__ __device__ function is not allowed" messages (yes, there are many of them and they seem to change with every version of the compiler)
-  #pragma diag_suppress 2527
-  #pragma diag_suppress 2529
-  #pragma diag_suppress 2651
-  #pragma diag_suppress 2653
-  #pragma diag_suppress 2668
-  #pragma diag_suppress 2669
-  #pragma diag_suppress 2670
-  #pragma diag_suppress 2671
-  #pragma diag_suppress 2735
-  #pragma diag_suppress 2737
-  #pragma diag_suppress 2739
-  #pragma diag_suppress 2885
-  #pragma diag_suppress 2888
-  #pragma diag_suppress 2976
-  #pragma diag_suppress 2979
-  #pragma diag_suppress 20011
-  #pragma diag_suppress 20014
+  EIGEN_NV_DIAG_SUPPRESS(2527)
+  EIGEN_NV_DIAG_SUPPRESS(2529)
+  EIGEN_NV_DIAG_SUPPRESS(2651)
+  EIGEN_NV_DIAG_SUPPRESS(2653)
+  EIGEN_NV_DIAG_SUPPRESS(2668)
+  EIGEN_NV_DIAG_SUPPRESS(2669)
+  EIGEN_NV_DIAG_SUPPRESS(2670)
+  EIGEN_NV_DIAG_SUPPRESS(2671)
+  EIGEN_NV_DIAG_SUPPRESS(2735)
+  EIGEN_NV_DIAG_SUPPRESS(2737)
+  EIGEN_NV_DIAG_SUPPRESS(2739)
+  EIGEN_NV_DIAG_SUPPRESS(2885)
+  EIGEN_NV_DIAG_SUPPRESS(2888)
+  EIGEN_NV_DIAG_SUPPRESS(2976)
+  EIGEN_NV_DIAG_SUPPRESS(2979)
+  EIGEN_NV_DIAG_SUPPRESS(20011)
+  EIGEN_NV_DIAG_SUPPRESS(20014)
   // Disable the "// __device__ annotation is ignored on a function(...) that is
   //              explicitly defaulted on its first declaration" message.
   // The __device__ annotation seems to actually be needed in some cases,
   // otherwise resulting in kernel runtime errors.
-  #pragma diag_suppress 2886
-  #pragma diag_suppress 2977
-  #pragma diag_suppress 20012
+  EIGEN_NV_DIAG_SUPPRESS(2886)
+  EIGEN_NV_DIAG_SUPPRESS(2977)
+  EIGEN_NV_DIAG_SUPPRESS(20012)
+  #undef EIGEN_NV_DIAG_SUPPRESS
+  #undef EIGEN_MAKE_PRAGMA
 #endif
 
 #else
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index 4ed98d5..909d00e 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -855,7 +855,7 @@
 /** \class aligned_allocator
 * \ingroup Core_Module
 *
-* \brief STL compatible allocator to use with types requiring a non standrad alignment.
+* \brief STL compatible allocator to use with types requiring a non-standard alignment.
 *
 * The memory is aligned as for dynamically aligned matrix/array types such as MatrixXd.
 * By default, it will thus provide at least 16 bytes alignment and more in following cases:
diff --git a/Eigen/src/Core/util/ReenableStupidWarnings.h b/Eigen/src/Core/util/ReenableStupidWarnings.h
index 9dad396..f830de1 100644
--- a/Eigen/src/Core/util/ReenableStupidWarnings.h
+++ b/Eigen/src/Core/util/ReenableStupidWarnings.h
@@ -20,10 +20,18 @@
 //    Don't re-enable the diagnostic messages, as it turns out these messages need
 //    to be disabled at the point of the template instantiation (i.e the user code)
 //    otherwise they'll be triggered by nvcc.
-//    #pragma diag_default code_is_unreachable
-//    #pragma diag_default initialization_not_reachable
-//    #pragma diag_default 2651
-//    #pragma diag_default 2653
+//    #define EIGEN_MAKE_PRAGMA(X) _Pragma(#X)
+//    #if __NVCC_DIAG_PRAGMA_SUPPORT__
+//      #define EIGEN_NV_DIAG_DEFAULT(X) EIGEN_MAKE_PRAGMA(nv_diag_default X)
+//    #else
+//      #define EIGEN_NV_DIAG_DEFAULT(X) EIGEN_MAKE_PRAGMA(diag_default X)
+//    #endif
+//    EIGEN_NV_DIAG_DEFAULT(code_is_unreachable)
+//    EIGEN_NV_DIAG_DEFAULT(initialization_not_reachable)
+//    EIGEN_NV_DIAG_DEFAULT(2651)
+//    EIGEN_NV_DIAG_DEFAULT(2653)
+//    #undef EIGEN_NV_DIAG_DEFAULT
+//    #undef EIGEN_MAKE_PRAGMA
   #endif
 
 #endif
diff --git a/Eigen/src/Geometry/Quaternion.h b/Eigen/src/Geometry/Quaternion.h
index 3a27505..42ce0bb 100644
--- a/Eigen/src/Geometry/Quaternion.h
+++ b/Eigen/src/Geometry/Quaternion.h
@@ -309,7 +309,7 @@
 
   /** Constructs and initializes a quaternion from either:
     *  - a rotation matrix expression,
-    *  - a 4D vector expression representing quaternion coefficients.
+    *  - a 4D vector expression representing quaternion coefficients in the order [\c x, \c y, \c z, \c w].
     */
   template<typename Derived>
   EIGEN_DEVICE_FUNC explicit inline Quaternion(const MatrixBase<Derived>& other) { *this = other; }
diff --git a/Eigen/src/Geometry/Transform.h b/Eigen/src/Geometry/Transform.h
index 39db9c7..27ea962 100644
--- a/Eigen/src/Geometry/Transform.h
+++ b/Eigen/src/Geometry/Transform.h
@@ -1266,17 +1266,17 @@
   typedef typename TransformType::MatrixType MatrixType;
   typedef typename TransformType::AffinePart AffinePart;
   typedef typename TransformType::ConstAffinePart ConstAffinePart;
-  static inline AffinePart run(MatrixType& m)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AffinePart run(MatrixType& m)
   { return m.template block<TransformType::Dim,TransformType::HDim>(0,0); }
-  static inline ConstAffinePart run(const MatrixType& m)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ConstAffinePart run(const MatrixType& m)
   { return m.template block<TransformType::Dim,TransformType::HDim>(0,0); }
 };
 
 template<typename Scalar, int Dim, int Options>
 struct transform_take_affine_part<Transform<Scalar,Dim,AffineCompact, Options> > {
   typedef typename Transform<Scalar,Dim,AffineCompact,Options>::MatrixType MatrixType;
-  static inline MatrixType& run(MatrixType& m) { return m; }
-  static inline const MatrixType& run(const MatrixType& m) { return m; }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MatrixType& run(MatrixType& m) { return m; }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const MatrixType& run(const MatrixType& m) { return m; }
 };
 
 /*****************************************************
@@ -1286,7 +1286,7 @@
 template<typename Other, int Mode, int Options, int Dim, int HDim>
 struct transform_construct_from_matrix<Other, Mode,Options,Dim,HDim, Dim,Dim>
 {
-  static inline void run(Transform<typename Other::Scalar,Dim,Mode,Options> *transform, const Other& other)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Transform<typename Other::Scalar,Dim,Mode,Options> *transform, const Other& other)
   {
     transform->linear() = other;
     transform->translation().setZero();
@@ -1297,7 +1297,7 @@
 template<typename Other, int Mode, int Options, int Dim, int HDim>
 struct transform_construct_from_matrix<Other, Mode,Options,Dim,HDim, Dim,HDim>
 {
-  static inline void run(Transform<typename Other::Scalar,Dim,Mode,Options> *transform, const Other& other)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Transform<typename Other::Scalar,Dim,Mode,Options> *transform, const Other& other)
   {
     transform->affine() = other;
     transform->makeAffine();
@@ -1307,14 +1307,14 @@
 template<typename Other, int Mode, int Options, int Dim, int HDim>
 struct transform_construct_from_matrix<Other, Mode,Options,Dim,HDim, HDim,HDim>
 {
-  static inline void run(Transform<typename Other::Scalar,Dim,Mode,Options> *transform, const Other& other)
+  static  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Transform<typename Other::Scalar,Dim,Mode,Options> *transform, const Other& other)
   { transform->matrix() = other; }
 };
 
 template<typename Other, int Options, int Dim, int HDim>
 struct transform_construct_from_matrix<Other, AffineCompact,Options,Dim,HDim, HDim,HDim>
 {
-  static inline void run(Transform<typename Other::Scalar,Dim,AffineCompact,Options> *transform, const Other& other)
+  static  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Transform<typename Other::Scalar,Dim,AffineCompact,Options> *transform, const Other& other)
   { transform->matrix() = other.template block<Dim,HDim>(0,0); }
 };
 
diff --git a/Eigen/src/Geometry/Umeyama.h b/Eigen/src/Geometry/Umeyama.h
index b402dfa..5f79b3a 100644
--- a/Eigen/src/Geometry/Umeyama.h
+++ b/Eigen/src/Geometry/Umeyama.h
@@ -124,9 +124,6 @@
   const RowMajorMatrixType src_demean = src.colwise() - src_mean;
   const RowMajorMatrixType dst_demean = dst.colwise() - dst_mean;
 
-  // Eq. (36)-(37)
-  const Scalar src_var = src_demean.rowwise().squaredNorm().sum() * one_over_n;
-
   // Eq. (38)
   const MatrixType sigma = one_over_n * dst_demean * src_demean.transpose();
 
@@ -148,6 +145,9 @@
 
   if (with_scaling)
   {
+    // Eq. (36)-(37)
+    const Scalar src_var = src_demean.rowwise().squaredNorm().sum() * one_over_n;
+
     // Eq. (42)
     const Scalar c = Scalar(1)/src_var * svd.singularValues().dot(S);
 
diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h
index 86f270d..8bb30cd 100644
--- a/Eigen/src/SVD/BDCSVD.h
+++ b/Eigen/src/SVD/BDCSVD.h
@@ -1,9 +1,9 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
-// 
+//
 // We used the "A Divide-And-Conquer Algorithm for the Bidiagonal SVD"
 // research report written by Ming Gu and Stanley C.Eisenstat
-// The code variable names correspond to the names they used in their 
+// The code variable names correspond to the names they used in their
 // report
 //
 // Copyright (C) 2013 Gauthier Brun <brun.gauthier@gmail.com>
@@ -29,12 +29,16 @@
 
 #include "./InternalHeaderCheck.h"
 
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+#include <iostream>
+#endif
+
 namespace Eigen {
 
 #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
 IOFormat bdcsvdfmt(8, 0, ", ", "\n", "  [", "]");
 #endif
-  
+
 template<typename MatrixType_> class BDCSVD;
 
 namespace internal {
@@ -44,11 +48,11 @@
         : traits<MatrixType_>
 {
   typedef MatrixType_ MatrixType;
-};  
+};
 
 } // end namespace internal
-  
-  
+
+
 /** \ingroup SVD_Module
  *
  *
@@ -75,31 +79,31 @@
 class BDCSVD : public SVDBase<BDCSVD<MatrixType_> >
 {
   typedef SVDBase<BDCSVD> Base;
-    
+
 public:
   using Base::rows;
   using Base::cols;
   using Base::computeU;
   using Base::computeV;
-  
+
   typedef MatrixType_ MatrixType;
   typedef typename MatrixType::Scalar Scalar;
   typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
   typedef typename NumTraits<RealScalar>::Literal Literal;
   enum {
-    RowsAtCompileTime = MatrixType::RowsAtCompileTime, 
-    ColsAtCompileTime = MatrixType::ColsAtCompileTime, 
-    DiagSizeAtCompileTime = EIGEN_SIZE_MIN_PREFER_DYNAMIC(RowsAtCompileTime, ColsAtCompileTime), 
-    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, 
-    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime, 
-    MaxDiagSizeAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(MaxRowsAtCompileTime, MaxColsAtCompileTime), 
+    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+    DiagSizeAtCompileTime = EIGEN_SIZE_MIN_PREFER_DYNAMIC(RowsAtCompileTime, ColsAtCompileTime),
+    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
+    MaxDiagSizeAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(MaxRowsAtCompileTime, MaxColsAtCompileTime),
     MatrixOptions = MatrixType::Options
   };
 
   typedef typename Base::MatrixUType MatrixUType;
   typedef typename Base::MatrixVType MatrixVType;
   typedef typename Base::SingularValuesType SingularValuesType;
-  
+
   typedef Matrix<Scalar, Dynamic, Dynamic, ColMajor> MatrixX;
   typedef Matrix<RealScalar, Dynamic, Dynamic, ColMajor> MatrixXr;
   typedef Matrix<RealScalar, Dynamic, 1> VectorType;
@@ -133,7 +137,7 @@
    *
    * \param matrix the matrix to decompose
    * \param computationOptions optional parameter allowing to specify if you want full or thin U or V unitaries to be computed.
-   *                           By default, none is computed. This is a bit - field, the possible bits are #ComputeFullU, #ComputeThinU, 
+   *                           By default, none is computed. This is a bit - field, the possible bits are #ComputeFullU, #ComputeThinU,
    *                           #ComputeFullV, #ComputeThinV.
    *
    * Thin unitaries are only available if your matrix type has a Dynamic number of columns (for example MatrixXf). They also are not
@@ -145,15 +149,15 @@
     compute(matrix, computationOptions);
   }
 
-  ~BDCSVD() 
+  ~BDCSVD()
   {
   }
-  
+
   /** \brief Method performing the decomposition of given matrix using custom options.
    *
    * \param matrix the matrix to decompose
    * \param computationOptions optional parameter allowing to specify if you want full or thin U or V unitaries to be computed.
-   *                           By default, none is computed. This is a bit - field, the possible bits are #ComputeFullU, #ComputeThinU, 
+   *                           By default, none is computed. This is a bit - field, the possible bits are #ComputeFullU, #ComputeThinU,
    *                           #ComputeFullV, #ComputeThinV.
    *
    * Thin unitaries are only available if your matrix type has a Dynamic number of columns (for example MatrixXf). They also are not
@@ -172,12 +176,12 @@
     return compute(matrix, this->m_computationOptions);
   }
 
-  void setSwitchSize(int s) 
+  void setSwitchSize(int s)
   {
-    eigen_assert(s>3 && "BDCSVD the size of the algo switch has to be greater than 3");
+    eigen_assert(s>=3 && "BDCSVD the size of the algo switch has to be at least 3.");
     m_algoswap = s;
   }
- 
+
 private:
   void allocate(Index rows, Index cols, unsigned int computationOptions);
   void divide(Index firstCol, Index lastCol, Index firstRowW, Index firstColW, Index shift);
@@ -201,7 +205,7 @@
   ArrayXi m_workspaceI;
   int m_algoswap;
   bool m_isTranspose, m_compU, m_compV;
-  
+
   using Base::m_singularValues;
   using Base::m_diagSize;
   using Base::m_computeFullU;
@@ -214,7 +218,7 @@
   using Base::m_isInitialized;
   using Base::m_nonzeroSingularValues;
 
-public:  
+public:
   int m_numIters;
 }; //end class BDCSVD
 
@@ -227,24 +231,24 @@
 
   if (Base::allocate(rows, cols, computationOptions))
     return;
-  
+
   m_computed = MatrixXr::Zero(m_diagSize + 1, m_diagSize );
   m_compU = computeV();
   m_compV = computeU();
   if (m_isTranspose)
     std::swap(m_compU, m_compV);
-  
+
   if (m_compU) m_naiveU = MatrixXr::Zero(m_diagSize + 1, m_diagSize + 1 );
   else         m_naiveU = MatrixXr::Zero(2, m_diagSize + 1 );
-  
+
   if (m_compV) m_naiveV = MatrixXr::Zero(m_diagSize, m_diagSize);
-  
+
   m_workspace.resize((m_diagSize+1)*(m_diagSize+1)*3);
   m_workspaceI.resize(3*m_diagSize);
 }// end allocate
 
 template<typename MatrixType>
-BDCSVD<MatrixType>& BDCSVD<MatrixType>::compute(const MatrixType& matrix, unsigned int computationOptions) 
+BDCSVD<MatrixType>& BDCSVD<MatrixType>::compute(const MatrixType& matrix, unsigned int computationOptions)
 {
 #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
   std::cout << "\n\n\n======================================================================================================================\n\n\n";
@@ -253,7 +257,7 @@
   using std::abs;
 
   const RealScalar considerZero = (std::numeric_limits<RealScalar>::min)();
-  
+
   //**** step -1 - If the problem is too small, directly falls back to JacobiSVD and return
   if(matrix.cols() < m_algoswap)
   {
@@ -269,7 +273,7 @@
     }
     return *this;
   }
-  
+
   //**** step 0 - Copy the input matrix and apply scaling to reduce over/under-flows
   RealScalar scale = matrix.cwiseAbs().template maxCoeff<PropagateNaN>();
   if (!(numext::isfinite)(scale)) {
@@ -282,7 +286,7 @@
   MatrixX copy;
   if (m_isTranspose) copy = matrix.adjoint()/scale;
   else               copy = matrix/scale;
-  
+
   //**** step 1 - Bidiagonalization
   // FIXME this line involves temporaries
   internal::UpperBidiagonalization<MatrixX> bid(copy);
@@ -298,7 +302,7 @@
     m_isInitialized = true;
     return *this;
   }
-    
+
   //**** step 3 - Copy singular values and vectors
   for (int i=0; i<m_diagSize; i++)
   {
@@ -387,7 +391,7 @@
         ++k2;
       }
     }
-  
+
     A.topRows(n1).noalias()    = A1.leftCols(k1) * B1.topRows(k1);
     A.bottomRows(n2).noalias() = A2.leftCols(k2) * B2.topRows(k2);
   }
@@ -399,15 +403,15 @@
   }
 }
 
-// The divide algorithm is done "in place", we are always working on subsets of the same matrix. The divide methods takes as argument the 
+// The divide algorithm is done "in place", we are always working on subsets of the same matrix. The divide methods takes as argument the
 // place of the submatrix we are currently working on.
 
 //@param firstCol : The Index of the first column of the submatrix of m_computed and for m_naiveU;
-//@param lastCol : The Index of the last column of the submatrix of m_computed and for m_naiveU; 
+//@param lastCol : The Index of the last column of the submatrix of m_computed and for m_naiveU;
 // lastCol + 1 - firstCol is the size of the submatrix.
 //@param firstRowW : The Index of the first row of the matrix W that we are to change. (see the reference paper section 1 for more information on W)
-//@param firstRowW : Same as firstRowW with the column.
-//@param shift : Each time one takes the left submatrix, one must add 1 to the shift. Why? Because! We actually want the last column of the U submatrix 
+//@param firstColW : Same as firstRowW with the column.
+//@param shift : Each time one takes the left submatrix, one must add 1 to the shift. Why? Because! We actually want the last column of the U submatrix
 // to become the first column (*coeff) and to shift all the other columns to the right. There are more details on the reference paper.
 template<typename MatrixType>
 void BDCSVD<MatrixType>::divide(Eigen::Index firstCol, Eigen::Index lastCol, Eigen::Index firstRowW, Eigen::Index firstColW, Eigen::Index shift)
@@ -420,11 +424,11 @@
   const Index k = n/2;
   const RealScalar considerZero = (std::numeric_limits<RealScalar>::min)();
   RealScalar alphaK;
-  RealScalar betaK; 
-  RealScalar r0; 
+  RealScalar betaK;
+  RealScalar r0;
   RealScalar lambda, phi, c0, s0;
   VectorType l, f;
-  // We use the other algorithm which is more efficient for small 
+  // We use the other algorithm which is more efficient for small
   // matrices.
   if (n < m_algoswap)
   {
@@ -434,7 +438,7 @@
     if (m_info != Success && m_info != NoConvergence) return;
     if (m_compU)
       m_naiveU.block(firstCol, firstCol, n + 1, n + 1).real() = b.matrixU();
-    else 
+    else
     {
       m_naiveU.row(0).segment(firstCol, n + 1).real() = b.matrixU().row(0);
       m_naiveU.row(1).segment(firstCol, n + 1).real() = b.matrixU().row(n);
@@ -448,8 +452,8 @@
   alphaK =  m_computed(firstCol + k, firstCol + k);
   betaK = m_computed(firstCol + k + 1, firstCol + k);
   // The divide must be done in that order in order to have good results. Divide change the data inside the submatrices
-  // and the divide of the right submatrice reads one column of the left submatrice. That's why we need to treat the 
-  // right submatrix before the left one. 
+  // and the divide of the right submatrice reads one column of the left submatrice. That's why we need to treat the
+  // right submatrix before the left one.
   divide(k + 1 + firstCol, lastCol, k + 1 + firstRowW, k + 1 + firstColW, shift);
   if (m_info != Success && m_info != NoConvergence) return;
   divide(firstCol, k - 1 + firstCol, firstRowW, firstColW + 1, shift + 1);
@@ -459,8 +463,8 @@
   {
     lambda = m_naiveU(firstCol + k, firstCol + k);
     phi = m_naiveU(firstCol + k + 1, lastCol + 1);
-  } 
-  else 
+  }
+  else
   {
     lambda = m_naiveU(1, firstCol + k);
     phi = m_naiveU(0, lastCol + 1);
@@ -470,8 +474,8 @@
   {
     l = m_naiveU.row(firstCol + k).segment(firstCol, k);
     f = m_naiveU.row(firstCol + k + 1).segment(firstCol + k + 1, n - k - 1);
-  } 
-  else 
+  }
+  else
   {
     l = m_naiveU.row(1).segment(firstCol, k);
     f = m_naiveU.row(0).segment(firstCol + k + 1, n - k - 1);
@@ -487,52 +491,52 @@
     c0 = alphaK * lambda / r0;
     s0 = betaK * phi / r0;
   }
-  
+
 #ifdef EIGEN_BDCSVD_SANITY_CHECKS
   assert(m_naiveU.allFinite());
   assert(m_naiveV.allFinite());
   assert(m_computed.allFinite());
 #endif
-  
+
   if (m_compU)
   {
-    MatrixXr q1 (m_naiveU.col(firstCol + k).segment(firstCol, k + 1));     
+    MatrixXr q1 (m_naiveU.col(firstCol + k).segment(firstCol, k + 1));
     // we shiftW Q1 to the right
-    for (Index i = firstCol + k - 1; i >= firstCol; i--) 
+    for (Index i = firstCol + k - 1; i >= firstCol; i--)
       m_naiveU.col(i + 1).segment(firstCol, k + 1) = m_naiveU.col(i).segment(firstCol, k + 1);
     // we shift q1 at the left with a factor c0
     m_naiveU.col(firstCol).segment( firstCol, k + 1) = (q1 * c0);
     // last column = q1 * - s0
     m_naiveU.col(lastCol + 1).segment(firstCol, k + 1) = (q1 * ( - s0));
     // first column = q2 * s0
-    m_naiveU.col(firstCol).segment(firstCol + k + 1, n - k) = m_naiveU.col(lastCol + 1).segment(firstCol + k + 1, n - k) * s0; 
+    m_naiveU.col(firstCol).segment(firstCol + k + 1, n - k) = m_naiveU.col(lastCol + 1).segment(firstCol + k + 1, n - k) * s0;
     // q2 *= c0
     m_naiveU.col(lastCol + 1).segment(firstCol + k + 1, n - k) *= c0;
-  } 
-  else 
+  }
+  else
   {
     RealScalar q1 = m_naiveU(0, firstCol + k);
     // we shift Q1 to the right
-    for (Index i = firstCol + k - 1; i >= firstCol; i--) 
+    for (Index i = firstCol + k - 1; i >= firstCol; i--)
       m_naiveU(0, i + 1) = m_naiveU(0, i);
     // we shift q1 at the left with a factor c0
     m_naiveU(0, firstCol) = (q1 * c0);
     // last column = q1 * - s0
     m_naiveU(0, lastCol + 1) = (q1 * ( - s0));
     // first column = q2 * s0
-    m_naiveU(1, firstCol) = m_naiveU(1, lastCol + 1) *s0; 
+    m_naiveU(1, firstCol) = m_naiveU(1, lastCol + 1) *s0;
     // q2 *= c0
     m_naiveU(1, lastCol + 1) *= c0;
     m_naiveU.row(1).segment(firstCol + 1, k).setZero();
     m_naiveU.row(0).segment(firstCol + k + 1, n - k - 1).setZero();
   }
-  
+
 #ifdef EIGEN_BDCSVD_SANITY_CHECKS
   assert(m_naiveU.allFinite());
   assert(m_naiveV.allFinite());
   assert(m_computed.allFinite());
 #endif
-  
+
   m_computed(firstCol + shift, firstCol + shift) = r0;
   m_computed.col(firstCol + shift).segment(firstCol + shift + 1, k) = alphaK * l.transpose().real();
   m_computed.col(firstCol + shift).segment(firstCol + shift + k + 1, n - k - 1) = betaK * f.transpose().real();
@@ -553,17 +557,17 @@
 //   assert(count<681);
 //   assert(((tmp1-tmp2).abs()<1e-13*tmp2.abs()).all());
 #endif
-  
+
   // Third part: compute SVD of combined matrix
   MatrixXr UofSVD, VofSVD;
   VectorType singVals;
   computeSVDofM(firstCol + shift, n, UofSVD, singVals, VofSVD);
-  
+
 #ifdef EIGEN_BDCSVD_SANITY_CHECKS
   assert(UofSVD.allFinite());
   assert(VofSVD.allFinite());
 #endif
-  
+
   if (m_compU)
     structured_update(m_naiveU.block(firstCol, firstCol, n + 1, n + 1), UofSVD, (n+2)/2);
   else
@@ -572,15 +576,15 @@
     tmp.noalias() = m_naiveU.middleCols(firstCol, n+1) * UofSVD;
     m_naiveU.middleCols(firstCol, n + 1) = tmp;
   }
-  
+
   if (m_compV)  structured_update(m_naiveV.block(firstRowW, firstColW, n, n), VofSVD, (n+1)/2);
-  
+
 #ifdef EIGEN_BDCSVD_SANITY_CHECKS
   assert(m_naiveU.allFinite());
   assert(m_naiveV.allFinite());
   assert(m_computed.allFinite());
 #endif
-  
+
   m_computed.block(firstCol + shift, firstCol + shift, n, n).setZero();
   m_computed.block(firstCol + shift, firstCol + shift, n, n).diagonal() = singVals;
 }// end divide
@@ -612,7 +616,7 @@
   if (col0.hasNaN() || diag.hasNaN())
     std::cout << "\n\nHAS NAN\n\n";
 #endif
-  
+
   // Many singular values might have been deflated, the zero ones have been moved to the end,
   // but others are interleaved and we must ignore them at this stage.
   // To this end, let's compute a permutation skipping them:
@@ -623,7 +627,7 @@
     if(abs(col0(k))>considerZero)
       m_workspaceI(m++) = k;
   Map<ArrayXi> perm(m_workspaceI.data(),m);
-  
+
   Map<ArrayXr> shifts(m_workspace.data()+1*n, n);
   Map<ArrayXr> mus(m_workspace.data()+2*n, n);
   Map<ArrayXr> zhat(m_workspace.data()+3*n, n);
@@ -633,16 +637,16 @@
   std::cout << "  z: " << col0.transpose() << "\n";
   std::cout << "  d: " << diag.transpose() << "\n";
 #endif
-  
+
   // Compute singVals, shifts, and mus
   computeSingVals(col0, diag, perm, singVals, shifts, mus);
-  
+
 #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
   std::cout << "  j:        " << (m_computed.block(firstCol, firstCol, n, n)).jacobiSvd().singularValues().transpose().reverse() << "\n\n";
   std::cout << "  sing-val: " << singVals.transpose() << "\n";
   std::cout << "  mu:       " << mus.transpose() << "\n";
   std::cout << "  shift:    " << shifts.transpose() << "\n";
-  
+
   {
     std::cout << "\n\n    mus:    " << mus.head(actual_n).transpose() << "\n\n";
     std::cout << "    check1 (expect0) : " << ((singVals.array()-(shifts+mus)) / singVals.array()).head(actual_n).transpose() << "\n\n";
@@ -651,30 +655,30 @@
     assert((((singVals.array()-diag) / singVals.array()).head(actual_n) >= 0).all());
   }
 #endif
-  
+
 #ifdef EIGEN_BDCSVD_SANITY_CHECKS
   assert(singVals.allFinite());
   assert(mus.allFinite());
   assert(shifts.allFinite());
 #endif
-  
+
   // Compute zhat
   perturbCol0(col0, diag, perm, singVals, shifts, mus, zhat);
 #ifdef  EIGEN_BDCSVD_DEBUG_VERBOSE
   std::cout << "  zhat: " << zhat.transpose() << "\n";
 #endif
-  
+
 #ifdef EIGEN_BDCSVD_SANITY_CHECKS
   assert(zhat.allFinite());
 #endif
-  
+
   computeSingVecs(zhat, diag, perm, singVals, shifts, mus, U, V);
-  
+
 #ifdef  EIGEN_BDCSVD_DEBUG_VERBOSE
   std::cout << "U^T U: " << (U.transpose() * U - MatrixXr(MatrixXr::Identity(U.cols(),U.cols()))).norm() << "\n";
   std::cout << "V^T V: " << (V.transpose() * V - MatrixXr(MatrixXr::Identity(V.cols(),V.cols()))).norm() << "\n";
 #endif
-  
+
 #ifdef EIGEN_BDCSVD_SANITY_CHECKS
   assert(m_naiveU.allFinite());
   assert(m_naiveV.allFinite());
@@ -684,7 +688,7 @@
 //   assert((U.transpose() * U - MatrixXr(MatrixXr::Identity(U.cols(),U.cols()))).norm() < 100*NumTraits<RealScalar>::epsilon() * n);
 //   assert((V.transpose() * V - MatrixXr(MatrixXr::Identity(V.cols(),V.cols()))).norm() < 100*NumTraits<RealScalar>::epsilon() * n);
 #endif
-  
+
   // Because of deflation, the singular values might not be completely sorted.
   // Fortunately, reordering them is a O(n) problem
   for(Index i=0; i<actual_n-1; ++i)
@@ -706,13 +710,13 @@
     assert(singular_values_sorted);
   }
 #endif
-  
+
   // Reverse order so that singular values in increased order
   // Because of deflation, the zeros singular-values are already at the end
   singVals.head(actual_n).reverseInPlace();
   U.leftCols(actual_n).rowwise().reverseInPlace();
   if (m_compV) V.leftCols(actual_n).rowwise().reverseInPlace();
-  
+
 #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
   JacobiSVD<MatrixXr> jsvd(m_computed.block(firstCol, firstCol, n, n) );
   std::cout << "  * j:        " << jsvd.singularValues().transpose() << "\n\n";
@@ -761,7 +765,7 @@
       mus(k) = Literal(0);
       shifts(k) = k==0 ? col0(0) : diag(k);
       continue;
-    } 
+    }
 
     // otherwise, use secular equation to find singular value
     RealScalar left = diag(k);
@@ -800,7 +804,7 @@
               << " "       << secularEq(left+RealScalar(0.999999)*(right-left), col0, diag, perm, diag, 0) << "\n";
 #endif
     RealScalar shift = (k == actual_n-1 || fMid > Literal(0)) ? left : right;
-    
+
     // measure everything relative to shift
     Map<ArrayXr> diagShifted(m_workspace.data()+4*n, n);
     diagShifted = diag - shift;
@@ -819,7 +823,7 @@
         diagShifted = diag - shift;
       }
     }
-    
+
     // initial guess
     RealScalar muPrev, muCur;
     if (shift == left)
@@ -859,12 +863,12 @@
 #ifdef EIGEN_BDCSVD_SANITY_CHECKS
       assert((numext::isfinite)(fZero));
 #endif
-      
+
       muPrev = muCur;
       fPrev = fCur;
       muCur = muZero;
       fCur = fZero;
-      
+
       if (shift == left  && (muCur < Literal(0) || muCur > right - left)) useBisection = true;
       if (shift == right && (muCur < -(right - left) || muCur > Literal(0))) useBisection = true;
       if (abs(fCur)>abs(fPrev)) useBisection = true;
@@ -901,7 +905,7 @@
       RealScalar fLeft = secularEq(leftShifted, col0, diag, perm, diagShifted, shift);
       eigen_internal_assert(fLeft<Literal(0));
 
-#if defined EIGEN_INTERNAL_DEBUGGING || defined EIGEN_BDCSVD_SANITY_CHECKS
+#if defined EIGEN_BDCSVD_DEBUG_VERBOSE || defined EIGEN_BDCSVD_SANITY_CHECKS || defined EIGEN_INTERNAL_DEBUGGING
       RealScalar fRight = secularEq(rightShifted, col0, diag, perm, diagShifted, shift);
 #endif
 
@@ -914,7 +918,7 @@
         std::cout << "f(" << rightShifted << ") =" << fRight << " ; " << left << " " << shift << " " << right << "\n";
       // assert((numext::isfinite)(fRight));
 #endif
-    
+
 #ifdef  EIGEN_BDCSVD_DEBUG_VERBOSE
       if(!(fLeft * fRight<0))
       {
@@ -948,7 +952,7 @@
         }
         muCur = (leftShifted + rightShifted) / Literal(2);
       }
-      else 
+      else
       {
         // We have a problem as shifting on the left or right give either a positive or negative value
         // at the middle of [left,right]...
@@ -959,7 +963,7 @@
           muCur = -muCur;
       }
     }
-      
+
     singVals[k] = shift + muCur;
     shifts[k] = shift;
     mus[k] = muCur;
@@ -1070,7 +1074,7 @@
 {
   Index n = zhat.size();
   Index m = perm.size();
-  
+
   for (Index k = 0; k < n; ++k)
   {
     if (zhat(k) == Literal(0))
@@ -1088,7 +1092,7 @@
       }
       U(n,k) = Literal(0);
       U.col(k).normalize();
-    
+
       if (m_compV)
       {
         V.col(k).setZero();
@@ -1124,10 +1128,10 @@
     m_computed(start+i, start+i) = Literal(0);
     return;
   }
-  m_computed(start,start) = r;  
+  m_computed(start,start) = r;
   m_computed(start+i, start) = Literal(0);
   m_computed(start+i, start+i) = Literal(0);
-  
+
   JacobiRotation<RealScalar> J(c/r,-s/r);
   if (m_compU)  m_naiveU.middleRows(firstCol, size+1).applyOnTheRight(firstCol, firstCol+i, J);
   else          m_naiveU.applyOnTheRight(firstCol, firstCol+i, J);
@@ -1184,29 +1188,29 @@
   using std::sqrt;
   using std::abs;
   const Index length = lastCol + 1 - firstCol;
-  
+
   Block<MatrixXr,Dynamic,1> col0(m_computed, firstCol+shift, firstCol+shift, length, 1);
   Diagonal<MatrixXr> fulldiag(m_computed);
   VectorBlock<Diagonal<MatrixXr>,Dynamic> diag(fulldiag, firstCol+shift, length);
-  
+
   const RealScalar considerZero = (std::numeric_limits<RealScalar>::min)();
   RealScalar maxDiag = diag.tail((std::max)(Index(1),length-1)).cwiseAbs().maxCoeff();
   RealScalar epsilon_strict = numext::maxi<RealScalar>(considerZero,NumTraits<RealScalar>::epsilon() * maxDiag);
   RealScalar epsilon_coarse = Literal(8) * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(col0.cwiseAbs().maxCoeff(), maxDiag);
-  
+
 #ifdef EIGEN_BDCSVD_SANITY_CHECKS
   assert(m_naiveU.allFinite());
   assert(m_naiveV.allFinite());
   assert(m_computed.allFinite());
 #endif
 
-#ifdef  EIGEN_BDCSVD_DEBUG_VERBOSE  
+#ifdef  EIGEN_BDCSVD_DEBUG_VERBOSE
   std::cout << "\ndeflate:" << diag.head(k+1).transpose() << "  |  " << diag.segment(k+1,length-k-1).transpose() << "\n";
 #endif
-  
+
   //condition 4.1
   if (diag(0) < epsilon_coarse)
-  { 
+  {
 #ifdef  EIGEN_BDCSVD_DEBUG_VERBOSE
     std::cout << "deflation 4.1, because " << diag(0) << " < " << epsilon_coarse << "\n";
 #endif
@@ -1243,22 +1247,22 @@
   std::cout << "            : " << col0.transpose() << "\n\n";
 #endif
   {
-    // Check for total deflation
-    // If we have a total deflation, then we have to consider col0(0)==diag(0) as a singular value during sorting
-    bool total_deflation = (col0.tail(length-1).array()<considerZero).all();
-    
+    // Check for total deflation:
+    // If we have a total deflation, then we have to consider col0(0)==diag(0) as a singular value during sorting.
+    const bool total_deflation = (col0.tail(length-1).array().abs()<considerZero).all();
+
     // Sort the diagonal entries, since diag(1:k-1) and diag(k:length) are already sorted, let's do a sorted merge.
     // First, compute the respective permutation.
     Index *permutation = m_workspaceI.data();
     {
       permutation[0] = 0;
       Index p = 1;
-      
+
       // Move deflated diagonal entries at the end.
       for(Index i=1; i<length; ++i)
         if(abs(diag(i))<considerZero)
           permutation[p++] = i;
-        
+
       Index i=1, j=k+1;
       for( ; p < length; ++p)
       {
@@ -1268,7 +1272,7 @@
         else                        permutation[p] = i++;
       }
     }
-    
+
     // If we have a total deflation, then we have to insert diag(0) at the right place
     if(total_deflation)
     {
@@ -1284,22 +1288,22 @@
         }
       }
     }
-    
+
     // Current index of each col, and current column of each index
     Index *realInd = m_workspaceI.data()+length;
     Index *realCol = m_workspaceI.data()+2*length;
-    
+
     for(int pos = 0; pos< length; pos++)
     {
       realCol[pos] = pos;
       realInd[pos] = pos;
     }
-    
+
     for(Index i = total_deflation?0:1; i < length; i++)
     {
       const Index pi = permutation[length - (total_deflation ? i+1 : i)];
       const Index J = realCol[pi];
-      
+
       using std::swap;
       // swap diagonal and first column entries:
       swap(diag(i), diag(J));
@@ -1322,7 +1326,7 @@
   std::cout << "sorted: " << diag.transpose().format(bdcsvdfmt) << "\n";
   std::cout << "      : " << col0.transpose() << "\n\n";
 #endif
-    
+
   //condition 4.4
   {
     Index i = length-1;
@@ -1337,12 +1341,12 @@
         deflation44(firstCol, firstCol + shift, firstRowW, firstColW, i-1, i, length);
       }
   }
-  
+
 #ifdef EIGEN_BDCSVD_SANITY_CHECKS
   for(Index j=2;j<length;++j)
     assert(diag(j-1)<=diag(j) || abs(diag(j))<considerZero);
 #endif
-  
+
 #ifdef EIGEN_BDCSVD_SANITY_CHECKS
   assert(m_naiveU.allFinite());
   assert(m_naiveV.allFinite());
diff --git a/Eigen/src/SparseCore/CompressedStorage.h b/Eigen/src/SparseCore/CompressedStorage.h
index 696f29d..1f7aeb5 100644
--- a/Eigen/src/SparseCore/CompressedStorage.h
+++ b/Eigen/src/SparseCore/CompressedStorage.h
@@ -225,22 +225,6 @@
       }
     }
 
-    void prune(const Scalar& reference, const RealScalar& epsilon = NumTraits<RealScalar>::dummy_precision())
-    {
-      Index k = 0;
-      Index n = size();
-      for (Index i=0; i<n; ++i)
-      {
-        if (!internal::isMuchSmallerThan(value(i), reference, epsilon))
-        {
-          value(k) = value(i);
-          index(k) = index(i);
-          ++k;
-        }
-      }
-      resize(k,0);
-    }
-
   protected:
 
     inline void reallocate(Index size)
diff --git a/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h b/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
index d4aa473..6bd0fa1 100644
--- a/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
+++ b/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
@@ -126,6 +126,11 @@
 
 namespace internal {
 
+
+// Helper template to generate new sparse matrix types
+template<class Source, int Order>
+using WithStorageOrder = SparseMatrix<typename Source::Scalar, Order, typename Source::StorageIndex>;
+
 template<typename Lhs, typename Rhs, typename ResultType,
   int LhsStorageOrder = (traits<Lhs>::Flags&RowMajorBit) ? RowMajor : ColMajor,
   int RhsStorageOrder = (traits<Rhs>::Flags&RowMajorBit) ? RowMajor : ColMajor,
@@ -140,15 +145,15 @@
 
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
   {
-    typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorMatrix;
-    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorMatrixAux;
-    typedef typename sparse_eval<ColMajorMatrixAux,ResultType::RowsAtCompileTime,ResultType::ColsAtCompileTime,ColMajorMatrixAux::Flags>::type ColMajorMatrix;
+    using RowMajorMatrix = WithStorageOrder<ResultType, RowMajor>;
+    using ColMajorMatrixAux = WithStorageOrder<ResultType, ColMajor>;
 
     // If the result is tall and thin (in the extreme case a column vector)
     // then it is faster to sort the coefficients inplace instead of transposing twice.
     // FIXME, the following heuristic is probably not very good.
     if(lhs.rows()>rhs.cols())
     {
+      using ColMajorMatrix = typename sparse_eval<ColMajorMatrixAux,ResultType::RowsAtCompileTime,ResultType::ColsAtCompileTime,ColMajorMatrixAux::Flags>::type;
       ColMajorMatrix resCol(lhs.rows(),rhs.cols());
       // perform sorted insertion
       internal::conservative_sparse_sparse_product_impl<Lhs,Rhs,ColMajorMatrix>(lhs, rhs, resCol, true);
@@ -170,8 +175,8 @@
 {
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
   {
-    typedef SparseMatrix<typename Rhs::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorRhs;
-    typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorRes;
+    using RowMajorRhs = WithStorageOrder<Rhs, RowMajor>;
+    using RowMajorRes = WithStorageOrder<ResultType, RowMajor>;
     RowMajorRhs rhsRow = rhs;
     RowMajorRes resRow(lhs.rows(), rhs.cols());
     internal::conservative_sparse_sparse_product_impl<RowMajorRhs,Lhs,RowMajorRes>(rhsRow, lhs, resRow);
@@ -184,8 +189,8 @@
 {
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
   {
-    typedef SparseMatrix<typename Lhs::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorLhs;
-    typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorRes;
+    using RowMajorLhs = WithStorageOrder<Lhs, RowMajor>;
+    using RowMajorRes = WithStorageOrder<ResultType, RowMajor>;
     RowMajorLhs lhsRow = lhs;
     RowMajorRes resRow(lhs.rows(), rhs.cols());
     internal::conservative_sparse_sparse_product_impl<Rhs,RowMajorLhs,RowMajorRes>(rhs, lhsRow, resRow);
@@ -198,9 +203,9 @@
 {
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
   {
-    typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorMatrix;
-    RowMajorMatrix resRow(lhs.rows(), rhs.cols());
-    internal::conservative_sparse_sparse_product_impl<Rhs,Lhs,RowMajorMatrix>(rhs, lhs, resRow);
+    using RowMajorRes = WithStorageOrder<ResultType, RowMajor>;
+    RowMajorRes resRow(lhs.rows(), rhs.cols());
+    internal::conservative_sparse_sparse_product_impl<Rhs,Lhs,RowMajorRes>(rhs, lhs, resRow);
     res = resRow;
   }
 };
@@ -213,9 +218,9 @@
 
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
   {
-    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorMatrix;
-    ColMajorMatrix resCol(lhs.rows(), rhs.cols());
-    internal::conservative_sparse_sparse_product_impl<Lhs,Rhs,ColMajorMatrix>(lhs, rhs, resCol);
+    using ColMajorRes = WithStorageOrder<ResultType, ColMajor>;
+    ColMajorRes resCol(lhs.rows(), rhs.cols());
+    internal::conservative_sparse_sparse_product_impl<Lhs,Rhs,ColMajorRes>(lhs, rhs, resCol);
     res = resCol;
   }
 };
@@ -225,8 +230,8 @@
 {
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
   {
-    typedef SparseMatrix<typename Lhs::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorLhs;
-    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorRes;
+    using ColMajorLhs = WithStorageOrder<Lhs, ColMajor>;
+    using ColMajorRes = WithStorageOrder<ResultType, ColMajor>;
     ColMajorLhs lhsCol = lhs;
     ColMajorRes resCol(lhs.rows(), rhs.cols());
     internal::conservative_sparse_sparse_product_impl<ColMajorLhs,Rhs,ColMajorRes>(lhsCol, rhs, resCol);
@@ -239,8 +244,8 @@
 {
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
   {
-    typedef SparseMatrix<typename Rhs::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorRhs;
-    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorRes;
+    using ColMajorRhs = WithStorageOrder<Rhs, ColMajor>;
+    using ColMajorRes = WithStorageOrder<ResultType, ColMajor>;
     ColMajorRhs rhsCol = rhs;
     ColMajorRes resCol(lhs.rows(), rhs.cols());
     internal::conservative_sparse_sparse_product_impl<Lhs,ColMajorRhs,ColMajorRes>(lhs, rhsCol, resCol);
@@ -253,12 +258,12 @@
 {
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
   {
-    typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorMatrix;
-    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorMatrix;
-    RowMajorMatrix resRow(lhs.rows(),rhs.cols());
-    internal::conservative_sparse_sparse_product_impl<Rhs,Lhs,RowMajorMatrix>(rhs, lhs, resRow);
+    using ColMajorRes = WithStorageOrder<ResultType, ColMajor>;
+    using RowMajorRes = WithStorageOrder<ResultType, RowMajor>;
+    RowMajorRes resRow(lhs.rows(),rhs.cols());
+    internal::conservative_sparse_sparse_product_impl<Rhs,Lhs,RowMajorRes>(rhs, lhs, resRow);
     // sort the non zeros:
-    ColMajorMatrix resCol(resRow);
+    ColMajorRes resCol(resRow);
     res = resCol;
   }
 };
@@ -319,7 +324,7 @@
 {
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
   {
-    typedef SparseMatrix<typename Lhs::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorLhs;
+    using ColMajorLhs = WithStorageOrder<Lhs, ColMajor>;
     ColMajorLhs lhsCol(lhs);
     internal::sparse_sparse_to_dense_product_impl<ColMajorLhs,Rhs,ResultType>(lhsCol, rhs, res);
   }
@@ -330,7 +335,7 @@
 {
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
   {
-    typedef SparseMatrix<typename Rhs::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorRhs;
+    using ColMajorRhs = WithStorageOrder<Rhs, ColMajor>;
     ColMajorRhs rhsCol(rhs);
     internal::sparse_sparse_to_dense_product_impl<Lhs,ColMajorRhs,ResultType>(lhs, rhsCol, res);
   }
diff --git a/Eigen/src/SparseCore/MappedSparseMatrix.h b/Eigen/src/SparseCore/MappedSparseMatrix.h
deleted file mode 100644
index efb7b49..0000000
--- a/Eigen/src/SparseCore/MappedSparseMatrix.h
+++ /dev/null
@@ -1,69 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_MAPPED_SPARSEMATRIX_H
-#define EIGEN_MAPPED_SPARSEMATRIX_H
-
-#include "./InternalHeaderCheck.h"
-
-namespace Eigen {
-
-/** \deprecated Use Map<SparseMatrix<> >
-  * \class MappedSparseMatrix
-  *
-  * \brief Sparse matrix
-  *
-  * \param Scalar_ the scalar type, i.e. the type of the coefficients
-  *
-  * See http://www.netlib.org/linalg/html_templates/node91.html for details on the storage scheme.
-  *
-  */
-namespace internal {
-template<typename Scalar_, int _Flags, typename StorageIndex_>
-struct traits<MappedSparseMatrix<Scalar_, _Flags, StorageIndex_> > : traits<SparseMatrix<Scalar_, _Flags, StorageIndex_> >
-{};
-} // end namespace internal
-
-template<typename Scalar_, int _Flags, typename StorageIndex_>
-class MappedSparseMatrix
-  : public Map<SparseMatrix<Scalar_, _Flags, StorageIndex_> >
-{
-    typedef Map<SparseMatrix<Scalar_, _Flags, StorageIndex_> > Base;
-
-  public:
-    
-    typedef typename Base::StorageIndex StorageIndex;
-    typedef typename Base::Scalar Scalar;
-
-    inline MappedSparseMatrix(Index rows, Index cols, Index nnz, StorageIndex* outerIndexPtr, StorageIndex* innerIndexPtr, Scalar* valuePtr, StorageIndex* innerNonZeroPtr = 0)
-      : Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZeroPtr)
-    {}
-
-    /** Empty destructor */
-    inline ~MappedSparseMatrix() {}
-};
-
-namespace internal {
-
-template<typename Scalar_, int Options_, typename StorageIndex_>
-struct evaluator<MappedSparseMatrix<Scalar_,Options_,StorageIndex_> >
-  : evaluator<SparseCompressedBase<MappedSparseMatrix<Scalar_,Options_,StorageIndex_> > >
-{
-  typedef MappedSparseMatrix<Scalar_,Options_,StorageIndex_> XprType;
-  typedef evaluator<SparseCompressedBase<XprType> > Base;
-  
-  evaluator() : Base() {}
-  explicit evaluator(const XprType &mat) : Base(mat) {}
-};
-
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN_MAPPED_SPARSEMATRIX_H
diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h
index 4bf47bb..9cb5d21 100644
--- a/Eigen/src/SparseCore/SparseMatrix.h
+++ b/Eigen/src/SparseCore/SparseMatrix.h
@@ -110,7 +110,7 @@
     using Base::operator+=;
     using Base::operator-=;
 
-    typedef MappedSparseMatrix<Scalar,Flags> Map;
+    typedef Eigen::Map<SparseMatrix<Scalar,Flags,StorageIndex>> Map;
     typedef Diagonal<SparseMatrix> DiagonalReturnType;
     typedef Diagonal<const SparseMatrix> ConstDiagonalReturnType;
     typedef typename Base::InnerIterator InnerIterator;
@@ -126,7 +126,7 @@
     typedef typename Base::IndexVector IndexVector;
     typedef typename Base::ScalarVector ScalarVector;
   protected:
-    typedef SparseMatrix<Scalar,(Flags&~RowMajorBit)|(IsRowMajor?RowMajorBit:0)> TransposedSparseMatrix;
+    typedef SparseMatrix<Scalar,(Flags&~RowMajorBit)|(IsRowMajor?RowMajorBit:0),StorageIndex> TransposedSparseMatrix;
 
     Index m_outerSize;
     Index m_innerSize;
diff --git a/Eigen/src/SparseCore/SparseRef.h b/Eigen/src/SparseCore/SparseRef.h
index 166b17e..26403ac 100644
--- a/Eigen/src/SparseCore/SparseRef.h
+++ b/Eigen/src/SparseCore/SparseRef.h
@@ -135,7 +135,7 @@
     template<int OtherOptions>
     inline Ref(const SparseMatrix<MatScalar,OtherOptions,MatIndex>& expr);
     template<int OtherOptions>
-    inline Ref(const MappedSparseMatrix<MatScalar,OtherOptions,MatIndex>& expr);
+    inline Ref(const Map<SparseMatrix<MatScalar,OtherOptions,MatIndex>>& expr);
   public:
 
     typedef internal::SparseRefBase<Ref> Base;
@@ -150,15 +150,15 @@
       eigen_assert( ((Options & int(StandardCompressedFormat))==0) || (expr.isCompressed()) );
       Base::construct(expr.derived());
     }
-    
+
     template<int OtherOptions>
-    inline Ref(MappedSparseMatrix<MatScalar,OtherOptions,MatIndex>& expr)
+    inline Ref(Map<SparseMatrix<MatScalar,OtherOptions,MatIndex> >& expr)
     {
       EIGEN_STATIC_ASSERT(bool(Traits::template match<SparseMatrix<MatScalar,OtherOptions,MatIndex> >::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
       eigen_assert( ((Options & int(StandardCompressedFormat))==0) || (expr.isCompressed()) );
       Base::construct(expr.derived());
     }
-    
+
     template<typename Derived>
     inline Ref(const SparseCompressedBase<Derived>& expr)
     #else
diff --git a/Eigen/src/SparseCore/SparseUtil.h b/Eigen/src/SparseCore/SparseUtil.h
index 3127c7e..19b59d1 100644
--- a/Eigen/src/SparseCore/SparseUtil.h
+++ b/Eigen/src/SparseCore/SparseUtil.h
@@ -53,7 +53,6 @@
 
 template<typename Scalar_, int _Flags = 0, typename StorageIndex_ = int>  class SparseMatrix;
 template<typename Scalar_, int _Flags = 0, typename StorageIndex_ = int>  class SparseVector;
-template<typename Scalar_, int _Flags = 0, typename StorageIndex_ = int>  class MappedSparseMatrix;
 
 template<typename MatrixType, unsigned int UpLo>  class SparseSelfAdjointView;
 template<typename Lhs, typename Rhs>              class SparseDiagonalProduct;
diff --git a/Eigen/src/SparseCore/SparseVector.h b/Eigen/src/SparseCore/SparseVector.h
index 5706948..7d7034f 100644
--- a/Eigen/src/SparseCore/SparseVector.h
+++ b/Eigen/src/SparseCore/SparseVector.h
@@ -209,9 +209,33 @@
     inline void finalize() {}
 
     /** \copydoc SparseMatrix::prune(const Scalar&,const RealScalar&) */
-    void prune(const Scalar& reference, const RealScalar& epsilon = NumTraits<RealScalar>::dummy_precision())
+    Index prune(const Scalar& reference, const RealScalar& epsilon = NumTraits<RealScalar>::dummy_precision()) {
+      return prune([&](const Scalar& val){ return !internal::isMuchSmallerThan(val, reference, epsilon); });
+    }
+
+    /**
+     * \brief Prunes the entries of the vector based on a `predicate`
+     * \tparam F Type of the predicate.
+     * \param keep_predicate The predicate that is used to test whether a value should be kept. A callable that
+     * gets passed om a `Scalar` value and returns a boolean. If the predicate returns true, the value is kept.
+     * \return The new number of structural non-zeros.
+     */
+    template<class F>
+    Index prune(F&& keep_predicate)
     {
-      m_data.prune(reference,epsilon);
+      Index k = 0;
+      Index n = m_data.size();
+      for (Index i = 0; i < n; ++i)
+      {
+        if (keep_predicate(m_data.value(i)))
+        {
+          m_data.value(k) = std::move(m_data.value(i));
+          m_data.index(k) = m_data.index(i);
+          ++k;
+        }
+      }
+      m_data.resize(k);
+      return k;
     }
 
     /** Resizes the sparse vector to \a rows x \a cols
diff --git a/Eigen/src/SparseLU/SparseLU.h b/Eigen/src/SparseLU/SparseLU.h
index cdc2b73..1516d26 100644
--- a/Eigen/src/SparseLU/SparseLU.h
+++ b/Eigen/src/SparseLU/SparseLU.h
@@ -252,9 +252,9 @@
       * y = b; matrixU().solveInPlace(y);
       * \endcode
       */
-    SparseLUMatrixUReturnType<SCMatrix,MappedSparseMatrix<Scalar,ColMajor,StorageIndex> > matrixU() const
+    SparseLUMatrixUReturnType<SCMatrix,Map<SparseMatrix<Scalar,ColMajor,StorageIndex> > > matrixU() const
     {
-      return SparseLUMatrixUReturnType<SCMatrix, MappedSparseMatrix<Scalar,ColMajor,StorageIndex> >(m_Lstore, m_Ustore);
+      return SparseLUMatrixUReturnType<SCMatrix, Map<SparseMatrix<Scalar,ColMajor,StorageIndex> > >(m_Lstore, m_Ustore);
     }
 
     /**
@@ -476,7 +476,7 @@
     std::string m_lastError;
     NCMatrix m_mat; // The input (permuted ) matrix 
     SCMatrix m_Lstore; // The lower triangular matrix (supernodal)
-    MappedSparseMatrix<Scalar,ColMajor,StorageIndex> m_Ustore; // The upper triangular matrix
+    Map<SparseMatrix<Scalar,ColMajor,StorageIndex>> m_Ustore; // The upper triangular matrix
     PermutationType m_perm_c; // Column permutation 
     PermutationType m_perm_r ; // Row permutation
     IndexVector m_etree; // Column elimination tree 
@@ -791,7 +791,7 @@
   // Create supernode matrix L 
   m_Lstore.setInfos(m, n, m_glu.lusup, m_glu.xlusup, m_glu.lsub, m_glu.xlsub, m_glu.supno, m_glu.xsup); 
   // Create the column major upper sparse matrix  U; 
-  new (&m_Ustore) MappedSparseMatrix<Scalar, ColMajor, StorageIndex> ( m, n, m_nnzU, m_glu.xusub.data(), m_glu.usub.data(), m_glu.ucol.data() );
+  new (&m_Ustore) Map<SparseMatrix<Scalar, ColMajor, StorageIndex>> ( m, n, m_nnzU, m_glu.xusub.data(), m_glu.usub.data(), m_glu.ucol.data() );
   
   m_info = Success;
   m_factorizationIsOk = true;
diff --git a/Eigen/src/SuperLUSupport/SuperLUSupport.h b/Eigen/src/SuperLUSupport/SuperLUSupport.h
index f9a9a43..4bac22d 100644
--- a/Eigen/src/SuperLUSupport/SuperLUSupport.h
+++ b/Eigen/src/SuperLUSupport/SuperLUSupport.h
@@ -297,14 +297,14 @@
 
 /** View a Super LU matrix as an Eigen expression */
 template<typename Scalar, int Flags, typename Index>
-MappedSparseMatrix<Scalar,Flags,Index> map_superlu(SluMatrix& sluMat)
+Map<SparseMatrix<Scalar,Flags,Index> > map_superlu(SluMatrix& sluMat)
 {
   eigen_assert(((Flags&RowMajor)==RowMajor && sluMat.Stype == SLU_NR)
          || ((Flags&ColMajor)==ColMajor && sluMat.Stype == SLU_NC));
 
   Index outerSize = (Flags&RowMajor)==RowMajor ? sluMat.ncol : sluMat.nrow;
 
-  return MappedSparseMatrix<Scalar,Flags,Index>(
+  return Map<SparseMatrix<Scalar,Flags,Index> >(
     sluMat.nrow, sluMat.ncol, sluMat.storage.outerInd[outerSize],
     sluMat.storage.outerInd, sluMat.storage.innerInd, reinterpret_cast<Scalar*>(sluMat.storage.values) );
 }
diff --git a/cmake/FindCLANG_FORMAT.cmake b/cmake/FindCLANG_FORMAT.cmake
new file mode 100644
index 0000000..e00f19f
--- /dev/null
+++ b/cmake/FindCLANG_FORMAT.cmake
@@ -0,0 +1,61 @@
+
+
+# Find clang-format
+#
+# CLANG_FORMAT_EXECUTABLE   - Path to clang-format executable
+# CLANG_FORMAT_FOUND        - True if the clang-format executable was found.
+# CLANG_FORMAT_VERSION      - The version of clang-format found
+#
+# Copyright 2009-2020 The VOTCA Development Team (http://www.votca.org)
+#
+# Licensed under the Mozilla Public License Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.mozilla.org/en-US/MPL/2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+find_program(CLANG_FORMAT_EXECUTABLE
+             NAMES
+                   clang-format-9
+                   clang-format
+                   clang-format-11
+                   clang-format-10
+                   clang-format-8
+                   clang-format-7
+
+             DOC "clang-format executable")
+mark_as_advanced(CLANG_FORMAT_EXECUTABLE)
+
+# Extract version from command "clang-format -version"
+if(CLANG_FORMAT_EXECUTABLE)
+  execute_process(COMMAND ${CLANG_FORMAT_EXECUTABLE} -version
+                  OUTPUT_VARIABLE clang_format_version
+                  ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+  if(clang_format_version MATCHES "^.*clang-format version .*")
+    # clang_format_version sample: "clang-format version 3.9.1-4ubuntu3~16.04.1
+    # (tags/RELEASE_391/rc2)"
+    string(REGEX
+           REPLACE "^.*clang-format version ([.0-9]+).*"
+                   "\\1"
+                   CLANG_FORMAT_VERSION
+                   "${clang_format_version}")
+    # CLANG_FORMAT_VERSION sample: "3.9.1"
+  else()
+    set(CLANG_FORMAT_VERSION 0.0)
+  endif()
+else()
+  set(CLANG_FORMAT_VERSION 0.0)
+endif()
+
+include(FindPackageHandleStandardArgs)
+# handle the QUIETLY and REQUIRED arguments and set CLANG_FORMAT_FOUND to TRUE
+# if all listed variables are TRUE
+find_package_handle_standard_args(CLANG_FORMAT REQUIRED_VARS CLANG_FORMAT_EXECUTABLE VERSION_VAR CLANG_FORMAT_VERSION)
diff --git a/test/bdcsvd.cpp b/test/bdcsvd.cpp
index e92a7dc..f98cdca 100644
--- a/test/bdcsvd.cpp
+++ b/test/bdcsvd.cpp
@@ -54,27 +54,53 @@
   VERIFY_IS_APPROX(m.bdcSvd(ComputeFullU|ComputeFullV).adjoint().solve(m), m);
 }
 
-// compare the Singular values returned with Jacobi and Bdc
-template<typename MatrixType> 
-void compare_bdc_jacobi(const MatrixType& a = MatrixType(), unsigned int computationOptions = 0)
+// Compare the Singular values returned with Jacobi and Bdc.
+template<typename MatrixType>
+void compare_bdc_jacobi(const MatrixType& a = MatrixType(), unsigned int computationOptions = 0, int algoswap = 16, bool random = true)
 {
-  MatrixType m = MatrixType::Random(a.rows(), a.cols());
-  BDCSVD<MatrixType> bdc_svd(m);
+  MatrixType m = random ? MatrixType::Random(a.rows(), a.cols()) : a;
+
+  BDCSVD<MatrixType> bdc_svd(m.rows(), m.cols(), computationOptions);
+  bdc_svd.setSwitchSize(algoswap);
+  bdc_svd.compute(m);
+
   JacobiSVD<MatrixType> jacobi_svd(m);
   VERIFY_IS_APPROX(bdc_svd.singularValues(), jacobi_svd.singularValues());
+
   if(computationOptions & ComputeFullU) VERIFY_IS_APPROX(bdc_svd.matrixU(), jacobi_svd.matrixU());
   if(computationOptions & ComputeThinU) VERIFY_IS_APPROX(bdc_svd.matrixU(), jacobi_svd.matrixU());
   if(computationOptions & ComputeFullV) VERIFY_IS_APPROX(bdc_svd.matrixV(), jacobi_svd.matrixV());
   if(computationOptions & ComputeThinV) VERIFY_IS_APPROX(bdc_svd.matrixV(), jacobi_svd.matrixV());
 }
 
+// Verifies total deflation is **not** triggered.
+void compare_bdc_jacobi_instance(bool structure_as_m, int algoswap = 16)
+{
+  MatrixXd m(4, 3);
+  if (structure_as_m) {
+    // The first 3 rows are the reduced form of Matrix 1 as shown below, and it
+    // has nonzero elements in the first column and diagonals only.
+    m << 1.056293, 0, 0,
+         -0.336468, 0.907359, 0,
+         -1.566245, 0, 0.149150,
+         -0.1, 0, 0;
+  } else {
+    // Matrix 1.
+    m << 0.882336, 18.3914, -26.7921,
+         -5.58135, 17.1931, -24.0892,
+         -20.794, 8.68496, -4.83103,
+         -8.4981, -10.5451, 23.9072;
+  }
+  compare_bdc_jacobi(m, 0, algoswap, false);
+}
+
 EIGEN_DECLARE_TEST(bdcsvd)
 {
   CALL_SUBTEST_3(( svd_verify_assert<BDCSVD<Matrix3f>  >(Matrix3f()) ));
   CALL_SUBTEST_4(( svd_verify_assert<BDCSVD<Matrix4d>  >(Matrix4d()) ));
   CALL_SUBTEST_7(( svd_verify_assert<BDCSVD<MatrixXf>  >(MatrixXf(10,12)) ));
   CALL_SUBTEST_8(( svd_verify_assert<BDCSVD<MatrixXcd> >(MatrixXcd(7,5)) ));
-  
+
   CALL_SUBTEST_101(( svd_all_trivial_2x2(bdcsvd<Matrix2cd>) ));
   CALL_SUBTEST_102(( svd_all_trivial_2x2(bdcsvd<Matrix2d>) ));
 
@@ -85,10 +111,10 @@
 
     int r = internal::random<int>(1, EIGEN_TEST_MAX_SIZE/2),
         c = internal::random<int>(1, EIGEN_TEST_MAX_SIZE/2);
-    
+
     TEST_SET_BUT_UNUSED_VARIABLE(r)
     TEST_SET_BUT_UNUSED_VARIABLE(c)
-    
+
     CALL_SUBTEST_6((  bdcsvd(Matrix<double,Dynamic,2>(r,2)) ));
     CALL_SUBTEST_7((  bdcsvd(MatrixXf(r,c)) ));
     CALL_SUBTEST_7((  compare_bdc_jacobi(MatrixXf(r,c)) ));
@@ -114,5 +140,12 @@
   // CALL_SUBTEST_9( svd_preallocate<void>() );
 
   CALL_SUBTEST_2( svd_underoverflow<void>() );
-}
 
+  // Without total deflation issues.
+  CALL_SUBTEST_11((  compare_bdc_jacobi_instance(true) ));
+  CALL_SUBTEST_12((  compare_bdc_jacobi_instance(false) ));
+
+  // With total deflation issues before, when it shouldn't be triggered.
+  CALL_SUBTEST_13((  compare_bdc_jacobi_instance(true, 3) ));
+  CALL_SUBTEST_14((  compare_bdc_jacobi_instance(false, 3) ));
+}
diff --git a/test/nestbyvalue.cpp b/test/nestbyvalue.cpp
index c5356bc..3a86bea 100644
--- a/test/nestbyvalue.cpp
+++ b/test/nestbyvalue.cpp
@@ -26,7 +26,7 @@
   for(int i = 0; i < g_repeat; i++) {
     Index rows = internal::random<Index>(1,EIGEN_TEST_MAX_SIZE);
     Index cols = internal::random<Index>(1,EIGEN_TEST_MAX_SIZE);
-    MatrixXd a = MatrixXd(rows,cols);
+    MatrixXd a = MatrixXd::Random(rows,cols);
     nb_temporaries = 0;
     XprType x = get_xpr_with_temps(a);
     VERIFY_IS_EQUAL(nb_temporaries,6);
diff --git a/test/sparse.h b/test/sparse.h
index 67cc06e..663aaee 100644
--- a/test/sparse.h
+++ b/test/sparse.h
@@ -59,7 +59,8 @@
   sparseMat.setZero();
   //sparseMat.reserve(int(refMat.rows()*refMat.cols()*density));
   sparseMat.reserve(VectorXi::Constant(IsRowMajor ? refMat.rows() : refMat.cols(), int((1.5*density)*(IsRowMajor?refMat.cols():refMat.rows()))));
-  
+
+  Index insert_count = 0;
   for(Index j=0; j<sparseMat.outerSize(); j++)
   {
     //sparseMat.startVec(j);
@@ -89,6 +90,7 @@
       {
         //sparseMat.insertBackByOuterInner(j,i) = v;
         sparseMat.insertByOuterInner(j,i) = v;
+        ++insert_count;
         if (nonzeroCoords)
           nonzeroCoords->push_back(Matrix<StorageIndex,2,1> (ai,aj));
       }
@@ -97,6 +99,9 @@
         zeroCoords->push_back(Matrix<StorageIndex,2,1> (ai,aj));
       }
       refMat(ai,aj) = v;
+
+      // make sure we only insert as many as the sparse matrix supports
+      if(insert_count == NumTraits<StorageIndex>::highest()) return;
     }
   }
   //sparseMat.finalize();
diff --git a/test/sparse_basic.cpp b/test/sparse_basic.cpp
index 9453111..93c30a5 100644
--- a/test/sparse_basic.cpp
+++ b/test/sparse_basic.cpp
@@ -431,12 +431,6 @@
       VERIFY_IS_APPROX(mapMat2+mapMat3, refMat2+refMat3);
       VERIFY_IS_APPROX(mapMat2+mapMat3, refMat2+refMat3);
     }
-    {
-      MappedSparseMatrix<Scalar,SparseMatrixType::Options,StorageIndex> mapMat2(m2.rows(), m2.cols(), m2.nonZeros(), m2.outerIndexPtr(), m2.innerIndexPtr(), m2.valuePtr(), m2.innerNonZeroPtr());
-      MappedSparseMatrix<Scalar,SparseMatrixType::Options,StorageIndex> mapMat3(m3.rows(), m3.cols(), m3.nonZeros(), m3.outerIndexPtr(), m3.innerIndexPtr(), m3.valuePtr(), m3.innerNonZeroPtr());
-      VERIFY_IS_APPROX(mapMat2+mapMat3, refMat2+refMat3);
-      VERIFY_IS_APPROX(mapMat2+mapMat3, refMat2+refMat3);
-    }
 
     Index i = internal::random<Index>(0,rows-1);
     Index j = internal::random<Index>(0,cols-1);
diff --git a/test/sparse_product.cpp b/test/sparse_product.cpp
index 6e85f69..488a392 100644
--- a/test/sparse_product.cpp
+++ b/test/sparse_product.cpp
@@ -461,6 +461,58 @@
   VERIFY_IS_APPROX( dC2 = sC1 * dR1.col(0), dC3 = sC1 * dR1.template cast<Cplx>().col(0) );
 }
 
+// Test mixed storage types
+template<int OrderA, int OrderB, int OrderC>
+void test_mixed_storage_imp() {
+  typedef float Real;
+  typedef Matrix<Real,Dynamic,Dynamic> DenseMat;
+
+  // Case: Large inputs but small result
+  {
+    SparseMatrix<Real, OrderA> A(8, 512);
+    SparseMatrix<Real, OrderB> B(512, 8);
+    DenseMat refA(8, 512);
+    DenseMat refB(512, 8);
+
+    initSparse<Real>(0.1, refA, A);
+    initSparse<Real>(0.1, refB, B);
+
+    SparseMatrix<Real, OrderC, std::int8_t> result;
+    SparseMatrix<Real, OrderC> result_large;
+    DenseMat refResult;
+
+    VERIFY_IS_APPROX( result = (A * B), refResult = refA * refB );
+  }
+
+  // Case: Small input but large result
+  {
+    SparseMatrix<Real, OrderA, std::int8_t> A(127, 8);
+    SparseMatrix<Real, OrderB, std::int8_t> B(8, 127);
+    DenseMat refA(127, 8);
+    DenseMat refB(8, 127);
+
+    initSparse<Real>(0.01, refA, A);
+    initSparse<Real>(0.01, refB, B);
+
+    SparseMatrix<Real, OrderC> result;
+    SparseMatrix<Real, OrderC> result_large;
+    DenseMat refResult;
+
+    VERIFY_IS_APPROX( result = (A * B), refResult = refA * refB );
+  }
+}
+
+void test_mixed_storage() {
+  test_mixed_storage_imp<RowMajor, RowMajor, RowMajor>();
+  test_mixed_storage_imp<RowMajor, RowMajor, ColMajor>();
+  test_mixed_storage_imp<RowMajor, ColMajor, RowMajor>();
+  test_mixed_storage_imp<RowMajor, ColMajor, ColMajor>();
+  test_mixed_storage_imp<ColMajor, RowMajor, RowMajor>();
+  test_mixed_storage_imp<ColMajor, RowMajor, ColMajor>();
+  test_mixed_storage_imp<ColMajor, ColMajor, RowMajor>();
+  test_mixed_storage_imp<ColMajor, ColMajor, ColMajor>();
+}
+
 EIGEN_DECLARE_TEST(sparse_product)
 {
   for(int i = 0; i < g_repeat; i++) {
@@ -473,5 +525,6 @@
     CALL_SUBTEST_4( (sparse_product_regression_test<SparseMatrix<double,RowMajor>, Matrix<double, Dynamic, Dynamic, RowMajor> >()) );
 
     CALL_SUBTEST_5( (test_mixing_types<float>()) );
+    CALL_SUBTEST_5( (test_mixed_storage()) );
   }
 }
diff --git a/test/sparse_solver.h b/test/sparse_solver.h
index 5892794..2b3b403 100644
--- a/test/sparse_solver.h
+++ b/test/sparse_solver.h
@@ -88,7 +88,7 @@
     
     x.setZero();
     // test with Map
-    MappedSparseMatrix<Scalar,Mat::Options,StorageIndex> Am(A.rows(), A.cols(), A.nonZeros(), const_cast<StorageIndex*>(A.outerIndexPtr()), const_cast<StorageIndex*>(A.innerIndexPtr()), const_cast<Scalar*>(A.valuePtr()));
+    Map<SparseMatrix<Scalar,Mat::Options,StorageIndex>> Am(A.rows(), A.cols(), A.nonZeros(), const_cast<StorageIndex*>(A.outerIndexPtr()), const_cast<StorageIndex*>(A.innerIndexPtr()), const_cast<Scalar*>(A.valuePtr()));
     solver.compute(Am);
     VERIFY(solver.info() == Success && "factorization failed when using Map");
     DenseRhs dx(refX);
@@ -217,7 +217,7 @@
 
     x1.setZero();
     // test with Map
-    MappedSparseMatrix<Scalar,Mat::Options,StorageIndex> Am(A.rows(), A.cols(), A.nonZeros(), const_cast<StorageIndex*>(A.outerIndexPtr()), const_cast<StorageIndex*>(A.innerIndexPtr()), const_cast<Scalar*>(A.valuePtr()));
+    Map<SparseMatrix<Scalar,Mat::Options,StorageIndex> > Am(A.rows(), A.cols(), A.nonZeros(), const_cast<StorageIndex*>(A.outerIndexPtr()), const_cast<StorageIndex*>(A.innerIndexPtr()), const_cast<Scalar*>(A.valuePtr()));
     solver.compute(Am);
     VERIFY(solver.info() == Success && "factorization failed when using Map");
     DenseRhs dx(refX1);
diff --git a/test/sparse_solvers.cpp b/test/sparse_solvers.cpp
index 3b7cd77..1f4948a 100644
--- a/test/sparse_solvers.cpp
+++ b/test/sparse_solvers.cpp
@@ -62,7 +62,7 @@
     {
       SparseMatrix<Scalar> cm2(m2);
       //Index rows, Index cols, Index nnz, Index* outerIndexPtr, Index* innerIndexPtr, Scalar* valuePtr
-      MappedSparseMatrix<Scalar> mm2(rows, cols, cm2.nonZeros(), cm2.outerIndexPtr(), cm2.innerIndexPtr(), cm2.valuePtr());
+      Map<SparseMatrix<Scalar> > mm2(rows, cols, cm2.nonZeros(), cm2.outerIndexPtr(), cm2.innerIndexPtr(), cm2.valuePtr());
       VERIFY_IS_APPROX(refMat2.conjugate().template triangularView<Upper>().solve(vec2),
                        mm2.conjugate().template triangularView<Upper>().solve(vec3));
     }
diff --git a/test/sparse_vector.cpp b/test/sparse_vector.cpp
index 3512927..7bd57cd 100644
--- a/test/sparse_vector.cpp
+++ b/test/sparse_vector.cpp
@@ -111,7 +111,7 @@
   // check copy to dense vector with transpose
   refV3.resize(0);
   VERIFY_IS_APPROX(refV3 = v1.transpose(),v1.toDense()); 
-  VERIFY_IS_APPROX(DenseVector(v1),v1.toDense()); 
+  VERIFY_IS_APPROX(DenseVector(v1),v1.toDense());
 
   // test conservative resize
   {
@@ -144,6 +144,31 @@
   }
 
 }
+void test_pruning() {
+    using SparseVectorType = SparseVector<double, 0, int>;
+
+    SparseVectorType vec;
+    auto init_vec = [&](){;
+        vec.resize(10);
+        vec.insert(3) = 0.1;
+        vec.insert(5) = 1.0;
+        vec.insert(8) = -0.1;
+        vec.insert(9) = -0.2;
+    };
+    init_vec();
+
+    VERIFY_IS_EQUAL(vec.nonZeros(), 4);
+    VERIFY_IS_EQUAL(vec.prune(0.1, 1.0), 2);
+    VERIFY_IS_EQUAL(vec.nonZeros(), 2);
+    VERIFY_IS_EQUAL(vec.coeff(5), 1.0);
+    VERIFY_IS_EQUAL(vec.coeff(9), -0.2);
+
+    init_vec();
+    VERIFY_IS_EQUAL(vec.prune([](double v) { return v >= 0; }), 2);
+    VERIFY_IS_EQUAL(vec.nonZeros(), 2);
+    VERIFY_IS_EQUAL(vec.coeff(3), 0.1);
+    VERIFY_IS_EQUAL(vec.coeff(5), 1.0);
+}
 
 EIGEN_DECLARE_TEST(sparse_vector)
 {
@@ -159,5 +184,7 @@
     CALL_SUBTEST_1(( sparse_vector<double,long int>(r, c) ));
     CALL_SUBTEST_1(( sparse_vector<double,short>(r, c) ));
   }
+
+  CALL_SUBTEST_1(test_pruning());
 }
 
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index 1b8b33f..c269289 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -38,6 +38,8 @@
 #include <cmath>
 #include <cstddef>
 #include <cstring>
+#include <iterator>
+#include <numeric>
 #include <random>
 #include <thread>
 
@@ -76,6 +78,8 @@
 #include "src/Tensor/TensorIntDiv.h"
 #include "src/Tensor/TensorGlobalFunctions.h"
 
+#include "src/Tensor/TensorIO.h"
+
 #include "src/Tensor/TensorBase.h"
 #include "src/Tensor/TensorBlock.h"
 
@@ -129,7 +133,7 @@
 #include "src/Tensor/TensorMap.h"
 #include "src/Tensor/TensorRef.h"
 
-#include "src/Tensor/TensorIO.h"
+
 
 #include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/README.md b/unsupported/Eigen/CXX11/src/Tensor/README.md
index d4d3d59..b6abf2f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/README.md
+++ b/unsupported/Eigen/CXX11/src/Tensor/README.md
@@ -1794,6 +1794,45 @@
 
 TODO
 
+## Tensor Printing
+Tensors can be printed into a stream object (e.g. `std::cout`) using different formatting options.
+
+	Eigen::Tensor<float, 3> tensor3d = {4, 3, 2};
+	tensor3d.setValues( {{{1, 2}, {3, 4}, {5, 6}}, {{7, 8}, {9, 10}, {11, 12}}, {{13, 14}, {15, 16}, {17, 18}}, {{19, 20}, {21, 22}, {23, 24}}} );
+	std::cout << tensor3d.format(Eigen::TensorIOFormat::Plain()) << std::endl;
+	==>
+	 1  2 
+	 3  4 
+	 5  6 
+	
+	 7  8 
+	 9 10
+	11 12
+	
+	13 14
+	15 16
+	17 18
+	
+	19 20
+	21 22
+	23 24
+
+
+In the example, we used the predefined format `Eigen::TensorIOFormat::Plain`.
+Here is the list of all predefined formats from which you can choose:
+- `Eigen::TensorIOFormat::Plain()` for a plain output without braces. Different submatrices are separated by a blank line.
+- `Eigen::TensorIOFormat::Numpy()` for numpy-like output.
+- `Eigen::TensorIOFormat::Native()` for a `c++` like output which can be directly copy-pasted to setValues().
+- `Eigen::TensorIOFormat::Legacy()` for a backwards compatible printing of tensors.
+
+If you send the tensor directly to the stream the default format is called which is `Eigen::IOFormats::Plain()`.
+
+You can define your own format by explicitly providing a `Eigen::TensorIOFormat` class instance. Here, you can specify:
+- The overall prefix and suffix with `std::string tenPrefix` and `std::string tenSuffix`
+- The prefix, separator and suffix for each new element, row, matrix, 3d subtensor, ... with `std::vector<std::string> prefix`, `std::vector<std::string> separator` and `std::vector<std::string> suffix`. Note that the first entry in each of the vectors refer to the last dimension of the tensor, e.g. `separator[0]` will be printed between adjacent elements,  `separator[1]` will be printed between adjacent matrices, ...
+- `char fill`: character which will be placed if the elements are aligned.
+- `int precision`
+- `int flags`: an OR-ed combination of flags, the default value is 0, the only currently available flag is `Eigen::DontAlignCols` which allows to disable the alignment of columns, resulting in faster code.
 
 ## Representation of scalar values
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
index 76e97cd..26358d5 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
@@ -76,7 +76,7 @@
     typedef typename Base::CoeffReturnType CoeffReturnType;
 
     enum {
-      IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0) & !(Options_&DontAlign),
+      IsAligned = (EIGEN_MAX_ALIGN_BYTES>0) && !(Options_&DontAlign),
       Layout = Options_ & RowMajor ? RowMajor : ColMajor,
       CoordAccess = true,
       RawAccess = true
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index 68aced5..9c356f4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -962,6 +962,11 @@
       return TensorForcedEvalOp<const Derived>(derived());
     }
 
+    // Returns a formatted tensor ready for printing to a stream
+    inline const TensorWithFormat<Derived,DerivedTraits::Layout,DerivedTraits::NumDimensions> format(const TensorIOFormat& fmt) const {
+      return TensorWithFormat<Derived,DerivedTraits::Layout,DerivedTraits::NumDimensions>(derived(), fmt);
+    }
+
     #ifdef EIGEN_READONLY_TENSORBASE_PLUGIN
     #include EIGEN_READONLY_TENSORBASE_PLUGIN
     #endif
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index bd53e60..ab6dcb6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -463,8 +463,8 @@
           values[i] = m_impl.coeff(inputIndex);
           ++outputOffset;
         } else {
-          outputOffset = 1;
           values[i] = m_impl.coeff(++inputIndex);
+          outputOffset = 1;  // Next offset.
         }
       }
       return internal::pload<PacketReturnType>(values);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 279be34..9a513dc 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -553,11 +553,59 @@
 };
 
 #if defined(EIGEN_GPUCC)
+// Returns 1 if lhs + rhs would overflow, -1 if it would underflow, otherwise 0.
+template <typename Index>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int sum_will_overflow(Index lhs,
+                                                            Index rhs) {
+  const Index highest = NumTraits<Index>::highest();
+  const Index lowest = NumTraits<Index>::lowest();
+  if (lhs > 0 && rhs > 0) {
+    return lhs > highest - rhs ? 1 : 0;
+  } else if (lhs < 0 && rhs < 0) {
+    return lhs < lowest - rhs ? -1 : 0;
+  } else {
+    return 0;
+  }
+}
+
+// Returns lhs + rhs, saturating to the highest/lowest representable value on
+// overflow/underflow respectively.
+template <typename Index>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index saturate_add(Index lhs, Index rhs) {
+  const Index highest = NumTraits<Index>::highest();
+  const Index lowest = NumTraits<Index>::lowest();
+  int overflow = sum_will_overflow(lhs, rhs);
+  return overflow == 1 ? highest : overflow == -1 ? lowest : lhs + rhs;
+}
+
+// A functor that adds step_size to a given index, saturating to avoid
+// overflow/underflow. If overflow/underflow is not possible, regular addition
+// is used (for efficiency).
+template <typename Index>
+struct SafeStep {
+  // lastIdx is one past the end of the possible indexes.
+  // step_size is the value that will be added to the given index when the
+  // functor is called.
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SafeStep(Index lastIdx, Index step_size)
+      : can_overflow_(sum_will_overflow(lastIdx, step_size)),
+        step_size_(step_size) {}
+
+  // Adds step_size to index, saturating on overflow (if overflow is possible).
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index operator()(Index index) const {
+    return can_overflow_ ? saturate_add(index, step_size_) : index + step_size_;
+  }
+
+ private:
+  const bool can_overflow_;
+  const Index step_size_;
+};
+
 template <typename Evaluator, typename StorageIndex, bool Vectorizable>
 struct EigenMetaKernelEval {
   static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
   void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
-    for (StorageIndex i = firstIdx; i < lastIdx; i += step_size) {
+    SafeStep<StorageIndex> safe_step(lastIdx, step_size);
+    for (StorageIndex i = firstIdx; i < lastIdx; i = safe_step(i)) {
       eval.evalScalar(i);
     }
   }
@@ -571,12 +619,16 @@
     const StorageIndex vectorized_size = (lastIdx / PacketSize) * PacketSize;
     const StorageIndex vectorized_step_size = step_size * PacketSize;
 
+    SafeStep<StorageIndex> safe_vectorized_step(vectorized_size,
+                                                vectorized_step_size);
     // Use the vector path
     for (StorageIndex i = firstIdx * PacketSize; i < vectorized_size;
-         i += vectorized_step_size) {
+         i = safe_vectorized_step(i)) {
       eval.evalPacket(i);
     }
-    for (StorageIndex i = vectorized_size + firstIdx; i < lastIdx; i += step_size) {
+    SafeStep<StorageIndex> safe_step(lastIdx, step_size);
+    for (StorageIndex i = saturate_add(vectorized_size, firstIdx); i < lastIdx;
+         i = safe_step(i)) {
       eval.evalScalar(i);
     }
   }
@@ -603,8 +655,11 @@
   if (needs_assign) {
 
     const int block_size = device.maxGpuThreadsPerBlock();
-    const int max_blocks = device.getNumGpuMultiProcessors() *
-                           device.maxGpuThreadsPerMultiProcessor() / block_size;
+    const int max_blocks =
+        numext::mini<int64_t>(device.getNumGpuMultiProcessors() *
+                              device.maxGpuThreadsPerMultiProcessor(),
+                          NumTraits<StorageIndex>::highest()) /
+        block_size;
     const StorageIndex size = array_prod(evaluator.dimensions());
     // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
     const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h
index f47973b..2e65586 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h
@@ -14,68 +14,361 @@
 
 namespace Eigen {
 
+struct TensorIOFormat;
+
 namespace internal {
-
-// Print the tensor as a 2d matrix
-template <typename Tensor, int Rank>
-struct TensorPrinter {
-  static void run (std::ostream& os, const Tensor& tensor) {
-    typedef typename internal::remove_const<typename Tensor::Scalar>::type Scalar;
-    typedef typename Tensor::Index Index;
-    const Index total_size = internal::array_prod(tensor.dimensions());
-    if (total_size > 0) {
-      const Index first_dim = Eigen::internal::array_get<0>(tensor.dimensions());
-      static const int layout = Tensor::Layout;
-      Map<const Array<Scalar, Dynamic, Dynamic, layout> > matrix(const_cast<Scalar*>(tensor.data()), first_dim, total_size/first_dim);
-      os << matrix;
-    }
-  }
-};
-
-
-// Print the tensor as a vector
-template <typename Tensor>
-struct TensorPrinter<Tensor, 1> {
-  static void run (std::ostream& os, const Tensor& tensor) {
-    typedef typename internal::remove_const<typename Tensor::Scalar>::type Scalar;
-    typedef typename Tensor::Index Index;
-    const Index total_size = internal::array_prod(tensor.dimensions());
-    if (total_size > 0) {
-      Map<const Array<Scalar, Dynamic, 1> > array(const_cast<Scalar*>(tensor.data()), total_size);
-      os << array;
-    }
-  }
-};
-
-
-// Print the tensor as a scalar
-template <typename Tensor>
-struct TensorPrinter<Tensor, 0> {
-  static void run (std::ostream& os, const Tensor& tensor) {
-    os << tensor.coeff(0);
-  }
-};
+template <typename Tensor, std::size_t rank>
+struct TensorPrinter;
 }
 
+struct TensorIOFormat {
+  TensorIOFormat(const std::vector<std::string>& _separator, const std::vector<std::string>& _prefix,
+                 const std::vector<std::string>& _suffix, int _precision = StreamPrecision, int _flags = 0,
+                 const std::string& _tenPrefix = "", const std::string& _tenSuffix = "", const char _fill = ' ')
+      : tenPrefix(_tenPrefix),
+        tenSuffix(_tenSuffix),
+        prefix(_prefix),
+        suffix(_suffix),
+        separator(_separator),
+        fill(_fill),
+        precision(_precision),
+        flags(_flags) {
+    init_spacer();
+  }
+
+  TensorIOFormat(int _precision = StreamPrecision, int _flags = 0, const std::string& _tenPrefix = "",
+                 const std::string& _tenSuffix = "", const char _fill = ' ')
+      : tenPrefix(_tenPrefix), tenSuffix(_tenSuffix), fill(_fill), precision(_precision), flags(_flags) {
+    // default values of prefix, suffix and separator
+    prefix = {"", "["};
+    suffix = {"", "]"};
+    separator = {", ", "\n"};
+
+    init_spacer();
+  }
+
+  void init_spacer() {
+    if ((flags & DontAlignCols)) return;
+    spacer.resize(prefix.size());
+    spacer[0] = "";
+    int i = int(tenPrefix.length()) - 1;
+    while (i >= 0 && tenPrefix[i] != '\n') {
+      spacer[0] += ' ';
+      i--;
+    }
+
+    for (std::size_t k = 1; k < prefix.size(); k++) {
+      int j = int(prefix[k].length()) - 1;
+      while (j >= 0 && prefix[k][j] != '\n') {
+        spacer[k] += ' ';
+        j--;
+      }
+    }
+  }
+
+  static inline const TensorIOFormat Numpy() {
+    std::vector<std::string> prefix = {"", "["};
+    std::vector<std::string> suffix = {"", "]"};
+    std::vector<std::string> separator = {" ", "\n"};
+    return TensorIOFormat(separator, prefix, suffix, StreamPrecision, 0, "[", "]");
+  }
+
+  static inline const TensorIOFormat Plain() {
+    std::vector<std::string> separator = {" ", "\n", "\n", ""};
+    std::vector<std::string> prefix = {""};
+    std::vector<std::string> suffix = {""};
+    return TensorIOFormat(separator, prefix, suffix, StreamPrecision, 0, "", "", ' ');
+  }
+
+  static inline const TensorIOFormat Native() {
+    std::vector<std::string> separator = {", ", ",\n", "\n"};
+    std::vector<std::string> prefix = {"", "{"};
+    std::vector<std::string> suffix = {"", "}"};
+    return TensorIOFormat(separator, prefix, suffix, StreamPrecision, 0, "{", "}", ' ');
+  }
+
+  static inline const TensorIOFormat Legacy() {
+    TensorIOFormat LegacyFormat(StreamPrecision, 0, "", "", ' ');
+    LegacyFormat.legacy_bit = true;
+    return LegacyFormat;
+  }
+
+  std::string tenPrefix;
+  std::string tenSuffix;
+  std::vector<std::string> prefix;
+  std::vector<std::string> suffix;
+  std::vector<std::string> separator;
+  char fill;
+  int precision;
+  int flags;
+  std::vector<std::string> spacer{};
+  bool legacy_bit = false;
+};
+
+template <typename T, int Layout, int rank>
+class TensorWithFormat;
+// specialize for Layout=ColMajor, Layout=RowMajor and rank=0.
+template <typename T, int rank>
+class TensorWithFormat<T, RowMajor, rank> {
+ public:
+  TensorWithFormat(const T& tensor, const TensorIOFormat& format) : t_tensor(tensor), t_format(format) {}
+
+  friend std::ostream& operator<<(std::ostream& os, const TensorWithFormat<T, RowMajor, rank>& wf) {
+    // Evaluate the expression if needed
+    typedef TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice> Evaluator;
+    TensorForcedEvalOp<const T> eval = wf.t_tensor.eval();
+    Evaluator tensor(eval, DefaultDevice());
+    tensor.evalSubExprsIfNeeded(NULL);
+    internal::TensorPrinter<Evaluator, rank>::run(os, tensor, wf.t_format);
+    // Cleanup.
+    tensor.cleanup();
+    return os;
+  }
+
+ protected:
+  T t_tensor;
+  TensorIOFormat t_format;
+};
+
+template <typename T, int rank>
+class TensorWithFormat<T, ColMajor, rank> {
+ public:
+  TensorWithFormat(const T& tensor, const TensorIOFormat& format) : t_tensor(tensor), t_format(format) {}
+
+  friend std::ostream& operator<<(std::ostream& os, const TensorWithFormat<T, ColMajor, rank>& wf) {
+    // Switch to RowMajor storage and print afterwards
+    typedef typename T::Index IndexType;
+    std::array<IndexType, rank> shuffle;
+    std::array<IndexType, rank> id;
+    std::iota(id.begin(), id.end(), IndexType(0));
+    std::copy(id.begin(), id.end(), shuffle.rbegin());
+    auto tensor_row_major = wf.t_tensor.swap_layout().shuffle(shuffle);
+
+    // Evaluate the expression if needed
+    typedef TensorEvaluator<const TensorForcedEvalOp<const decltype(tensor_row_major)>, DefaultDevice> Evaluator;
+    TensorForcedEvalOp<const decltype(tensor_row_major)> eval = tensor_row_major.eval();
+    Evaluator tensor(eval, DefaultDevice());
+    tensor.evalSubExprsIfNeeded(NULL);
+    internal::TensorPrinter<Evaluator, rank>::run(os, tensor, wf.t_format);
+    // Cleanup.
+    tensor.cleanup();
+    return os;
+  }
+
+ protected:
+  T t_tensor;
+  TensorIOFormat t_format;
+};
+
 template <typename T>
-std::ostream& operator << (std::ostream& os, const TensorBase<T, ReadOnlyAccessors>& expr) {
-  typedef TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice> Evaluator;
-  typedef typename Evaluator::Dimensions Dimensions;
+class TensorWithFormat<T, ColMajor, 0> {
+ public:
+  TensorWithFormat(const T& tensor, const TensorIOFormat& format) : t_tensor(tensor), t_format(format) {}
 
-  // Evaluate the expression if needed
-  TensorForcedEvalOp<const T> eval = expr.eval();
-  Evaluator tensor(eval, DefaultDevice());
-  tensor.evalSubExprsIfNeeded(NULL);
+  friend std::ostream& operator<<(std::ostream& os, const TensorWithFormat<T, ColMajor, 0>& wf) {
+    // Evaluate the expression if needed
+    typedef TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice> Evaluator;
+    TensorForcedEvalOp<const T> eval = wf.t_tensor.eval();
+    Evaluator tensor(eval, DefaultDevice());
+    tensor.evalSubExprsIfNeeded(NULL);
+    internal::TensorPrinter<Evaluator, 0>::run(os, tensor, wf.t_format);
+    // Cleanup.
+    tensor.cleanup();
+    return os;
+  }
 
-  // Print the result
-  static const int rank = internal::array_size<Dimensions>::value;
-  internal::TensorPrinter<Evaluator, rank>::run(os, tensor);
+ protected:
+  T t_tensor;
+  TensorIOFormat t_format;
+};
 
-  // Cleanup.
-  tensor.cleanup();
-  return os;
+namespace internal {
+template <typename Tensor, std::size_t rank>
+struct TensorPrinter {
+  static void run(std::ostream& s, const Tensor& _t, const TensorIOFormat& fmt) {
+    typedef typename internal::remove_const<typename Tensor::Scalar>::type Scalar;
+    typedef typename Tensor::Index IndexType;
+    static const int layout = Tensor::Layout;
+    // backwards compatibility case: print tensor after reshaping to matrix of size dim(0) x
+    // (dim(1)*dim(2)*...*dim(rank-1)).
+    if (fmt.legacy_bit) {
+      const IndexType total_size = internal::array_prod(_t.dimensions());
+      if (total_size > 0) {
+        const IndexType first_dim = Eigen::internal::array_get<0>(_t.dimensions());
+        Map<const Array<Scalar, Dynamic, Dynamic, layout> > matrix(_t.data(), first_dim,
+                                                                   total_size / first_dim);
+        s << matrix;
+        return;
+      }
+    }
+
+    assert(layout == RowMajor);
+    typedef typename conditional<is_same<Scalar, char>::value || is_same<Scalar, unsigned char>::value ||
+                                     is_same<Scalar, numext::int8_t>::value || is_same<Scalar, numext::uint8_t>::value,
+                                 int,
+                                 typename conditional<is_same<Scalar, std::complex<char> >::value ||
+                                                          is_same<Scalar, std::complex<unsigned char> >::value ||
+                                                          is_same<Scalar, std::complex<numext::int8_t> >::value ||
+                                                          is_same<Scalar, std::complex<numext::uint8_t> >::value,
+                                                      std::complex<int>, const Scalar&>::type>::type PrintType;
+
+    const IndexType total_size = array_prod(_t.dimensions());
+
+    std::streamsize explicit_precision;
+    if (fmt.precision == StreamPrecision) {
+      explicit_precision = 0;
+    } else if (fmt.precision == FullPrecision) {
+      if (NumTraits<Scalar>::IsInteger) {
+        explicit_precision = 0;
+      } else {
+        explicit_precision = significant_decimals_impl<Scalar>::run();
+      }
+    } else {
+      explicit_precision = fmt.precision;
+    }
+
+    std::streamsize old_precision = 0;
+    if (explicit_precision) old_precision = s.precision(explicit_precision);
+
+    IndexType width = 0;
+
+    bool align_cols = !(fmt.flags & DontAlignCols);
+    if (align_cols) {
+      // compute the largest width
+      for (IndexType i = 0; i < total_size; i++) {
+        std::stringstream sstr;
+        sstr.copyfmt(s);
+        sstr << static_cast<PrintType>(_t.data()[i]);
+        width = std::max<IndexType>(width, IndexType(sstr.str().length()));
+      }
+    }
+    std::streamsize old_width = s.width();
+    char old_fill_character = s.fill();
+
+    s << fmt.tenPrefix;
+    for (IndexType i = 0; i < total_size; i++) {
+      std::array<bool, rank> is_at_end{};
+      std::array<bool, rank> is_at_begin{};
+
+      // is the ith element the end of an coeff (always true), of a row, of a matrix, ...?
+      for (std::size_t k = 0; k < rank; k++) {
+        if ((i + 1) % (std::accumulate(_t.dimensions().rbegin(), _t.dimensions().rbegin() + k, 1,
+                                       std::multiplies<IndexType>())) ==
+            0) {
+          is_at_end[k] = true;
+        }
+      }
+
+      // is the ith element the begin of an coeff (always true), of a row, of a matrix, ...?
+      for (std::size_t k = 0; k < rank; k++) {
+        if (i % (std::accumulate(_t.dimensions().rbegin(), _t.dimensions().rbegin() + k, 1,
+                                 std::multiplies<IndexType>())) ==
+            0) {
+          is_at_begin[k] = true;
+        }
+      }
+
+      // do we have a line break?
+      bool is_at_begin_after_newline = false;
+      for (std::size_t k = 0; k < rank; k++) {
+        if (is_at_begin[k]) {
+          std::size_t separator_index = (k < fmt.separator.size()) ? k : fmt.separator.size() - 1;
+          if (fmt.separator[separator_index].find('\n') != std::string::npos) {
+            is_at_begin_after_newline = true;
+          }
+        }
+      }
+
+      bool is_at_end_before_newline = false;
+      for (std::size_t k = 0; k < rank; k++) {
+        if (is_at_end[k]) {
+          std::size_t separator_index = (k < fmt.separator.size()) ? k : fmt.separator.size() - 1;
+          if (fmt.separator[separator_index].find('\n') != std::string::npos) {
+            is_at_end_before_newline = true;
+          }
+        }
+      }
+
+      std::stringstream suffix, prefix, separator;
+      for (std::size_t k = 0; k < rank; k++) {
+        std::size_t suffix_index = (k < fmt.suffix.size()) ? k : fmt.suffix.size() - 1;
+        if (is_at_end[k]) {
+          suffix << fmt.suffix[suffix_index];
+        }
+      }
+      for (std::size_t k = 0; k < rank; k++) {
+        std::size_t separator_index = (k < fmt.separator.size()) ? k : fmt.separator.size() - 1;
+        if (is_at_end[k] &&
+            (!is_at_end_before_newline || fmt.separator[separator_index].find('\n') != std::string::npos)) {
+          separator << fmt.separator[separator_index];
+        }
+      }
+      for (std::size_t k = 0; k < rank; k++) {
+        std::size_t spacer_index = (k < fmt.spacer.size()) ? k : fmt.spacer.size() - 1;
+        if (i != 0 && is_at_begin_after_newline && (!is_at_begin[k] || k == 0)) {
+          prefix << fmt.spacer[spacer_index];
+        }
+      }
+      for (int k = rank - 1; k >= 0; k--) {
+        std::size_t prefix_index = (static_cast<std::size_t>(k) < fmt.prefix.size()) ? k : fmt.prefix.size() - 1;
+        if (is_at_begin[k]) {
+          prefix << fmt.prefix[prefix_index];
+        }
+      }
+
+      s << prefix.str();
+      if (width) {
+        s.fill(fmt.fill);
+        s.width(width);
+        s << std::right;
+      }
+      s << _t.data()[i];
+      s << suffix.str();
+      if (i < total_size - 1) {
+        s << separator.str();
+      }
+    }
+    s << fmt.tenSuffix;
+    if (explicit_precision) s.precision(old_precision);
+    if (width) {
+      s.fill(old_fill_character);
+      s.width(old_width);
+    }
+  }
+};
+
+template <typename Tensor>
+struct TensorPrinter<Tensor, 0> {
+  static void run(std::ostream& s, const Tensor& _t, const TensorIOFormat& fmt) {
+    typedef typename Tensor::Scalar Scalar;
+
+    std::streamsize explicit_precision;
+    if (fmt.precision == StreamPrecision) {
+      explicit_precision = 0;
+    } else if (fmt.precision == FullPrecision) {
+      if (NumTraits<Scalar>::IsInteger) {
+        explicit_precision = 0;
+      } else {
+        explicit_precision = significant_decimals_impl<Scalar>::run();
+      }
+    } else {
+      explicit_precision = fmt.precision;
+    }
+
+    std::streamsize old_precision = 0;
+    if (explicit_precision) old_precision = s.precision(explicit_precision);
+
+    s << fmt.tenPrefix << _t.coeff(0) << fmt.tenSuffix;
+    if (explicit_precision) s.precision(old_precision);
+  }
+};
+
+}  // end namespace internal
+template <typename T>
+std::ostream& operator<<(std::ostream& s, const TensorBase<T, ReadOnlyAccessors>& t) {
+  s << t.format(TensorIOFormat::Plain());
+  return s;
 }
+}  // end namespace Eigen
 
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_IO_H
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_IO_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
index b3f4a1c..cf891eb 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@@ -30,13 +30,15 @@
 template <typename T, typename X, typename Y>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T divup(const X x, const Y y) {
-  return static_cast<T>((x + y - 1) / y);
+  // Note: This form is used because it cannot overflow.
+  return static_cast<T>(x == 0 ? 0 : (x - 1) / y + 1);
 }
 
 template <typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T divup(const T x, const T y) {
-  return static_cast<T>((x + y - 1) / y);
+  // Note: This form is used because it cannot overflow.
+  return static_cast<T>(x == 0 ? 0 : (x - 1) / y + 1);
 }
 
 template <size_t n> struct max_n_1 {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index b592247..2939b98 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -633,7 +633,7 @@
           m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
         }
       } else {
-        m_outputStrides[NumOutputDims - 1] = 1;
+        m_outputStrides[static_cast<size_t>(NumOutputDims - 1)] = 1;
         for (int i = NumOutputDims - 2; i >= 0; --i) {
           m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
           m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
@@ -680,7 +680,7 @@
             ? internal::array_prod(input_dims)
             : (static_cast<int>(Layout) == static_cast<int>(ColMajor))
                   ? m_preservedStrides[0]
-                  : m_preservedStrides[NumOutputDims - 1];
+                  : m_preservedStrides[static_cast<size_t>(NumOutputDims - 1)];
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
diff --git a/unsupported/Eigen/CXX11/src/util/CXX11Meta.h b/unsupported/Eigen/CXX11/src/util/CXX11Meta.h
index 149ceaf..f662dee 100644
--- a/unsupported/Eigen/CXX11/src/util/CXX11Meta.h
+++ b/unsupported/Eigen/CXX11/src/util/CXX11Meta.h
@@ -81,7 +81,8 @@
 template<>                                  struct take<0, type_list<>>         { typedef type_list<> type; };
 
 template<typename T, int n, T a, T... as> struct take<n, numeric_list<T, a, as...>> : concat<numeric_list<T, a>, typename take<n-1, numeric_list<T, as...>>::type> {};
-template<typename T, int n>               struct take<n, numeric_list<T>>           { typedef numeric_list<T> type; };
+// XXX The following breaks in gcc-11, and is invalid anyways.
+// template<typename T, int n>               struct take<n, numeric_list<T>>           { typedef numeric_list<T> type; };
 template<typename T, T a, T... as>        struct take<0, numeric_list<T, a, as...>> { typedef numeric_list<T> type; };
 template<typename T>                      struct take<0, numeric_list<T>>           { typedef numeric_list<T> type; };
 
diff --git a/unsupported/Eigen/CXX11/src/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/util/EmulateArray.h
index e510010..0265136 100644
--- a/unsupported/Eigen/CXX11/src/util/EmulateArray.h
+++ b/unsupported/Eigen/CXX11/src/util/EmulateArray.h
@@ -17,12 +17,39 @@
 
 namespace Eigen {
 template <typename T, size_t n> class array {
+
  public:
   typedef T value_type;
   typedef T* iterator;
   typedef const T* const_iterator;
 
   EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE iterator begin() { return values; }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const_iterator begin() const { return values; }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE iterator end() { return values + n; }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const_iterator end() const { return values + n; }
+
+
+#if !defined(EIGEN_GPUCC)
+  typedef std::reverse_iterator<iterator> reverse_iterator;
+  typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE reverse_iterator rbegin() { return reverse_iterator(end());}
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const_reverse_iterator rbegin() const { return const_reverse_iterator(end()); }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE reverse_iterator rend() { return reverse_iterator(begin()); }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const_reverse_iterator rend() const { return const_reverse_iterator(begin()); }
+#endif
+
+  EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE T& operator[] (size_t index) { eigen_internal_assert(index < size()); return values[index]; }
   EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE const T& operator[] (size_t index) const { eigen_internal_assert(index < size()); return values[index]; }
@@ -42,16 +69,6 @@
   EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE const T& back() const { return values[n-1]; }
 
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE iterator begin() { return values; }
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE const_iterator begin() const { return values; }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE iterator end() { return values + n; }
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE const_iterator end() const { return values + n; }
-
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
   static std::size_t size() { return n; }
 
diff --git a/unsupported/test/cxx11_tensor_gpu.cu b/unsupported/test/cxx11_tensor_gpu.cu
index 0a37c02..83b150d 100644
--- a/unsupported/test/cxx11_tensor_gpu.cu
+++ b/unsupported/test/cxx11_tensor_gpu.cu
@@ -66,6 +66,47 @@
   gpuFree(d_in2);
 }
 
+// Tests that there are no indexing overflows when computing tensors with the
+// max representable size.
+template <typename IndexType,
+          IndexType N = (std::numeric_limits<IndexType>::max)()>
+void test_gpu_nullary_max_size()
+{
+  typedef int8_t DataType;
+  typedef Tensor<DataType, 1, 0, IndexType> TensorType;
+  typedef Eigen::array<IndexType, 1> ArrayType;
+
+  const IndexType n = N;
+  TensorType in1((ArrayType(n)));
+  in1.setZero();
+
+  std::size_t in1_bytes = in1.size() * sizeof(DataType);
+
+  DataType* d_in1;
+  gpuMalloc((void**)(&d_in1), in1_bytes);
+
+  gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<TensorType> gpu_in1(d_in1, ArrayType(n));
+
+  gpu_in1.device(gpu_device) = gpu_in1.constant(123);
+
+  TensorType new1((ArrayType(n)));
+
+  assert(gpuMemcpyAsync(new1.data(), d_in1, in1_bytes, gpuMemcpyDeviceToHost,
+                        gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+  for (IndexType i = 0; i < n; ++i) {
+    VERIFY_IS_EQUAL(new1(ArrayType(i)), 123);
+  }
+
+  gpuFree(d_in1);
+}
+
 void test_gpu_elementwise_small() {
   Tensor<float, 1> in1(Eigen::array<Eigen::DenseIndex, 1>(2));
   Tensor<float, 1> in2(Eigen::array<Eigen::DenseIndex, 1>(2));
@@ -1524,6 +1565,10 @@
 EIGEN_DECLARE_TEST(cxx11_tensor_gpu)
 {
   CALL_SUBTEST_1(test_gpu_nullary());
+  CALL_SUBTEST_1(test_gpu_nullary_max_size<int16_t>());
+  CALL_SUBTEST_1(test_gpu_nullary_max_size<int32_t>());
+  CALL_SUBTEST_1((test_gpu_nullary_max_size<
+                  int64_t, (std::numeric_limits<int32_t>::max)() + 100ll>()));
   CALL_SUBTEST_1(test_gpu_elementwise_small());
   CALL_SUBTEST_1(test_gpu_elementwise());
   CALL_SUBTEST_1(test_gpu_props());
diff --git a/unsupported/test/cxx11_tensor_io.cpp b/unsupported/test/cxx11_tensor_io.cpp
index 2c638f9..c0ea2a1 100644
--- a/unsupported/test/cxx11_tensor_io.cpp
+++ b/unsupported/test/cxx11_tensor_io.cpp
@@ -6,131 +6,137 @@
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
 #include "main.h"
+
 #include <sstream>
-#include <string>
 #include <Eigen/CXX11/Tensor>
 
+template <typename Scalar, int rank, int Layout>
+struct test_tensor_ostream_impl {};
 
-template<int DataLayout>
-static void test_output_0d()
-{
-  Tensor<int, 0, DataLayout> tensor;
-  tensor() = 123;
-
-  std::stringstream os;
-  os << tensor;
-
-  std::string expected("123");
-  VERIFY_IS_EQUAL(std::string(os.str()), expected);
-}
-
-
-template<int DataLayout>
-static void test_output_1d()
-{
-  Tensor<int, 1, DataLayout> tensor(5);
-  for (int i = 0; i < 5; ++i) {
-    tensor(i) = i;
+template<typename Scalar, int Layout>
+struct test_tensor_ostream_impl<Scalar, 0, Layout> {
+  static void run() {
+    Eigen::Tensor<Scalar, 0> t;
+    t.setValues(1);
+    std::ostringstream os;
+    os << t.format(Eigen::TensorIOFormat::Plain());
+    VERIFY(os.str() == "1");
   }
+};
 
-  std::stringstream os;
-  os << tensor;
-
-  std::string expected("0\n1\n2\n3\n4");
-  VERIFY_IS_EQUAL(std::string(os.str()), expected);
-
-  Eigen::Tensor<double,1,DataLayout> empty_tensor(0);
-  std::stringstream empty_os;
-  empty_os << empty_tensor;
-  std::string empty_string;
-  VERIFY_IS_EQUAL(std::string(empty_os.str()), empty_string);
-}
-
-
-template<int DataLayout>
-static void test_output_2d()
-{
-  Tensor<int, 2, DataLayout> tensor(5, 3);
-  for (int i = 0; i < 5; ++i) {
-    for (int j = 0; j < 3; ++j) {
-      tensor(i, j) = i*j;
-    }
+template<typename Scalar, int Layout>
+struct test_tensor_ostream_impl<Scalar, 1, Layout> {
+  static void run() {
+    Eigen::Tensor<Scalar, 1> t = {3};
+    t.setValues({1, 2, 3});
+    std::ostringstream os;
+    os << t.format(Eigen::TensorIOFormat::Plain());
+    VERIFY(os.str() == "1 2 3");
   }
+};
 
-  std::stringstream os;
-  os << tensor;
-
-  std::string expected("0  0  0\n0  1  2\n0  2  4\n0  3  6\n0  4  8");
-  VERIFY_IS_EQUAL(std::string(os.str()), expected);
-}
-
-
-template<int DataLayout>
-static void test_output_expr()
-{
-  Tensor<int, 1, DataLayout> tensor1(5);
-  Tensor<int, 1, DataLayout> tensor2(5);
-  for (int i = 0; i < 5; ++i) {
-    tensor1(i) = i;
-    tensor2(i) = 7;
+template<typename Scalar, int Layout>
+struct test_tensor_ostream_impl<Scalar, 2, Layout> {
+  static void run() {
+    Eigen::Tensor<Scalar, 2> t = {3, 2};
+    t.setValues({{1, 2}, {3, 4}, {5, 6}});
+    std::ostringstream os;
+    os << t.format(Eigen::TensorIOFormat::Plain());
+    VERIFY(os.str() == "1 2\n3 4\n5 6");
   }
+};
 
-  std::stringstream os;
-  os << tensor1 + tensor2;
-
-  std::string expected(" 7\n 8\n 9\n10\n11");
-  VERIFY_IS_EQUAL(std::string(os.str()), expected);
-}
-
-
-template<int DataLayout>
-static void test_output_string()
-{
-  Tensor<std::string, 2, DataLayout> tensor(5, 3);
-  tensor.setConstant(std::string("foo"));
-
-  std::cout << tensor << std::endl;
-
-  std::stringstream os;
-  os << tensor;
-
-  std::string expected("foo  foo  foo\nfoo  foo  foo\nfoo  foo  foo\nfoo  foo  foo\nfoo  foo  foo");
-  VERIFY_IS_EQUAL(std::string(os.str()), expected);
-}
-
-
-template<int DataLayout>
-static void test_output_const()
-{
-  Tensor<int, 1, DataLayout> tensor(5);
-  for (int i = 0; i < 5; ++i) {
-    tensor(i) = i;
+template<typename Scalar, int Layout>
+struct test_tensor_ostream_impl<Scalar, 3, Layout> {
+  static void run() {
+    Eigen::Tensor<Scalar, 3> t = {4, 3, 2};
+    t.setValues({{{1, 2}, {3, 4}, {5, 6}},
+                 {{7, 8}, {9, 10}, {11, 12}},
+                 {{13, 14}, {15, 16}, {17, 18}},
+                 {{19, 20}, {21, 22}, {23, 24}}});
+    std::ostringstream os;
+    os << t.format(Eigen::TensorIOFormat::Plain());
+    VERIFY(os.str() == " 1  2\n 3  4\n 5  6\n\n 7  8\n 9 10\n11 12\n\n13 14\n15 16\n17 18\n\n19 20\n21 22\n23 24");
   }
+};
 
-  TensorMap<Tensor<const int, 1, DataLayout> > tensor_map(tensor.data(), 5);
+template<int Layout>
+struct test_tensor_ostream_impl<bool, 2, Layout> {
+  static void run() {
+    Eigen::Tensor<bool, 2> t = {3, 2};
+    t.setValues({{false, true}, {true, false}, {false, false}});
+    std::ostringstream os;
+    os << t.format(Eigen::TensorIOFormat::Plain());
+    VERIFY(os.str() == "0 1\n1 0\n0 0");
+  }
+};
 
-  std::stringstream os;
-  os << tensor_map;
+template<typename Scalar, int Layout>
+struct test_tensor_ostream_impl<std::complex<Scalar>, 2, Layout> {
+  static void run() {
+    Eigen::Tensor<std::complex<Scalar>, 2> t = {3, 2};
+    t.setValues({{std::complex<Scalar>(1, 2), std::complex<Scalar>(12, 3)},
+                 {std::complex<Scalar>(-4, 2), std::complex<Scalar>(0, 5)},
+                 {std::complex<Scalar>(-1, 4), std::complex<Scalar>(5, 27)}});
+    std::ostringstream os;
+    os << t.format(Eigen::TensorIOFormat::Plain());
+    VERIFY(os.str() == " (1,2) (12,3)\n(-4,2)  (0,5)\n(-1,4) (5,27)");
+  }
+};
 
-  std::string expected("0\n1\n2\n3\n4");
-  VERIFY_IS_EQUAL(std::string(os.str()), expected);
+template <typename Scalar, int rank, int Layout>
+void test_tensor_ostream() {
+  test_tensor_ostream_impl<Scalar, rank, Layout>::run();
 }
 
+void test_const_tensor_ostream() {
+  Eigen::Tensor<float, 0> t;
+  t.setValues(1);
+  const Eigen::TensorMap<Eigen::Tensor<const float, 0, Eigen::RowMajor>, Eigen::Unaligned> t_const(
+      t.data(), Eigen::DSizes<Eigen::DenseIndex, 0>{});
+  std::ostringstream os;
+  os << t_const.format(Eigen::TensorIOFormat::Plain());
+  VERIFY(os.str() == "1");
+}
 
-EIGEN_DECLARE_TEST(cxx11_tensor_io)
-{
-  CALL_SUBTEST(test_output_0d<ColMajor>());
-  CALL_SUBTEST(test_output_0d<RowMajor>());
-  CALL_SUBTEST(test_output_1d<ColMajor>());
-  CALL_SUBTEST(test_output_1d<RowMajor>());
-  CALL_SUBTEST(test_output_2d<ColMajor>());
-  CALL_SUBTEST(test_output_2d<RowMajor>());
-  CALL_SUBTEST(test_output_expr<ColMajor>());
-  CALL_SUBTEST(test_output_expr<RowMajor>());
-  CALL_SUBTEST(test_output_string<ColMajor>());
-  CALL_SUBTEST(test_output_string<RowMajor>());
-  CALL_SUBTEST(test_output_const<ColMajor>());
-  CALL_SUBTEST(test_output_const<RowMajor>());
+EIGEN_DECLARE_TEST(cxx11_tensor_io) {
+  CALL_SUBTEST((test_tensor_ostream<float, 0, Eigen::ColMajor>()));
+  CALL_SUBTEST((test_tensor_ostream<float, 1, Eigen::ColMajor>()));
+  CALL_SUBTEST((test_tensor_ostream<float, 2, Eigen::ColMajor>()));
+  CALL_SUBTEST((test_tensor_ostream<float, 3, Eigen::ColMajor>()));
+
+  CALL_SUBTEST((test_tensor_ostream<double, 0, Eigen::ColMajor>()));
+  CALL_SUBTEST((test_tensor_ostream<double, 1, Eigen::ColMajor>()));
+  CALL_SUBTEST((test_tensor_ostream<double, 2, Eigen::ColMajor>()));
+  CALL_SUBTEST((test_tensor_ostream<double, 3, Eigen::ColMajor>()));
+
+  CALL_SUBTEST((test_tensor_ostream<int, 0, Eigen::ColMajor>()));
+  CALL_SUBTEST((test_tensor_ostream<int, 1, Eigen::ColMajor>()));
+  CALL_SUBTEST((test_tensor_ostream<int, 2, Eigen::ColMajor>()));
+  CALL_SUBTEST((test_tensor_ostream<int, 3, Eigen::ColMajor>()));
+
+  CALL_SUBTEST((test_tensor_ostream<float, 0, Eigen::RowMajor>()));
+  CALL_SUBTEST((test_tensor_ostream<float, 1, Eigen::RowMajor>()));
+  CALL_SUBTEST((test_tensor_ostream<float, 2, Eigen::RowMajor>()));
+  CALL_SUBTEST((test_tensor_ostream<float, 3, Eigen::RowMajor>()));
+
+  CALL_SUBTEST((test_tensor_ostream<double, 0, Eigen::RowMajor>()));
+  CALL_SUBTEST((test_tensor_ostream<double, 1, Eigen::RowMajor>()));
+  CALL_SUBTEST((test_tensor_ostream<double, 2, Eigen::RowMajor>()));
+  CALL_SUBTEST((test_tensor_ostream<double, 3, Eigen::RowMajor>()));
+
+  CALL_SUBTEST((test_tensor_ostream<int, 0, Eigen::RowMajor>()));
+  CALL_SUBTEST((test_tensor_ostream<int, 1, Eigen::RowMajor>()));
+  CALL_SUBTEST((test_tensor_ostream<int, 2, Eigen::RowMajor>()));
+  CALL_SUBTEST((test_tensor_ostream<int, 3, Eigen::RowMajor>()));
+
+  CALL_SUBTEST((test_tensor_ostream<bool, 2, Eigen::ColMajor>()));
+  CALL_SUBTEST((test_tensor_ostream<bool, 2, Eigen::RowMajor>()));
+
+  CALL_SUBTEST((test_tensor_ostream<std::complex<double>, 2, Eigen::ColMajor>()));
+  CALL_SUBTEST((test_tensor_ostream<std::complex<float>, 2, Eigen::ColMajor>()));
+
+  // Test printing TensorMap with const elements.
+  CALL_SUBTEST((test_const_tensor_ostream()));
 }
diff --git a/unsupported/test/cxx11_tensor_reduction.cpp b/unsupported/test/cxx11_tensor_reduction.cpp
index b7611d7..9648b9c 100644
--- a/unsupported/test/cxx11_tensor_reduction.cpp
+++ b/unsupported/test/cxx11_tensor_reduction.cpp
@@ -501,10 +501,16 @@
 
     // Compute the reference value in double precsion.
     double expected_sum = 0.0;
+    double abs_sum = 0.0;
     for (int i = 0; i < num_elements; ++i) {
       expected_sum += static_cast<double>(tensor(i));
+      abs_sum += static_cast<double>(numext::abs(tensor(i)));
     }
-    VERIFY_IS_APPROX(sum(), static_cast<ScalarType>(expected_sum));
+    // Test against probabilistic forward error bound. In reality, the error is much smaller
+    // when we use tree summation.
+    double err = Eigen::numext::abs(static_cast<double>(sum()) - expected_sum);
+    double tol = numext::sqrt(num_elements) * NumTraits<ScalarType>::epsilon() * static_cast<ScalarType>(abs_sum);
+    VERIFY_LE(err, tol);
   }
 }